summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Transforms/Scalar
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Transforms/Scalar')
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/ADCE.cpp43
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp16
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/BDCE.cpp48
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp341
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp6
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp68
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/DCE.cpp11
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp62
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp160
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp2
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp21
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/GVN.cpp519
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp94
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/GVNSink.cpp883
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp28
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp58
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp162
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp969
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp654
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LICM.cpp170
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp280
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp261
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp46
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp483
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp6
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp165
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp108
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopPassManager.cpp7
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopPredication.cpp330
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp10
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp91
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp3
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp40
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp706
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp196
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp336
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp3
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp176
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp2
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp147
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp181
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp22
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp3454
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp12
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp141
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp2
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp216
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/SCCP.cpp199
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/SROA.cpp127
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/Scalar.cpp17
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp21
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp12
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp808
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp76
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/Sink.cpp14
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp2
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp4
-rw-r--r--contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp66
58 files changed, 9712 insertions, 3373 deletions
diff --git a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
index adc903c..5b467dc 100644
--- a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
@@ -41,8 +41,8 @@ using namespace llvm;
STATISTIC(NumRemoved, "Number of instructions removed");
STATISTIC(NumBranchesRemoved, "Number of branch instructions removed");
-// This is a tempoary option until we change the interface
-// to this pass based on optimization level.
+// This is a temporary option until we change the interface to this pass based
+// on optimization level.
static cl::opt<bool> RemoveControlFlowFlag("adce-remove-control-flow",
cl::init(true), cl::Hidden);
@@ -110,7 +110,7 @@ class AggressiveDeadCodeElimination {
/// The set of blocks which we have determined whose control
/// dependence sources must be live and which have not had
- /// those dependences analyized.
+ /// those dependences analyzed.
SmallPtrSet<BasicBlock *, 16> NewLiveBlocks;
/// Set up auxiliary data structures for Instructions and BasicBlocks and
@@ -145,7 +145,7 @@ class AggressiveDeadCodeElimination {
/// was removed.
bool removeDeadInstructions();
- /// Identify connected sections of the control flow grap which have
+ /// Identify connected sections of the control flow graph which have
/// dead terminators and rewrite the control flow graph to remove them.
void updateDeadRegions();
@@ -234,7 +234,7 @@ void AggressiveDeadCodeElimination::initialize() {
return Iter != end() && Iter->second;
}
} State;
-
+
State.reserve(F.size());
// Iterate over blocks in depth-first pre-order and
// treat all edges to a block already seen as loop back edges
@@ -262,25 +262,6 @@ void AggressiveDeadCodeElimination::initialize() {
continue;
auto *BB = BBInfo.BB;
if (!PDT.getNode(BB)) {
- markLive(BBInfo.Terminator);
- continue;
- }
- for (auto *Succ : successors(BB))
- if (!PDT.getNode(Succ)) {
- markLive(BBInfo.Terminator);
- break;
- }
- }
-
- // Mark blocks live if there is no path from the block to the
- // return of the function or a successor for which this is true.
- // This protects IDFCalculator which cannot handle such blocks.
- for (auto &BBInfoPair : BlockInfo) {
- auto &BBInfo = BBInfoPair.second;
- if (BBInfo.terminatorIsLive())
- continue;
- auto *BB = BBInfo.BB;
- if (!PDT.getNode(BB)) {
DEBUG(dbgs() << "Not post-dominated by return: " << BB->getName()
<< '\n';);
markLive(BBInfo.Terminator);
@@ -579,7 +560,7 @@ void AggressiveDeadCodeElimination::updateDeadRegions() {
PreferredSucc = Info;
}
assert((PreferredSucc && PreferredSucc->PostOrder > 0) &&
- "Failed to find safe successor for dead branc");
+ "Failed to find safe successor for dead branch");
bool First = true;
for (auto *Succ : successors(BB)) {
if (!First || Succ != PreferredSucc->BB)
@@ -594,13 +575,13 @@ void AggressiveDeadCodeElimination::updateDeadRegions() {
// reverse top-sort order
void AggressiveDeadCodeElimination::computeReversePostOrder() {
-
- // This provides a post-order numbering of the reverse conrtol flow graph
+
+ // This provides a post-order numbering of the reverse control flow graph
// Note that it is incomplete in the presence of infinite loops but we don't
// need numbers blocks which don't reach the end of the functions since
// all branches in those blocks are forced live.
-
- // For each block without successors, extend the DFS from the bloack
+
+ // For each block without successors, extend the DFS from the block
// backward through the graph
SmallPtrSet<BasicBlock*, 16> Visited;
unsigned PostOrder = 0;
@@ -644,8 +625,8 @@ PreservedAnalyses ADCEPass::run(Function &F, FunctionAnalysisManager &FAM) {
if (!AggressiveDeadCodeElimination(F, PDT).performDeadCodeElimination())
return PreservedAnalyses::all();
- // FIXME: This should also 'preserve the CFG'.
- auto PA = PreservedAnalyses();
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
PA.preserve<GlobalsAA>();
return PA;
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
index c1df317..99480f1 100644
--- a/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
@@ -19,12 +19,11 @@
#define AA_NAME "alignment-from-assumptions"
#define DEBUG_TYPE AA_NAME
#include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h"
-#include "llvm/Transforms/Scalar.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/ValueTracking.h"
@@ -35,6 +34,7 @@
#include "llvm/IR/Module.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
using namespace llvm;
STATISTIC(NumLoadAlignChanged,
@@ -438,19 +438,13 @@ AlignmentFromAssumptionsPass::run(Function &F, FunctionAnalysisManager &AM) {
AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
- bool Changed = runImpl(F, AC, &SE, &DT);
-
- // FIXME: We need to invalidate this to avoid PR28400. Is there a better
- // solution?
- AM.invalidate<ScalarEvolutionAnalysis>(F);
-
- if (!Changed)
+ if (!runImpl(F, AC, &SE, &DT))
return PreservedAnalyses::all();
+
PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
PA.preserve<AAManager>();
PA.preserve<ScalarEvolutionAnalysis>();
PA.preserve<GlobalsAA>();
- PA.preserve<LoopAnalysis>();
- PA.preserve<DominatorTreeAnalysis>();
return PA;
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp
index 251b387..2e56186 100644
--- a/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp
@@ -15,6 +15,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/BDCE.h"
+#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/DemandedBits.h"
@@ -35,6 +36,46 @@ using namespace llvm;
STATISTIC(NumRemoved, "Number of instructions removed (unused)");
STATISTIC(NumSimplified, "Number of instructions trivialized (dead bits)");
+/// If an instruction is trivialized (dead), then the chain of users of that
+/// instruction may need to be cleared of assumptions that can no longer be
+/// guaranteed correct.
+static void clearAssumptionsOfUsers(Instruction *I, DemandedBits &DB) {
+ assert(I->getType()->isIntegerTy() && "Trivializing a non-integer value?");
+
+ // Initialize the worklist with eligible direct users.
+ SmallVector<Instruction *, 16> WorkList;
+ for (User *JU : I->users()) {
+ // If all bits of a user are demanded, then we know that nothing below that
+ // in the def-use chain needs to be changed.
+ auto *J = dyn_cast<Instruction>(JU);
+ if (J && !DB.getDemandedBits(J).isAllOnesValue())
+ WorkList.push_back(J);
+ }
+
+ // DFS through subsequent users while tracking visits to avoid cycles.
+ SmallPtrSet<Instruction *, 16> Visited;
+ while (!WorkList.empty()) {
+ Instruction *J = WorkList.pop_back_val();
+
+ // NSW, NUW, and exact are based on operands that might have changed.
+ J->dropPoisonGeneratingFlags();
+
+ // We do not have to worry about llvm.assume or range metadata:
+ // 1. llvm.assume demands its operand, so trivializing can't change it.
+ // 2. range metadata only applies to memory accesses which demand all bits.
+
+ Visited.insert(J);
+
+ for (User *KU : J->users()) {
+ // If all bits of a user are demanded, then we know that nothing below
+ // that in the def-use chain needs to be changed.
+ auto *K = dyn_cast<Instruction>(KU);
+ if (K && !Visited.count(K) && !DB.getDemandedBits(K).isAllOnesValue())
+ WorkList.push_back(K);
+ }
+ }
+}
+
static bool bitTrackingDCE(Function &F, DemandedBits &DB) {
SmallVector<Instruction*, 128> Worklist;
bool Changed = false;
@@ -51,6 +92,9 @@ static bool bitTrackingDCE(Function &F, DemandedBits &DB) {
// replacing all uses with something else. Then, if they don't need to
// remain live (because they have side effects, etc.) we can remove them.
DEBUG(dbgs() << "BDCE: Trivializing: " << I << " (all bits dead)\n");
+
+ clearAssumptionsOfUsers(&I, DB);
+
// FIXME: In theory we could substitute undef here instead of zero.
// This should be reconsidered once we settle on the semantics of
// undef, poison, etc.
@@ -80,8 +124,8 @@ PreservedAnalyses BDCEPass::run(Function &F, FunctionAnalysisManager &AM) {
if (!bitTrackingDCE(F, DB))
return PreservedAnalyses::all();
- // FIXME: This should also 'preserve the CFG'.
- auto PA = PreservedAnalyses();
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
PA.preserve<GlobalsAA>();
return PA;
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
index 3826251..122c931 100644
--- a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -38,11 +38,13 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/IR/Constants.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
#include <tuple>
using namespace llvm;
@@ -53,6 +55,12 @@ using namespace consthoist;
STATISTIC(NumConstantsHoisted, "Number of constants hoisted");
STATISTIC(NumConstantsRebased, "Number of constants rebased");
+static cl::opt<bool> ConstHoistWithBlockFrequency(
+ "consthoist-with-block-frequency", cl::init(true), cl::Hidden,
+ cl::desc("Enable the use of the block frequency analysis to reduce the "
+ "chance to execute const materialization more frequently than "
+ "without hoisting."));
+
namespace {
/// \brief The constant hoisting pass.
class ConstantHoistingLegacyPass : public FunctionPass {
@@ -68,6 +76,8 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
+ if (ConstHoistWithBlockFrequency)
+ AU.addRequired<BlockFrequencyInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
}
@@ -82,6 +92,7 @@ private:
char ConstantHoistingLegacyPass::ID = 0;
INITIALIZE_PASS_BEGIN(ConstantHoistingLegacyPass, "consthoist",
"Constant Hoisting", false, false)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_END(ConstantHoistingLegacyPass, "consthoist",
@@ -99,9 +110,13 @@ bool ConstantHoistingLegacyPass::runOnFunction(Function &Fn) {
DEBUG(dbgs() << "********** Begin Constant Hoisting **********\n");
DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n');
- bool MadeChange = Impl.runImpl(
- Fn, getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn),
- getAnalysis<DominatorTreeWrapperPass>().getDomTree(), Fn.getEntryBlock());
+ bool MadeChange =
+ Impl.runImpl(Fn, getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn),
+ getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+ ConstHoistWithBlockFrequency
+ ? &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI()
+ : nullptr,
+ Fn.getEntryBlock());
if (MadeChange) {
DEBUG(dbgs() << "********** Function after Constant Hoisting: "
@@ -136,37 +151,163 @@ Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst,
if (Idx != ~0U && isa<PHINode>(Inst))
return cast<PHINode>(Inst)->getIncomingBlock(Idx)->getTerminator();
- BasicBlock *IDom = DT->getNode(Inst->getParent())->getIDom()->getBlock();
- return IDom->getTerminator();
+ // This must be an EH pad. Iterate over immediate dominators until we find a
+ // non-EH pad. We need to skip over catchswitch blocks, which are both EH pads
+ // and terminators.
+ auto IDom = DT->getNode(Inst->getParent())->getIDom();
+ while (IDom->getBlock()->isEHPad()) {
+ assert(Entry != IDom->getBlock() && "eh pad in entry block");
+ IDom = IDom->getIDom();
+ }
+
+ return IDom->getBlock()->getTerminator();
+}
+
+/// \brief Given \p BBs as input, find another set of BBs which collectively
+/// dominates \p BBs and have the minimal sum of frequencies. Return the BB
+/// set found in \p BBs.
+static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
+ BasicBlock *Entry,
+ SmallPtrSet<BasicBlock *, 8> &BBs) {
+ assert(!BBs.count(Entry) && "Assume Entry is not in BBs");
+ // Nodes on the current path to the root.
+ SmallPtrSet<BasicBlock *, 8> Path;
+ // Candidates includes any block 'BB' in set 'BBs' that is not strictly
+ // dominated by any other blocks in set 'BBs', and all nodes in the path
+ // in the dominator tree from Entry to 'BB'.
+ SmallPtrSet<BasicBlock *, 16> Candidates;
+ for (auto BB : BBs) {
+ Path.clear();
+ // Walk up the dominator tree until Entry or another BB in BBs
+ // is reached. Insert the nodes on the way to the Path.
+ BasicBlock *Node = BB;
+ // The "Path" is a candidate path to be added into Candidates set.
+ bool isCandidate = false;
+ do {
+ Path.insert(Node);
+ if (Node == Entry || Candidates.count(Node)) {
+ isCandidate = true;
+ break;
+ }
+ assert(DT.getNode(Node)->getIDom() &&
+ "Entry doens't dominate current Node");
+ Node = DT.getNode(Node)->getIDom()->getBlock();
+ } while (!BBs.count(Node));
+
+ // If isCandidate is false, Node is another Block in BBs dominating
+ // current 'BB'. Drop the nodes on the Path.
+ if (!isCandidate)
+ continue;
+
+ // Add nodes on the Path into Candidates.
+ Candidates.insert(Path.begin(), Path.end());
+ }
+
+ // Sort the nodes in Candidates in top-down order and save the nodes
+ // in Orders.
+ unsigned Idx = 0;
+ SmallVector<BasicBlock *, 16> Orders;
+ Orders.push_back(Entry);
+ while (Idx != Orders.size()) {
+ BasicBlock *Node = Orders[Idx++];
+ for (auto ChildDomNode : DT.getNode(Node)->getChildren()) {
+ if (Candidates.count(ChildDomNode->getBlock()))
+ Orders.push_back(ChildDomNode->getBlock());
+ }
+ }
+
+ // Visit Orders in bottom-up order.
+ typedef std::pair<SmallPtrSet<BasicBlock *, 16>, BlockFrequency>
+ InsertPtsCostPair;
+ // InsertPtsMap is a map from a BB to the best insertion points for the
+ // subtree of BB (subtree not including the BB itself).
+ DenseMap<BasicBlock *, InsertPtsCostPair> InsertPtsMap;
+ InsertPtsMap.reserve(Orders.size() + 1);
+ for (auto RIt = Orders.rbegin(); RIt != Orders.rend(); RIt++) {
+ BasicBlock *Node = *RIt;
+ bool NodeInBBs = BBs.count(Node);
+ SmallPtrSet<BasicBlock *, 16> &InsertPts = InsertPtsMap[Node].first;
+ BlockFrequency &InsertPtsFreq = InsertPtsMap[Node].second;
+
+ // Return the optimal insert points in BBs.
+ if (Node == Entry) {
+ BBs.clear();
+ if (InsertPtsFreq > BFI.getBlockFreq(Node) ||
+ (InsertPtsFreq == BFI.getBlockFreq(Node) && InsertPts.size() > 1))
+ BBs.insert(Entry);
+ else
+ BBs.insert(InsertPts.begin(), InsertPts.end());
+ break;
+ }
+
+ BasicBlock *Parent = DT.getNode(Node)->getIDom()->getBlock();
+ // Initially, ParentInsertPts is empty and ParentPtsFreq is 0. Every child
+ // will update its parent's ParentInsertPts and ParentPtsFreq.
+ SmallPtrSet<BasicBlock *, 16> &ParentInsertPts = InsertPtsMap[Parent].first;
+ BlockFrequency &ParentPtsFreq = InsertPtsMap[Parent].second;
+ // Choose to insert in Node or in subtree of Node.
+ // Don't hoist to EHPad because we may not find a proper place to insert
+ // in EHPad.
+ // If the total frequency of InsertPts is the same as the frequency of the
+ // target Node, and InsertPts contains more than one nodes, choose hoisting
+ // to reduce code size.
+ if (NodeInBBs ||
+ (!Node->isEHPad() &&
+ (InsertPtsFreq > BFI.getBlockFreq(Node) ||
+ (InsertPtsFreq == BFI.getBlockFreq(Node) && InsertPts.size() > 1)))) {
+ ParentInsertPts.insert(Node);
+ ParentPtsFreq += BFI.getBlockFreq(Node);
+ } else {
+ ParentInsertPts.insert(InsertPts.begin(), InsertPts.end());
+ ParentPtsFreq += InsertPtsFreq;
+ }
+ }
}
/// \brief Find an insertion point that dominates all uses.
-Instruction *ConstantHoistingPass::findConstantInsertionPoint(
+SmallPtrSet<Instruction *, 8> ConstantHoistingPass::findConstantInsertionPoint(
const ConstantInfo &ConstInfo) const {
assert(!ConstInfo.RebasedConstants.empty() && "Invalid constant info entry.");
// Collect all basic blocks.
SmallPtrSet<BasicBlock *, 8> BBs;
+ SmallPtrSet<Instruction *, 8> InsertPts;
for (auto const &RCI : ConstInfo.RebasedConstants)
for (auto const &U : RCI.Uses)
BBs.insert(findMatInsertPt(U.Inst, U.OpndIdx)->getParent());
- if (BBs.count(Entry))
- return &Entry->front();
+ if (BBs.count(Entry)) {
+ InsertPts.insert(&Entry->front());
+ return InsertPts;
+ }
+
+ if (BFI) {
+ findBestInsertionSet(*DT, *BFI, Entry, BBs);
+ for (auto BB : BBs) {
+ BasicBlock::iterator InsertPt = BB->begin();
+ for (; isa<PHINode>(InsertPt) || InsertPt->isEHPad(); ++InsertPt)
+ ;
+ InsertPts.insert(&*InsertPt);
+ }
+ return InsertPts;
+ }
while (BBs.size() >= 2) {
BasicBlock *BB, *BB1, *BB2;
BB1 = *BBs.begin();
BB2 = *std::next(BBs.begin());
BB = DT->findNearestCommonDominator(BB1, BB2);
- if (BB == Entry)
- return &Entry->front();
+ if (BB == Entry) {
+ InsertPts.insert(&Entry->front());
+ return InsertPts;
+ }
BBs.erase(BB1);
BBs.erase(BB2);
BBs.insert(BB);
}
assert((BBs.size() == 1) && "Expected only one element.");
Instruction &FirstInst = (*BBs.begin())->front();
- return findMatInsertPt(&FirstInst);
+ InsertPts.insert(findMatInsertPt(&FirstInst));
+ return InsertPts;
}
@@ -210,68 +351,65 @@ void ConstantHoistingPass::collectConstantCandidates(
}
}
-/// \brief Scan the instruction for expensive integer constants and record them
-/// in the constant candidate vector.
-void ConstantHoistingPass::collectConstantCandidates(
- ConstCandMapType &ConstCandMap, Instruction *Inst) {
- // Skip all cast instructions. They are visited indirectly later on.
- if (Inst->isCast())
- return;
-
- // Can't handle inline asm. Skip it.
- if (auto Call = dyn_cast<CallInst>(Inst))
- if (isa<InlineAsm>(Call->getCalledValue()))
- return;
- // Switch cases must remain constant, and if the value being tested is
- // constant the entire thing should disappear.
- if (isa<SwitchInst>(Inst))
- return;
+/// \brief Check the operand for instruction Inst at index Idx.
+void ConstantHoistingPass::collectConstantCandidates(
+ ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx) {
+ Value *Opnd = Inst->getOperand(Idx);
- // Static allocas (constant size in the entry block) are handled by
- // prologue/epilogue insertion so they're free anyway. We definitely don't
- // want to make them non-constant.
- auto AI = dyn_cast<AllocaInst>(Inst);
- if (AI && AI->isStaticAlloca())
+ // Visit constant integers.
+ if (auto ConstInt = dyn_cast<ConstantInt>(Opnd)) {
+ collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
return;
+ }
- // Scan all operands.
- for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) {
- Value *Opnd = Inst->getOperand(Idx);
+ // Visit cast instructions that have constant integers.
+ if (auto CastInst = dyn_cast<Instruction>(Opnd)) {
+ // Only visit cast instructions, which have been skipped. All other
+ // instructions should have already been visited.
+ if (!CastInst->isCast())
+ return;
- // Visit constant integers.
- if (auto ConstInt = dyn_cast<ConstantInt>(Opnd)) {
+ if (auto *ConstInt = dyn_cast<ConstantInt>(CastInst->getOperand(0))) {
+ // Pretend the constant is directly used by the instruction and ignore
+ // the cast instruction.
collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
- continue;
+ return;
}
+ }
- // Visit cast instructions that have constant integers.
- if (auto CastInst = dyn_cast<Instruction>(Opnd)) {
- // Only visit cast instructions, which have been skipped. All other
- // instructions should have already been visited.
- if (!CastInst->isCast())
- continue;
+ // Visit constant expressions that have constant integers.
+ if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) {
+ // Only visit constant cast expressions.
+ if (!ConstExpr->isCast())
+ return;
- if (auto *ConstInt = dyn_cast<ConstantInt>(CastInst->getOperand(0))) {
- // Pretend the constant is directly used by the instruction and ignore
- // the cast instruction.
- collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
- continue;
- }
+ if (auto ConstInt = dyn_cast<ConstantInt>(ConstExpr->getOperand(0))) {
+ // Pretend the constant is directly used by the instruction and ignore
+ // the constant expression.
+ collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
+ return;
}
+ }
+}
- // Visit constant expressions that have constant integers.
- if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) {
- // Only visit constant cast expressions.
- if (!ConstExpr->isCast())
- continue;
- if (auto ConstInt = dyn_cast<ConstantInt>(ConstExpr->getOperand(0))) {
- // Pretend the constant is directly used by the instruction and ignore
- // the constant expression.
- collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
- continue;
- }
+/// \brief Scan the instruction for expensive integer constants and record them
+/// in the constant candidate vector.
+void ConstantHoistingPass::collectConstantCandidates(
+ ConstCandMapType &ConstCandMap, Instruction *Inst) {
+ // Skip all cast instructions. They are visited indirectly later on.
+ if (Inst->isCast())
+ return;
+
+ // Scan all operands.
+ for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) {
+ // The cost of materializing the constants (defined in
+ // `TargetTransformInfo::getIntImmCost`) for instructions which only take
+ // constant variables is lower than `TargetTransformInfo::TCC_Basic`. So
+ // it's safe for us to collect constant candidates from all IntrinsicInsts.
+ if (canReplaceOperandWithVariable(Inst, Idx) || isa<IntrinsicInst>(Inst)) {
+ collectConstantCandidates(ConstCandMap, Inst, Idx);
}
} // end of for all operands
}
@@ -289,8 +427,8 @@ void ConstantHoistingPass::collectConstantCandidates(Function &Fn) {
// bit widths (APInt Operator- does not like that). If the value cannot be
// represented in uint64 we return an "empty" APInt. This is then interpreted
// as the value is not in range.
-static llvm::Optional<APInt> calculateOffsetDiff(APInt V1, APInt V2)
-{
+static llvm::Optional<APInt> calculateOffsetDiff(const APInt &V1,
+ const APInt &V2) {
llvm::Optional<APInt> Res = None;
unsigned BW = V1.getBitWidth() > V2.getBitWidth() ?
V1.getBitWidth() : V2.getBitWidth();
@@ -549,29 +687,54 @@ bool ConstantHoistingPass::emitBaseConstants() {
bool MadeChange = false;
for (auto const &ConstInfo : ConstantVec) {
// Hoist and hide the base constant behind a bitcast.
- Instruction *IP = findConstantInsertionPoint(ConstInfo);
- IntegerType *Ty = ConstInfo.BaseConstant->getType();
- Instruction *Base =
- new BitCastInst(ConstInfo.BaseConstant, Ty, "const", IP);
- DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseConstant << ") to BB "
- << IP->getParent()->getName() << '\n' << *Base << '\n');
- NumConstantsHoisted++;
+ SmallPtrSet<Instruction *, 8> IPSet = findConstantInsertionPoint(ConstInfo);
+ assert(!IPSet.empty() && "IPSet is empty");
+
+ unsigned UsesNum = 0;
+ unsigned ReBasesNum = 0;
+ for (Instruction *IP : IPSet) {
+ IntegerType *Ty = ConstInfo.BaseConstant->getType();
+ Instruction *Base =
+ new BitCastInst(ConstInfo.BaseConstant, Ty, "const", IP);
+ DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseConstant
+ << ") to BB " << IP->getParent()->getName() << '\n'
+ << *Base << '\n');
+
+ // Emit materialization code for all rebased constants.
+ unsigned Uses = 0;
+ for (auto const &RCI : ConstInfo.RebasedConstants) {
+ for (auto const &U : RCI.Uses) {
+ Uses++;
+ BasicBlock *OrigMatInsertBB =
+ findMatInsertPt(U.Inst, U.OpndIdx)->getParent();
+ // If Base constant is to be inserted in multiple places,
+ // generate rebase for U using the Base dominating U.
+ if (IPSet.size() == 1 ||
+ DT->dominates(Base->getParent(), OrigMatInsertBB)) {
+ emitBaseConstants(Base, RCI.Offset, U);
+ ReBasesNum++;
+ }
+ }
+ }
+ UsesNum = Uses;
- // Emit materialization code for all rebased constants.
- for (auto const &RCI : ConstInfo.RebasedConstants) {
- NumConstantsRebased++;
- for (auto const &U : RCI.Uses)
- emitBaseConstants(Base, RCI.Offset, U);
+ // Use the same debug location as the last user of the constant.
+ assert(!Base->use_empty() && "The use list is empty!?");
+ assert(isa<Instruction>(Base->user_back()) &&
+ "All uses should be instructions.");
+ Base->setDebugLoc(cast<Instruction>(Base->user_back())->getDebugLoc());
}
+ (void)UsesNum;
+ (void)ReBasesNum;
+ // Expect all uses are rebased after rebase is done.
+ assert(UsesNum == ReBasesNum && "Not all uses are rebased");
+
+ NumConstantsHoisted++;
- // Use the same debug location as the last user of the constant.
- assert(!Base->use_empty() && "The use list is empty!?");
- assert(isa<Instruction>(Base->user_back()) &&
- "All uses should be instructions.");
- Base->setDebugLoc(cast<Instruction>(Base->user_back())->getDebugLoc());
+ // Base constant is also included in ConstInfo.RebasedConstants, so
+ // deduct 1 from ConstInfo.RebasedConstants.size().
+ NumConstantsRebased = ConstInfo.RebasedConstants.size() - 1;
- // Correct for base constant, which we counted above too.
- NumConstantsRebased--;
MadeChange = true;
}
return MadeChange;
@@ -587,9 +750,11 @@ void ConstantHoistingPass::deleteDeadCastInst() const {
/// \brief Optimize expensive integer constants in the given function.
bool ConstantHoistingPass::runImpl(Function &Fn, TargetTransformInfo &TTI,
- DominatorTree &DT, BasicBlock &Entry) {
+ DominatorTree &DT, BlockFrequencyInfo *BFI,
+ BasicBlock &Entry) {
this->TTI = &TTI;
this->DT = &DT;
+ this->BFI = BFI;
this->Entry = &Entry;
// Collect all constant candidates.
collectConstantCandidates(Fn);
@@ -620,9 +785,13 @@ PreservedAnalyses ConstantHoistingPass::run(Function &F,
FunctionAnalysisManager &AM) {
auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
auto &TTI = AM.getResult<TargetIRAnalysis>(F);
- if (!runImpl(F, TTI, DT, F.getEntryBlock()))
+ auto BFI = ConstHoistWithBlockFrequency
+ ? &AM.getResult<BlockFrequencyAnalysis>(F)
+ : nullptr;
+ if (!runImpl(F, TTI, DT, BFI, F.getEntryBlock()))
return PreservedAnalyses::all();
- // FIXME: This should also 'preserve the CFG'.
- return PreservedAnalyses::none();
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp
index 9e98219..4fa2789 100644
--- a/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp
@@ -18,15 +18,15 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instruction.h"
#include "llvm/Pass.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
#include <set>
using namespace llvm;
diff --git a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 84f9373..2815778 100644
--- a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -12,8 +12,8 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h"
-#include "llvm/Transforms/Scalar.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LazyValueInfo.h"
@@ -26,6 +26,7 @@
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;
@@ -95,7 +96,8 @@ static bool processSelect(SelectInst *S, LazyValueInfo *LVI) {
return true;
}
-static bool processPHI(PHINode *P, LazyValueInfo *LVI) {
+static bool processPHI(PHINode *P, LazyValueInfo *LVI,
+ const SimplifyQuery &SQ) {
bool Changed = false;
BasicBlock *BB = P->getParent();
@@ -149,9 +151,7 @@ static bool processPHI(PHINode *P, LazyValueInfo *LVI) {
Changed = true;
}
- // FIXME: Provide TLI, DT, AT to SimplifyInstruction.
- const DataLayout &DL = BB->getModule()->getDataLayout();
- if (Value *V = SimplifyInstruction(P, DL)) {
+ if (Value *V = SimplifyInstruction(P, SQ)) {
P->replaceAllUsesWith(V);
P->eraseFromParent();
Changed = true;
@@ -232,12 +232,10 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
pred_iterator PB = pred_begin(BB), PE = pred_end(BB);
if (PB == PE) return false;
- // Analyse each switch case in turn. This is done in reverse order so that
- // removing a case doesn't cause trouble for the iteration.
+ // Analyse each switch case in turn.
bool Changed = false;
- for (SwitchInst::CaseIt CI = SI->case_end(), CE = SI->case_begin(); CI-- != CE;
- ) {
- ConstantInt *Case = CI.getCaseValue();
+ for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) {
+ ConstantInt *Case = CI->getCaseValue();
// Check to see if the switch condition is equal to/not equal to the case
// value on every incoming edge, equal/not equal being the same each time.
@@ -270,8 +268,9 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
if (State == LazyValueInfo::False) {
// This case never fires - remove it.
- CI.getCaseSuccessor()->removePredecessor(BB);
- SI->removeCase(CI); // Does not invalidate the iterator.
+ CI->getCaseSuccessor()->removePredecessor(BB);
+ CI = SI->removeCase(CI);
+ CE = SI->case_end();
// The condition can be modified by removePredecessor's PHI simplification
// logic.
@@ -279,7 +278,9 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
++NumDeadCases;
Changed = true;
- } else if (State == LazyValueInfo::True) {
+ continue;
+ }
+ if (State == LazyValueInfo::True) {
// This case always fires. Arrange for the switch to be turned into an
// unconditional branch by replacing the switch condition with the case
// value.
@@ -288,6 +289,9 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
Changed = true;
break;
}
+
+ // Increment the case iterator since we didn't delete it.
+ ++CI;
}
if (Changed)
@@ -300,7 +304,7 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
/// Infer nonnull attributes for the arguments at the specified callsite.
static bool processCallSite(CallSite CS, LazyValueInfo *LVI) {
- SmallVector<unsigned, 4> Indices;
+ SmallVector<unsigned, 4> ArgNos;
unsigned ArgNo = 0;
for (Value *V : CS.args()) {
@@ -308,23 +312,24 @@ static bool processCallSite(CallSite CS, LazyValueInfo *LVI) {
// Try to mark pointer typed parameters as non-null. We skip the
// relatively expensive analysis for constants which are obviously either
// null or non-null to start with.
- if (Type && !CS.paramHasAttr(ArgNo + 1, Attribute::NonNull) &&
+ if (Type && !CS.paramHasAttr(ArgNo, Attribute::NonNull) &&
!isa<Constant>(V) &&
LVI->getPredicateAt(ICmpInst::ICMP_EQ, V,
ConstantPointerNull::get(Type),
CS.getInstruction()) == LazyValueInfo::False)
- Indices.push_back(ArgNo + 1);
+ ArgNos.push_back(ArgNo);
ArgNo++;
}
assert(ArgNo == CS.arg_size() && "sanity check");
- if (Indices.empty())
+ if (ArgNos.empty())
return false;
- AttributeSet AS = CS.getAttributes();
+ AttributeList AS = CS.getAttributes();
LLVMContext &Ctx = CS.getInstruction()->getContext();
- AS = AS.addAttribute(Ctx, Indices, Attribute::get(Ctx, Attribute::NonNull));
+ AS = AS.addParamAttribute(Ctx, ArgNos,
+ Attribute::get(Ctx, Attribute::NonNull));
CS.setAttributes(AS);
return true;
@@ -437,9 +442,8 @@ static bool processAdd(BinaryOperator *AddOp, LazyValueInfo *LVI) {
bool Changed = false;
if (!NUW) {
- ConstantRange NUWRange =
- LRange.makeGuaranteedNoWrapRegion(BinaryOperator::Add, LRange,
- OBO::NoUnsignedWrap);
+ ConstantRange NUWRange = ConstantRange::makeGuaranteedNoWrapRegion(
+ BinaryOperator::Add, LRange, OBO::NoUnsignedWrap);
if (!NUWRange.isEmptySet()) {
bool NewNUW = NUWRange.contains(LazyRRange());
AddOp->setHasNoUnsignedWrap(NewNUW);
@@ -447,9 +451,8 @@ static bool processAdd(BinaryOperator *AddOp, LazyValueInfo *LVI) {
}
}
if (!NSW) {
- ConstantRange NSWRange =
- LRange.makeGuaranteedNoWrapRegion(BinaryOperator::Add, LRange,
- OBO::NoSignedWrap);
+ ConstantRange NSWRange = ConstantRange::makeGuaranteedNoWrapRegion(
+ BinaryOperator::Add, LRange, OBO::NoSignedWrap);
if (!NSWRange.isEmptySet()) {
bool NewNSW = NSWRange.contains(LazyRRange());
AddOp->setHasNoSignedWrap(NewNSW);
@@ -483,9 +486,8 @@ static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) {
ConstantInt::getFalse(C->getContext());
}
-static bool runImpl(Function &F, LazyValueInfo *LVI) {
+static bool runImpl(Function &F, LazyValueInfo *LVI, const SimplifyQuery &SQ) {
bool FnChanged = false;
-
// Visiting in a pre-order depth-first traversal causes us to simplify early
// blocks before querying later blocks (which require us to analyze early
// blocks). Eagerly simplifying shallow blocks means there is strictly less
@@ -500,7 +502,7 @@ static bool runImpl(Function &F, LazyValueInfo *LVI) {
BBChanged |= processSelect(cast<SelectInst>(II), LVI);
break;
case Instruction::PHI:
- BBChanged |= processPHI(cast<PHINode>(II), LVI);
+ BBChanged |= processPHI(cast<PHINode>(II), LVI, SQ);
break;
case Instruction::ICmp:
case Instruction::FCmp:
@@ -548,7 +550,7 @@ static bool runImpl(Function &F, LazyValueInfo *LVI) {
BBChanged = true;
}
}
- };
+ }
FnChanged |= BBChanged;
}
@@ -561,18 +563,14 @@ bool CorrelatedValuePropagation::runOnFunction(Function &F) {
return false;
LazyValueInfo *LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
- return runImpl(F, LVI);
+ return runImpl(F, LVI, getBestSimplifyQuery(*this, F));
}
PreservedAnalyses
CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) {
LazyValueInfo *LVI = &AM.getResult<LazyValueAnalysis>(F);
- bool Changed = runImpl(F, LVI);
-
- // FIXME: We need to invalidate LVI to avoid PR28400. Is there a better
- // solution?
- AM.invalidate<LazyValueAnalysis>(F);
+ bool Changed = runImpl(F, LVI, getBestSimplifyQuery(AM, F));
if (!Changed)
return PreservedAnalyses::all();
diff --git a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
index cc2a3cf..fa4806e 100644
--- a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
@@ -19,10 +19,10 @@
#include "llvm/Transforms/Scalar/DCE.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instruction.h"
#include "llvm/Pass.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;
@@ -124,9 +124,12 @@ static bool eliminateDeadCode(Function &F, TargetLibraryInfo *TLI) {
}
PreservedAnalyses DCEPass::run(Function &F, FunctionAnalysisManager &AM) {
- if (eliminateDeadCode(F, AM.getCachedResult<TargetLibraryAnalysis>(F)))
- return PreservedAnalyses::none();
- return PreservedAnalyses::all();
+ if (!eliminateDeadCode(F, AM.getCachedResult<TargetLibraryAnalysis>(F)))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
}
namespace {
diff --git a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 4d4c3ba..1ec38e5 100644
--- a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -135,13 +135,13 @@ static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo &TLI) {
if (auto CS = CallSite(I)) {
if (Function *F = CS.getCalledFunction()) {
StringRef FnName = F->getName();
- if (TLI.has(LibFunc::strcpy) && FnName == TLI.getName(LibFunc::strcpy))
+ if (TLI.has(LibFunc_strcpy) && FnName == TLI.getName(LibFunc_strcpy))
return true;
- if (TLI.has(LibFunc::strncpy) && FnName == TLI.getName(LibFunc::strncpy))
+ if (TLI.has(LibFunc_strncpy) && FnName == TLI.getName(LibFunc_strncpy))
return true;
- if (TLI.has(LibFunc::strcat) && FnName == TLI.getName(LibFunc::strcat))
+ if (TLI.has(LibFunc_strcat) && FnName == TLI.getName(LibFunc_strcat))
return true;
- if (TLI.has(LibFunc::strncat) && FnName == TLI.getName(LibFunc::strncat))
+ if (TLI.has(LibFunc_strncat) && FnName == TLI.getName(LibFunc_strncat))
return true;
}
}
@@ -287,19 +287,14 @@ static uint64_t getPointerSize(const Value *V, const DataLayout &DL,
}
namespace {
-enum OverwriteResult {
- OverwriteBegin,
- OverwriteComplete,
- OverwriteEnd,
- OverwriteUnknown
-};
+enum OverwriteResult { OW_Begin, OW_Complete, OW_End, OW_Unknown };
}
-/// Return 'OverwriteComplete' if a store to the 'Later' location completely
-/// overwrites a store to the 'Earlier' location, 'OverwriteEnd' if the end of
-/// the 'Earlier' location is completely overwritten by 'Later',
-/// 'OverwriteBegin' if the beginning of the 'Earlier' location is overwritten
-/// by 'Later', or 'OverwriteUnknown' if nothing can be determined.
+/// Return 'OW_Complete' if a store to the 'Later' location completely
+/// overwrites a store to the 'Earlier' location, 'OW_End' if the end of the
+/// 'Earlier' location is completely overwritten by 'Later', 'OW_Begin' if the
+/// beginning of the 'Earlier' location is overwritten by 'Later', or
+/// 'OW_Unknown' if nothing can be determined.
static OverwriteResult isOverwrite(const MemoryLocation &Later,
const MemoryLocation &Earlier,
const DataLayout &DL,
@@ -310,7 +305,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
// If we don't know the sizes of either access, then we can't do a comparison.
if (Later.Size == MemoryLocation::UnknownSize ||
Earlier.Size == MemoryLocation::UnknownSize)
- return OverwriteUnknown;
+ return OW_Unknown;
const Value *P1 = Earlier.Ptr->stripPointerCasts();
const Value *P2 = Later.Ptr->stripPointerCasts();
@@ -320,7 +315,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
if (P1 == P2) {
// Make sure that the Later size is >= the Earlier size.
if (Later.Size >= Earlier.Size)
- return OverwriteComplete;
+ return OW_Complete;
}
// Check to see if the later store is to the entire object (either a global,
@@ -332,13 +327,13 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
// If we can't resolve the same pointers to the same object, then we can't
// analyze them at all.
if (UO1 != UO2)
- return OverwriteUnknown;
+ return OW_Unknown;
// If the "Later" store is to a recognizable object, get its size.
uint64_t ObjectSize = getPointerSize(UO2, DL, TLI);
if (ObjectSize != MemoryLocation::UnknownSize)
if (ObjectSize == Later.Size && ObjectSize >= Earlier.Size)
- return OverwriteComplete;
+ return OW_Complete;
// Okay, we have stores to two completely different pointers. Try to
// decompose the pointer into a "base + constant_offset" form. If the base
@@ -350,7 +345,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
// If the base pointers still differ, we have two completely different stores.
if (BP1 != BP2)
- return OverwriteUnknown;
+ return OW_Unknown;
// The later store completely overlaps the earlier store if:
//
@@ -370,7 +365,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
if (EarlierOff >= LaterOff &&
Later.Size >= Earlier.Size &&
uint64_t(EarlierOff - LaterOff) + Earlier.Size <= Later.Size)
- return OverwriteComplete;
+ return OW_Complete;
// We may now overlap, although the overlap is not complete. There might also
// be other incomplete overlaps, and together, they might cover the complete
@@ -428,7 +423,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
") Composite Later [" <<
ILI->second << ", " << ILI->first << ")\n");
++NumCompletePartials;
- return OverwriteComplete;
+ return OW_Complete;
}
}
@@ -443,7 +438,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
if (!EnablePartialOverwriteTracking &&
(LaterOff > EarlierOff && LaterOff < int64_t(EarlierOff + Earlier.Size) &&
int64_t(LaterOff + Later.Size) >= int64_t(EarlierOff + Earlier.Size)))
- return OverwriteEnd;
+ return OW_End;
// Finally, we also need to check if the later store overwrites the beginning
// of the earlier store.
@@ -458,11 +453,11 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
(LaterOff <= EarlierOff && int64_t(LaterOff + Later.Size) > EarlierOff)) {
assert(int64_t(LaterOff + Later.Size) <
int64_t(EarlierOff + Earlier.Size) &&
- "Expect to be handled as OverwriteComplete");
- return OverwriteBegin;
+ "Expect to be handled as OW_Complete");
+ return OW_Begin;
}
// Otherwise, they don't completely overlap.
- return OverwriteUnknown;
+ return OW_Unknown;
}
/// If 'Inst' might be a self read (i.e. a noop copy of a
@@ -551,7 +546,7 @@ static bool memoryIsNotModifiedBetween(Instruction *FirstI,
Instruction *I = &*BI;
if (I->mayWriteToMemory() && I != SecondI) {
auto Res = AA->getModRefInfo(I, MemLoc);
- if (Res != MRI_NoModRef)
+ if (Res & MRI_Mod)
return false;
}
}
@@ -909,7 +904,7 @@ static bool tryToShortenBegin(Instruction *EarlierWrite,
if (LaterStart <= EarlierStart && LaterStart + LaterSize > EarlierStart) {
assert(LaterStart + LaterSize < EarlierStart + EarlierSize &&
- "Should have been handled as OverwriteComplete");
+ "Should have been handled as OW_Complete");
if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart,
LaterSize, false)) {
IntervalMap.erase(OII);
@@ -1105,7 +1100,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
OverwriteResult OR =
isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset,
DepWrite, IOL);
- if (OR == OverwriteComplete) {
+ if (OR == OW_Complete) {
DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: "
<< *DepWrite << "\n KILLER: " << *Inst << '\n');
@@ -1117,15 +1112,15 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
// We erased DepWrite; start over.
InstDep = MD->getDependency(Inst);
continue;
- } else if ((OR == OverwriteEnd && isShortenableAtTheEnd(DepWrite)) ||
- ((OR == OverwriteBegin &&
+ } else if ((OR == OW_End && isShortenableAtTheEnd(DepWrite)) ||
+ ((OR == OW_Begin &&
isShortenableAtTheBeginning(DepWrite)))) {
assert(!EnablePartialOverwriteTracking && "Do not expect to perform "
"when partial-overwrite "
"tracking is enabled");
int64_t EarlierSize = DepLoc.Size;
int64_t LaterSize = Loc.Size;
- bool IsOverwriteEnd = (OR == OverwriteEnd);
+ bool IsOverwriteEnd = (OR == OW_End);
MadeChange |= tryToShorten(DepWrite, DepWriteOffset, EarlierSize,
InstWriteOffset, LaterSize, IsOverwriteEnd);
}
@@ -1186,8 +1181,9 @@ PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
if (!eliminateDeadStores(F, AA, MD, DT, TLI))
return PreservedAnalyses::all();
+
PreservedAnalyses PA;
- PA.preserve<DominatorTreeAnalysis>();
+ PA.preserveSet<CFGAnalyses>();
PA.preserve<GlobalsAA>();
PA.preserve<MemoryDependenceAnalysis>();
return PA;
diff --git a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index 16e08ee..c5c9b2c 100644
--- a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -15,10 +15,13 @@
#include "llvm/Transforms/Scalar/EarlyCSE.h"
#include "llvm/ADT/Hashing.h"
#include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/DataLayout.h"
@@ -32,7 +35,6 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/MemorySSA.h"
#include <deque>
using namespace llvm;
using namespace llvm::PatternMatch;
@@ -252,7 +254,9 @@ public:
const TargetTransformInfo &TTI;
DominatorTree &DT;
AssumptionCache &AC;
+ const SimplifyQuery SQ;
MemorySSA *MSSA;
+ std::unique_ptr<MemorySSAUpdater> MSSAUpdater;
typedef RecyclingAllocator<
BumpPtrAllocator, ScopedHashTableVal<SimpleValue, Value *>> AllocatorTy;
typedef ScopedHashTable<SimpleValue, Value *, DenseMapInfo<SimpleValue>,
@@ -313,9 +317,12 @@ public:
unsigned CurrentGeneration;
/// \brief Set up the EarlyCSE runner for a particular function.
- EarlyCSE(const TargetLibraryInfo &TLI, const TargetTransformInfo &TTI,
- DominatorTree &DT, AssumptionCache &AC, MemorySSA *MSSA)
- : TLI(TLI), TTI(TTI), DT(DT), AC(AC), MSSA(MSSA), CurrentGeneration(0) {}
+ EarlyCSE(const DataLayout &DL, const TargetLibraryInfo &TLI,
+ const TargetTransformInfo &TTI, DominatorTree &DT,
+ AssumptionCache &AC, MemorySSA *MSSA)
+ : TLI(TLI), TTI(TTI), DT(DT), AC(AC), SQ(DL, &TLI, &DT, &AC), MSSA(MSSA),
+ MSSAUpdater(make_unique<MemorySSAUpdater>(MSSA)), CurrentGeneration(0) {
+ }
bool run();
@@ -388,7 +395,7 @@ private:
ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI)
: IsTargetMemInst(false), Inst(Inst) {
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst))
- if (TTI.getTgtMemIntrinsic(II, Info) && Info.NumMemRefs == 1)
+ if (TTI.getTgtMemIntrinsic(II, Info))
IsTargetMemInst = true;
}
bool isLoad() const {
@@ -400,17 +407,14 @@ private:
return isa<StoreInst>(Inst);
}
bool isAtomic() const {
- if (IsTargetMemInst) {
- assert(Info.IsSimple && "need to refine IsSimple in TTI");
- return false;
- }
+ if (IsTargetMemInst)
+ return Info.Ordering != AtomicOrdering::NotAtomic;
return Inst->isAtomic();
}
bool isUnordered() const {
- if (IsTargetMemInst) {
- assert(Info.IsSimple && "need to refine IsSimple in TTI");
- return true;
- }
+ if (IsTargetMemInst)
+ return Info.isUnordered();
+
if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
return LI->isUnordered();
} else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
@@ -421,10 +425,9 @@ private:
}
bool isVolatile() const {
- if (IsTargetMemInst) {
- assert(Info.IsSimple && "need to refine IsSimple in TTI");
- return false;
- }
+ if (IsTargetMemInst)
+ return Info.IsVolatile;
+
if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
return LI->isVolatile();
} else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
@@ -504,7 +507,7 @@ private:
if (MemoryAccess *MA = MSSA->getMemoryAccess(Inst)) {
// Optimize MemoryPhi nodes that may become redundant by having all the
// same input values once MA is removed.
- SmallVector<MemoryPhi *, 4> PhisToCheck;
+ SmallSetVector<MemoryPhi *, 4> PhisToCheck;
SmallVector<MemoryAccess *, 8> WorkQueue;
WorkQueue.push_back(MA);
// Process MemoryPhi nodes in FIFO order using a ever-growing vector since
@@ -515,9 +518,9 @@ private:
for (auto *U : WI->users())
if (MemoryPhi *MP = dyn_cast<MemoryPhi>(U))
- PhisToCheck.push_back(MP);
+ PhisToCheck.insert(MP);
- MSSA->removeMemoryAccess(WI);
+ MSSAUpdater->removeMemoryAccess(WI);
for (MemoryPhi *MP : PhisToCheck) {
MemoryAccess *FirstIn = MP->getIncomingValue(0);
@@ -559,13 +562,27 @@ bool EarlyCSE::isSameMemGeneration(unsigned EarlierGeneration,
if (!MSSA)
return false;
+ // If MemorySSA has determined that one of EarlierInst or LaterInst does not
+ // read/write memory, then we can safely return true here.
+ // FIXME: We could be more aggressive when checking doesNotAccessMemory(),
+ // onlyReadsMemory(), mayReadFromMemory(), and mayWriteToMemory() in this pass
+ // by also checking the MemorySSA MemoryAccess on the instruction. Initial
+ // experiments suggest this isn't worthwhile, at least for C/C++ code compiled
+ // with the default optimization pipeline.
+ auto *EarlierMA = MSSA->getMemoryAccess(EarlierInst);
+ if (!EarlierMA)
+ return true;
+ auto *LaterMA = MSSA->getMemoryAccess(LaterInst);
+ if (!LaterMA)
+ return true;
+
// Since we know LaterDef dominates LaterInst and EarlierInst dominates
// LaterInst, if LaterDef dominates EarlierInst then it can't occur between
// EarlierInst and LaterInst and neither can any other write that potentially
// clobbers LaterInst.
MemoryAccess *LaterDef =
MSSA->getWalker()->getClobberingMemoryAccess(LaterInst);
- return MSSA->dominates(LaterDef, MSSA->getMemoryAccess(EarlierInst));
+ return MSSA->dominates(LaterDef, EarlierMA);
}
bool EarlyCSE::processNode(DomTreeNode *Node) {
@@ -587,27 +604,28 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
// which reaches this block where the condition might hold a different
// value. Since we're adding this to the scoped hash table (like any other
// def), it will have been popped if we encounter a future merge block.
- if (BasicBlock *Pred = BB->getSinglePredecessor())
- if (auto *BI = dyn_cast<BranchInst>(Pred->getTerminator()))
- if (BI->isConditional())
- if (auto *CondInst = dyn_cast<Instruction>(BI->getCondition()))
- if (SimpleValue::canHandle(CondInst)) {
- assert(BI->getSuccessor(0) == BB || BI->getSuccessor(1) == BB);
- auto *ConditionalConstant = (BI->getSuccessor(0) == BB) ?
- ConstantInt::getTrue(BB->getContext()) :
- ConstantInt::getFalse(BB->getContext());
- AvailableValues.insert(CondInst, ConditionalConstant);
- DEBUG(dbgs() << "EarlyCSE CVP: Add conditional value for '"
- << CondInst->getName() << "' as " << *ConditionalConstant
- << " in " << BB->getName() << "\n");
- // Replace all dominated uses with the known value.
- if (unsigned Count =
- replaceDominatedUsesWith(CondInst, ConditionalConstant, DT,
- BasicBlockEdge(Pred, BB))) {
- Changed = true;
- NumCSECVP = NumCSECVP + Count;
- }
- }
+ if (BasicBlock *Pred = BB->getSinglePredecessor()) {
+ auto *BI = dyn_cast<BranchInst>(Pred->getTerminator());
+ if (BI && BI->isConditional()) {
+ auto *CondInst = dyn_cast<Instruction>(BI->getCondition());
+ if (CondInst && SimpleValue::canHandle(CondInst)) {
+ assert(BI->getSuccessor(0) == BB || BI->getSuccessor(1) == BB);
+ auto *TorF = (BI->getSuccessor(0) == BB)
+ ? ConstantInt::getTrue(BB->getContext())
+ : ConstantInt::getFalse(BB->getContext());
+ AvailableValues.insert(CondInst, TorF);
+ DEBUG(dbgs() << "EarlyCSE CVP: Add conditional value for '"
+ << CondInst->getName() << "' as " << *TorF << " in "
+ << BB->getName() << "\n");
+ // Replace all dominated uses with the known value.
+ if (unsigned Count = replaceDominatedUsesWith(
+ CondInst, TorF, DT, BasicBlockEdge(Pred, BB))) {
+ Changed = true;
+ NumCSECVP += Count;
+ }
+ }
+ }
+ }
/// LastStore - Keep track of the last non-volatile store that we saw... for
/// as long as there in no instruction that reads memory. If we see a store
@@ -615,8 +633,6 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
/// stores which can occur in bitfield code among other things.
Instruction *LastStore = nullptr;
- const DataLayout &DL = BB->getModule()->getDataLayout();
-
// See if any instructions in the block can be eliminated. If so, do it. If
// not, add them to AvailableValues.
for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
@@ -634,10 +650,16 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
// Skip assume intrinsics, they don't really have side effects (although
// they're marked as such to ensure preservation of control dependencies),
- // and this pass will not disturb any of the assumption's control
- // dependencies.
+ // and this pass will not bother with its removal. However, we should mark
+ // its condition as true for all dominated blocks.
if (match(Inst, m_Intrinsic<Intrinsic::assume>())) {
- DEBUG(dbgs() << "EarlyCSE skipping assumption: " << *Inst << '\n');
+ auto *CondI =
+ dyn_cast<Instruction>(cast<CallInst>(Inst)->getArgOperand(0));
+ if (CondI && SimpleValue::canHandle(CondI)) {
+ DEBUG(dbgs() << "EarlyCSE considering assumption: " << *Inst << '\n');
+ AvailableValues.insert(CondI, ConstantInt::getTrue(BB->getContext()));
+ } else
+ DEBUG(dbgs() << "EarlyCSE skipping assumption: " << *Inst << '\n');
continue;
}
@@ -657,10 +679,25 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
if (match(Inst, m_Intrinsic<Intrinsic::experimental_guard>())) {
if (auto *CondI =
dyn_cast<Instruction>(cast<CallInst>(Inst)->getArgOperand(0))) {
- // The condition we're on guarding here is true for all dominated
- // locations.
- if (SimpleValue::canHandle(CondI))
+ if (SimpleValue::canHandle(CondI)) {
+ // Do we already know the actual value of this condition?
+ if (auto *KnownCond = AvailableValues.lookup(CondI)) {
+ // Is the condition known to be true?
+ if (isa<ConstantInt>(KnownCond) &&
+ cast<ConstantInt>(KnownCond)->isOne()) {
+ DEBUG(dbgs() << "EarlyCSE removing guard: " << *Inst << '\n');
+ removeMSSA(Inst);
+ Inst->eraseFromParent();
+ Changed = true;
+ continue;
+ } else
+ // Use the known value if it wasn't true.
+ cast<CallInst>(Inst)->setArgOperand(0, KnownCond);
+ }
+ // The condition we're on guarding here is true for all dominated
+ // locations.
AvailableValues.insert(CondI, ConstantInt::getTrue(BB->getContext()));
+ }
}
// Guard intrinsics read all memory, but don't write any memory.
@@ -672,7 +709,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
// If the instruction can be simplified (e.g. X+0 = X) then replace it with
// its simpler value.
- if (Value *V = SimplifyInstruction(Inst, DL, &TLI, &DT, &AC)) {
+ if (Value *V = SimplifyInstruction(Inst, SQ)) {
DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << " to: " << *V << '\n');
bool Killed = false;
if (!Inst->use_empty()) {
@@ -761,12 +798,13 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
continue;
}
- // If this instruction may read from memory, forget LastStore.
- // Load/store intrinsics will indicate both a read and a write to
- // memory. The target may override this (e.g. so that a store intrinsic
- // does not read from memory, and thus will be treated the same as a
- // regular store for commoning purposes).
- if (Inst->mayReadFromMemory() &&
+ // If this instruction may read from memory or throw (and potentially read
+ // from memory in the exception handler), forget LastStore. Load/store
+ // intrinsics will indicate both a read and a write to memory. The target
+ // may override this (e.g. so that a store intrinsic does not read from
+ // memory, and thus will be treated the same as a regular store for
+ // commoning purposes).
+ if ((Inst->mayReadFromMemory() || Inst->mayThrow()) &&
!(MemInst.isValid() && !MemInst.mayReadFromMemory()))
LastStore = nullptr;
@@ -962,15 +1000,13 @@ PreservedAnalyses EarlyCSEPass::run(Function &F,
auto *MSSA =
UseMemorySSA ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() : nullptr;
- EarlyCSE CSE(TLI, TTI, DT, AC, MSSA);
+ EarlyCSE CSE(F.getParent()->getDataLayout(), TLI, TTI, DT, AC, MSSA);
if (!CSE.run())
return PreservedAnalyses::all();
- // CSE preserves the dominator tree because it doesn't mutate the CFG.
- // FIXME: Bundle this with other CFG-preservation.
PreservedAnalyses PA;
- PA.preserve<DominatorTreeAnalysis>();
+ PA.preserveSet<CFGAnalyses>();
PA.preserve<GlobalsAA>();
if (UseMemorySSA)
PA.preserve<MemorySSAAnalysis>();
@@ -1008,7 +1044,7 @@ public:
auto *MSSA =
UseMemorySSA ? &getAnalysis<MemorySSAWrapperPass>().getMSSA() : nullptr;
- EarlyCSE CSE(TLI, TTI, DT, AC, MSSA);
+ EarlyCSE CSE(F.getParent()->getDataLayout(), TLI, TTI, DT, AC, MSSA);
return CSE.run();
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
index 185cdbd..063df77 100644
--- a/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
@@ -11,10 +11,10 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/Transforms/Scalar.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/IR/CFG.h"
#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;
diff --git a/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp
index 545036d..b105ece 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp
@@ -137,13 +137,13 @@ void Float2IntPass::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) {
}
// Helper - mark I as having been traversed, having range R.
-ConstantRange Float2IntPass::seen(Instruction *I, ConstantRange R) {
+void Float2IntPass::seen(Instruction *I, ConstantRange R) {
DEBUG(dbgs() << "F2I: " << *I << ":" << R << "\n");
- if (SeenInsts.find(I) != SeenInsts.end())
- SeenInsts.find(I)->second = R;
+ auto IT = SeenInsts.find(I);
+ if (IT != SeenInsts.end())
+ IT->second = std::move(R);
else
- SeenInsts.insert(std::make_pair(I, R));
- return R;
+ SeenInsts.insert(std::make_pair(I, std::move(R)));
}
// Helper - get a range representing a poison value.
@@ -516,11 +516,10 @@ FunctionPass *createFloat2IntPass() { return new Float2IntLegacyPass(); }
PreservedAnalyses Float2IntPass::run(Function &F, FunctionAnalysisManager &) {
if (!runImpl(F))
return PreservedAnalyses::all();
- else {
- // FIXME: This should also 'preserve the CFG'.
- PreservedAnalyses PA;
- PA.preserve<GlobalsAA>();
- return PA;
- }
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<GlobalsAA>();
+ return PA;
}
} // End namespace llvm
diff --git a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
index 0137378..ea28705 100644
--- a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -36,7 +36,6 @@
#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
#include "llvm/Analysis/PHITransAddr.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/GlobalVariable.h"
@@ -51,9 +50,12 @@
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/VNCoercion.h"
+
#include <vector>
using namespace llvm;
using namespace llvm::gvn;
+using namespace llvm::VNCoercion;
using namespace PatternMatch;
#define DEBUG_TYPE "gvn"
@@ -595,11 +597,12 @@ PreservedAnalyses GVN::run(Function &F, FunctionAnalysisManager &AM) {
PreservedAnalyses PA;
PA.preserve<DominatorTreeAnalysis>();
PA.preserve<GlobalsAA>();
+ PA.preserve<TargetLibraryAnalysis>();
return PA;
}
-LLVM_DUMP_METHOD
-void GVN::dump(DenseMap<uint32_t, Value*>& d) {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void GVN::dump(DenseMap<uint32_t, Value*>& d) const {
errs() << "{\n";
for (DenseMap<uint32_t, Value*>::iterator I = d.begin(),
E = d.end(); I != E; ++I) {
@@ -608,6 +611,7 @@ void GVN::dump(DenseMap<uint32_t, Value*>& d) {
}
errs() << "}\n";
}
+#endif
/// Return true if we can prove that the value
/// we're analyzing is fully available in the specified block. As we go, keep
@@ -690,442 +694,6 @@ SpeculationFailure:
}
-/// Return true if CoerceAvailableValueToLoadType will succeed.
-static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
- Type *LoadTy,
- const DataLayout &DL) {
- // If the loaded or stored value is an first class array or struct, don't try
- // to transform them. We need to be able to bitcast to integer.
- if (LoadTy->isStructTy() || LoadTy->isArrayTy() ||
- StoredVal->getType()->isStructTy() ||
- StoredVal->getType()->isArrayTy())
- return false;
-
- // The store has to be at least as big as the load.
- if (DL.getTypeSizeInBits(StoredVal->getType()) <
- DL.getTypeSizeInBits(LoadTy))
- return false;
-
- return true;
-}
-
-/// If we saw a store of a value to memory, and
-/// then a load from a must-aliased pointer of a different type, try to coerce
-/// the stored value. LoadedTy is the type of the load we want to replace.
-/// IRB is IRBuilder used to insert new instructions.
-///
-/// If we can't do it, return null.
-static Value *CoerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
- IRBuilder<> &IRB,
- const DataLayout &DL) {
- assert(CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL) &&
- "precondition violation - materialization can't fail");
-
- if (auto *C = dyn_cast<Constant>(StoredVal))
- if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL))
- StoredVal = FoldedStoredVal;
-
- // If this is already the right type, just return it.
- Type *StoredValTy = StoredVal->getType();
-
- uint64_t StoredValSize = DL.getTypeSizeInBits(StoredValTy);
- uint64_t LoadedValSize = DL.getTypeSizeInBits(LoadedTy);
-
- // If the store and reload are the same size, we can always reuse it.
- if (StoredValSize == LoadedValSize) {
- // Pointer to Pointer -> use bitcast.
- if (StoredValTy->getScalarType()->isPointerTy() &&
- LoadedTy->getScalarType()->isPointerTy()) {
- StoredVal = IRB.CreateBitCast(StoredVal, LoadedTy);
- } else {
- // Convert source pointers to integers, which can be bitcast.
- if (StoredValTy->getScalarType()->isPointerTy()) {
- StoredValTy = DL.getIntPtrType(StoredValTy);
- StoredVal = IRB.CreatePtrToInt(StoredVal, StoredValTy);
- }
-
- Type *TypeToCastTo = LoadedTy;
- if (TypeToCastTo->getScalarType()->isPointerTy())
- TypeToCastTo = DL.getIntPtrType(TypeToCastTo);
-
- if (StoredValTy != TypeToCastTo)
- StoredVal = IRB.CreateBitCast(StoredVal, TypeToCastTo);
-
- // Cast to pointer if the load needs a pointer type.
- if (LoadedTy->getScalarType()->isPointerTy())
- StoredVal = IRB.CreateIntToPtr(StoredVal, LoadedTy);
- }
-
- if (auto *C = dyn_cast<ConstantExpr>(StoredVal))
- if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL))
- StoredVal = FoldedStoredVal;
-
- return StoredVal;
- }
-
- // If the loaded value is smaller than the available value, then we can
- // extract out a piece from it. If the available value is too small, then we
- // can't do anything.
- assert(StoredValSize >= LoadedValSize &&
- "CanCoerceMustAliasedValueToLoad fail");
-
- // Convert source pointers to integers, which can be manipulated.
- if (StoredValTy->getScalarType()->isPointerTy()) {
- StoredValTy = DL.getIntPtrType(StoredValTy);
- StoredVal = IRB.CreatePtrToInt(StoredVal, StoredValTy);
- }
-
- // Convert vectors and fp to integer, which can be manipulated.
- if (!StoredValTy->isIntegerTy()) {
- StoredValTy = IntegerType::get(StoredValTy->getContext(), StoredValSize);
- StoredVal = IRB.CreateBitCast(StoredVal, StoredValTy);
- }
-
- // If this is a big-endian system, we need to shift the value down to the low
- // bits so that a truncate will work.
- if (DL.isBigEndian()) {
- uint64_t ShiftAmt = DL.getTypeStoreSizeInBits(StoredValTy) -
- DL.getTypeStoreSizeInBits(LoadedTy);
- StoredVal = IRB.CreateLShr(StoredVal, ShiftAmt, "tmp");
- }
-
- // Truncate the integer to the right size now.
- Type *NewIntTy = IntegerType::get(StoredValTy->getContext(), LoadedValSize);
- StoredVal = IRB.CreateTrunc(StoredVal, NewIntTy, "trunc");
-
- if (LoadedTy != NewIntTy) {
- // If the result is a pointer, inttoptr.
- if (LoadedTy->getScalarType()->isPointerTy())
- StoredVal = IRB.CreateIntToPtr(StoredVal, LoadedTy, "inttoptr");
- else
- // Otherwise, bitcast.
- StoredVal = IRB.CreateBitCast(StoredVal, LoadedTy, "bitcast");
- }
-
- if (auto *C = dyn_cast<Constant>(StoredVal))
- if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL))
- StoredVal = FoldedStoredVal;
-
- return StoredVal;
-}
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being a clobbering memory write (store,
-/// memset, memcpy, memmove). This means that the write *may* provide bits used
-/// by the load but we can't be sure because the pointers don't mustalias.
-///
-/// Check this case to see if there is anything more we can do before we give
-/// up. This returns -1 if we have to give up, or a byte number in the stored
-/// value of the piece that feeds the load.
-static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
- Value *WritePtr,
- uint64_t WriteSizeInBits,
- const DataLayout &DL) {
- // If the loaded or stored value is a first class array or struct, don't try
- // to transform them. We need to be able to bitcast to integer.
- if (LoadTy->isStructTy() || LoadTy->isArrayTy())
- return -1;
-
- int64_t StoreOffset = 0, LoadOffset = 0;
- Value *StoreBase =
- GetPointerBaseWithConstantOffset(WritePtr, StoreOffset, DL);
- Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, DL);
- if (StoreBase != LoadBase)
- return -1;
-
- // If the load and store are to the exact same address, they should have been
- // a must alias. AA must have gotten confused.
- // FIXME: Study to see if/when this happens. One case is forwarding a memset
- // to a load from the base of the memset.
-
- // If the load and store don't overlap at all, the store doesn't provide
- // anything to the load. In this case, they really don't alias at all, AA
- // must have gotten confused.
- uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy);
-
- if ((WriteSizeInBits & 7) | (LoadSize & 7))
- return -1;
- uint64_t StoreSize = WriteSizeInBits / 8; // Convert to bytes.
- LoadSize /= 8;
-
-
- bool isAAFailure = false;
- if (StoreOffset < LoadOffset)
- isAAFailure = StoreOffset+int64_t(StoreSize) <= LoadOffset;
- else
- isAAFailure = LoadOffset+int64_t(LoadSize) <= StoreOffset;
-
- if (isAAFailure)
- return -1;
-
- // If the Load isn't completely contained within the stored bits, we don't
- // have all the bits to feed it. We could do something crazy in the future
- // (issue a smaller load then merge the bits in) but this seems unlikely to be
- // valuable.
- if (StoreOffset > LoadOffset ||
- StoreOffset+StoreSize < LoadOffset+LoadSize)
- return -1;
-
- // Okay, we can do this transformation. Return the number of bytes into the
- // store that the load is.
- return LoadOffset-StoreOffset;
-}
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being a clobbering store.
-static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
- StoreInst *DepSI) {
- // Cannot handle reading from store of first-class aggregate yet.
- if (DepSI->getValueOperand()->getType()->isStructTy() ||
- DepSI->getValueOperand()->getType()->isArrayTy())
- return -1;
-
- const DataLayout &DL = DepSI->getModule()->getDataLayout();
- Value *StorePtr = DepSI->getPointerOperand();
- uint64_t StoreSize =DL.getTypeSizeInBits(DepSI->getValueOperand()->getType());
- return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr,
- StorePtr, StoreSize, DL);
-}
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being clobbered by another load. See if
-/// the other load can feed into the second load.
-static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr,
- LoadInst *DepLI, const DataLayout &DL){
- // Cannot handle reading from store of first-class aggregate yet.
- if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy())
- return -1;
-
- Value *DepPtr = DepLI->getPointerOperand();
- uint64_t DepSize = DL.getTypeSizeInBits(DepLI->getType());
- int R = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, DL);
- if (R != -1) return R;
-
- // If we have a load/load clobber an DepLI can be widened to cover this load,
- // then we should widen it!
- int64_t LoadOffs = 0;
- const Value *LoadBase =
- GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, DL);
- unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
-
- unsigned Size = MemoryDependenceResults::getLoadLoadClobberFullWidthSize(
- LoadBase, LoadOffs, LoadSize, DepLI);
- if (Size == 0) return -1;
-
- // Check non-obvious conditions enforced by MDA which we rely on for being
- // able to materialize this potentially available value
- assert(DepLI->isSimple() && "Cannot widen volatile/atomic load!");
- assert(DepLI->getType()->isIntegerTy() && "Can't widen non-integer load");
-
- return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size*8, DL);
-}
-
-
-
-static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
- MemIntrinsic *MI,
- const DataLayout &DL) {
- // If the mem operation is a non-constant size, we can't handle it.
- ConstantInt *SizeCst = dyn_cast<ConstantInt>(MI->getLength());
- if (!SizeCst) return -1;
- uint64_t MemSizeInBits = SizeCst->getZExtValue()*8;
-
- // If this is memset, we just need to see if the offset is valid in the size
- // of the memset..
- if (MI->getIntrinsicID() == Intrinsic::memset)
- return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(),
- MemSizeInBits, DL);
-
- // If we have a memcpy/memmove, the only case we can handle is if this is a
- // copy from constant memory. In that case, we can read directly from the
- // constant memory.
- MemTransferInst *MTI = cast<MemTransferInst>(MI);
-
- Constant *Src = dyn_cast<Constant>(MTI->getSource());
- if (!Src) return -1;
-
- GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, DL));
- if (!GV || !GV->isConstant()) return -1;
-
- // See if the access is within the bounds of the transfer.
- int Offset = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr,
- MI->getDest(), MemSizeInBits, DL);
- if (Offset == -1)
- return Offset;
-
- unsigned AS = Src->getType()->getPointerAddressSpace();
- // Otherwise, see if we can constant fold a load from the constant with the
- // offset applied as appropriate.
- Src = ConstantExpr::getBitCast(Src,
- Type::getInt8PtrTy(Src->getContext(), AS));
- Constant *OffsetCst =
- ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
- Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src,
- OffsetCst);
- Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
- if (ConstantFoldLoadFromConstPtr(Src, LoadTy, DL))
- return Offset;
- return -1;
-}
-
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being a clobbering store. This means
-/// that the store provides bits used by the load but we the pointers don't
-/// mustalias. Check this case to see if there is anything more we can do
-/// before we give up.
-static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
- Type *LoadTy,
- Instruction *InsertPt, const DataLayout &DL){
- LLVMContext &Ctx = SrcVal->getType()->getContext();
-
- uint64_t StoreSize = (DL.getTypeSizeInBits(SrcVal->getType()) + 7) / 8;
- uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy) + 7) / 8;
-
- IRBuilder<> Builder(InsertPt);
-
- // Compute which bits of the stored value are being used by the load. Convert
- // to an integer type to start with.
- if (SrcVal->getType()->getScalarType()->isPointerTy())
- SrcVal = Builder.CreatePtrToInt(SrcVal,
- DL.getIntPtrType(SrcVal->getType()));
- if (!SrcVal->getType()->isIntegerTy())
- SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8));
-
- // Shift the bits to the least significant depending on endianness.
- unsigned ShiftAmt;
- if (DL.isLittleEndian())
- ShiftAmt = Offset*8;
- else
- ShiftAmt = (StoreSize-LoadSize-Offset)*8;
-
- if (ShiftAmt)
- SrcVal = Builder.CreateLShr(SrcVal, ShiftAmt);
-
- if (LoadSize != StoreSize)
- SrcVal = Builder.CreateTrunc(SrcVal, IntegerType::get(Ctx, LoadSize*8));
-
- return CoerceAvailableValueToLoadType(SrcVal, LoadTy, Builder, DL);
-}
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being a clobbering load. This means
-/// that the load *may* provide bits used by the load but we can't be sure
-/// because the pointers don't mustalias. Check this case to see if there is
-/// anything more we can do before we give up.
-static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
- Type *LoadTy, Instruction *InsertPt,
- GVN &gvn) {
- const DataLayout &DL = SrcVal->getModule()->getDataLayout();
- // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to
- // widen SrcVal out to a larger load.
- unsigned SrcValStoreSize = DL.getTypeStoreSize(SrcVal->getType());
- unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
- if (Offset+LoadSize > SrcValStoreSize) {
- assert(SrcVal->isSimple() && "Cannot widen volatile/atomic load!");
- assert(SrcVal->getType()->isIntegerTy() && "Can't widen non-integer load");
- // If we have a load/load clobber an DepLI can be widened to cover this
- // load, then we should widen it to the next power of 2 size big enough!
- unsigned NewLoadSize = Offset+LoadSize;
- if (!isPowerOf2_32(NewLoadSize))
- NewLoadSize = NextPowerOf2(NewLoadSize);
-
- Value *PtrVal = SrcVal->getPointerOperand();
-
- // Insert the new load after the old load. This ensures that subsequent
- // memdep queries will find the new load. We can't easily remove the old
- // load completely because it is already in the value numbering table.
- IRBuilder<> Builder(SrcVal->getParent(), ++BasicBlock::iterator(SrcVal));
- Type *DestPTy =
- IntegerType::get(LoadTy->getContext(), NewLoadSize*8);
- DestPTy = PointerType::get(DestPTy,
- PtrVal->getType()->getPointerAddressSpace());
- Builder.SetCurrentDebugLocation(SrcVal->getDebugLoc());
- PtrVal = Builder.CreateBitCast(PtrVal, DestPTy);
- LoadInst *NewLoad = Builder.CreateLoad(PtrVal);
- NewLoad->takeName(SrcVal);
- NewLoad->setAlignment(SrcVal->getAlignment());
-
- DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n");
- DEBUG(dbgs() << "TO: " << *NewLoad << "\n");
-
- // Replace uses of the original load with the wider load. On a big endian
- // system, we need to shift down to get the relevant bits.
- Value *RV = NewLoad;
- if (DL.isBigEndian())
- RV = Builder.CreateLShr(RV, (NewLoadSize - SrcValStoreSize) * 8);
- RV = Builder.CreateTrunc(RV, SrcVal->getType());
- SrcVal->replaceAllUsesWith(RV);
-
- // We would like to use gvn.markInstructionForDeletion here, but we can't
- // because the load is already memoized into the leader map table that GVN
- // tracks. It is potentially possible to remove the load from the table,
- // but then there all of the operations based on it would need to be
- // rehashed. Just leave the dead load around.
- gvn.getMemDep().removeInstruction(SrcVal);
- SrcVal = NewLoad;
- }
-
- return GetStoreValueForLoad(SrcVal, Offset, LoadTy, InsertPt, DL);
-}
-
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being a clobbering mem intrinsic.
-static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
- Type *LoadTy, Instruction *InsertPt,
- const DataLayout &DL){
- LLVMContext &Ctx = LoadTy->getContext();
- uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy)/8;
-
- IRBuilder<> Builder(InsertPt);
-
- // We know that this method is only called when the mem transfer fully
- // provides the bits for the load.
- if (MemSetInst *MSI = dyn_cast<MemSetInst>(SrcInst)) {
- // memset(P, 'x', 1234) -> splat('x'), even if x is a variable, and
- // independently of what the offset is.
- Value *Val = MSI->getValue();
- if (LoadSize != 1)
- Val = Builder.CreateZExt(Val, IntegerType::get(Ctx, LoadSize*8));
-
- Value *OneElt = Val;
-
- // Splat the value out to the right number of bits.
- for (unsigned NumBytesSet = 1; NumBytesSet != LoadSize; ) {
- // If we can double the number of bytes set, do it.
- if (NumBytesSet*2 <= LoadSize) {
- Value *ShVal = Builder.CreateShl(Val, NumBytesSet*8);
- Val = Builder.CreateOr(Val, ShVal);
- NumBytesSet <<= 1;
- continue;
- }
-
- // Otherwise insert one byte at a time.
- Value *ShVal = Builder.CreateShl(Val, 1*8);
- Val = Builder.CreateOr(OneElt, ShVal);
- ++NumBytesSet;
- }
-
- return CoerceAvailableValueToLoadType(Val, LoadTy, Builder, DL);
- }
-
- // Otherwise, this is a memcpy/memmove from a constant global.
- MemTransferInst *MTI = cast<MemTransferInst>(SrcInst);
- Constant *Src = cast<Constant>(MTI->getSource());
- unsigned AS = Src->getType()->getPointerAddressSpace();
-
- // Otherwise, see if we can constant fold a load from the constant with the
- // offset applied as appropriate.
- Src = ConstantExpr::getBitCast(Src,
- Type::getInt8PtrTy(Src->getContext(), AS));
- Constant *OffsetCst =
- ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
- Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src,
- OffsetCst);
- Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
- return ConstantFoldLoadFromConstPtr(Src, LoadTy, DL);
-}
/// Given a set of loads specified by ValuesPerBlock,
@@ -1171,7 +739,7 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *LI,
if (isSimpleValue()) {
Res = getSimpleValue();
if (Res->getType() != LoadTy) {
- Res = GetStoreValueForLoad(Res, Offset, LoadTy, InsertPt, DL);
+ Res = getStoreValueForLoad(Res, Offset, LoadTy, InsertPt, DL);
DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << " "
<< *getSimpleValue() << '\n'
@@ -1182,14 +750,20 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *LI,
if (Load->getType() == LoadTy && Offset == 0) {
Res = Load;
} else {
- Res = GetLoadValueForLoad(Load, Offset, LoadTy, InsertPt, gvn);
-
+ Res = getLoadValueForLoad(Load, Offset, LoadTy, InsertPt, DL);
+ // We would like to use gvn.markInstructionForDeletion here, but we can't
+ // because the load is already memoized into the leader map table that GVN
+ // tracks. It is potentially possible to remove the load from the table,
+ // but then there all of the operations based on it would need to be
+ // rehashed. Just leave the dead load around.
+ gvn.getMemDep().removeInstruction(Load);
DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset << " "
<< *getCoercedLoadValue() << '\n'
- << *Res << '\n' << "\n\n\n");
+ << *Res << '\n'
+ << "\n\n\n");
}
} else if (isMemIntrinValue()) {
- Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset, LoadTy,
+ Res = getMemInstValueForLoad(getMemIntrinValue(), Offset, LoadTy,
InsertPt, DL);
DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
<< " " << *getMemIntrinValue() << '\n'
@@ -1258,7 +832,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
// Can't forward from non-atomic to atomic without violating memory model.
if (Address && LI->isAtomic() <= DepSI->isAtomic()) {
int Offset =
- AnalyzeLoadFromClobberingStore(LI->getType(), Address, DepSI);
+ analyzeLoadFromClobberingStore(LI->getType(), Address, DepSI, DL);
if (Offset != -1) {
Res = AvailableValue::get(DepSI->getValueOperand(), Offset);
return true;
@@ -1276,7 +850,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
// Can't forward from non-atomic to atomic without violating memory model.
if (DepLI != LI && Address && LI->isAtomic() <= DepLI->isAtomic()) {
int Offset =
- AnalyzeLoadFromClobberingLoad(LI->getType(), Address, DepLI, DL);
+ analyzeLoadFromClobberingLoad(LI->getType(), Address, DepLI, DL);
if (Offset != -1) {
Res = AvailableValue::getLoad(DepLI, Offset);
@@ -1289,7 +863,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
// forward a value on from it.
if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInfo.getInst())) {
if (Address && !LI->isAtomic()) {
- int Offset = AnalyzeLoadFromClobberingMemInst(LI->getType(), Address,
+ int Offset = analyzeLoadFromClobberingMemInst(LI->getType(), Address,
DepMI, DL);
if (Offset != -1) {
Res = AvailableValue::getMI(DepMI, Offset);
@@ -1334,7 +908,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
// different types if we have to. If the stored value is larger or equal to
// the loaded value, we can reuse it.
if (S->getValueOperand()->getType() != LI->getType() &&
- !CanCoerceMustAliasedValueToLoad(S->getValueOperand(),
+ !canCoerceMustAliasedValueToLoad(S->getValueOperand(),
LI->getType(), DL))
return false;
@@ -1351,7 +925,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
// If the stored value is larger or equal to the loaded value, we can reuse
// it.
if (LD->getType() != LI->getType() &&
- !CanCoerceMustAliasedValueToLoad(LD, LI->getType(), DL))
+ !canCoerceMustAliasedValueToLoad(LD, LI->getType(), DL))
return false;
// Can't forward from non-atomic to atomic without violating memory model.
@@ -1592,8 +1166,9 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
auto *NewLoad = new LoadInst(LoadPtr, LI->getName()+".pre",
LI->isVolatile(), LI->getAlignment(),
- LI->getOrdering(), LI->getSynchScope(),
+ LI->getOrdering(), LI->getSyncScopeID(),
UnavailablePred->getTerminator());
+ NewLoad->setDebugLoc(LI->getDebugLoc());
// Transfer the old load's AA tags to the new load.
AAMDNodes Tags;
@@ -1628,7 +1203,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
V->takeName(LI);
if (Instruction *I = dyn_cast<Instruction>(V))
I->setDebugLoc(LI->getDebugLoc());
- if (V->getType()->getScalarType()->isPointerTy())
+ if (V->getType()->isPtrOrPtrVectorTy())
MD->invalidateCachedPointerInfo(V);
markInstructionForDeletion(LI);
ORE->emit(OptimizationRemark(DEBUG_TYPE, "LoadPRE", LI)
@@ -1713,9 +1288,9 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
// If instruction I has debug info, then we should not update it.
// Also, if I has a null DebugLoc, then it is still potentially incorrect
// to propagate LI's DebugLoc because LI may not post-dominate I.
- if (LI->getDebugLoc() && ValuesPerBlock.size() != 1)
+ if (LI->getDebugLoc() && LI->getParent() == I->getParent())
I->setDebugLoc(LI->getDebugLoc());
- if (V->getType()->getScalarType()->isPointerTy())
+ if (V->getType()->isPtrOrPtrVectorTy())
MD->invalidateCachedPointerInfo(V);
markInstructionForDeletion(LI);
++NumGVNLoad;
@@ -1795,7 +1370,7 @@ static void patchReplacementInstruction(Instruction *I, Value *Repl) {
// Patch the replacement so that it is not more restrictive than the value
// being replaced.
- // Note that if 'I' is a load being replaced by some operation,
+ // Note that if 'I' is a load being replaced by some operation,
// for example, by an arithmetic operation, then andIRFlags()
// would just erase all math flags from the original arithmetic
// operation, which is clearly not wanted and not needed.
@@ -1869,7 +1444,7 @@ bool GVN::processLoad(LoadInst *L) {
reportLoadElim(L, AvailableValue, ORE);
// Tell MDA to rexamine the reused pointer since we might have more
// information after forwarding it.
- if (MD && AvailableValue->getType()->getScalarType()->isPointerTy())
+ if (MD && AvailableValue->getType()->isPtrOrPtrVectorTy())
MD->invalidateCachedPointerInfo(AvailableValue);
return true;
}
@@ -2024,7 +1599,7 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root,
// RHS neither 'true' nor 'false' - bail out.
continue;
// Whether RHS equals 'true'. Otherwise it equals 'false'.
- bool isKnownTrue = CI->isAllOnesValue();
+ bool isKnownTrue = CI->isMinusOne();
bool isKnownFalse = !isKnownTrue;
// If "A && B" is known true then both A and B are known true. If "A || B"
@@ -2113,7 +1688,7 @@ bool GVN::processInstruction(Instruction *I) {
// example if it determines that %y is equal to %x then the instruction
// "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify.
const DataLayout &DL = I->getModule()->getDataLayout();
- if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AC)) {
+ if (Value *V = SimplifyInstruction(I, {DL, TLI, DT, AC})) {
bool Changed = false;
if (!I->use_empty()) {
I->replaceAllUsesWith(V);
@@ -2124,7 +1699,7 @@ bool GVN::processInstruction(Instruction *I) {
Changed = true;
}
if (Changed) {
- if (MD && V->getType()->getScalarType()->isPointerTy())
+ if (MD && V->getType()->isPtrOrPtrVectorTy())
MD->invalidateCachedPointerInfo(V);
++NumGVNSimpl;
return true;
@@ -2187,11 +1762,11 @@ bool GVN::processInstruction(Instruction *I) {
for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
i != e; ++i) {
- BasicBlock *Dst = i.getCaseSuccessor();
+ BasicBlock *Dst = i->getCaseSuccessor();
// If there is only a single edge, propagate the case value into it.
if (SwitchEdges.lookup(Dst) == 1) {
BasicBlockEdge E(Parent, Dst);
- Changed |= propagateEquality(SwitchCond, i.getCaseValue(), E, true);
+ Changed |= propagateEquality(SwitchCond, i->getCaseValue(), E, true);
}
}
return Changed;
@@ -2235,7 +1810,7 @@ bool GVN::processInstruction(Instruction *I) {
// Remove it!
patchAndReplaceAllUsesWith(I, Repl);
- if (MD && Repl->getType()->getScalarType()->isPointerTy())
+ if (MD && Repl->getType()->isPtrOrPtrVectorTy())
MD->invalidateCachedPointerInfo(Repl);
markInstructionForDeletion(I);
return true;
@@ -2483,7 +2058,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
if (!performScalarPREInsertion(PREInstr, PREPred, ValNo)) {
// If we failed insertion, make sure we remove the instruction.
DEBUG(verifyRemoved(PREInstr));
- delete PREInstr;
+ PREInstr->deleteValue();
return false;
}
}
@@ -2509,7 +2084,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
addToLeaderTable(ValNo, Phi, CurrentBlock);
Phi->setDebugLoc(CurInst->getDebugLoc());
CurInst->replaceAllUsesWith(Phi);
- if (MD && Phi->getType()->getScalarType()->isPointerTy())
+ if (MD && Phi->getType()->isPtrOrPtrVectorTy())
MD->invalidateCachedPointerInfo(Phi);
VN.erase(CurInst);
removeFromLeaderTable(ValNo, CurInst, CurrentBlock);
@@ -2581,21 +2156,12 @@ bool GVN::iterateOnFunction(Function &F) {
// Top-down walk of the dominator tree
bool Changed = false;
- // Save the blocks this function have before transformation begins. GVN may
- // split critical edge, and hence may invalidate the RPO/DT iterator.
- //
- std::vector<BasicBlock *> BBVect;
- BBVect.reserve(256);
// Needed for value numbering with phi construction to work.
+ // RPOT walks the graph in its constructor and will not be invalidated during
+ // processBlock.
ReversePostOrderTraversal<Function *> RPOT(&F);
- for (ReversePostOrderTraversal<Function *>::rpo_iterator RI = RPOT.begin(),
- RE = RPOT.end();
- RI != RE; ++RI)
- BBVect.push_back(*RI);
-
- for (std::vector<BasicBlock *>::iterator I = BBVect.begin(), E = BBVect.end();
- I != E; I++)
- Changed |= processBlock(*I);
+ for (BasicBlock *BB : RPOT)
+ Changed |= processBlock(BB);
return Changed;
}
@@ -2783,6 +2349,7 @@ public:
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<TargetLibraryInfoWrapperPass>();
AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp b/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp
index f8e1d2e..29de792 100644
--- a/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp
@@ -17,16 +17,40 @@
// is disabled in the following cases.
// 1. Scalars across calls.
// 2. geps when corresponding load/store cannot be hoisted.
+//
+// TODO: Hoist from >2 successors. Currently GVNHoist will not hoist stores
+// in this case because it works on two instructions at a time.
+// entry:
+// switch i32 %c1, label %exit1 [
+// i32 0, label %sw0
+// i32 1, label %sw1
+// ]
+//
+// sw0:
+// store i32 1, i32* @G
+// br label %exit
+//
+// sw1:
+// store i32 1, i32* @G
+// br label %exit
+//
+// exit1:
+// store i32 1, i32* @G
+// ret void
+// exit:
+// ret void
//===----------------------------------------------------------------------===//
-#include "llvm/Transforms/Scalar/GVN.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVN.h"
#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/MemorySSA.h"
using namespace llvm;
@@ -60,7 +84,7 @@ static cl::opt<int>
cl::desc("Maximum length of dependent chains to hoist "
"(default = 10, unlimited = -1)"));
-namespace {
+namespace llvm {
// Provides a sorting function based on the execution order of two instructions.
struct SortByDFSIn {
@@ -72,13 +96,6 @@ public:
// Returns true when A executes before B.
bool operator()(const Instruction *A, const Instruction *B) const {
- // FIXME: libc++ has a std::sort() algorithm that will call the compare
- // function on the same element. Once PR20837 is fixed and some more years
- // pass by and all the buildbots have moved to a corrected std::sort(),
- // enable the following assert:
- //
- // assert(A != B);
-
const BasicBlock *BA = A->getParent();
const BasicBlock *BB = B->getParent();
unsigned ADFS, BDFS;
@@ -202,6 +219,7 @@ public:
GVNHoist(DominatorTree *DT, AliasAnalysis *AA, MemoryDependenceResults *MD,
MemorySSA *MSSA)
: DT(DT), AA(AA), MD(MD), MSSA(MSSA),
+ MSSAUpdater(make_unique<MemorySSAUpdater>(MSSA)),
HoistingGeps(false),
HoistedCtr(0)
{ }
@@ -249,9 +267,11 @@ private:
AliasAnalysis *AA;
MemoryDependenceResults *MD;
MemorySSA *MSSA;
+ std::unique_ptr<MemorySSAUpdater> MSSAUpdater;
const bool HoistingGeps;
DenseMap<const Value *, unsigned> DFSNumber;
BBSideEffectsSet BBSideEffects;
+ DenseSet<const BasicBlock*> HoistBarrier;
int HoistedCtr;
enum InsKind { Unknown, Scalar, Load, Store };
@@ -307,8 +327,8 @@ private:
continue;
}
- // Check for end of function, calls that do not return, etc.
- if (!isGuaranteedToTransferExecutionToSuccessor(BB->getTerminator()))
+ // We reached the leaf Basic Block => not all paths have this instruction.
+ if (!BB->getTerminator()->getNumSuccessors())
return false;
// When reaching the back-edge of a loop, there may be a path through the
@@ -360,7 +380,7 @@ private:
ReachedNewPt = true;
}
}
- if (defClobbersUseOrDef(Def, MU, *AA))
+ if (MemorySSAUtil::defClobbersUseOrDef(Def, MU, *AA))
return true;
}
@@ -387,7 +407,8 @@ private:
// executed between the execution of NewBB and OldBB. Hoisting an expression
// from OldBB into NewBB has to be safe on all execution paths.
for (auto I = idf_begin(OldBB), E = idf_end(OldBB); I != E;) {
- if (*I == NewBB) {
+ const BasicBlock *BB = *I;
+ if (BB == NewBB) {
// Stop traversal when reaching HoistPt.
I.skipChildren();
continue;
@@ -398,11 +419,17 @@ private:
return true;
// Impossible to hoist with exceptions on the path.
- if (hasEH(*I))
+ if (hasEH(BB))
+ return true;
+
+ // No such instruction after HoistBarrier in a basic block was
+ // selected for hoisting so instructions selected within basic block with
+ // a hoist barrier can be hoisted.
+ if ((BB != OldBB) && HoistBarrier.count(BB))
return true;
// Check that we do not move a store past loads.
- if (hasMemoryUse(NewPt, Def, *I))
+ if (hasMemoryUse(NewPt, Def, BB))
return true;
// -1 is unlimited number of blocks on all paths.
@@ -419,17 +446,18 @@ private:
// Decrement by 1 NBBsOnAllPaths for each block between HoistPt and BB, and
// return true when the counter NBBsOnAllPaths reaches 0, except when it is
// initialized to -1 which is unlimited.
- bool hasEHOnPath(const BasicBlock *HoistPt, const BasicBlock *BB,
+ bool hasEHOnPath(const BasicBlock *HoistPt, const BasicBlock *SrcBB,
int &NBBsOnAllPaths) {
- assert(DT->dominates(HoistPt, BB) && "Invalid path");
+ assert(DT->dominates(HoistPt, SrcBB) && "Invalid path");
// Walk all basic blocks reachable in depth-first iteration on
// the inverse CFG from BBInsn to NewHoistPt. These blocks are all the
// blocks that may be executed between the execution of NewHoistPt and
// BBInsn. Hoisting an expression from BBInsn into NewHoistPt has to be safe
// on all execution paths.
- for (auto I = idf_begin(BB), E = idf_end(BB); I != E;) {
- if (*I == HoistPt) {
+ for (auto I = idf_begin(SrcBB), E = idf_end(SrcBB); I != E;) {
+ const BasicBlock *BB = *I;
+ if (BB == HoistPt) {
// Stop traversal when reaching NewHoistPt.
I.skipChildren();
continue;
@@ -440,7 +468,13 @@ private:
return true;
// Impossible to hoist with exceptions on the path.
- if (hasEH(*I))
+ if (hasEH(BB))
+ return true;
+
+ // No such instruction after HoistBarrier in a basic block was
+ // selected for hoisting so instructions selected within basic block with
+ // a hoist barrier can be hoisted.
+ if ((BB != SrcBB) && HoistBarrier.count(BB))
return true;
// -1 is unlimited number of blocks on all paths.
@@ -626,6 +660,8 @@ private:
// Compute the insertion point and the list of expressions to be hoisted.
SmallVecInsn InstructionsToHoist;
for (auto I : V)
+ // We don't need to check for hoist-barriers here because if
+ // I->getParent() is a barrier then I precedes the barrier.
if (!hasEH(I->getParent()))
InstructionsToHoist.push_back(I);
@@ -809,9 +845,9 @@ private:
// legal when the ld/st is not moved past its current definition.
MemoryAccess *Def = OldMemAcc->getDefiningAccess();
NewMemAcc =
- MSSA->createMemoryAccessInBB(Repl, Def, HoistPt, MemorySSA::End);
+ MSSAUpdater->createMemoryAccessInBB(Repl, Def, HoistPt, MemorySSA::End);
OldMemAcc->replaceAllUsesWith(NewMemAcc);
- MSSA->removeMemoryAccess(OldMemAcc);
+ MSSAUpdater->removeMemoryAccess(OldMemAcc);
}
}
@@ -850,7 +886,7 @@ private:
// Update the uses of the old MSSA access with NewMemAcc.
MemoryAccess *OldMA = MSSA->getMemoryAccess(I);
OldMA->replaceAllUsesWith(NewMemAcc);
- MSSA->removeMemoryAccess(OldMA);
+ MSSAUpdater->removeMemoryAccess(OldMA);
}
Repl->andIRFlags(I);
@@ -872,7 +908,7 @@ private:
auto In = Phi->incoming_values();
if (all_of(In, [&](Use &U) { return U == NewMemAcc; })) {
Phi->replaceAllUsesWith(NewMemAcc);
- MSSA->removeMemoryAccess(Phi);
+ MSSAUpdater->removeMemoryAccess(Phi);
}
}
}
@@ -896,6 +932,12 @@ private:
for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
int InstructionNb = 0;
for (Instruction &I1 : *BB) {
+ // If I1 cannot guarantee progress, subsequent instructions
+ // in BB cannot be hoisted anyways.
+ if (!isGuaranteedToTransferExecutionToSuccessor(&I1)) {
+ HoistBarrier.insert(BB);
+ break;
+ }
// Only hoist the first instructions in BB up to MaxDepthInBB. Hoisting
// deeper may increase the register pressure and compilation time.
if (MaxDepthInBB != -1 && InstructionNb++ >= MaxDepthInBB)
@@ -969,6 +1011,7 @@ public:
AU.addRequired<MemorySSAWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<MemorySSAWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
}
};
} // namespace
@@ -985,6 +1028,7 @@ PreservedAnalyses GVNHoistPass::run(Function &F, FunctionAnalysisManager &AM) {
PreservedAnalyses PA;
PA.preserve<DominatorTreeAnalysis>();
PA.preserve<MemorySSAAnalysis>();
+ PA.preserve<GlobalsAA>();
return PA;
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/GVNSink.cpp b/contrib/llvm/lib/Transforms/Scalar/GVNSink.cpp
new file mode 100644
index 0000000..5fd2dfc
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/GVNSink.cpp
@@ -0,0 +1,883 @@
+//===- GVNSink.cpp - sink expressions into successors -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file GVNSink.cpp
+/// This pass attempts to sink instructions into successors, reducing static
+/// instruction count and enabling if-conversion.
+///
+/// We use a variant of global value numbering to decide what can be sunk.
+/// Consider:
+///
+/// [ %a1 = add i32 %b, 1 ] [ %c1 = add i32 %d, 1 ]
+/// [ %a2 = xor i32 %a1, 1 ] [ %c2 = xor i32 %c1, 1 ]
+/// \ /
+/// [ %e = phi i32 %a2, %c2 ]
+/// [ add i32 %e, 4 ]
+///
+///
+/// GVN would number %a1 and %c1 differently because they compute different
+/// results - the VN of an instruction is a function of its opcode and the
+/// transitive closure of its operands. This is the key property for hoisting
+/// and CSE.
+///
+/// What we want when sinking however is for a numbering that is a function of
+/// the *uses* of an instruction, which allows us to answer the question "if I
+/// replace %a1 with %c1, will it contribute in an equivalent way to all
+/// successive instructions?". The PostValueTable class in GVN provides this
+/// mapping.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Scalar/GVNExpression.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <unordered_set>
+using namespace llvm;
+
+#define DEBUG_TYPE "gvn-sink"
+
+STATISTIC(NumRemoved, "Number of instructions removed");
+
+namespace llvm {
+namespace GVNExpression {
+
+LLVM_DUMP_METHOD void Expression::dump() const {
+ print(dbgs());
+ dbgs() << "\n";
+}
+
+}
+}
+
+namespace {
+
+static bool isMemoryInst(const Instruction *I) {
+ return isa<LoadInst>(I) || isa<StoreInst>(I) ||
+ (isa<InvokeInst>(I) && !cast<InvokeInst>(I)->doesNotAccessMemory()) ||
+ (isa<CallInst>(I) && !cast<CallInst>(I)->doesNotAccessMemory());
+}
+
+/// Iterates through instructions in a set of blocks in reverse order from the
+/// first non-terminator. For example (assume all blocks have size n):
+/// LockstepReverseIterator I([B1, B2, B3]);
+/// *I-- = [B1[n], B2[n], B3[n]];
+/// *I-- = [B1[n-1], B2[n-1], B3[n-1]];
+/// *I-- = [B1[n-2], B2[n-2], B3[n-2]];
+/// ...
+///
+/// It continues until all blocks have been exhausted. Use \c getActiveBlocks()
+/// to
+/// determine which blocks are still going and the order they appear in the
+/// list returned by operator*.
+class LockstepReverseIterator {
+ ArrayRef<BasicBlock *> Blocks;
+ SmallPtrSet<BasicBlock *, 4> ActiveBlocks;
+ SmallVector<Instruction *, 4> Insts;
+ bool Fail;
+
+public:
+ LockstepReverseIterator(ArrayRef<BasicBlock *> Blocks) : Blocks(Blocks) {
+ reset();
+ }
+
+ void reset() {
+ Fail = false;
+ ActiveBlocks.clear();
+ for (BasicBlock *BB : Blocks)
+ ActiveBlocks.insert(BB);
+ Insts.clear();
+ for (BasicBlock *BB : Blocks) {
+ if (BB->size() <= 1) {
+ // Block wasn't big enough - only contained a terminator.
+ ActiveBlocks.erase(BB);
+ continue;
+ }
+ Insts.push_back(BB->getTerminator()->getPrevNode());
+ }
+ if (Insts.empty())
+ Fail = true;
+ }
+
+ bool isValid() const { return !Fail; }
+ ArrayRef<Instruction *> operator*() const { return Insts; }
+ SmallPtrSet<BasicBlock *, 4> &getActiveBlocks() { return ActiveBlocks; }
+
+ void restrictToBlocks(SmallPtrSetImpl<BasicBlock *> &Blocks) {
+ for (auto II = Insts.begin(); II != Insts.end();) {
+ if (std::find(Blocks.begin(), Blocks.end(), (*II)->getParent()) ==
+ Blocks.end()) {
+ ActiveBlocks.erase((*II)->getParent());
+ II = Insts.erase(II);
+ } else {
+ ++II;
+ }
+ }
+ }
+
+ void operator--() {
+ if (Fail)
+ return;
+ SmallVector<Instruction *, 4> NewInsts;
+ for (auto *Inst : Insts) {
+ if (Inst == &Inst->getParent()->front())
+ ActiveBlocks.erase(Inst->getParent());
+ else
+ NewInsts.push_back(Inst->getPrevNode());
+ }
+ if (NewInsts.empty()) {
+ Fail = true;
+ return;
+ }
+ Insts = NewInsts;
+ }
+};
+
+//===----------------------------------------------------------------------===//
+
+/// Candidate solution for sinking. There may be different ways to
+/// sink instructions, differing in the number of instructions sunk,
+/// the number of predecessors sunk from and the number of PHIs
+/// required.
+struct SinkingInstructionCandidate {
+ unsigned NumBlocks;
+ unsigned NumInstructions;
+ unsigned NumPHIs;
+ unsigned NumMemoryInsts;
+ int Cost = -1;
+ SmallVector<BasicBlock *, 4> Blocks;
+
+ void calculateCost(unsigned NumOrigPHIs, unsigned NumOrigBlocks) {
+ unsigned NumExtraPHIs = NumPHIs - NumOrigPHIs;
+ unsigned SplitEdgeCost = (NumOrigBlocks > NumBlocks) ? 2 : 0;
+ Cost = (NumInstructions * (NumBlocks - 1)) -
+ (NumExtraPHIs *
+ NumExtraPHIs) // PHIs are expensive, so make sure they're worth it.
+ - SplitEdgeCost;
+ }
+ bool operator>(const SinkingInstructionCandidate &Other) const {
+ return Cost > Other.Cost;
+ }
+};
+
+#ifndef NDEBUG
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
+ const SinkingInstructionCandidate &C) {
+ OS << "<Candidate Cost=" << C.Cost << " #Blocks=" << C.NumBlocks
+ << " #Insts=" << C.NumInstructions << " #PHIs=" << C.NumPHIs << ">";
+ return OS;
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+
+/// Describes a PHI node that may or may not exist. These track the PHIs
+/// that must be created if we sunk a sequence of instructions. It provides
+/// a hash function for efficient equality comparisons.
+class ModelledPHI {
+ SmallVector<Value *, 4> Values;
+ SmallVector<BasicBlock *, 4> Blocks;
+
+public:
+ ModelledPHI() {}
+ ModelledPHI(const PHINode *PN) {
+ for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I)
+ Blocks.push_back(PN->getIncomingBlock(I));
+ std::sort(Blocks.begin(), Blocks.end());
+
+ // This assumes the PHI is already well-formed and there aren't conflicting
+ // incoming values for the same block.
+ for (auto *B : Blocks)
+ Values.push_back(PN->getIncomingValueForBlock(B));
+ }
+ /// Create a dummy ModelledPHI that will compare unequal to any other ModelledPHI
+ /// without the same ID.
+ /// \note This is specifically for DenseMapInfo - do not use this!
+ static ModelledPHI createDummy(size_t ID) {
+ ModelledPHI M;
+ M.Values.push_back(reinterpret_cast<Value*>(ID));
+ return M;
+ }
+
+ /// Create a PHI from an array of incoming values and incoming blocks.
+ template <typename VArray, typename BArray>
+ ModelledPHI(const VArray &V, const BArray &B) {
+ std::copy(V.begin(), V.end(), std::back_inserter(Values));
+ std::copy(B.begin(), B.end(), std::back_inserter(Blocks));
+ }
+
+ /// Create a PHI from [I[OpNum] for I in Insts].
+ template <typename BArray>
+ ModelledPHI(ArrayRef<Instruction *> Insts, unsigned OpNum, const BArray &B) {
+ std::copy(B.begin(), B.end(), std::back_inserter(Blocks));
+ for (auto *I : Insts)
+ Values.push_back(I->getOperand(OpNum));
+ }
+
+ /// Restrict the PHI's contents down to only \c NewBlocks.
+ /// \c NewBlocks must be a subset of \c this->Blocks.
+ void restrictToBlocks(const SmallPtrSetImpl<BasicBlock *> &NewBlocks) {
+ auto BI = Blocks.begin();
+ auto VI = Values.begin();
+ while (BI != Blocks.end()) {
+ assert(VI != Values.end());
+ if (std::find(NewBlocks.begin(), NewBlocks.end(), *BI) ==
+ NewBlocks.end()) {
+ BI = Blocks.erase(BI);
+ VI = Values.erase(VI);
+ } else {
+ ++BI;
+ ++VI;
+ }
+ }
+ assert(Blocks.size() == NewBlocks.size());
+ }
+
+ ArrayRef<Value *> getValues() const { return Values; }
+
+ bool areAllIncomingValuesSame() const {
+ return all_of(Values, [&](Value *V) { return V == Values[0]; });
+ }
+ bool areAllIncomingValuesSameType() const {
+ return all_of(
+ Values, [&](Value *V) { return V->getType() == Values[0]->getType(); });
+ }
+ bool areAnyIncomingValuesConstant() const {
+ return any_of(Values, [&](Value *V) { return isa<Constant>(V); });
+ }
+ // Hash functor
+ unsigned hash() const {
+ return (unsigned)hash_combine_range(Values.begin(), Values.end());
+ }
+ bool operator==(const ModelledPHI &Other) const {
+ return Values == Other.Values && Blocks == Other.Blocks;
+ }
+};
+
+template <typename ModelledPHI> struct DenseMapInfo {
+ static inline ModelledPHI &getEmptyKey() {
+ static ModelledPHI Dummy = ModelledPHI::createDummy(0);
+ return Dummy;
+ }
+ static inline ModelledPHI &getTombstoneKey() {
+ static ModelledPHI Dummy = ModelledPHI::createDummy(1);
+ return Dummy;
+ }
+ static unsigned getHashValue(const ModelledPHI &V) { return V.hash(); }
+ static bool isEqual(const ModelledPHI &LHS, const ModelledPHI &RHS) {
+ return LHS == RHS;
+ }
+};
+
+typedef DenseSet<ModelledPHI, DenseMapInfo<ModelledPHI>> ModelledPHISet;
+
+//===----------------------------------------------------------------------===//
+// ValueTable
+//===----------------------------------------------------------------------===//
+// This is a value number table where the value number is a function of the
+// *uses* of a value, rather than its operands. Thus, if VN(A) == VN(B) we know
+// that the program would be equivalent if we replaced A with PHI(A, B).
+//===----------------------------------------------------------------------===//
+
+/// A GVN expression describing how an instruction is used. The operands
+/// field of BasicExpression is used to store uses, not operands.
+///
+/// This class also contains fields for discriminators used when determining
+/// equivalence of instructions with sideeffects.
+class InstructionUseExpr : public GVNExpression::BasicExpression {
+ unsigned MemoryUseOrder = -1;
+ bool Volatile = false;
+
+public:
+ InstructionUseExpr(Instruction *I, ArrayRecycler<Value *> &R,
+ BumpPtrAllocator &A)
+ : GVNExpression::BasicExpression(I->getNumUses()) {
+ allocateOperands(R, A);
+ setOpcode(I->getOpcode());
+ setType(I->getType());
+
+ for (auto &U : I->uses())
+ op_push_back(U.getUser());
+ std::sort(op_begin(), op_end());
+ }
+ void setMemoryUseOrder(unsigned MUO) { MemoryUseOrder = MUO; }
+ void setVolatile(bool V) { Volatile = V; }
+
+ virtual hash_code getHashValue() const {
+ return hash_combine(GVNExpression::BasicExpression::getHashValue(),
+ MemoryUseOrder, Volatile);
+ }
+
+ template <typename Function> hash_code getHashValue(Function MapFn) {
+ hash_code H =
+ hash_combine(getOpcode(), getType(), MemoryUseOrder, Volatile);
+ for (auto *V : operands())
+ H = hash_combine(H, MapFn(V));
+ return H;
+ }
+};
+
+class ValueTable {
+ DenseMap<Value *, uint32_t> ValueNumbering;
+ DenseMap<GVNExpression::Expression *, uint32_t> ExpressionNumbering;
+ DenseMap<size_t, uint32_t> HashNumbering;
+ BumpPtrAllocator Allocator;
+ ArrayRecycler<Value *> Recycler;
+ uint32_t nextValueNumber;
+
+ /// Create an expression for I based on its opcode and its uses. If I
+ /// touches or reads memory, the expression is also based upon its memory
+ /// order - see \c getMemoryUseOrder().
+ InstructionUseExpr *createExpr(Instruction *I) {
+ InstructionUseExpr *E =
+ new (Allocator) InstructionUseExpr(I, Recycler, Allocator);
+ if (isMemoryInst(I))
+ E->setMemoryUseOrder(getMemoryUseOrder(I));
+
+ if (CmpInst *C = dyn_cast<CmpInst>(I)) {
+ CmpInst::Predicate Predicate = C->getPredicate();
+ E->setOpcode((C->getOpcode() << 8) | Predicate);
+ }
+ return E;
+ }
+
+ /// Helper to compute the value number for a memory instruction
+ /// (LoadInst/StoreInst), including checking the memory ordering and
+ /// volatility.
+ template <class Inst> InstructionUseExpr *createMemoryExpr(Inst *I) {
+ if (isStrongerThanUnordered(I->getOrdering()) || I->isAtomic())
+ return nullptr;
+ InstructionUseExpr *E = createExpr(I);
+ E->setVolatile(I->isVolatile());
+ return E;
+ }
+
+public:
+ /// Returns the value number for the specified value, assigning
+ /// it a new number if it did not have one before.
+ uint32_t lookupOrAdd(Value *V) {
+ auto VI = ValueNumbering.find(V);
+ if (VI != ValueNumbering.end())
+ return VI->second;
+
+ if (!isa<Instruction>(V)) {
+ ValueNumbering[V] = nextValueNumber;
+ return nextValueNumber++;
+ }
+
+ Instruction *I = cast<Instruction>(V);
+ InstructionUseExpr *exp = nullptr;
+ switch (I->getOpcode()) {
+ case Instruction::Load:
+ exp = createMemoryExpr(cast<LoadInst>(I));
+ break;
+ case Instruction::Store:
+ exp = createMemoryExpr(cast<StoreInst>(I));
+ break;
+ case Instruction::Call:
+ case Instruction::Invoke:
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::UIToFP:
+ case Instruction::SIToFP:
+ case Instruction::FPTrunc:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::BitCast:
+ case Instruction::Select:
+ case Instruction::ExtractElement:
+ case Instruction::InsertElement:
+ case Instruction::ShuffleVector:
+ case Instruction::InsertValue:
+ case Instruction::GetElementPtr:
+ exp = createExpr(I);
+ break;
+ default:
+ break;
+ }
+
+ if (!exp) {
+ ValueNumbering[V] = nextValueNumber;
+ return nextValueNumber++;
+ }
+
+ uint32_t e = ExpressionNumbering[exp];
+ if (!e) {
+ hash_code H = exp->getHashValue([=](Value *V) { return lookupOrAdd(V); });
+ auto I = HashNumbering.find(H);
+ if (I != HashNumbering.end()) {
+ e = I->second;
+ } else {
+ e = nextValueNumber++;
+ HashNumbering[H] = e;
+ ExpressionNumbering[exp] = e;
+ }
+ }
+ ValueNumbering[V] = e;
+ return e;
+ }
+
+ /// Returns the value number of the specified value. Fails if the value has
+ /// not yet been numbered.
+ uint32_t lookup(Value *V) const {
+ auto VI = ValueNumbering.find(V);
+ assert(VI != ValueNumbering.end() && "Value not numbered?");
+ return VI->second;
+ }
+
+ /// Removes all value numberings and resets the value table.
+ void clear() {
+ ValueNumbering.clear();
+ ExpressionNumbering.clear();
+ HashNumbering.clear();
+ Recycler.clear(Allocator);
+ nextValueNumber = 1;
+ }
+
+ ValueTable() : nextValueNumber(1) {}
+
+ /// \c Inst uses or touches memory. Return an ID describing the memory state
+ /// at \c Inst such that if getMemoryUseOrder(I1) == getMemoryUseOrder(I2),
+ /// the exact same memory operations happen after I1 and I2.
+ ///
+ /// This is a very hard problem in general, so we use domain-specific
+ /// knowledge that we only ever check for equivalence between blocks sharing a
+ /// single immediate successor that is common, and when determining if I1 ==
+ /// I2 we will have already determined that next(I1) == next(I2). This
+ /// inductive property allows us to simply return the value number of the next
+ /// instruction that defines memory.
+ uint32_t getMemoryUseOrder(Instruction *Inst) {
+ auto *BB = Inst->getParent();
+ for (auto I = std::next(Inst->getIterator()), E = BB->end();
+ I != E && !I->isTerminator(); ++I) {
+ if (!isMemoryInst(&*I))
+ continue;
+ if (isa<LoadInst>(&*I))
+ continue;
+ CallInst *CI = dyn_cast<CallInst>(&*I);
+ if (CI && CI->onlyReadsMemory())
+ continue;
+ InvokeInst *II = dyn_cast<InvokeInst>(&*I);
+ if (II && II->onlyReadsMemory())
+ continue;
+ return lookupOrAdd(&*I);
+ }
+ return 0;
+ }
+};
+
+//===----------------------------------------------------------------------===//
+
+class GVNSink {
+public:
+ GVNSink() : VN() {}
+ bool run(Function &F) {
+ DEBUG(dbgs() << "GVNSink: running on function @" << F.getName() << "\n");
+
+ unsigned NumSunk = 0;
+ ReversePostOrderTraversal<Function*> RPOT(&F);
+ for (auto *N : RPOT)
+ NumSunk += sinkBB(N);
+
+ return NumSunk > 0;
+ }
+
+private:
+ ValueTable VN;
+
+ bool isInstructionBlacklisted(Instruction *I) {
+ // These instructions may change or break semantics if moved.
+ if (isa<PHINode>(I) || I->isEHPad() || isa<AllocaInst>(I) ||
+ I->getType()->isTokenTy())
+ return true;
+ return false;
+ }
+
+ /// The main heuristic function. Analyze the set of instructions pointed to by
+ /// LRI and return a candidate solution if these instructions can be sunk, or
+ /// None otherwise.
+ Optional<SinkingInstructionCandidate> analyzeInstructionForSinking(
+ LockstepReverseIterator &LRI, unsigned &InstNum, unsigned &MemoryInstNum,
+ ModelledPHISet &NeededPHIs, SmallPtrSetImpl<Value *> &PHIContents);
+
+ /// Create a ModelledPHI for each PHI in BB, adding to PHIs.
+ void analyzeInitialPHIs(BasicBlock *BB, ModelledPHISet &PHIs,
+ SmallPtrSetImpl<Value *> &PHIContents) {
+ for (auto &I : *BB) {
+ auto *PN = dyn_cast<PHINode>(&I);
+ if (!PN)
+ return;
+
+ auto MPHI = ModelledPHI(PN);
+ PHIs.insert(MPHI);
+ for (auto *V : MPHI.getValues())
+ PHIContents.insert(V);
+ }
+ }
+
+ /// The main instruction sinking driver. Set up state and try and sink
+ /// instructions into BBEnd from its predecessors.
+ unsigned sinkBB(BasicBlock *BBEnd);
+
+ /// Perform the actual mechanics of sinking an instruction from Blocks into
+ /// BBEnd, which is their only successor.
+ void sinkLastInstruction(ArrayRef<BasicBlock *> Blocks, BasicBlock *BBEnd);
+
+ /// Remove PHIs that all have the same incoming value.
+ void foldPointlessPHINodes(BasicBlock *BB) {
+ auto I = BB->begin();
+ while (PHINode *PN = dyn_cast<PHINode>(I++)) {
+ if (!all_of(PN->incoming_values(),
+ [&](const Value *V) { return V == PN->getIncomingValue(0); }))
+ continue;
+ if (PN->getIncomingValue(0) != PN)
+ PN->replaceAllUsesWith(PN->getIncomingValue(0));
+ else
+ PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
+ PN->eraseFromParent();
+ }
+ }
+};
+
+Optional<SinkingInstructionCandidate> GVNSink::analyzeInstructionForSinking(
+ LockstepReverseIterator &LRI, unsigned &InstNum, unsigned &MemoryInstNum,
+ ModelledPHISet &NeededPHIs, SmallPtrSetImpl<Value *> &PHIContents) {
+ auto Insts = *LRI;
+ DEBUG(dbgs() << " -- Analyzing instruction set: [\n"; for (auto *I
+ : Insts) {
+ I->dump();
+ } dbgs() << " ]\n";);
+
+ DenseMap<uint32_t, unsigned> VNums;
+ for (auto *I : Insts) {
+ uint32_t N = VN.lookupOrAdd(I);
+ DEBUG(dbgs() << " VN=" << utohexstr(N) << " for" << *I << "\n");
+ if (N == ~0U)
+ return None;
+ VNums[N]++;
+ }
+ unsigned VNumToSink =
+ std::max_element(VNums.begin(), VNums.end(),
+ [](const std::pair<uint32_t, unsigned> &I,
+ const std::pair<uint32_t, unsigned> &J) {
+ return I.second < J.second;
+ })
+ ->first;
+
+ if (VNums[VNumToSink] == 1)
+ // Can't sink anything!
+ return None;
+
+ // Now restrict the number of incoming blocks down to only those with
+ // VNumToSink.
+ auto &ActivePreds = LRI.getActiveBlocks();
+ unsigned InitialActivePredSize = ActivePreds.size();
+ SmallVector<Instruction *, 4> NewInsts;
+ for (auto *I : Insts) {
+ if (VN.lookup(I) != VNumToSink)
+ ActivePreds.erase(I->getParent());
+ else
+ NewInsts.push_back(I);
+ }
+ for (auto *I : NewInsts)
+ if (isInstructionBlacklisted(I))
+ return None;
+
+ // If we've restricted the incoming blocks, restrict all needed PHIs also
+ // to that set.
+ bool RecomputePHIContents = false;
+ if (ActivePreds.size() != InitialActivePredSize) {
+ ModelledPHISet NewNeededPHIs;
+ for (auto P : NeededPHIs) {
+ P.restrictToBlocks(ActivePreds);
+ NewNeededPHIs.insert(P);
+ }
+ NeededPHIs = NewNeededPHIs;
+ LRI.restrictToBlocks(ActivePreds);
+ RecomputePHIContents = true;
+ }
+
+ // The sunk instruction's results.
+ ModelledPHI NewPHI(NewInsts, ActivePreds);
+
+ // Does sinking this instruction render previous PHIs redundant?
+ if (NeededPHIs.find(NewPHI) != NeededPHIs.end()) {
+ NeededPHIs.erase(NewPHI);
+ RecomputePHIContents = true;
+ }
+
+ if (RecomputePHIContents) {
+ // The needed PHIs have changed, so recompute the set of all needed
+ // values.
+ PHIContents.clear();
+ for (auto &PHI : NeededPHIs)
+ PHIContents.insert(PHI.getValues().begin(), PHI.getValues().end());
+ }
+
+ // Is this instruction required by a later PHI that doesn't match this PHI?
+ // if so, we can't sink this instruction.
+ for (auto *V : NewPHI.getValues())
+ if (PHIContents.count(V))
+ // V exists in this PHI, but the whole PHI is different to NewPHI
+ // (else it would have been removed earlier). We cannot continue
+ // because this isn't representable.
+ return None;
+
+ // Which operands need PHIs?
+ // FIXME: If any of these fail, we should partition up the candidates to
+ // try and continue making progress.
+ Instruction *I0 = NewInsts[0];
+ for (unsigned OpNum = 0, E = I0->getNumOperands(); OpNum != E; ++OpNum) {
+ ModelledPHI PHI(NewInsts, OpNum, ActivePreds);
+ if (PHI.areAllIncomingValuesSame())
+ continue;
+ if (!canReplaceOperandWithVariable(I0, OpNum))
+ // We can 't create a PHI from this instruction!
+ return None;
+ if (NeededPHIs.count(PHI))
+ continue;
+ if (!PHI.areAllIncomingValuesSameType())
+ return None;
+ // Don't create indirect calls! The called value is the final operand.
+ if ((isa<CallInst>(I0) || isa<InvokeInst>(I0)) && OpNum == E - 1 &&
+ PHI.areAnyIncomingValuesConstant())
+ return None;
+
+ NeededPHIs.reserve(NeededPHIs.size());
+ NeededPHIs.insert(PHI);
+ PHIContents.insert(PHI.getValues().begin(), PHI.getValues().end());
+ }
+
+ if (isMemoryInst(NewInsts[0]))
+ ++MemoryInstNum;
+
+ SinkingInstructionCandidate Cand;
+ Cand.NumInstructions = ++InstNum;
+ Cand.NumMemoryInsts = MemoryInstNum;
+ Cand.NumBlocks = ActivePreds.size();
+ Cand.NumPHIs = NeededPHIs.size();
+ for (auto *C : ActivePreds)
+ Cand.Blocks.push_back(C);
+
+ return Cand;
+}
+
+unsigned GVNSink::sinkBB(BasicBlock *BBEnd) {
+ DEBUG(dbgs() << "GVNSink: running on basic block ";
+ BBEnd->printAsOperand(dbgs()); dbgs() << "\n");
+ SmallVector<BasicBlock *, 4> Preds;
+ for (auto *B : predecessors(BBEnd)) {
+ auto *T = B->getTerminator();
+ if (isa<BranchInst>(T) || isa<SwitchInst>(T))
+ Preds.push_back(B);
+ else
+ return 0;
+ }
+ if (Preds.size() < 2)
+ return 0;
+ std::sort(Preds.begin(), Preds.end());
+
+ unsigned NumOrigPreds = Preds.size();
+ // We can only sink instructions through unconditional branches.
+ for (auto I = Preds.begin(); I != Preds.end();) {
+ if ((*I)->getTerminator()->getNumSuccessors() != 1)
+ I = Preds.erase(I);
+ else
+ ++I;
+ }
+
+ LockstepReverseIterator LRI(Preds);
+ SmallVector<SinkingInstructionCandidate, 4> Candidates;
+ unsigned InstNum = 0, MemoryInstNum = 0;
+ ModelledPHISet NeededPHIs;
+ SmallPtrSet<Value *, 4> PHIContents;
+ analyzeInitialPHIs(BBEnd, NeededPHIs, PHIContents);
+ unsigned NumOrigPHIs = NeededPHIs.size();
+
+ while (LRI.isValid()) {
+ auto Cand = analyzeInstructionForSinking(LRI, InstNum, MemoryInstNum,
+ NeededPHIs, PHIContents);
+ if (!Cand)
+ break;
+ Cand->calculateCost(NumOrigPHIs, Preds.size());
+ Candidates.emplace_back(*Cand);
+ --LRI;
+ }
+
+ std::stable_sort(
+ Candidates.begin(), Candidates.end(),
+ [](const SinkingInstructionCandidate &A,
+ const SinkingInstructionCandidate &B) { return A > B; });
+ DEBUG(dbgs() << " -- Sinking candidates:\n"; for (auto &C
+ : Candidates) dbgs()
+ << " " << C << "\n";);
+
+ // Pick the top candidate, as long it is positive!
+ if (Candidates.empty() || Candidates.front().Cost <= 0)
+ return 0;
+ auto C = Candidates.front();
+
+ DEBUG(dbgs() << " -- Sinking: " << C << "\n");
+ BasicBlock *InsertBB = BBEnd;
+ if (C.Blocks.size() < NumOrigPreds) {
+ DEBUG(dbgs() << " -- Splitting edge to "; BBEnd->printAsOperand(dbgs());
+ dbgs() << "\n");
+ InsertBB = SplitBlockPredecessors(BBEnd, C.Blocks, ".gvnsink.split");
+ if (!InsertBB) {
+ DEBUG(dbgs() << " -- FAILED to split edge!\n");
+ // Edge couldn't be split.
+ return 0;
+ }
+ }
+
+ for (unsigned I = 0; I < C.NumInstructions; ++I)
+ sinkLastInstruction(C.Blocks, InsertBB);
+
+ return C.NumInstructions;
+}
+
+void GVNSink::sinkLastInstruction(ArrayRef<BasicBlock *> Blocks,
+ BasicBlock *BBEnd) {
+ SmallVector<Instruction *, 4> Insts;
+ for (BasicBlock *BB : Blocks)
+ Insts.push_back(BB->getTerminator()->getPrevNode());
+ Instruction *I0 = Insts.front();
+
+ SmallVector<Value *, 4> NewOperands;
+ for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O) {
+ bool NeedPHI = any_of(Insts, [&I0, O](const Instruction *I) {
+ return I->getOperand(O) != I0->getOperand(O);
+ });
+ if (!NeedPHI) {
+ NewOperands.push_back(I0->getOperand(O));
+ continue;
+ }
+
+ // Create a new PHI in the successor block and populate it.
+ auto *Op = I0->getOperand(O);
+ assert(!Op->getType()->isTokenTy() && "Can't PHI tokens!");
+ auto *PN = PHINode::Create(Op->getType(), Insts.size(),
+ Op->getName() + ".sink", &BBEnd->front());
+ for (auto *I : Insts)
+ PN->addIncoming(I->getOperand(O), I->getParent());
+ NewOperands.push_back(PN);
+ }
+
+ // Arbitrarily use I0 as the new "common" instruction; remap its operands
+ // and move it to the start of the successor block.
+ for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O)
+ I0->getOperandUse(O).set(NewOperands[O]);
+ I0->moveBefore(&*BBEnd->getFirstInsertionPt());
+
+ // Update metadata and IR flags.
+ for (auto *I : Insts)
+ if (I != I0) {
+ combineMetadataForCSE(I0, I);
+ I0->andIRFlags(I);
+ }
+
+ for (auto *I : Insts)
+ if (I != I0)
+ I->replaceAllUsesWith(I0);
+ foldPointlessPHINodes(BBEnd);
+
+ // Finally nuke all instructions apart from the common instruction.
+ for (auto *I : Insts)
+ if (I != I0)
+ I->eraseFromParent();
+
+ NumRemoved += Insts.size() - 1;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Pass machinery / boilerplate
+
+class GVNSinkLegacyPass : public FunctionPass {
+public:
+ static char ID;
+
+ GVNSinkLegacyPass() : FunctionPass(ID) {
+ initializeGVNSinkLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+ GVNSink G;
+ return G.run(F);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+};
+} // namespace
+
+PreservedAnalyses GVNSinkPass::run(Function &F, FunctionAnalysisManager &AM) {
+ GVNSink G;
+ if (!G.run(F))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
+
+char GVNSinkLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(GVNSinkLegacyPass, "gvn-sink",
+ "Early GVN sinking of Expressions", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_END(GVNSinkLegacyPass, "gvn-sink",
+ "Early GVN sinking of Expressions", false, false)
+
+FunctionPass *llvm::createGVNSinkPass() { return new GVNSinkLegacyPass(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp
index b05ef00..fb7c6e1 100644
--- a/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp
@@ -40,7 +40,6 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/GuardWidening.h"
-#include "llvm/Pass.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/Analysis/LoopInfo.h"
@@ -50,7 +49,9 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
#include "llvm/Transforms/Scalar.h"
using namespace llvm;
@@ -536,10 +537,8 @@ bool GuardWideningImpl::parseRangeChecks(
Changed = true;
} else if (match(Check.getBase(),
m_Or(m_Value(OpLHS), m_ConstantInt(OpRHS)))) {
- unsigned BitWidth = OpLHS->getType()->getScalarSizeInBits();
- APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
- computeKnownBits(OpLHS, KnownZero, KnownOne, DL);
- if ((OpRHS->getValue() & KnownZero) == OpRHS->getValue()) {
+ KnownBits Known = computeKnownBits(OpLHS, DL);
+ if ((OpRHS->getValue() & Known.Zero) == OpRHS->getValue()) {
Check.setBase(OpLHS);
APInt NewOffset = Check.getOffsetValue() + OpRHS->getValue();
Check.setOffset(ConstantInt::get(Ctx, NewOffset));
@@ -568,8 +567,7 @@ bool GuardWideningImpl::combineRangeChecks(
return RC.getBase() == CurrentBase && RC.getLength() == CurrentLength;
};
- std::copy_if(Checks.begin(), Checks.end(),
- std::back_inserter(CurrentChecks), IsCurrentCheck);
+ copy_if(Checks, std::back_inserter(CurrentChecks), IsCurrentCheck);
Checks.erase(remove_if(Checks, IsCurrentCheck), Checks.end());
assert(CurrentChecks.size() != 0 && "We know we have at least one!");
@@ -613,16 +611,16 @@ bool GuardWideningImpl::combineRangeChecks(
// We have a series of f+1 checks as:
//
// I+k_0 u< L ... Chk_0
- // I_k_1 u< L ... Chk_1
+ // I+k_1 u< L ... Chk_1
// ...
- // I_k_f u< L ... Chk_(f+1)
+ // I+k_f u< L ... Chk_f
//
- // with forall i in [0,f): k_f-k_i u< k_f-k_0 ... Precond_0
+ // with forall i in [0,f]: k_f-k_i u< k_f-k_0 ... Precond_0
// k_f-k_0 u< INT_MIN+k_f ... Precond_1
// k_f != k_0 ... Precond_2
//
// Claim:
- // Chk_0 AND Chk_(f+1) implies all the other checks
+ // Chk_0 AND Chk_f implies all the other checks
//
// Informal proof sketch:
//
@@ -658,8 +656,12 @@ PreservedAnalyses GuardWideningPass::run(Function &F,
auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
auto &LI = AM.getResult<LoopAnalysis>(F);
auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
- bool Changed = GuardWideningImpl(DT, PDT, LI).run();
- return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+ if (!GuardWideningImpl(DT, PDT, LI).run())
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
}
StringRef GuardWideningImpl::scoreTypeToString(WideningScore WS) {
diff --git a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index 1752fb7..1078296 100644
--- a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -86,6 +86,10 @@ static cl::opt<bool> UsePostIncrementRanges(
cl::desc("Use post increment control-dependent ranges in IndVarSimplify"),
cl::init(true));
+static cl::opt<bool>
+DisableLFTR("disable-lftr", cl::Hidden, cl::init(false),
+ cl::desc("Disable Linear Function Test Replace optimization"));
+
namespace {
struct RewritePhi;
@@ -97,7 +101,7 @@ class IndVarSimplify {
TargetLibraryInfo *TLI;
const TargetTransformInfo *TTI;
- SmallVector<WeakVH, 16> DeadInsts;
+ SmallVector<WeakTrackingVH, 16> DeadInsts;
bool Changed = false;
bool isValidRewrite(Value *FromVal, Value *ToVal);
@@ -231,8 +235,9 @@ static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) {
bool isExact = false;
// See if we can convert this to an int64_t
uint64_t UIntVal;
- if (APF.convertToInteger(&UIntVal, 64, true, APFloat::rmTowardZero,
- &isExact) != APFloat::opOK || !isExact)
+ if (APF.convertToInteger(makeMutableArrayRef(UIntVal), 64, true,
+ APFloat::rmTowardZero, &isExact) != APFloat::opOK ||
+ !isExact)
return false;
IntVal = UIntVal;
return true;
@@ -414,8 +419,8 @@ void IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) {
Compare->getName());
// In the following deletions, PN may become dead and may be deleted.
- // Use a WeakVH to observe whether this happens.
- WeakVH WeakPH = PN;
+ // Use a WeakTrackingVH to observe whether this happens.
+ WeakTrackingVH WeakPH = PN;
// Delete the old floating point exit comparison. The branch starts using the
// new comparison.
@@ -450,7 +455,7 @@ void IndVarSimplify::rewriteNonIntegerIVs(Loop *L) {
//
BasicBlock *Header = L->getHeader();
- SmallVector<WeakVH, 8> PHIs;
+ SmallVector<WeakTrackingVH, 8> PHIs;
for (BasicBlock::iterator I = Header->begin();
PHINode *PN = dyn_cast<PHINode>(I); ++I)
PHIs.push_back(PN);
@@ -900,13 +905,13 @@ class WidenIV {
PHINode *WidePhi;
Instruction *WideInc;
const SCEV *WideIncExpr;
- SmallVectorImpl<WeakVH> &DeadInsts;
+ SmallVectorImpl<WeakTrackingVH> &DeadInsts;
SmallPtrSet<Instruction *,16> Widened;
SmallVector<NarrowIVDefUse, 8> NarrowIVUsers;
enum ExtendKind { ZeroExtended, SignExtended, Unknown };
- // A map tracking the kind of extension used to widen each narrow IV
+ // A map tracking the kind of extension used to widen each narrow IV
// and narrow IV user.
// Key: pointer to a narrow IV or IV user.
// Value: the kind of extension used to widen this Instruction.
@@ -940,20 +945,13 @@ class WidenIV {
}
public:
- WidenIV(const WideIVInfo &WI, LoopInfo *LInfo,
- ScalarEvolution *SEv, DominatorTree *DTree,
- SmallVectorImpl<WeakVH> &DI, bool HasGuards) :
- OrigPhi(WI.NarrowIV),
- WideType(WI.WidestNativeType),
- LI(LInfo),
- L(LI->getLoopFor(OrigPhi->getParent())),
- SE(SEv),
- DT(DTree),
- HasGuards(HasGuards),
- WidePhi(nullptr),
- WideInc(nullptr),
- WideIncExpr(nullptr),
- DeadInsts(DI) {
+ WidenIV(const WideIVInfo &WI, LoopInfo *LInfo, ScalarEvolution *SEv,
+ DominatorTree *DTree, SmallVectorImpl<WeakTrackingVH> &DI,
+ bool HasGuards)
+ : OrigPhi(WI.NarrowIV), WideType(WI.WidestNativeType), LI(LInfo),
+ L(LI->getLoopFor(OrigPhi->getParent())), SE(SEv), DT(DTree),
+ HasGuards(HasGuards), WidePhi(nullptr), WideInc(nullptr),
+ WideIncExpr(nullptr), DeadInsts(DI) {
assert(L->getHeader() == OrigPhi->getParent() && "Phi must be an IV");
ExtendKindMap[OrigPhi] = WI.IsSigned ? SignExtended : ZeroExtended;
}
@@ -1608,7 +1606,7 @@ void WidenIV::calculatePostIncRange(Instruction *NarrowDef,
return;
CmpInst::Predicate P =
- TrueDest ? Pred : CmpInst::getInversePredicate(Pred);
+ TrueDest ? Pred : CmpInst::getInversePredicate(Pred);
auto CmpRHSRange = SE->getSignedRange(SE->getSCEV(CmpRHS));
auto CmpConstrainedLHSRange =
@@ -1634,7 +1632,7 @@ void WidenIV::calculatePostIncRange(Instruction *NarrowDef,
UpdateRangeFromGuards(NarrowUser);
BasicBlock *NarrowUserBB = NarrowUser->getParent();
- // If NarrowUserBB is statically unreachable asking dominator queries may
+ // If NarrowUserBB is statically unreachable asking dominator queries may
// yield surprising results. (e.g. the block may not have a dom tree node)
if (!DT->isReachableFromEntry(NarrowUserBB))
return;
@@ -1829,6 +1827,7 @@ static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) {
// An IV counter must preserve its type.
if (IncI->getNumOperands() == 2)
break;
+ LLVM_FALLTHROUGH;
default:
return nullptr;
}
@@ -2152,6 +2151,8 @@ linearFunctionTestReplace(Loop *L,
Value *CmpIndVar = IndVar;
const SCEV *IVCount = BackedgeTakenCount;
+ assert(L->getLoopLatch() && "Loop no longer in simplified form?");
+
// If the exiting block is the same as the backedge block, we prefer to
// compare against the post-incremented value, otherwise we must compare
// against the preincremented value.
@@ -2376,6 +2377,7 @@ bool IndVarSimplify::run(Loop *L) {
// Loop::getCanonicalInductionVariable only supports loops with preheaders,
// and we're in trouble if we can't find the induction variable even when
// we've manually inserted one.
+ // - LFTR relies on having a single backedge.
if (!L->isLoopSimplifyForm())
return false;
@@ -2415,7 +2417,8 @@ bool IndVarSimplify::run(Loop *L) {
// If we have a trip count expression, rewrite the loop's exit condition
// using it. We can currently only handle loops with a single exit.
- if (canExpandBackedgeTakenCount(L, SE, Rewriter) && needsLFTR(L, DT)) {
+ if (!DisableLFTR && canExpandBackedgeTakenCount(L, SE, Rewriter) &&
+ needsLFTR(L, DT)) {
PHINode *IndVar = FindLoopCounter(L, BackedgeTakenCount, SE, DT);
if (IndVar) {
// Check preconditions for proper SCEVExpander operation. SCEV does not
@@ -2492,8 +2495,9 @@ PreservedAnalyses IndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
if (!IVS.run(&L))
return PreservedAnalyses::all();
- // FIXME: This should also 'preserve the CFG'.
- return getLoopPassPreservedAnalyses();
+ auto PA = getLoopPassPreservedAnalyses();
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
}
namespace {
diff --git a/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index 8e81541..99b4458 100644
--- a/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -59,8 +59,8 @@
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
using namespace llvm;
@@ -446,6 +446,15 @@ struct LoopStructure {
BasicBlock *LatchExit;
unsigned LatchBrExitIdx;
+ // The loop represented by this instance of LoopStructure is semantically
+ // equivalent to:
+ //
+ // intN_ty inc = IndVarIncreasing ? 1 : -1;
+ // pred_ty predicate = IndVarIncreasing ? ICMP_SLT : ICMP_SGT;
+ //
+ // for (intN_ty iv = IndVarStart; predicate(iv, LoopExitAt); iv = IndVarNext)
+ // ... body ...
+
Value *IndVarNext;
Value *IndVarStart;
Value *LoopExitAt;
@@ -789,9 +798,32 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
return None;
}
+ const SCEV *StartNext = IndVarNext->getStart();
+ const SCEV *Addend = SE.getNegativeSCEV(IndVarNext->getStepRecurrence(SE));
+ const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend);
+
ConstantInt *One = ConstantInt::get(IndVarTy, 1);
// TODO: generalize the predicates here to also match their unsigned variants.
if (IsIncreasing) {
+ bool DecreasedRightValueByOne = false;
+ // Try to turn eq/ne predicates to those we can work with.
+ if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1)
+ // while (++i != len) { while (++i < len) {
+ // ... ---> ...
+ // } }
+ Pred = ICmpInst::ICMP_SLT;
+ else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 &&
+ !CanBeSMin(SE, RightSCEV)) {
+ // while (true) { while (true) {
+ // if (++i == len) ---> if (++i > len - 1)
+ // break; break;
+ // ... ...
+ // } }
+ Pred = ICmpInst::ICMP_SGT;
+ RightSCEV = SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType()));
+ DecreasedRightValueByOne = true;
+ }
+
bool FoundExpectedPred =
(Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 1) ||
(Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 0);
@@ -809,11 +841,48 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
return None;
}
- IRBuilder<> B(Preheader->getTerminator());
- RightValue = B.CreateAdd(RightValue, One);
- }
+ if (!SE.isLoopEntryGuardedByCond(
+ &L, CmpInst::ICMP_SLT, IndVarStart,
+ SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType())))) {
+ FailureReason = "Induction variable start not bounded by upper limit";
+ return None;
+ }
+ // We need to increase the right value unless we have already decreased
+ // it virtually when we replaced EQ with SGT.
+ if (!DecreasedRightValueByOne) {
+ IRBuilder<> B(Preheader->getTerminator());
+ RightValue = B.CreateAdd(RightValue, One);
+ }
+ } else {
+ if (!SE.isLoopEntryGuardedByCond(&L, CmpInst::ICMP_SLT, IndVarStart,
+ RightSCEV)) {
+ FailureReason = "Induction variable start not bounded by upper limit";
+ return None;
+ }
+ assert(!DecreasedRightValueByOne &&
+ "Right value can be decreased only for LatchBrExitIdx == 0!");
+ }
} else {
+ bool IncreasedRightValueByOne = false;
+ // Try to turn eq/ne predicates to those we can work with.
+ if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1)
+ // while (--i != len) { while (--i > len) {
+ // ... ---> ...
+ // } }
+ Pred = ICmpInst::ICMP_SGT;
+ else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 &&
+ !CanBeSMax(SE, RightSCEV)) {
+ // while (true) { while (true) {
+ // if (--i == len) ---> if (--i < len + 1)
+ // break; break;
+ // ... ...
+ // } }
+ Pred = ICmpInst::ICMP_SLT;
+ RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
+ IncreasedRightValueByOne = true;
+ }
+
bool FoundExpectedPred =
(Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 1) ||
(Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 0);
@@ -831,15 +900,30 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
return None;
}
- IRBuilder<> B(Preheader->getTerminator());
- RightValue = B.CreateSub(RightValue, One);
+ if (!SE.isLoopEntryGuardedByCond(
+ &L, CmpInst::ICMP_SGT, IndVarStart,
+ SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType())))) {
+ FailureReason = "Induction variable start not bounded by lower limit";
+ return None;
+ }
+
+ // We need to decrease the right value unless we have already increased
+ // it virtually when we replaced EQ with SLT.
+ if (!IncreasedRightValueByOne) {
+ IRBuilder<> B(Preheader->getTerminator());
+ RightValue = B.CreateSub(RightValue, One);
+ }
+ } else {
+ if (!SE.isLoopEntryGuardedByCond(&L, CmpInst::ICMP_SGT, IndVarStart,
+ RightSCEV)) {
+ FailureReason = "Induction variable start not bounded by lower limit";
+ return None;
+ }
+ assert(!IncreasedRightValueByOne &&
+ "Right value can be increased only for LatchBrExitIdx == 0!");
}
}
- const SCEV *StartNext = IndVarNext->getStart();
- const SCEV *Addend = SE.getNegativeSCEV(IndVarNext->getStepRecurrence(SE));
- const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend);
-
BasicBlock *LatchExit = LatchBr->getSuccessor(LatchBrExitIdx);
assert(SE.getLoopDisposition(LatchCount, &L) ==
@@ -883,20 +967,23 @@ LoopConstrainer::calculateSubRanges() const {
// I think we can be more aggressive here and make this nuw / nsw if the
// addition that feeds into the icmp for the latch's terminating branch is nuw
// / nsw. In any case, a wrapping 2's complement addition is safe.
- ConstantInt *One = ConstantInt::get(Ty, 1);
const SCEV *Start = SE.getSCEV(MainLoopStructure.IndVarStart);
const SCEV *End = SE.getSCEV(MainLoopStructure.LoopExitAt);
bool Increasing = MainLoopStructure.IndVarIncreasing;
- // We compute `Smallest` and `Greatest` such that [Smallest, Greatest) is the
- // range of values the induction variable takes.
+ // We compute `Smallest` and `Greatest` such that [Smallest, Greatest), or
+ // [Smallest, GreatestSeen] is the range of values the induction variable
+ // takes.
- const SCEV *Smallest = nullptr, *Greatest = nullptr;
+ const SCEV *Smallest = nullptr, *Greatest = nullptr, *GreatestSeen = nullptr;
+ const SCEV *One = SE.getOne(Ty);
if (Increasing) {
Smallest = Start;
Greatest = End;
+ // No overflow, because the range [Smallest, GreatestSeen] is not empty.
+ GreatestSeen = SE.getMinusSCEV(End, One);
} else {
// These two computations may sign-overflow. Here is why that is okay:
//
@@ -914,8 +1001,9 @@ LoopConstrainer::calculateSubRanges() const {
// will be an empty range. Returning an empty range is always safe.
//
- Smallest = SE.getAddExpr(End, SE.getSCEV(One));
- Greatest = SE.getAddExpr(Start, SE.getSCEV(One));
+ Smallest = SE.getAddExpr(End, One);
+ Greatest = SE.getAddExpr(Start, One);
+ GreatestSeen = Start;
}
auto Clamp = [this, Smallest, Greatest](const SCEV *S) {
@@ -930,7 +1018,7 @@ LoopConstrainer::calculateSubRanges() const {
Result.LowLimit = Clamp(Range.getBegin());
bool ProvablyNoPostLoop =
- SE.isKnownPredicate(ICmpInst::ICMP_SLE, Greatest, Range.getEnd());
+ SE.isKnownPredicate(ICmpInst::ICMP_SLT, GreatestSeen, Range.getEnd());
if (!ProvablyNoPostLoop)
Result.HighLimit = Clamp(Range.getEnd());
@@ -1194,7 +1282,12 @@ void LoopConstrainer::addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs) {
Loop *LoopConstrainer::createClonedLoopStructure(Loop *Original, Loop *Parent,
ValueToValueMapTy &VM) {
- Loop &New = LPM.addLoop(Parent);
+ Loop &New = *new Loop();
+ if (Parent)
+ Parent->addChildLoop(&New);
+ else
+ LI.addTopLevelLoop(&New);
+ LPM.addLoop(New);
// Add all of the blocks in Original to the new loop.
for (auto *BB : Original->blocks())
@@ -1332,28 +1425,35 @@ bool LoopConstrainer::run() {
DT.recalculate(F);
+ // We need to first add all the pre and post loop blocks into the loop
+ // structures (as part of createClonedLoopStructure), and then update the
+ // LCSSA form and LoopSimplifyForm. This is necessary for correctly updating
+ // LI when LoopSimplifyForm is generated.
+ Loop *PreL = nullptr, *PostL = nullptr;
if (!PreLoop.Blocks.empty()) {
- auto *L = createClonedLoopStructure(
+ PreL = createClonedLoopStructure(
&OriginalLoop, OriginalLoop.getParentLoop(), PreLoop.Map);
- formLCSSARecursively(*L, DT, &LI, &SE);
- simplifyLoop(L, &DT, &LI, &SE, nullptr, true);
- // Pre loops are slow paths, we do not need to perform any loop
- // optimizations on them.
- DisableAllLoopOptsOnLoop(*L);
}
if (!PostLoop.Blocks.empty()) {
- auto *L = createClonedLoopStructure(
+ PostL = createClonedLoopStructure(
&OriginalLoop, OriginalLoop.getParentLoop(), PostLoop.Map);
+ }
+
+ // This function canonicalizes the loop into Loop-Simplify and LCSSA forms.
+ auto CanonicalizeLoop = [&] (Loop *L, bool IsOriginalLoop) {
formLCSSARecursively(*L, DT, &LI, &SE);
simplifyLoop(L, &DT, &LI, &SE, nullptr, true);
- // Post loops are slow paths, we do not need to perform any loop
+ // Pre/post loops are slow paths, we do not need to perform any loop
// optimizations on them.
- DisableAllLoopOptsOnLoop(*L);
- }
-
- formLCSSARecursively(OriginalLoop, DT, &LI, &SE);
- simplifyLoop(&OriginalLoop, &DT, &LI, &SE, nullptr, true);
+ if (!IsOriginalLoop)
+ DisableAllLoopOptsOnLoop(*L);
+ };
+ if (PreL)
+ CanonicalizeLoop(PreL, false);
+ if (PostL)
+ CanonicalizeLoop(PostL, false);
+ CanonicalizeLoop(&OriginalLoop, true);
return true;
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/contrib/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
new file mode 100644
index 0000000..89b28f0
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -0,0 +1,969 @@
+//===-- NVPTXInferAddressSpace.cpp - ---------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// CUDA C/C++ includes memory space designation as variable type qualifers (such
+// as __global__ and __shared__). Knowing the space of a memory access allows
+// CUDA compilers to emit faster PTX loads and stores. For example, a load from
+// shared memory can be translated to `ld.shared` which is roughly 10% faster
+// than a generic `ld` on an NVIDIA Tesla K40c.
+//
+// Unfortunately, type qualifiers only apply to variable declarations, so CUDA
+// compilers must infer the memory space of an address expression from
+// type-qualified variables.
+//
+// LLVM IR uses non-zero (so-called) specific address spaces to represent memory
+// spaces (e.g. addrspace(3) means shared memory). The Clang frontend
+// places only type-qualified variables in specific address spaces, and then
+// conservatively `addrspacecast`s each type-qualified variable to addrspace(0)
+// (so-called the generic address space) for other instructions to use.
+//
+// For example, the Clang translates the following CUDA code
+// __shared__ float a[10];
+// float v = a[i];
+// to
+// %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]*
+// %1 = gep [10 x float], [10 x float]* %0, i64 0, i64 %i
+// %v = load float, float* %1 ; emits ld.f32
+// @a is in addrspace(3) since it's type-qualified, but its use from %1 is
+// redirected to %0 (the generic version of @a).
+//
+// The optimization implemented in this file propagates specific address spaces
+// from type-qualified variable declarations to its users. For example, it
+// optimizes the above IR to
+// %1 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
+// %v = load float addrspace(3)* %1 ; emits ld.shared.f32
+// propagating the addrspace(3) from @a to %1. As the result, the NVPTX
+// codegen is able to emit ld.shared.f32 for %v.
+//
+// Address space inference works in two steps. First, it uses a data-flow
+// analysis to infer as many generic pointers as possible to point to only one
+// specific address space. In the above example, it can prove that %1 only
+// points to addrspace(3). This algorithm was published in
+// CUDA: Compiling and optimizing for a GPU platform
+// Chakrabarti, Grover, Aarts, Kong, Kudlur, Lin, Marathe, Murphy, Wang
+// ICCS 2012
+//
+// Then, address space inference replaces all refinable generic pointers with
+// equivalent specific pointers.
+//
+// The major challenge of implementing this optimization is handling PHINodes,
+// which may create loops in the data flow graph. This brings two complications.
+//
+// First, the data flow analysis in Step 1 needs to be circular. For example,
+// %generic.input = addrspacecast float addrspace(3)* %input to float*
+// loop:
+// %y = phi [ %generic.input, %y2 ]
+// %y2 = getelementptr %y, 1
+// %v = load %y2
+// br ..., label %loop, ...
+// proving %y specific requires proving both %generic.input and %y2 specific,
+// but proving %y2 specific circles back to %y. To address this complication,
+// the data flow analysis operates on a lattice:
+// uninitialized > specific address spaces > generic.
+// All address expressions (our implementation only considers phi, bitcast,
+// addrspacecast, and getelementptr) start with the uninitialized address space.
+// The monotone transfer function moves the address space of a pointer down a
+// lattice path from uninitialized to specific and then to generic. A join
+// operation of two different specific address spaces pushes the expression down
+// to the generic address space. The analysis completes once it reaches a fixed
+// point.
+//
+// Second, IR rewriting in Step 2 also needs to be circular. For example,
+// converting %y to addrspace(3) requires the compiler to know the converted
+// %y2, but converting %y2 needs the converted %y. To address this complication,
+// we break these cycles using "undef" placeholders. When converting an
+// instruction `I` to a new address space, if its operand `Op` is not converted
+// yet, we let `I` temporarily use `undef` and fix all the uses of undef later.
+// For instance, our algorithm first converts %y to
+// %y' = phi float addrspace(3)* [ %input, undef ]
+// Then, it converts %y2 to
+// %y2' = getelementptr %y', 1
+// Finally, it fixes the undef in %y' so that
+// %y' = phi float addrspace(3)* [ %input, %y2' ]
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+
+#define DEBUG_TYPE "infer-address-spaces"
+
+using namespace llvm;
+
+namespace {
+static const unsigned UninitializedAddressSpace = ~0u;
+
+using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>;
+
+/// \brief InferAddressSpaces
+class InferAddressSpaces : public FunctionPass {
+ /// Target specific address space which uses of should be replaced if
+ /// possible.
+ unsigned FlatAddrSpace;
+
+public:
+ static char ID;
+
+ InferAddressSpaces() : FunctionPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ }
+
+ bool runOnFunction(Function &F) override;
+
+private:
+ // Returns the new address space of V if updated; otherwise, returns None.
+ Optional<unsigned>
+ updateAddressSpace(const Value &V,
+ const ValueToAddrSpaceMapTy &InferredAddrSpace) const;
+
+ // Tries to infer the specific address space of each address expression in
+ // Postorder.
+ void inferAddressSpaces(ArrayRef<WeakTrackingVH> Postorder,
+ ValueToAddrSpaceMapTy *InferredAddrSpace) const;
+
+ bool isSafeToCastConstAddrSpace(Constant *C, unsigned NewAS) const;
+
+ // Changes the flat address expressions in function F to point to specific
+ // address spaces if InferredAddrSpace says so. Postorder is the postorder of
+ // all flat expressions in the use-def graph of function F.
+ bool
+ rewriteWithNewAddressSpaces(ArrayRef<WeakTrackingVH> Postorder,
+ const ValueToAddrSpaceMapTy &InferredAddrSpace,
+ Function *F) const;
+
+ void appendsFlatAddressExpressionToPostorderStack(
+ Value *V, std::vector<std::pair<Value *, bool>> &PostorderStack,
+ DenseSet<Value *> &Visited) const;
+
+ bool rewriteIntrinsicOperands(IntrinsicInst *II,
+ Value *OldV, Value *NewV) const;
+ void collectRewritableIntrinsicOperands(
+ IntrinsicInst *II,
+ std::vector<std::pair<Value *, bool>> &PostorderStack,
+ DenseSet<Value *> &Visited) const;
+
+ std::vector<WeakTrackingVH> collectFlatAddressExpressions(Function &F) const;
+
+ Value *cloneValueWithNewAddressSpace(
+ Value *V, unsigned NewAddrSpace,
+ const ValueToValueMapTy &ValueWithNewAddrSpace,
+ SmallVectorImpl<const Use *> *UndefUsesToFix) const;
+ unsigned joinAddressSpaces(unsigned AS1, unsigned AS2) const;
+};
+} // end anonymous namespace
+
+char InferAddressSpaces::ID = 0;
+
+namespace llvm {
+void initializeInferAddressSpacesPass(PassRegistry &);
+}
+
+INITIALIZE_PASS(InferAddressSpaces, DEBUG_TYPE, "Infer address spaces",
+ false, false)
+
+// Returns true if V is an address expression.
+// TODO: Currently, we consider only phi, bitcast, addrspacecast, and
+// getelementptr operators.
+static bool isAddressExpression(const Value &V) {
+ if (!isa<Operator>(V))
+ return false;
+
+ switch (cast<Operator>(V).getOpcode()) {
+ case Instruction::PHI:
+ case Instruction::BitCast:
+ case Instruction::AddrSpaceCast:
+ case Instruction::GetElementPtr:
+ case Instruction::Select:
+ return true;
+ default:
+ return false;
+ }
+}
+
+// Returns the pointer operands of V.
+//
+// Precondition: V is an address expression.
+static SmallVector<Value *, 2> getPointerOperands(const Value &V) {
+ const Operator &Op = cast<Operator>(V);
+ switch (Op.getOpcode()) {
+ case Instruction::PHI: {
+ auto IncomingValues = cast<PHINode>(Op).incoming_values();
+ return SmallVector<Value *, 2>(IncomingValues.begin(),
+ IncomingValues.end());
+ }
+ case Instruction::BitCast:
+ case Instruction::AddrSpaceCast:
+ case Instruction::GetElementPtr:
+ return {Op.getOperand(0)};
+ case Instruction::Select:
+ return {Op.getOperand(1), Op.getOperand(2)};
+ default:
+ llvm_unreachable("Unexpected instruction type.");
+ }
+}
+
+// TODO: Move logic to TTI?
+bool InferAddressSpaces::rewriteIntrinsicOperands(IntrinsicInst *II,
+ Value *OldV,
+ Value *NewV) const {
+ Module *M = II->getParent()->getParent()->getParent();
+
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::amdgcn_atomic_inc:
+ case Intrinsic::amdgcn_atomic_dec:{
+ const ConstantInt *IsVolatile = dyn_cast<ConstantInt>(II->getArgOperand(4));
+ if (!IsVolatile || !IsVolatile->isZero())
+ return false;
+
+ LLVM_FALLTHROUGH;
+ }
+ case Intrinsic::objectsize: {
+ Type *DestTy = II->getType();
+ Type *SrcTy = NewV->getType();
+ Function *NewDecl =
+ Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
+ II->setArgOperand(0, NewV);
+ II->setCalledFunction(NewDecl);
+ return true;
+ }
+ default:
+ return false;
+ }
+}
+
+// TODO: Move logic to TTI?
+void InferAddressSpaces::collectRewritableIntrinsicOperands(
+ IntrinsicInst *II, std::vector<std::pair<Value *, bool>> &PostorderStack,
+ DenseSet<Value *> &Visited) const {
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::objectsize:
+ case Intrinsic::amdgcn_atomic_inc:
+ case Intrinsic::amdgcn_atomic_dec:
+ appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(0),
+ PostorderStack, Visited);
+ break;
+ default:
+ break;
+ }
+}
+
+// Returns all flat address expressions in function F. The elements are
+// If V is an unvisited flat address expression, appends V to PostorderStack
+// and marks it as visited.
+void InferAddressSpaces::appendsFlatAddressExpressionToPostorderStack(
+ Value *V, std::vector<std::pair<Value *, bool>> &PostorderStack,
+ DenseSet<Value *> &Visited) const {
+ assert(V->getType()->isPointerTy());
+
+ // Generic addressing expressions may be hidden in nested constant
+ // expressions.
+ if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
+ // TODO: Look in non-address parts, like icmp operands.
+ if (isAddressExpression(*CE) && Visited.insert(CE).second)
+ PostorderStack.push_back(std::make_pair(CE, false));
+
+ return;
+ }
+
+ if (isAddressExpression(*V) &&
+ V->getType()->getPointerAddressSpace() == FlatAddrSpace) {
+ if (Visited.insert(V).second) {
+ PostorderStack.push_back(std::make_pair(V, false));
+
+ Operator *Op = cast<Operator>(V);
+ for (unsigned I = 0, E = Op->getNumOperands(); I != E; ++I) {
+ if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Op->getOperand(I))) {
+ if (isAddressExpression(*CE) && Visited.insert(CE).second)
+ PostorderStack.emplace_back(CE, false);
+ }
+ }
+ }
+ }
+}
+
+// Returns all flat address expressions in function F. The elements are ordered
+// ordered in postorder.
+std::vector<WeakTrackingVH>
+InferAddressSpaces::collectFlatAddressExpressions(Function &F) const {
+ // This function implements a non-recursive postorder traversal of a partial
+ // use-def graph of function F.
+ std::vector<std::pair<Value *, bool>> PostorderStack;
+ // The set of visited expressions.
+ DenseSet<Value *> Visited;
+
+ auto PushPtrOperand = [&](Value *Ptr) {
+ appendsFlatAddressExpressionToPostorderStack(Ptr, PostorderStack,
+ Visited);
+ };
+
+ // Look at operations that may be interesting accelerate by moving to a known
+ // address space. We aim at generating after loads and stores, but pure
+ // addressing calculations may also be faster.
+ for (Instruction &I : instructions(F)) {
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+ if (!GEP->getType()->isVectorTy())
+ PushPtrOperand(GEP->getPointerOperand());
+ } else if (auto *LI = dyn_cast<LoadInst>(&I))
+ PushPtrOperand(LI->getPointerOperand());
+ else if (auto *SI = dyn_cast<StoreInst>(&I))
+ PushPtrOperand(SI->getPointerOperand());
+ else if (auto *RMW = dyn_cast<AtomicRMWInst>(&I))
+ PushPtrOperand(RMW->getPointerOperand());
+ else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(&I))
+ PushPtrOperand(CmpX->getPointerOperand());
+ else if (auto *MI = dyn_cast<MemIntrinsic>(&I)) {
+ // For memset/memcpy/memmove, any pointer operand can be replaced.
+ PushPtrOperand(MI->getRawDest());
+
+ // Handle 2nd operand for memcpy/memmove.
+ if (auto *MTI = dyn_cast<MemTransferInst>(MI))
+ PushPtrOperand(MTI->getRawSource());
+ } else if (auto *II = dyn_cast<IntrinsicInst>(&I))
+ collectRewritableIntrinsicOperands(II, PostorderStack, Visited);
+ else if (ICmpInst *Cmp = dyn_cast<ICmpInst>(&I)) {
+ // FIXME: Handle vectors of pointers
+ if (Cmp->getOperand(0)->getType()->isPointerTy()) {
+ PushPtrOperand(Cmp->getOperand(0));
+ PushPtrOperand(Cmp->getOperand(1));
+ }
+ } else if (auto *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
+ if (!ASC->getType()->isVectorTy())
+ PushPtrOperand(ASC->getPointerOperand());
+ }
+ }
+
+ std::vector<WeakTrackingVH> Postorder; // The resultant postorder.
+ while (!PostorderStack.empty()) {
+ Value *TopVal = PostorderStack.back().first;
+ // If the operands of the expression on the top are already explored,
+ // adds that expression to the resultant postorder.
+ if (PostorderStack.back().second) {
+ if (TopVal->getType()->getPointerAddressSpace() == FlatAddrSpace)
+ Postorder.push_back(TopVal);
+ PostorderStack.pop_back();
+ continue;
+ }
+ // Otherwise, adds its operands to the stack and explores them.
+ PostorderStack.back().second = true;
+ for (Value *PtrOperand : getPointerOperands(*TopVal)) {
+ appendsFlatAddressExpressionToPostorderStack(PtrOperand, PostorderStack,
+ Visited);
+ }
+ }
+ return Postorder;
+}
+
+// A helper function for cloneInstructionWithNewAddressSpace. Returns the clone
+// of OperandUse.get() in the new address space. If the clone is not ready yet,
+// returns an undef in the new address space as a placeholder.
+static Value *operandWithNewAddressSpaceOrCreateUndef(
+ const Use &OperandUse, unsigned NewAddrSpace,
+ const ValueToValueMapTy &ValueWithNewAddrSpace,
+ SmallVectorImpl<const Use *> *UndefUsesToFix) {
+ Value *Operand = OperandUse.get();
+
+ Type *NewPtrTy =
+ Operand->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
+
+ if (Constant *C = dyn_cast<Constant>(Operand))
+ return ConstantExpr::getAddrSpaceCast(C, NewPtrTy);
+
+ if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand))
+ return NewOperand;
+
+ UndefUsesToFix->push_back(&OperandUse);
+ return UndefValue::get(NewPtrTy);
+}
+
+// Returns a clone of `I` with its operands converted to those specified in
+// ValueWithNewAddrSpace. Due to potential cycles in the data flow graph, an
+// operand whose address space needs to be modified might not exist in
+// ValueWithNewAddrSpace. In that case, uses undef as a placeholder operand and
+// adds that operand use to UndefUsesToFix so that caller can fix them later.
+//
+// Note that we do not necessarily clone `I`, e.g., if it is an addrspacecast
+// from a pointer whose type already matches. Therefore, this function returns a
+// Value* instead of an Instruction*.
+static Value *cloneInstructionWithNewAddressSpace(
+ Instruction *I, unsigned NewAddrSpace,
+ const ValueToValueMapTy &ValueWithNewAddrSpace,
+ SmallVectorImpl<const Use *> *UndefUsesToFix) {
+ Type *NewPtrType =
+ I->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
+
+ if (I->getOpcode() == Instruction::AddrSpaceCast) {
+ Value *Src = I->getOperand(0);
+ // Because `I` is flat, the source address space must be specific.
+ // Therefore, the inferred address space must be the source space, according
+ // to our algorithm.
+ assert(Src->getType()->getPointerAddressSpace() == NewAddrSpace);
+ if (Src->getType() != NewPtrType)
+ return new BitCastInst(Src, NewPtrType);
+ return Src;
+ }
+
+ // Computes the converted pointer operands.
+ SmallVector<Value *, 4> NewPointerOperands;
+ for (const Use &OperandUse : I->operands()) {
+ if (!OperandUse.get()->getType()->isPointerTy())
+ NewPointerOperands.push_back(nullptr);
+ else
+ NewPointerOperands.push_back(operandWithNewAddressSpaceOrCreateUndef(
+ OperandUse, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix));
+ }
+
+ switch (I->getOpcode()) {
+ case Instruction::BitCast:
+ return new BitCastInst(NewPointerOperands[0], NewPtrType);
+ case Instruction::PHI: {
+ assert(I->getType()->isPointerTy());
+ PHINode *PHI = cast<PHINode>(I);
+ PHINode *NewPHI = PHINode::Create(NewPtrType, PHI->getNumIncomingValues());
+ for (unsigned Index = 0; Index < PHI->getNumIncomingValues(); ++Index) {
+ unsigned OperandNo = PHINode::getOperandNumForIncomingValue(Index);
+ NewPHI->addIncoming(NewPointerOperands[OperandNo],
+ PHI->getIncomingBlock(Index));
+ }
+ return NewPHI;
+ }
+ case Instruction::GetElementPtr: {
+ GetElementPtrInst *GEP = cast<GetElementPtrInst>(I);
+ GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
+ GEP->getSourceElementType(), NewPointerOperands[0],
+ SmallVector<Value *, 4>(GEP->idx_begin(), GEP->idx_end()));
+ NewGEP->setIsInBounds(GEP->isInBounds());
+ return NewGEP;
+ }
+ case Instruction::Select: {
+ assert(I->getType()->isPointerTy());
+ return SelectInst::Create(I->getOperand(0), NewPointerOperands[1],
+ NewPointerOperands[2], "", nullptr, I);
+ }
+ default:
+ llvm_unreachable("Unexpected opcode");
+ }
+}
+
+// Similar to cloneInstructionWithNewAddressSpace, returns a clone of the
+// constant expression `CE` with its operands replaced as specified in
+// ValueWithNewAddrSpace.
+static Value *cloneConstantExprWithNewAddressSpace(
+ ConstantExpr *CE, unsigned NewAddrSpace,
+ const ValueToValueMapTy &ValueWithNewAddrSpace) {
+ Type *TargetType =
+ CE->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
+
+ if (CE->getOpcode() == Instruction::AddrSpaceCast) {
+ // Because CE is flat, the source address space must be specific.
+ // Therefore, the inferred address space must be the source space according
+ // to our algorithm.
+ assert(CE->getOperand(0)->getType()->getPointerAddressSpace() ==
+ NewAddrSpace);
+ return ConstantExpr::getBitCast(CE->getOperand(0), TargetType);
+ }
+
+ if (CE->getOpcode() == Instruction::BitCast) {
+ if (Value *NewOperand = ValueWithNewAddrSpace.lookup(CE->getOperand(0)))
+ return ConstantExpr::getBitCast(cast<Constant>(NewOperand), TargetType);
+ return ConstantExpr::getAddrSpaceCast(CE, TargetType);
+ }
+
+ if (CE->getOpcode() == Instruction::Select) {
+ Constant *Src0 = CE->getOperand(1);
+ Constant *Src1 = CE->getOperand(2);
+ if (Src0->getType()->getPointerAddressSpace() ==
+ Src1->getType()->getPointerAddressSpace()) {
+
+ return ConstantExpr::getSelect(
+ CE->getOperand(0), ConstantExpr::getAddrSpaceCast(Src0, TargetType),
+ ConstantExpr::getAddrSpaceCast(Src1, TargetType));
+ }
+ }
+
+ // Computes the operands of the new constant expression.
+ bool IsNew = false;
+ SmallVector<Constant *, 4> NewOperands;
+ for (unsigned Index = 0; Index < CE->getNumOperands(); ++Index) {
+ Constant *Operand = CE->getOperand(Index);
+ // If the address space of `Operand` needs to be modified, the new operand
+ // with the new address space should already be in ValueWithNewAddrSpace
+ // because (1) the constant expressions we consider (i.e. addrspacecast,
+ // bitcast, and getelementptr) do not incur cycles in the data flow graph
+ // and (2) this function is called on constant expressions in postorder.
+ if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand)) {
+ IsNew = true;
+ NewOperands.push_back(cast<Constant>(NewOperand));
+ } else {
+ // Otherwise, reuses the old operand.
+ NewOperands.push_back(Operand);
+ }
+ }
+
+ // If !IsNew, we will replace the Value with itself. However, replaced values
+ // are assumed to wrapped in a addrspace cast later so drop it now.
+ if (!IsNew)
+ return nullptr;
+
+ if (CE->getOpcode() == Instruction::GetElementPtr) {
+ // Needs to specify the source type while constructing a getelementptr
+ // constant expression.
+ return CE->getWithOperands(
+ NewOperands, TargetType, /*OnlyIfReduced=*/false,
+ NewOperands[0]->getType()->getPointerElementType());
+ }
+
+ return CE->getWithOperands(NewOperands, TargetType);
+}
+
+// Returns a clone of the value `V`, with its operands replaced as specified in
+// ValueWithNewAddrSpace. This function is called on every flat address
+// expression whose address space needs to be modified, in postorder.
+//
+// See cloneInstructionWithNewAddressSpace for the meaning of UndefUsesToFix.
+Value *InferAddressSpaces::cloneValueWithNewAddressSpace(
+ Value *V, unsigned NewAddrSpace,
+ const ValueToValueMapTy &ValueWithNewAddrSpace,
+ SmallVectorImpl<const Use *> *UndefUsesToFix) const {
+ // All values in Postorder are flat address expressions.
+ assert(isAddressExpression(*V) &&
+ V->getType()->getPointerAddressSpace() == FlatAddrSpace);
+
+ if (Instruction *I = dyn_cast<Instruction>(V)) {
+ Value *NewV = cloneInstructionWithNewAddressSpace(
+ I, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix);
+ if (Instruction *NewI = dyn_cast<Instruction>(NewV)) {
+ if (NewI->getParent() == nullptr) {
+ NewI->insertBefore(I);
+ NewI->takeName(I);
+ }
+ }
+ return NewV;
+ }
+
+ return cloneConstantExprWithNewAddressSpace(
+ cast<ConstantExpr>(V), NewAddrSpace, ValueWithNewAddrSpace);
+}
+
+// Defines the join operation on the address space lattice (see the file header
+// comments).
+unsigned InferAddressSpaces::joinAddressSpaces(unsigned AS1,
+ unsigned AS2) const {
+ if (AS1 == FlatAddrSpace || AS2 == FlatAddrSpace)
+ return FlatAddrSpace;
+
+ if (AS1 == UninitializedAddressSpace)
+ return AS2;
+ if (AS2 == UninitializedAddressSpace)
+ return AS1;
+
+ // The join of two different specific address spaces is flat.
+ return (AS1 == AS2) ? AS1 : FlatAddrSpace;
+}
+
+bool InferAddressSpaces::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ const TargetTransformInfo &TTI =
+ getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ FlatAddrSpace = TTI.getFlatAddressSpace();
+ if (FlatAddrSpace == UninitializedAddressSpace)
+ return false;
+
+ // Collects all flat address expressions in postorder.
+ std::vector<WeakTrackingVH> Postorder = collectFlatAddressExpressions(F);
+
+ // Runs a data-flow analysis to refine the address spaces of every expression
+ // in Postorder.
+ ValueToAddrSpaceMapTy InferredAddrSpace;
+ inferAddressSpaces(Postorder, &InferredAddrSpace);
+
+ // Changes the address spaces of the flat address expressions who are inferred
+ // to point to a specific address space.
+ return rewriteWithNewAddressSpaces(Postorder, InferredAddrSpace, &F);
+}
+
+// Constants need to be tracked through RAUW to handle cases with nested
+// constant expressions, so wrap values in WeakTrackingVH.
+void InferAddressSpaces::inferAddressSpaces(
+ ArrayRef<WeakTrackingVH> Postorder,
+ ValueToAddrSpaceMapTy *InferredAddrSpace) const {
+ SetVector<Value *> Worklist(Postorder.begin(), Postorder.end());
+ // Initially, all expressions are in the uninitialized address space.
+ for (Value *V : Postorder)
+ (*InferredAddrSpace)[V] = UninitializedAddressSpace;
+
+ while (!Worklist.empty()) {
+ Value *V = Worklist.pop_back_val();
+
+ // Tries to update the address space of the stack top according to the
+ // address spaces of its operands.
+ DEBUG(dbgs() << "Updating the address space of\n " << *V << '\n');
+ Optional<unsigned> NewAS = updateAddressSpace(*V, *InferredAddrSpace);
+ if (!NewAS.hasValue())
+ continue;
+ // If any updates are made, grabs its users to the worklist because
+ // their address spaces can also be possibly updated.
+ DEBUG(dbgs() << " to " << NewAS.getValue() << '\n');
+ (*InferredAddrSpace)[V] = NewAS.getValue();
+
+ for (Value *User : V->users()) {
+ // Skip if User is already in the worklist.
+ if (Worklist.count(User))
+ continue;
+
+ auto Pos = InferredAddrSpace->find(User);
+ // Our algorithm only updates the address spaces of flat address
+ // expressions, which are those in InferredAddrSpace.
+ if (Pos == InferredAddrSpace->end())
+ continue;
+
+ // Function updateAddressSpace moves the address space down a lattice
+ // path. Therefore, nothing to do if User is already inferred as flat (the
+ // bottom element in the lattice).
+ if (Pos->second == FlatAddrSpace)
+ continue;
+
+ Worklist.insert(User);
+ }
+ }
+}
+
+Optional<unsigned> InferAddressSpaces::updateAddressSpace(
+ const Value &V, const ValueToAddrSpaceMapTy &InferredAddrSpace) const {
+ assert(InferredAddrSpace.count(&V));
+
+ // The new inferred address space equals the join of the address spaces
+ // of all its pointer operands.
+ unsigned NewAS = UninitializedAddressSpace;
+
+ const Operator &Op = cast<Operator>(V);
+ if (Op.getOpcode() == Instruction::Select) {
+ Value *Src0 = Op.getOperand(1);
+ Value *Src1 = Op.getOperand(2);
+
+ auto I = InferredAddrSpace.find(Src0);
+ unsigned Src0AS = (I != InferredAddrSpace.end()) ?
+ I->second : Src0->getType()->getPointerAddressSpace();
+
+ auto J = InferredAddrSpace.find(Src1);
+ unsigned Src1AS = (J != InferredAddrSpace.end()) ?
+ J->second : Src1->getType()->getPointerAddressSpace();
+
+ auto *C0 = dyn_cast<Constant>(Src0);
+ auto *C1 = dyn_cast<Constant>(Src1);
+
+ // If one of the inputs is a constant, we may be able to do a constant
+ // addrspacecast of it. Defer inferring the address space until the input
+ // address space is known.
+ if ((C1 && Src0AS == UninitializedAddressSpace) ||
+ (C0 && Src1AS == UninitializedAddressSpace))
+ return None;
+
+ if (C0 && isSafeToCastConstAddrSpace(C0, Src1AS))
+ NewAS = Src1AS;
+ else if (C1 && isSafeToCastConstAddrSpace(C1, Src0AS))
+ NewAS = Src0AS;
+ else
+ NewAS = joinAddressSpaces(Src0AS, Src1AS);
+ } else {
+ for (Value *PtrOperand : getPointerOperands(V)) {
+ auto I = InferredAddrSpace.find(PtrOperand);
+ unsigned OperandAS = I != InferredAddrSpace.end() ?
+ I->second : PtrOperand->getType()->getPointerAddressSpace();
+
+ // join(flat, *) = flat. So we can break if NewAS is already flat.
+ NewAS = joinAddressSpaces(NewAS, OperandAS);
+ if (NewAS == FlatAddrSpace)
+ break;
+ }
+ }
+
+ unsigned OldAS = InferredAddrSpace.lookup(&V);
+ assert(OldAS != FlatAddrSpace);
+ if (OldAS == NewAS)
+ return None;
+ return NewAS;
+}
+
+/// \p returns true if \p U is the pointer operand of a memory instruction with
+/// a single pointer operand that can have its address space changed by simply
+/// mutating the use to a new value.
+static bool isSimplePointerUseValidToReplace(Use &U) {
+ User *Inst = U.getUser();
+ unsigned OpNo = U.getOperandNo();
+
+ if (auto *LI = dyn_cast<LoadInst>(Inst))
+ return OpNo == LoadInst::getPointerOperandIndex() && !LI->isVolatile();
+
+ if (auto *SI = dyn_cast<StoreInst>(Inst))
+ return OpNo == StoreInst::getPointerOperandIndex() && !SI->isVolatile();
+
+ if (auto *RMW = dyn_cast<AtomicRMWInst>(Inst))
+ return OpNo == AtomicRMWInst::getPointerOperandIndex() && !RMW->isVolatile();
+
+ if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+ return OpNo == AtomicCmpXchgInst::getPointerOperandIndex() &&
+ !CmpX->isVolatile();
+ }
+
+ return false;
+}
+
+/// Update memory intrinsic uses that require more complex processing than
+/// simple memory instructions. Thse require re-mangling and may have multiple
+/// pointer operands.
+static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV,
+ Value *NewV) {
+ IRBuilder<> B(MI);
+ MDNode *TBAA = MI->getMetadata(LLVMContext::MD_tbaa);
+ MDNode *ScopeMD = MI->getMetadata(LLVMContext::MD_alias_scope);
+ MDNode *NoAliasMD = MI->getMetadata(LLVMContext::MD_noalias);
+
+ if (auto *MSI = dyn_cast<MemSetInst>(MI)) {
+ B.CreateMemSet(NewV, MSI->getValue(),
+ MSI->getLength(), MSI->getAlignment(),
+ false, // isVolatile
+ TBAA, ScopeMD, NoAliasMD);
+ } else if (auto *MTI = dyn_cast<MemTransferInst>(MI)) {
+ Value *Src = MTI->getRawSource();
+ Value *Dest = MTI->getRawDest();
+
+ // Be careful in case this is a self-to-self copy.
+ if (Src == OldV)
+ Src = NewV;
+
+ if (Dest == OldV)
+ Dest = NewV;
+
+ if (isa<MemCpyInst>(MTI)) {
+ MDNode *TBAAStruct = MTI->getMetadata(LLVMContext::MD_tbaa_struct);
+ B.CreateMemCpy(Dest, Src, MTI->getLength(),
+ MTI->getAlignment(),
+ false, // isVolatile
+ TBAA, TBAAStruct, ScopeMD, NoAliasMD);
+ } else {
+ assert(isa<MemMoveInst>(MTI));
+ B.CreateMemMove(Dest, Src, MTI->getLength(),
+ MTI->getAlignment(),
+ false, // isVolatile
+ TBAA, ScopeMD, NoAliasMD);
+ }
+ } else
+ llvm_unreachable("unhandled MemIntrinsic");
+
+ MI->eraseFromParent();
+ return true;
+}
+
+// \p returns true if it is OK to change the address space of constant \p C with
+// a ConstantExpr addrspacecast.
+bool InferAddressSpaces::isSafeToCastConstAddrSpace(Constant *C, unsigned NewAS) const {
+ assert(NewAS != UninitializedAddressSpace);
+
+ unsigned SrcAS = C->getType()->getPointerAddressSpace();
+ if (SrcAS == NewAS || isa<UndefValue>(C))
+ return true;
+
+ // Prevent illegal casts between different non-flat address spaces.
+ if (SrcAS != FlatAddrSpace && NewAS != FlatAddrSpace)
+ return false;
+
+ if (isa<ConstantPointerNull>(C))
+ return true;
+
+ if (auto *Op = dyn_cast<Operator>(C)) {
+ // If we already have a constant addrspacecast, it should be safe to cast it
+ // off.
+ if (Op->getOpcode() == Instruction::AddrSpaceCast)
+ return isSafeToCastConstAddrSpace(cast<Constant>(Op->getOperand(0)), NewAS);
+
+ if (Op->getOpcode() == Instruction::IntToPtr &&
+ Op->getType()->getPointerAddressSpace() == FlatAddrSpace)
+ return true;
+ }
+
+ return false;
+}
+
+static Value::use_iterator skipToNextUser(Value::use_iterator I,
+ Value::use_iterator End) {
+ User *CurUser = I->getUser();
+ ++I;
+
+ while (I != End && I->getUser() == CurUser)
+ ++I;
+
+ return I;
+}
+
+bool InferAddressSpaces::rewriteWithNewAddressSpaces(
+ ArrayRef<WeakTrackingVH> Postorder,
+ const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) const {
+ // For each address expression to be modified, creates a clone of it with its
+ // pointer operands converted to the new address space. Since the pointer
+ // operands are converted, the clone is naturally in the new address space by
+ // construction.
+ ValueToValueMapTy ValueWithNewAddrSpace;
+ SmallVector<const Use *, 32> UndefUsesToFix;
+ for (Value* V : Postorder) {
+ unsigned NewAddrSpace = InferredAddrSpace.lookup(V);
+ if (V->getType()->getPointerAddressSpace() != NewAddrSpace) {
+ ValueWithNewAddrSpace[V] = cloneValueWithNewAddressSpace(
+ V, NewAddrSpace, ValueWithNewAddrSpace, &UndefUsesToFix);
+ }
+ }
+
+ if (ValueWithNewAddrSpace.empty())
+ return false;
+
+ // Fixes all the undef uses generated by cloneInstructionWithNewAddressSpace.
+ for (const Use *UndefUse : UndefUsesToFix) {
+ User *V = UndefUse->getUser();
+ User *NewV = cast<User>(ValueWithNewAddrSpace.lookup(V));
+ unsigned OperandNo = UndefUse->getOperandNo();
+ assert(isa<UndefValue>(NewV->getOperand(OperandNo)));
+ NewV->setOperand(OperandNo, ValueWithNewAddrSpace.lookup(UndefUse->get()));
+ }
+
+ SmallVector<Instruction *, 16> DeadInstructions;
+
+ // Replaces the uses of the old address expressions with the new ones.
+ for (const WeakTrackingVH &WVH : Postorder) {
+ assert(WVH && "value was unexpectedly deleted");
+ Value *V = WVH;
+ Value *NewV = ValueWithNewAddrSpace.lookup(V);
+ if (NewV == nullptr)
+ continue;
+
+ DEBUG(dbgs() << "Replacing the uses of " << *V
+ << "\n with\n " << *NewV << '\n');
+
+ if (Constant *C = dyn_cast<Constant>(V)) {
+ Constant *Replace = ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV),
+ C->getType());
+ if (C != Replace) {
+ DEBUG(dbgs() << "Inserting replacement const cast: "
+ << Replace << ": " << *Replace << '\n');
+ C->replaceAllUsesWith(Replace);
+ V = Replace;
+ }
+ }
+
+ Value::use_iterator I, E, Next;
+ for (I = V->use_begin(), E = V->use_end(); I != E; ) {
+ Use &U = *I;
+
+ // Some users may see the same pointer operand in multiple operands. Skip
+ // to the next instruction.
+ I = skipToNextUser(I, E);
+
+ if (isSimplePointerUseValidToReplace(U)) {
+ // If V is used as the pointer operand of a compatible memory operation,
+ // sets the pointer operand to NewV. This replacement does not change
+ // the element type, so the resultant load/store is still valid.
+ U.set(NewV);
+ continue;
+ }
+
+ User *CurUser = U.getUser();
+ // Handle more complex cases like intrinsic that need to be remangled.
+ if (auto *MI = dyn_cast<MemIntrinsic>(CurUser)) {
+ if (!MI->isVolatile() && handleMemIntrinsicPtrUse(MI, V, NewV))
+ continue;
+ }
+
+ if (auto *II = dyn_cast<IntrinsicInst>(CurUser)) {
+ if (rewriteIntrinsicOperands(II, V, NewV))
+ continue;
+ }
+
+ if (isa<Instruction>(CurUser)) {
+ if (ICmpInst *Cmp = dyn_cast<ICmpInst>(CurUser)) {
+ // If we can infer that both pointers are in the same addrspace,
+ // transform e.g.
+ // %cmp = icmp eq float* %p, %q
+ // into
+ // %cmp = icmp eq float addrspace(3)* %new_p, %new_q
+
+ unsigned NewAS = NewV->getType()->getPointerAddressSpace();
+ int SrcIdx = U.getOperandNo();
+ int OtherIdx = (SrcIdx == 0) ? 1 : 0;
+ Value *OtherSrc = Cmp->getOperand(OtherIdx);
+
+ if (Value *OtherNewV = ValueWithNewAddrSpace.lookup(OtherSrc)) {
+ if (OtherNewV->getType()->getPointerAddressSpace() == NewAS) {
+ Cmp->setOperand(OtherIdx, OtherNewV);
+ Cmp->setOperand(SrcIdx, NewV);
+ continue;
+ }
+ }
+
+ // Even if the type mismatches, we can cast the constant.
+ if (auto *KOtherSrc = dyn_cast<Constant>(OtherSrc)) {
+ if (isSafeToCastConstAddrSpace(KOtherSrc, NewAS)) {
+ Cmp->setOperand(SrcIdx, NewV);
+ Cmp->setOperand(OtherIdx,
+ ConstantExpr::getAddrSpaceCast(KOtherSrc, NewV->getType()));
+ continue;
+ }
+ }
+ }
+
+ if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(CurUser)) {
+ unsigned NewAS = NewV->getType()->getPointerAddressSpace();
+ if (ASC->getDestAddressSpace() == NewAS) {
+ ASC->replaceAllUsesWith(NewV);
+ DeadInstructions.push_back(ASC);
+ continue;
+ }
+ }
+
+ // Otherwise, replaces the use with flat(NewV).
+ if (Instruction *I = dyn_cast<Instruction>(V)) {
+ BasicBlock::iterator InsertPos = std::next(I->getIterator());
+ while (isa<PHINode>(InsertPos))
+ ++InsertPos;
+ U.set(new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos));
+ } else {
+ U.set(ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV),
+ V->getType()));
+ }
+ }
+ }
+
+ if (V->use_empty()) {
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ DeadInstructions.push_back(I);
+ }
+ }
+
+ for (Instruction *I : DeadInstructions)
+ RecursivelyDeleteTriviallyDeadInstructions(I);
+
+ return true;
+}
+
+FunctionPass *llvm::createInferAddressSpacesPass() {
+ return new InferAddressSpaces();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 1870c3d..dc9143b 100644
--- a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -12,29 +12,33 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/JumpThreading.h"
-#include "llvm/Transforms/Scalar.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
#include <algorithm>
@@ -60,6 +64,11 @@ ImplicationSearchThreshold(
"condition to use to thread over a weaker condition"),
cl::init(3), cl::Hidden);
+static cl::opt<bool> PrintLVIAfterJumpThreading(
+ "print-lvi-after-jump-threading",
+ cl::desc("Print the LazyValueInfo cache after JumpThreading"), cl::init(false),
+ cl::Hidden);
+
namespace {
/// This pass performs 'jump threading', which looks at blocks that have
/// multiple predecessors and multiple successors. If one or more of the
@@ -89,8 +98,10 @@ namespace {
bool runOnFunction(Function &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ if (PrintLVIAfterJumpThreading)
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<LazyValueInfoWrapperPass>();
- AU.addPreserved<LazyValueInfoWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
AU.addRequired<TargetLibraryInfoWrapperPass>();
}
@@ -104,6 +115,7 @@ INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading",
"Jump Threading", false, false)
INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_END(JumpThreading, "jump-threading",
"Jump Threading", false, false)
@@ -121,16 +133,24 @@ bool JumpThreading::runOnFunction(Function &F) {
return false;
auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
auto LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
+ auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
std::unique_ptr<BlockFrequencyInfo> BFI;
std::unique_ptr<BranchProbabilityInfo> BPI;
bool HasProfileData = F.getEntryCount().hasValue();
if (HasProfileData) {
LoopInfo LI{DominatorTree(F)};
- BPI.reset(new BranchProbabilityInfo(F, LI));
+ BPI.reset(new BranchProbabilityInfo(F, LI, TLI));
BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
}
- return Impl.runImpl(F, TLI, LVI, HasProfileData, std::move(BFI),
- std::move(BPI));
+
+ bool Changed = Impl.runImpl(F, TLI, LVI, AA, HasProfileData, std::move(BFI),
+ std::move(BPI));
+ if (PrintLVIAfterJumpThreading) {
+ dbgs() << "LVI for function '" << F.getName() << "':\n";
+ LVI->printLVI(F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+ dbgs());
+ }
+ return Changed;
}
PreservedAnalyses JumpThreadingPass::run(Function &F,
@@ -138,20 +158,19 @@ PreservedAnalyses JumpThreadingPass::run(Function &F,
auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
auto &LVI = AM.getResult<LazyValueAnalysis>(F);
+ auto &AA = AM.getResult<AAManager>(F);
+
std::unique_ptr<BlockFrequencyInfo> BFI;
std::unique_ptr<BranchProbabilityInfo> BPI;
bool HasProfileData = F.getEntryCount().hasValue();
if (HasProfileData) {
LoopInfo LI{DominatorTree(F)};
- BPI.reset(new BranchProbabilityInfo(F, LI));
+ BPI.reset(new BranchProbabilityInfo(F, LI, &TLI));
BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
}
- bool Changed =
- runImpl(F, &TLI, &LVI, HasProfileData, std::move(BFI), std::move(BPI));
- // FIXME: We need to invalidate LVI to avoid PR28400. Is there a better
- // solution?
- AM.invalidate<LazyValueAnalysis>(F);
+ bool Changed = runImpl(F, &TLI, &LVI, &AA, HasProfileData, std::move(BFI),
+ std::move(BPI));
if (!Changed)
return PreservedAnalyses::all();
@@ -161,18 +180,23 @@ PreservedAnalyses JumpThreadingPass::run(Function &F,
}
bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
- LazyValueInfo *LVI_, bool HasProfileData_,
+ LazyValueInfo *LVI_, AliasAnalysis *AA_,
+ bool HasProfileData_,
std::unique_ptr<BlockFrequencyInfo> BFI_,
std::unique_ptr<BranchProbabilityInfo> BPI_) {
DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
TLI = TLI_;
LVI = LVI_;
+ AA = AA_;
BFI.reset();
BPI.reset();
// When profile data is available, we need to update edge weights after
// successful jump threading, which requires both BPI and BFI being available.
HasProfileData = HasProfileData_;
+ auto *GuardDecl = F.getParent()->getFunction(
+ Intrinsic::getName(Intrinsic::experimental_guard));
+ HasGuards = GuardDecl && !GuardDecl->use_empty();
if (HasProfileData) {
BPI = std::move(BPI_);
BFI = std::move(BFI_);
@@ -219,33 +243,22 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
// Can't thread an unconditional jump, but if the block is "almost
// empty", we can replace uses of it with uses of the successor and make
// this dead.
- // We should not eliminate the loop header either, because eliminating
- // a loop header might later prevent LoopSimplify from transforming nested
- // loops into simplified form.
+ // We should not eliminate the loop header or latch either, because
+ // eliminating a loop header or latch might later prevent LoopSimplify
+ // from transforming nested loops into simplified form. We will rely on
+ // later passes in backend to clean up empty blocks.
if (BI && BI->isUnconditional() &&
BB != &BB->getParent()->getEntryBlock() &&
// If the terminator is the only non-phi instruction, try to nuke it.
- BB->getFirstNonPHIOrDbg()->isTerminator() && !LoopHeaders.count(BB)) {
- // Since TryToSimplifyUncondBranchFromEmptyBlock may delete the
- // block, we have to make sure it isn't in the LoopHeaders set. We
- // reinsert afterward if needed.
- bool ErasedFromLoopHeaders = LoopHeaders.erase(BB);
- BasicBlock *Succ = BI->getSuccessor(0);
-
+ BB->getFirstNonPHIOrDbg()->isTerminator() && !LoopHeaders.count(BB) &&
+ !LoopHeaders.count(BI->getSuccessor(0))) {
// FIXME: It is always conservatively correct to drop the info
// for a block even if it doesn't get erased. This isn't totally
// awesome, but it allows us to use AssertingVH to prevent nasty
// dangling pointer issues within LazyValueInfo.
LVI->eraseBlock(BB);
- if (TryToSimplifyUncondBranchFromEmptyBlock(BB)) {
+ if (TryToSimplifyUncondBranchFromEmptyBlock(BB))
Changed = true;
- // If we deleted BB and BB was the header of a loop, then the
- // successor is now the header of the loop.
- BB = Succ;
- }
-
- if (ErasedFromLoopHeaders)
- LoopHeaders.insert(BB);
}
}
EverChanged |= Changed;
@@ -255,10 +268,42 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
return EverChanged;
}
-/// getJumpThreadDuplicationCost - Return the cost of duplicating this block to
-/// thread across it. Stop scanning the block when passing the threshold.
-static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,
+// Replace uses of Cond with ToVal when safe to do so. If all uses are
+// replaced, we can remove Cond. We cannot blindly replace all uses of Cond
+// because we may incorrectly replace uses when guards/assumes are uses of
+// of `Cond` and we used the guards/assume to reason about the `Cond` value
+// at the end of block. RAUW unconditionally replaces all uses
+// including the guards/assumes themselves and the uses before the
+// guard/assume.
+static void ReplaceFoldableUses(Instruction *Cond, Value *ToVal) {
+ assert(Cond->getType() == ToVal->getType());
+ auto *BB = Cond->getParent();
+ // We can unconditionally replace all uses in non-local blocks (i.e. uses
+ // strictly dominated by BB), since LVI information is true from the
+ // terminator of BB.
+ replaceNonLocalUsesWith(Cond, ToVal);
+ for (Instruction &I : reverse(*BB)) {
+ // Reached the Cond whose uses we are trying to replace, so there are no
+ // more uses.
+ if (&I == Cond)
+ break;
+ // We only replace uses in instructions that are guaranteed to reach the end
+ // of BB, where we know Cond is ToVal.
+ if (!isGuaranteedToTransferExecutionToSuccessor(&I))
+ break;
+ I.replaceUsesOfWith(Cond, ToVal);
+ }
+ if (Cond->use_empty() && !Cond->mayHaveSideEffects())
+ Cond->eraseFromParent();
+}
+
+/// Return the cost of duplicating a piece of this block from first non-phi
+/// and before StopAt instruction to thread across it. Stop scanning the block
+/// when exceeding the threshold. If duplication is impossible, returns ~0U.
+static unsigned getJumpThreadDuplicationCost(BasicBlock *BB,
+ Instruction *StopAt,
unsigned Threshold) {
+ assert(StopAt->getParent() == BB && "Not an instruction from proper BB?");
/// Ignore PHI nodes, these will be flattened when duplication happens.
BasicBlock::const_iterator I(BB->getFirstNonPHI());
@@ -266,15 +311,17 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,
// branch, so they shouldn't count against the duplication cost.
unsigned Bonus = 0;
- const TerminatorInst *BBTerm = BB->getTerminator();
- // Threading through a switch statement is particularly profitable. If this
- // block ends in a switch, decrease its cost to make it more likely to happen.
- if (isa<SwitchInst>(BBTerm))
- Bonus = 6;
-
- // The same holds for indirect branches, but slightly more so.
- if (isa<IndirectBrInst>(BBTerm))
- Bonus = 8;
+ if (BB->getTerminator() == StopAt) {
+ // Threading through a switch statement is particularly profitable. If this
+ // block ends in a switch, decrease its cost to make it more likely to
+ // happen.
+ if (isa<SwitchInst>(StopAt))
+ Bonus = 6;
+
+ // The same holds for indirect branches, but slightly more so.
+ if (isa<IndirectBrInst>(StopAt))
+ Bonus = 8;
+ }
// Bump the threshold up so the early exit from the loop doesn't skip the
// terminator-based Size adjustment at the end.
@@ -283,7 +330,7 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,
// Sum up the cost of each instruction until we get to the terminator. Don't
// include the terminator because the copy won't include it.
unsigned Size = 0;
- for (; !isa<TerminatorInst>(I); ++I) {
+ for (; &*I != StopAt; ++I) {
// Stop scanning the block if we've reached the threshold.
if (Size > Threshold)
@@ -544,7 +591,12 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
// Handle compare with phi operand, where the PHI is defined in this block.
if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) {
assert(Preference == WantInteger && "Compares only produce integers");
- PHINode *PN = dyn_cast<PHINode>(Cmp->getOperand(0));
+ Type *CmpType = Cmp->getType();
+ Value *CmpLHS = Cmp->getOperand(0);
+ Value *CmpRHS = Cmp->getOperand(1);
+ CmpInst::Predicate Pred = Cmp->getPredicate();
+
+ PHINode *PN = dyn_cast<PHINode>(CmpLHS);
if (PN && PN->getParent() == BB) {
const DataLayout &DL = PN->getModule()->getDataLayout();
// We can do this simplification if any comparisons fold to true or false.
@@ -552,15 +604,15 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
BasicBlock *PredBB = PN->getIncomingBlock(i);
Value *LHS = PN->getIncomingValue(i);
- Value *RHS = Cmp->getOperand(1)->DoPHITranslation(BB, PredBB);
+ Value *RHS = CmpRHS->DoPHITranslation(BB, PredBB);
- Value *Res = SimplifyCmpInst(Cmp->getPredicate(), LHS, RHS, DL);
+ Value *Res = SimplifyCmpInst(Pred, LHS, RHS, {DL});
if (!Res) {
if (!isa<Constant>(RHS))
continue;
LazyValueInfo::Tristate
- ResT = LVI->getPredicateOnEdge(Cmp->getPredicate(), LHS,
+ ResT = LVI->getPredicateOnEdge(Pred, LHS,
cast<Constant>(RHS), PredBB, BB,
CxtI ? CxtI : Cmp);
if (ResT == LazyValueInfo::Unknown)
@@ -577,44 +629,81 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
// If comparing a live-in value against a constant, see if we know the
// live-in value on any predecessors.
- if (isa<Constant>(Cmp->getOperand(1)) && Cmp->getType()->isIntegerTy()) {
- if (!isa<Instruction>(Cmp->getOperand(0)) ||
- cast<Instruction>(Cmp->getOperand(0))->getParent() != BB) {
- Constant *RHSCst = cast<Constant>(Cmp->getOperand(1));
+ if (isa<Constant>(CmpRHS) && !CmpType->isVectorTy()) {
+ Constant *CmpConst = cast<Constant>(CmpRHS);
+ if (!isa<Instruction>(CmpLHS) ||
+ cast<Instruction>(CmpLHS)->getParent() != BB) {
for (BasicBlock *P : predecessors(BB)) {
// If the value is known by LazyValueInfo to be a constant in a
// predecessor, use that information to try to thread this block.
LazyValueInfo::Tristate Res =
- LVI->getPredicateOnEdge(Cmp->getPredicate(), Cmp->getOperand(0),
- RHSCst, P, BB, CxtI ? CxtI : Cmp);
+ LVI->getPredicateOnEdge(Pred, CmpLHS,
+ CmpConst, P, BB, CxtI ? CxtI : Cmp);
if (Res == LazyValueInfo::Unknown)
continue;
- Constant *ResC = ConstantInt::get(Cmp->getType(), Res);
+ Constant *ResC = ConstantInt::get(CmpType, Res);
Result.push_back(std::make_pair(ResC, P));
}
return !Result.empty();
}
+ // InstCombine can fold some forms of constant range checks into
+ // (icmp (add (x, C1)), C2). See if we have we have such a thing with
+ // x as a live-in.
+ {
+ using namespace PatternMatch;
+ Value *AddLHS;
+ ConstantInt *AddConst;
+ if (isa<ConstantInt>(CmpConst) &&
+ match(CmpLHS, m_Add(m_Value(AddLHS), m_ConstantInt(AddConst)))) {
+ if (!isa<Instruction>(AddLHS) ||
+ cast<Instruction>(AddLHS)->getParent() != BB) {
+ for (BasicBlock *P : predecessors(BB)) {
+ // If the value is known by LazyValueInfo to be a ConstantRange in
+ // a predecessor, use that information to try to thread this
+ // block.
+ ConstantRange CR = LVI->getConstantRangeOnEdge(
+ AddLHS, P, BB, CxtI ? CxtI : cast<Instruction>(CmpLHS));
+ // Propagate the range through the addition.
+ CR = CR.add(AddConst->getValue());
+
+ // Get the range where the compare returns true.
+ ConstantRange CmpRange = ConstantRange::makeExactICmpRegion(
+ Pred, cast<ConstantInt>(CmpConst)->getValue());
+
+ Constant *ResC;
+ if (CmpRange.contains(CR))
+ ResC = ConstantInt::getTrue(CmpType);
+ else if (CmpRange.inverse().contains(CR))
+ ResC = ConstantInt::getFalse(CmpType);
+ else
+ continue;
+
+ Result.push_back(std::make_pair(ResC, P));
+ }
+
+ return !Result.empty();
+ }
+ }
+ }
+
// Try to find a constant value for the LHS of a comparison,
// and evaluate it statically if we can.
- if (Constant *CmpConst = dyn_cast<Constant>(Cmp->getOperand(1))) {
- PredValueInfoTy LHSVals;
- ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals,
- WantInteger, CxtI);
-
- for (const auto &LHSVal : LHSVals) {
- Constant *V = LHSVal.first;
- Constant *Folded = ConstantExpr::getCompare(Cmp->getPredicate(),
- V, CmpConst);
- if (Constant *KC = getKnownConstant(Folded, WantInteger))
- Result.push_back(std::make_pair(KC, LHSVal.second));
- }
+ PredValueInfoTy LHSVals;
+ ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals,
+ WantInteger, CxtI);
- return !Result.empty();
+ for (const auto &LHSVal : LHSVals) {
+ Constant *V = LHSVal.first;
+ Constant *Folded = ConstantExpr::getCompare(Pred, V, CmpConst);
+ if (Constant *KC = getKnownConstant(Folded, WantInteger))
+ Result.push_back(std::make_pair(KC, LHSVal.second));
}
+
+ return !Result.empty();
}
}
@@ -722,6 +811,37 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
LVI->eraseBlock(SinglePred);
MergeBasicBlockIntoOnlyPred(BB);
+ // Now that BB is merged into SinglePred (i.e. SinglePred Code followed by
+ // BB code within one basic block `BB`), we need to invalidate the LVI
+ // information associated with BB, because the LVI information need not be
+ // true for all of BB after the merge. For example,
+ // Before the merge, LVI info and code is as follows:
+ // SinglePred: <LVI info1 for %p val>
+ // %y = use of %p
+ // call @exit() // need not transfer execution to successor.
+ // assume(%p) // from this point on %p is true
+ // br label %BB
+ // BB: <LVI info2 for %p val, i.e. %p is true>
+ // %x = use of %p
+ // br label exit
+ //
+ // Note that this LVI info for blocks BB and SinglPred is correct for %p
+ // (info2 and info1 respectively). After the merge and the deletion of the
+ // LVI info1 for SinglePred. We have the following code:
+ // BB: <LVI info2 for %p val>
+ // %y = use of %p
+ // call @exit()
+ // assume(%p)
+ // %x = use of %p <-- LVI info2 is correct from here onwards.
+ // br label exit
+ // LVI info2 for BB is incorrect at the beginning of BB.
+
+ // Invalidate LVI information for BB if the LVI is not provably true for
+ // all of BB.
+ if (any_of(*BB, [](Instruction &I) {
+ return !isGuaranteedToTransferExecutionToSuccessor(&I);
+ }))
+ LVI->eraseBlock(BB);
return true;
}
}
@@ -729,6 +849,10 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
if (TryToUnfoldSelectInCurrBB(BB))
return true;
+ // Look if we can propagate guards to predecessors.
+ if (HasGuards && ProcessGuards(BB))
+ return true;
+
// What kind of constant we're looking for.
ConstantPreference Preference = WantInteger;
@@ -804,7 +928,6 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
return false;
}
-
if (CmpInst *CondCmp = dyn_cast<CmpInst>(CondInst)) {
// If we're branching on a conditional, LVI might be able to determine
// it's value at the branch instruction. We only handle comparisons
@@ -812,7 +935,12 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
// TODO: This should be extended to handle switches as well.
BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1));
- if (CondBr && CondConst && CondBr->isConditional()) {
+ if (CondBr && CondConst) {
+ // We should have returned as soon as we turn a conditional branch to
+ // unconditional. Because its no longer interesting as far as jump
+ // threading is concerned.
+ assert(CondBr->isConditional() && "Threading on unconditional terminator");
+
LazyValueInfo::Tristate Ret =
LVI->getPredicateAt(CondCmp->getPredicate(), CondCmp->getOperand(0),
CondConst, CondBr);
@@ -824,21 +952,27 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
CondBr->eraseFromParent();
if (CondCmp->use_empty())
CondCmp->eraseFromParent();
+ // We can safely replace *some* uses of the CondInst if it has
+ // exactly one value as returned by LVI. RAUW is incorrect in the
+ // presence of guards and assumes, that have the `Cond` as the use. This
+ // is because we use the guards/assume to reason about the `Cond` value
+ // at the end of block, but RAUW unconditionally replaces all uses
+ // including the guards/assumes themselves and the uses before the
+ // guard/assume.
else if (CondCmp->getParent() == BB) {
- // If the fact we just learned is true for all uses of the
- // condition, replace it with a constant value
auto *CI = Ret == LazyValueInfo::True ?
ConstantInt::getTrue(CondCmp->getType()) :
ConstantInt::getFalse(CondCmp->getType());
- CondCmp->replaceAllUsesWith(CI);
- CondCmp->eraseFromParent();
+ ReplaceFoldableUses(CondCmp, CI);
}
return true;
}
- }
- if (CondBr && CondConst && TryToUnfoldSelect(CondCmp, BB))
- return true;
+ // We did not manage to simplify this branch, try to see whether
+ // CondCmp depends on a known phi-select pattern.
+ if (TryToUnfoldSelect(CondCmp, BB))
+ return true;
+ }
}
// Check for some cases that are worth simplifying. Right now we want to look
@@ -857,7 +991,6 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
if (SimplifyPartiallyRedundantLoad(LI))
return true;
-
// Handle a variety of cases where we are branching on something derived from
// a PHI node in the current block. If we can prove that any predecessors
// compute a predictable value based on a PHI node, thread those predecessors.
@@ -871,7 +1004,6 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
if (PN->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
return ProcessBranchOnPHI(PN);
-
// If this is an otherwise-unfoldable branch on a XOR, see if we can simplify.
if (CondInst->getOpcode() == Instruction::Xor &&
CondInst->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
@@ -920,6 +1052,14 @@ bool JumpThreadingPass::ProcessImpliedCondition(BasicBlock *BB) {
return false;
}
+/// Return true if Op is an instruction defined in the given block.
+static bool isOpDefinedInBlock(Value *Op, BasicBlock *BB) {
+ if (Instruction *OpInst = dyn_cast<Instruction>(Op))
+ if (OpInst->getParent() == BB)
+ return true;
+ return false;
+}
+
/// SimplifyPartiallyRedundantLoad - If LI is an obviously partially redundant
/// load instruction, eliminate it by replacing it with a PHI node. This is an
/// important optimization that encourages jump threading, and needs to be run
@@ -942,18 +1082,17 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
Value *LoadedPtr = LI->getOperand(0);
- // If the loaded operand is defined in the LoadBB, it can't be available.
- // TODO: Could do simple PHI translation, that would be fun :)
- if (Instruction *PtrOp = dyn_cast<Instruction>(LoadedPtr))
- if (PtrOp->getParent() == LoadBB)
- return false;
+ // If the loaded operand is defined in the LoadBB and its not a phi,
+ // it can't be available in predecessors.
+ if (isOpDefinedInBlock(LoadedPtr, LoadBB) && !isa<PHINode>(LoadedPtr))
+ return false;
// Scan a few instructions up from the load, to see if it is obviously live at
// the entry to its block.
BasicBlock::iterator BBIt(LI);
bool IsLoadCSE;
- if (Value *AvailableVal =
- FindAvailableLoadedValue(LI, LoadBB, BBIt, DefMaxInstsToScan, nullptr, &IsLoadCSE)) {
+ if (Value *AvailableVal = FindAvailableLoadedValue(
+ LI, LoadBB, BBIt, DefMaxInstsToScan, AA, &IsLoadCSE)) {
// If the value of the load is locally available within the block, just use
// it. This frequently occurs for reg2mem'd allocas.
@@ -997,12 +1136,34 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
if (!PredsScanned.insert(PredBB).second)
continue;
- // Scan the predecessor to see if the value is available in the pred.
BBIt = PredBB->end();
- Value *PredAvailable = FindAvailableLoadedValue(LI, PredBB, BBIt,
- DefMaxInstsToScan,
- nullptr,
- &IsLoadCSE);
+ unsigned NumScanedInst = 0;
+ Value *PredAvailable = nullptr;
+ // NOTE: We don't CSE load that is volatile or anything stronger than
+ // unordered, that should have been checked when we entered the function.
+ assert(LI->isUnordered() && "Attempting to CSE volatile or atomic loads");
+ // If this is a load on a phi pointer, phi-translate it and search
+ // for available load/store to the pointer in predecessors.
+ Value *Ptr = LoadedPtr->DoPHITranslation(LoadBB, PredBB);
+ PredAvailable = FindAvailablePtrLoadStore(
+ Ptr, LI->getType(), LI->isAtomic(), PredBB, BBIt, DefMaxInstsToScan,
+ AA, &IsLoadCSE, &NumScanedInst);
+
+ // If PredBB has a single predecessor, continue scanning through the
+ // single precessor.
+ BasicBlock *SinglePredBB = PredBB;
+ while (!PredAvailable && SinglePredBB && BBIt == SinglePredBB->begin() &&
+ NumScanedInst < DefMaxInstsToScan) {
+ SinglePredBB = SinglePredBB->getSinglePredecessor();
+ if (SinglePredBB) {
+ BBIt = SinglePredBB->end();
+ PredAvailable = FindAvailablePtrLoadStore(
+ Ptr, LI->getType(), LI->isAtomic(), SinglePredBB, BBIt,
+ (DefMaxInstsToScan - NumScanedInst), AA, &IsLoadCSE,
+ &NumScanedInst);
+ }
+ }
+
if (!PredAvailable) {
OneUnavailablePred = PredBB;
continue;
@@ -1062,10 +1223,10 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
if (UnavailablePred) {
assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 &&
"Can't handle critical edge here!");
- LoadInst *NewVal =
- new LoadInst(LoadedPtr, LI->getName() + ".pr", false,
- LI->getAlignment(), LI->getOrdering(), LI->getSynchScope(),
- UnavailablePred->getTerminator());
+ LoadInst *NewVal = new LoadInst(
+ LoadedPtr->DoPHITranslation(LoadBB, UnavailablePred),
+ LI->getName() + ".pr", false, LI->getAlignment(), LI->getOrdering(),
+ LI->getSyncScopeID(), UnavailablePred->getTerminator());
NewVal->setDebugLoc(LI->getDebugLoc());
if (AATags)
NewVal->setAAMetadata(AATags);
@@ -1210,37 +1371,53 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
BasicBlock *OnlyDest = nullptr;
BasicBlock *MultipleDestSentinel = (BasicBlock*)(intptr_t)~0ULL;
+ Constant *OnlyVal = nullptr;
+ Constant *MultipleVal = (Constant *)(intptr_t)~0ULL;
+ unsigned PredWithKnownDest = 0;
for (const auto &PredValue : PredValues) {
BasicBlock *Pred = PredValue.second;
if (!SeenPreds.insert(Pred).second)
continue; // Duplicate predecessor entry.
- // If the predecessor ends with an indirect goto, we can't change its
- // destination.
- if (isa<IndirectBrInst>(Pred->getTerminator()))
- continue;
-
Constant *Val = PredValue.first;
BasicBlock *DestBB;
if (isa<UndefValue>(Val))
DestBB = nullptr;
- else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()))
+ else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
+ assert(isa<ConstantInt>(Val) && "Expecting a constant integer");
DestBB = BI->getSuccessor(cast<ConstantInt>(Val)->isZero());
- else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
- DestBB = SI->findCaseValue(cast<ConstantInt>(Val)).getCaseSuccessor();
+ } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
+ assert(isa<ConstantInt>(Val) && "Expecting a constant integer");
+ DestBB = SI->findCaseValue(cast<ConstantInt>(Val))->getCaseSuccessor();
} else {
assert(isa<IndirectBrInst>(BB->getTerminator())
&& "Unexpected terminator");
+ assert(isa<BlockAddress>(Val) && "Expecting a constant blockaddress");
DestBB = cast<BlockAddress>(Val)->getBasicBlock();
}
// If we have exactly one destination, remember it for efficiency below.
- if (PredToDestList.empty())
+ if (PredToDestList.empty()) {
OnlyDest = DestBB;
- else if (OnlyDest != DestBB)
- OnlyDest = MultipleDestSentinel;
+ OnlyVal = Val;
+ } else {
+ if (OnlyDest != DestBB)
+ OnlyDest = MultipleDestSentinel;
+ // It possible we have same destination, but different value, e.g. default
+ // case in switchinst.
+ if (Val != OnlyVal)
+ OnlyVal = MultipleVal;
+ }
+
+ // We know where this predecessor is going.
+ ++PredWithKnownDest;
+
+ // If the predecessor ends with an indirect goto, we can't change its
+ // destination.
+ if (isa<IndirectBrInst>(Pred->getTerminator()))
+ continue;
PredToDestList.push_back(std::make_pair(Pred, DestBB));
}
@@ -1249,6 +1426,45 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
if (PredToDestList.empty())
return false;
+ // If all the predecessors go to a single known successor, we want to fold,
+ // not thread. By doing so, we do not need to duplicate the current block and
+ // also miss potential opportunities in case we dont/cant duplicate.
+ if (OnlyDest && OnlyDest != MultipleDestSentinel) {
+ if (PredWithKnownDest ==
+ (size_t)std::distance(pred_begin(BB), pred_end(BB))) {
+ bool SeenFirstBranchToOnlyDest = false;
+ for (BasicBlock *SuccBB : successors(BB)) {
+ if (SuccBB == OnlyDest && !SeenFirstBranchToOnlyDest)
+ SeenFirstBranchToOnlyDest = true; // Don't modify the first branch.
+ else
+ SuccBB->removePredecessor(BB, true); // This is unreachable successor.
+ }
+
+ // Finally update the terminator.
+ TerminatorInst *Term = BB->getTerminator();
+ BranchInst::Create(OnlyDest, Term);
+ Term->eraseFromParent();
+
+ // If the condition is now dead due to the removal of the old terminator,
+ // erase it.
+ if (auto *CondInst = dyn_cast<Instruction>(Cond)) {
+ if (CondInst->use_empty() && !CondInst->mayHaveSideEffects())
+ CondInst->eraseFromParent();
+ // We can safely replace *some* uses of the CondInst if it has
+ // exactly one value as returned by LVI. RAUW is incorrect in the
+ // presence of guards and assumes, that have the `Cond` as the use. This
+ // is because we use the guards/assume to reason about the `Cond` value
+ // at the end of block, but RAUW unconditionally replaces all uses
+ // including the guards/assumes themselves and the uses before the
+ // guard/assume.
+ else if (OnlyVal && OnlyVal != MultipleVal &&
+ CondInst->getParent() == BB)
+ ReplaceFoldableUses(CondInst, OnlyVal);
+ }
+ return true;
+ }
+ }
+
// Determine which is the most common successor. If we have many inputs and
// this block is a switch, we want to start by threading the batch that goes
// to the most popular destination first. If we only know about one
@@ -1468,7 +1684,8 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
return false;
}
- unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB, BBDupThreshold);
+ unsigned JumpThreadCost =
+ getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
if (JumpThreadCost > BBDupThreshold) {
DEBUG(dbgs() << " Not threading BB '" << BB->getName()
<< "' - Cost is too high: " << JumpThreadCost << "\n");
@@ -1756,7 +1973,8 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
return false;
}
- unsigned DuplicationCost = getJumpThreadDuplicationCost(BB, BBDupThreshold);
+ unsigned DuplicationCost =
+ getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
if (DuplicationCost > BBDupThreshold) {
DEBUG(dbgs() << " Not duplicating BB '" << BB->getName()
<< "' - Cost is too high: " << DuplicationCost << "\n");
@@ -1811,11 +2029,12 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
// If this instruction can be simplified after the operands are updated,
// just use the simplified value instead. This frequently happens due to
// phi translation.
- if (Value *IV =
- SimplifyInstruction(New, BB->getModule()->getDataLayout())) {
+ if (Value *IV = SimplifyInstruction(
+ New,
+ {BB->getModule()->getDataLayout(), TLI, nullptr, nullptr, New})) {
ValueMapping[&*BI] = IV;
if (!New->mayHaveSideEffects()) {
- delete New;
+ New->deleteValue();
New = nullptr;
}
} else {
@@ -1888,10 +2107,10 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
/// TryToUnfoldSelect - Look for blocks of the form
/// bb1:
/// %a = select
-/// br bb
+/// br bb2
///
/// bb2:
-/// %p = phi [%a, %bb] ...
+/// %p = phi [%a, %bb1] ...
/// %c = icmp %p
/// br i1 %c
///
@@ -1963,11 +2182,19 @@ bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
return false;
}
-/// TryToUnfoldSelectInCurrBB - Look for PHI/Select in the same BB of the form
+/// TryToUnfoldSelectInCurrBB - Look for PHI/Select or PHI/CMP/Select in the
+/// same BB in the form
/// bb:
/// %p = phi [false, %bb1], [true, %bb2], [false, %bb3], [true, %bb4], ...
-/// %s = select p, trueval, falseval
+/// %s = select %p, trueval, falseval
///
+/// or
+///
+/// bb:
+/// %p = phi [0, %bb1], [1, %bb2], [0, %bb3], [1, %bb4], ...
+/// %c = cmp %p, 0
+/// %s = select %c, trueval, falseval
+//
/// And expand the select into a branch structure. This later enables
/// jump-threading over bb in this pass.
///
@@ -1981,43 +2208,180 @@ bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
if (LoopHeaders.count(BB))
return false;
- // Look for a Phi/Select pair in the same basic block. The Phi feeds the
- // condition of the Select and at least one of the incoming values is a
- // constant.
for (BasicBlock::iterator BI = BB->begin();
PHINode *PN = dyn_cast<PHINode>(BI); ++BI) {
- unsigned NumPHIValues = PN->getNumIncomingValues();
- if (NumPHIValues == 0 || !PN->hasOneUse())
+ // Look for a Phi having at least one constant incoming value.
+ if (llvm::all_of(PN->incoming_values(),
+ [](Value *V) { return !isa<ConstantInt>(V); }))
continue;
- SelectInst *SI = dyn_cast<SelectInst>(PN->user_back());
- if (!SI || SI->getParent() != BB)
- continue;
+ auto isUnfoldCandidate = [BB](SelectInst *SI, Value *V) {
+ // Check if SI is in BB and use V as condition.
+ if (SI->getParent() != BB)
+ return false;
+ Value *Cond = SI->getCondition();
+ return (Cond && Cond == V && Cond->getType()->isIntegerTy(1));
+ };
- Value *Cond = SI->getCondition();
- if (!Cond || Cond != PN || !Cond->getType()->isIntegerTy(1))
+ SelectInst *SI = nullptr;
+ for (Use &U : PN->uses()) {
+ if (ICmpInst *Cmp = dyn_cast<ICmpInst>(U.getUser())) {
+ // Look for a ICmp in BB that compares PN with a constant and is the
+ // condition of a Select.
+ if (Cmp->getParent() == BB && Cmp->hasOneUse() &&
+ isa<ConstantInt>(Cmp->getOperand(1 - U.getOperandNo())))
+ if (SelectInst *SelectI = dyn_cast<SelectInst>(Cmp->user_back()))
+ if (isUnfoldCandidate(SelectI, Cmp->use_begin()->get())) {
+ SI = SelectI;
+ break;
+ }
+ } else if (SelectInst *SelectI = dyn_cast<SelectInst>(U.getUser())) {
+ // Look for a Select in BB that uses PN as condtion.
+ if (isUnfoldCandidate(SelectI, U.get())) {
+ SI = SelectI;
+ break;
+ }
+ }
+ }
+
+ if (!SI)
continue;
+ // Expand the select.
+ TerminatorInst *Term =
+ SplitBlockAndInsertIfThen(SI->getCondition(), SI, false);
+ PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI);
+ NewPN->addIncoming(SI->getTrueValue(), Term->getParent());
+ NewPN->addIncoming(SI->getFalseValue(), BB);
+ SI->replaceAllUsesWith(NewPN);
+ SI->eraseFromParent();
+ return true;
+ }
+ return false;
+}
- bool HasConst = false;
- for (unsigned i = 0; i != NumPHIValues; ++i) {
- if (PN->getIncomingBlock(i) == BB)
- return false;
- if (isa<ConstantInt>(PN->getIncomingValue(i)))
- HasConst = true;
- }
+/// Try to propagate a guard from the current BB into one of its predecessors
+/// in case if another branch of execution implies that the condition of this
+/// guard is always true. Currently we only process the simplest case that
+/// looks like:
+///
+/// Start:
+/// %cond = ...
+/// br i1 %cond, label %T1, label %F1
+/// T1:
+/// br label %Merge
+/// F1:
+/// br label %Merge
+/// Merge:
+/// %condGuard = ...
+/// call void(i1, ...) @llvm.experimental.guard( i1 %condGuard )[ "deopt"() ]
+///
+/// And cond either implies condGuard or !condGuard. In this case all the
+/// instructions before the guard can be duplicated in both branches, and the
+/// guard is then threaded to one of them.
+bool JumpThreadingPass::ProcessGuards(BasicBlock *BB) {
+ using namespace PatternMatch;
+ // We only want to deal with two predecessors.
+ BasicBlock *Pred1, *Pred2;
+ auto PI = pred_begin(BB), PE = pred_end(BB);
+ if (PI == PE)
+ return false;
+ Pred1 = *PI++;
+ if (PI == PE)
+ return false;
+ Pred2 = *PI++;
+ if (PI != PE)
+ return false;
+ if (Pred1 == Pred2)
+ return false;
- if (HasConst) {
- // Expand the select.
- TerminatorInst *Term =
- SplitBlockAndInsertIfThen(SI->getCondition(), SI, false);
- PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI);
- NewPN->addIncoming(SI->getTrueValue(), Term->getParent());
- NewPN->addIncoming(SI->getFalseValue(), BB);
- SI->replaceAllUsesWith(NewPN);
- SI->eraseFromParent();
- return true;
+ // Try to thread one of the guards of the block.
+ // TODO: Look up deeper than to immediate predecessor?
+ auto *Parent = Pred1->getSinglePredecessor();
+ if (!Parent || Parent != Pred2->getSinglePredecessor())
+ return false;
+
+ if (auto *BI = dyn_cast<BranchInst>(Parent->getTerminator()))
+ for (auto &I : *BB)
+ if (match(&I, m_Intrinsic<Intrinsic::experimental_guard>()))
+ if (ThreadGuard(BB, cast<IntrinsicInst>(&I), BI))
+ return true;
+
+ return false;
+}
+
+/// Try to propagate the guard from BB which is the lower block of a diamond
+/// to one of its branches, in case if diamond's condition implies guard's
+/// condition.
+bool JumpThreadingPass::ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard,
+ BranchInst *BI) {
+ assert(BI->getNumSuccessors() == 2 && "Wrong number of successors?");
+ assert(BI->isConditional() && "Unconditional branch has 2 successors?");
+ Value *GuardCond = Guard->getArgOperand(0);
+ Value *BranchCond = BI->getCondition();
+ BasicBlock *TrueDest = BI->getSuccessor(0);
+ BasicBlock *FalseDest = BI->getSuccessor(1);
+
+ auto &DL = BB->getModule()->getDataLayout();
+ bool TrueDestIsSafe = false;
+ bool FalseDestIsSafe = false;
+
+ // True dest is safe if BranchCond => GuardCond.
+ auto Impl = isImpliedCondition(BranchCond, GuardCond, DL);
+ if (Impl && *Impl)
+ TrueDestIsSafe = true;
+ else {
+ // False dest is safe if !BranchCond => GuardCond.
+ Impl =
+ isImpliedCondition(BranchCond, GuardCond, DL, /* InvertAPred */ true);
+ if (Impl && *Impl)
+ FalseDestIsSafe = true;
+ }
+
+ if (!TrueDestIsSafe && !FalseDestIsSafe)
+ return false;
+
+ BasicBlock *UnguardedBlock = TrueDestIsSafe ? TrueDest : FalseDest;
+ BasicBlock *GuardedBlock = FalseDestIsSafe ? TrueDest : FalseDest;
+
+ ValueToValueMapTy UnguardedMapping, GuardedMapping;
+ Instruction *AfterGuard = Guard->getNextNode();
+ unsigned Cost = getJumpThreadDuplicationCost(BB, AfterGuard, BBDupThreshold);
+ if (Cost > BBDupThreshold)
+ return false;
+ // Duplicate all instructions before the guard and the guard itself to the
+ // branch where implication is not proved.
+ GuardedBlock = DuplicateInstructionsInSplitBetween(
+ BB, GuardedBlock, AfterGuard, GuardedMapping);
+ assert(GuardedBlock && "Could not create the guarded block?");
+ // Duplicate all instructions before the guard in the unguarded branch.
+ // Since we have successfully duplicated the guarded block and this block
+ // has fewer instructions, we expect it to succeed.
+ UnguardedBlock = DuplicateInstructionsInSplitBetween(BB, UnguardedBlock,
+ Guard, UnguardedMapping);
+ assert(UnguardedBlock && "Could not create the unguarded block?");
+ DEBUG(dbgs() << "Moved guard " << *Guard << " to block "
+ << GuardedBlock->getName() << "\n");
+
+ // Some instructions before the guard may still have uses. For them, we need
+ // to create Phi nodes merging their copies in both guarded and unguarded
+ // branches. Those instructions that have no uses can be just removed.
+ SmallVector<Instruction *, 4> ToRemove;
+ for (auto BI = BB->begin(); &*BI != AfterGuard; ++BI)
+ if (!isa<PHINode>(&*BI))
+ ToRemove.push_back(&*BI);
+
+ Instruction *InsertionPoint = &*BB->getFirstInsertionPt();
+ assert(InsertionPoint && "Empty block?");
+ // Substitute with Phis & remove.
+ for (auto *Inst : reverse(ToRemove)) {
+ if (!Inst->use_empty()) {
+ PHINode *NewPN = PHINode::Create(Inst->getType(), 2);
+ NewPN->addIncoming(UnguardedMapping[Inst], UnguardedBlock);
+ NewPN->addIncoming(GuardedMapping[Inst], GuardedBlock);
+ NewPN->insertBefore(InsertionPoint);
+ Inst->replaceAllUsesWith(NewPN);
}
+ Inst->eraseFromParent();
}
-
- return false;
+ return true;
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
index f51d11c..37b9c4b 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -77,10 +77,16 @@ STATISTIC(NumMovedLoads, "Number of load insts hoisted or sunk");
STATISTIC(NumMovedCalls, "Number of call insts hoisted or sunk");
STATISTIC(NumPromoted, "Number of memory locations promoted to registers");
+/// Memory promotion is enabled by default.
static cl::opt<bool>
- DisablePromotion("disable-licm-promotion", cl::Hidden,
+ DisablePromotion("disable-licm-promotion", cl::Hidden, cl::init(false),
cl::desc("Disable memory promotion in LICM pass"));
+static cl::opt<uint32_t> MaxNumUsesTraversed(
+ "licm-max-num-uses-traversed", cl::Hidden, cl::init(8),
+ cl::desc("Max num uses visited for identifying load "
+ "invariance in loop using invariant start (default = 8)"));
+
static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
const LoopSafetyInfo *SafetyInfo);
@@ -201,9 +207,9 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, &AR.TLI, &AR.SE, ORE, true))
return PreservedAnalyses::all();
- // FIXME: There is no setPreservesCFG in the new PM. When that becomes
- // available, it should be used here.
- return getLoopPassPreservedAnalyses();
+ auto PA = getLoopPassPreservedAnalyses();
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
}
char LegacyLICMPass::ID = 0;
@@ -425,6 +431,29 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
continue;
}
+ // Attempt to remove floating point division out of the loop by converting
+ // it to a reciprocal multiplication.
+ if (I.getOpcode() == Instruction::FDiv &&
+ CurLoop->isLoopInvariant(I.getOperand(1)) &&
+ I.hasAllowReciprocal()) {
+ auto Divisor = I.getOperand(1);
+ auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0);
+ auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor);
+ ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags());
+ ReciprocalDivisor->insertBefore(&I);
+
+ auto Product = BinaryOperator::CreateFMul(I.getOperand(0),
+ ReciprocalDivisor);
+ Product->setFastMathFlags(I.getFastMathFlags());
+ Product->insertAfter(&I);
+ I.replaceAllUsesWith(Product);
+ I.eraseFromParent();
+
+ hoist(*ReciprocalDivisor, DT, CurLoop, SafetyInfo, ORE);
+ Changed = true;
+ continue;
+ }
+
// Try hoisting the instruction out to the preheader. We can only do this
// if all of the operands of the instruction are loop invariant and if it
// is safe to hoist the instruction.
@@ -461,7 +490,10 @@ void llvm::computeLoopSafetyInfo(LoopSafetyInfo *SafetyInfo, Loop *CurLoop) {
SafetyInfo->MayThrow = SafetyInfo->HeaderMayThrow;
// Iterate over loop instructions and compute safety info.
- for (Loop::block_iterator BB = CurLoop->block_begin(),
+ // Skip header as it has been computed and stored in HeaderMayThrow.
+ // The first block in loopinfo.Blocks is guaranteed to be the header.
+ assert(Header == *CurLoop->getBlocks().begin() && "First block must be header");
+ for (Loop::block_iterator BB = std::next(CurLoop->block_begin()),
BBE = CurLoop->block_end();
(BB != BBE) && !SafetyInfo->MayThrow; ++BB)
for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end();
@@ -477,6 +509,59 @@ void llvm::computeLoopSafetyInfo(LoopSafetyInfo *SafetyInfo, Loop *CurLoop) {
SafetyInfo->BlockColors = colorEHFunclets(*Fn);
}
+// Return true if LI is invariant within scope of the loop. LI is invariant if
+// CurLoop is dominated by an invariant.start representing the same memory location
+// and size as the memory location LI loads from, and also the invariant.start
+// has no uses.
+static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT,
+ Loop *CurLoop) {
+ Value *Addr = LI->getOperand(0);
+ const DataLayout &DL = LI->getModule()->getDataLayout();
+ const uint32_t LocSizeInBits = DL.getTypeSizeInBits(
+ cast<PointerType>(Addr->getType())->getElementType());
+
+ // if the type is i8 addrspace(x)*, we know this is the type of
+ // llvm.invariant.start operand
+ auto *PtrInt8Ty = PointerType::get(Type::getInt8Ty(LI->getContext()),
+ LI->getPointerAddressSpace());
+ unsigned BitcastsVisited = 0;
+ // Look through bitcasts until we reach the i8* type (this is invariant.start
+ // operand type).
+ while (Addr->getType() != PtrInt8Ty) {
+ auto *BC = dyn_cast<BitCastInst>(Addr);
+ // Avoid traversing high number of bitcast uses.
+ if (++BitcastsVisited > MaxNumUsesTraversed || !BC)
+ return false;
+ Addr = BC->getOperand(0);
+ }
+
+ unsigned UsesVisited = 0;
+ // Traverse all uses of the load operand value, to see if invariant.start is
+ // one of the uses, and whether it dominates the load instruction.
+ for (auto *U : Addr->users()) {
+ // Avoid traversing for Load operand with high number of users.
+ if (++UsesVisited > MaxNumUsesTraversed)
+ return false;
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
+ // If there are escaping uses of invariant.start instruction, the load maybe
+ // non-invariant.
+ if (!II || II->getIntrinsicID() != Intrinsic::invariant_start ||
+ !II->use_empty())
+ continue;
+ unsigned InvariantSizeInBits =
+ cast<ConstantInt>(II->getArgOperand(0))->getSExtValue() * 8;
+ // Confirm the invariant.start location size contains the load operand size
+ // in bits. Also, the invariant.start should dominate the load, and we
+ // should not hoist the load out of a loop that contains this dominating
+ // invariant.start.
+ if (LocSizeInBits <= InvariantSizeInBits &&
+ DT->properlyDominates(II->getParent(), CurLoop->getHeader()))
+ return true;
+ }
+
+ return false;
+}
+
bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
Loop *CurLoop, AliasSetTracker *CurAST,
LoopSafetyInfo *SafetyInfo,
@@ -493,6 +578,10 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
if (LI->getMetadata(LLVMContext::MD_invariant_load))
return true;
+ // This checks for an invariant.start dominating the load.
+ if (isLoadInvariantInLoop(LI, DT, CurLoop))
+ return true;
+
// Don't hoist loads which have may-aliased stores in loop.
uint64_t Size = 0;
if (LI->getType()->isSized())
@@ -782,7 +871,7 @@ static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " << I
<< "\n");
ORE->emit(OptimizationRemark(DEBUG_TYPE, "Hoisted", &I)
- << "hosting " << ore::NV("Inst", &I));
+ << "hoisting " << ore::NV("Inst", &I));
// Metadata can be dependent on conditions we are hoisting above.
// Conservatively strip all metadata on the instruction unless we were
@@ -852,6 +941,7 @@ class LoopPromoter : public LoadAndStorePromoter {
LoopInfo &LI;
DebugLoc DL;
int Alignment;
+ bool UnorderedAtomic;
AAMDNodes AATags;
Value *maybeInsertLCSSAPHI(Value *V, BasicBlock *BB) const {
@@ -875,10 +965,11 @@ public:
SmallVectorImpl<BasicBlock *> &LEB,
SmallVectorImpl<Instruction *> &LIP, PredIteratorCache &PIC,
AliasSetTracker &ast, LoopInfo &li, DebugLoc dl, int alignment,
- const AAMDNodes &AATags)
+ bool UnorderedAtomic, const AAMDNodes &AATags)
: LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA),
LoopExitBlocks(LEB), LoopInsertPts(LIP), PredCache(PIC), AST(ast),
- LI(li), DL(std::move(dl)), Alignment(alignment), AATags(AATags) {}
+ LI(li), DL(std::move(dl)), Alignment(alignment),
+ UnorderedAtomic(UnorderedAtomic),AATags(AATags) {}
bool isInstInList(Instruction *I,
const SmallVectorImpl<Instruction *> &) const override {
@@ -902,6 +993,8 @@ public:
Value *Ptr = maybeInsertLCSSAPHI(SomePtr, ExitBlock);
Instruction *InsertPos = LoopInsertPts[i];
StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos);
+ if (UnorderedAtomic)
+ NewSI->setOrdering(AtomicOrdering::Unordered);
NewSI->setAlignment(Alignment);
NewSI->setDebugLoc(DL);
if (AATags)
@@ -992,18 +1085,41 @@ bool llvm::promoteLoopAccessesToScalars(
// We start with an alignment of one and try to find instructions that allow
// us to prove better alignment.
unsigned Alignment = 1;
+ // Keep track of which types of access we see
+ bool SawUnorderedAtomic = false;
+ bool SawNotAtomic = false;
AAMDNodes AATags;
const DataLayout &MDL = Preheader->getModule()->getDataLayout();
+ // Do we know this object does not escape ?
+ bool IsKnownNonEscapingObject = false;
if (SafetyInfo->MayThrow) {
// If a loop can throw, we have to insert a store along each unwind edge.
// That said, we can't actually make the unwind edge explicit. Therefore,
// we have to prove that the store is dead along the unwind edge.
//
- // Currently, this code just special-cases alloca instructions.
- if (!isa<AllocaInst>(GetUnderlyingObject(SomePtr, MDL)))
- return false;
+ // If the underlying object is not an alloca, nor a pointer that does not
+ // escape, then we can not effectively prove that the store is dead along
+ // the unwind edge. i.e. the caller of this function could have ways to
+ // access the pointed object.
+ Value *Object = GetUnderlyingObject(SomePtr, MDL);
+ // If this is a base pointer we do not understand, simply bail.
+ // We only handle alloca and return value from alloc-like fn right now.
+ if (!isa<AllocaInst>(Object)) {
+ if (!isAllocLikeFn(Object, TLI))
+ return false;
+ // If this is an alloc like fn. There are more constraints we need to verify.
+ // More specifically, we must make sure that the pointer can not escape.
+ //
+ // NOTE: PointerMayBeCaptured is not enough as the pointer may have escaped
+ // even though its not captured by the enclosing function. Standard allocation
+ // functions like malloc, calloc, and operator new return values which can
+ // be assumed not to have previously escaped.
+ if (PointerMayBeCaptured(Object, true, true))
+ return false;
+ IsKnownNonEscapingObject = true;
+ }
}
// Check that all of the pointers in the alias set have the same type. We
@@ -1029,8 +1145,11 @@ bool llvm::promoteLoopAccessesToScalars(
// it.
if (LoadInst *Load = dyn_cast<LoadInst>(UI)) {
assert(!Load->isVolatile() && "AST broken");
- if (!Load->isSimple())
+ if (!Load->isUnordered())
return false;
+
+ SawUnorderedAtomic |= Load->isAtomic();
+ SawNotAtomic |= !Load->isAtomic();
if (!DereferenceableInPH)
DereferenceableInPH = isSafeToExecuteUnconditionally(
@@ -1041,9 +1160,12 @@ bool llvm::promoteLoopAccessesToScalars(
if (UI->getOperand(1) != ASIV)
continue;
assert(!Store->isVolatile() && "AST broken");
- if (!Store->isSimple())
+ if (!Store->isUnordered())
return false;
+ SawUnorderedAtomic |= Store->isAtomic();
+ SawNotAtomic |= !Store->isAtomic();
+
// If the store is guaranteed to execute, both properties are satisfied.
// We may want to check if a store is guaranteed to execute even if we
// already know that promotion is safe, since it may have higher
@@ -1096,6 +1218,12 @@ bool llvm::promoteLoopAccessesToScalars(
}
}
+ // If we found both an unordered atomic instruction and a non-atomic memory
+ // access, bail. We can't blindly promote non-atomic to atomic since we
+ // might not be able to lower the result. We can't downgrade since that
+ // would violate memory model. Also, align 0 is an error for atomics.
+ if (SawUnorderedAtomic && SawNotAtomic)
+ return false;
// If we couldn't prove we can hoist the load, bail.
if (!DereferenceableInPH)
@@ -1106,10 +1234,15 @@ bool llvm::promoteLoopAccessesToScalars(
// stores along paths which originally didn't have them without violating the
// memory model.
if (!SafeToInsertStore) {
- Value *Object = GetUnderlyingObject(SomePtr, MDL);
- SafeToInsertStore =
- (isAllocLikeFn(Object, TLI) || isa<AllocaInst>(Object)) &&
+ // If this is a known non-escaping object, it is safe to insert the stores.
+ if (IsKnownNonEscapingObject)
+ SafeToInsertStore = true;
+ else {
+ Value *Object = GetUnderlyingObject(SomePtr, MDL);
+ SafeToInsertStore =
+ (isAllocLikeFn(Object, TLI) || isa<AllocaInst>(Object)) &&
!PointerMayBeCaptured(Object, true, true);
+ }
}
// If we've still failed to prove we can sink the store, give up.
@@ -1134,12 +1267,15 @@ bool llvm::promoteLoopAccessesToScalars(
SmallVector<PHINode *, 16> NewPHIs;
SSAUpdater SSA(&NewPHIs);
LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks,
- InsertPts, PIC, *CurAST, *LI, DL, Alignment, AATags);
+ InsertPts, PIC, *CurAST, *LI, DL, Alignment,
+ SawUnorderedAtomic, AATags);
// Set up the preheader to have a definition of the value. It is the live-out
// value from the preheader that uses in the loop will use.
LoadInst *PreheaderLoad = new LoadInst(
SomePtr, SomePtr->getName() + ".promoted", Preheader->getTerminator());
+ if (SawUnorderedAtomic)
+ PreheaderLoad->setOrdering(AtomicOrdering::Unordered);
PreheaderLoad->setAlignment(Alignment);
PreheaderLoad->setDebugLoc(DL);
if (AATags)
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp b/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp
deleted file mode 100644
index 389f1c5..0000000
--- a/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp
+++ /dev/null
@@ -1,280 +0,0 @@
-//===- LoadCombine.cpp - Combine Adjacent Loads ---------------------------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-/// \file
-/// This transformation combines adjacent loads.
-///
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/TargetFolder.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "load-combine"
-
-STATISTIC(NumLoadsAnalyzed, "Number of loads analyzed for combining");
-STATISTIC(NumLoadsCombined, "Number of loads combined");
-
-#define LDCOMBINE_NAME "Combine Adjacent Loads"
-
-namespace {
-struct PointerOffsetPair {
- Value *Pointer;
- APInt Offset;
-};
-
-struct LoadPOPPair {
- LoadInst *Load;
- PointerOffsetPair POP;
- /// \brief The new load needs to be created before the first load in IR order.
- unsigned InsertOrder;
-};
-
-class LoadCombine : public BasicBlockPass {
- LLVMContext *C;
- AliasAnalysis *AA;
-
-public:
- LoadCombine() : BasicBlockPass(ID), C(nullptr), AA(nullptr) {
- initializeLoadCombinePass(*PassRegistry::getPassRegistry());
- }
-
- using llvm::Pass::doInitialization;
- bool doInitialization(Function &) override;
- bool runOnBasicBlock(BasicBlock &BB) override;
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- AU.addRequired<AAResultsWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- }
-
- StringRef getPassName() const override { return LDCOMBINE_NAME; }
- static char ID;
-
- typedef IRBuilder<TargetFolder> BuilderTy;
-
-private:
- BuilderTy *Builder;
-
- PointerOffsetPair getPointerOffsetPair(LoadInst &);
- bool combineLoads(DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> &);
- bool aggregateLoads(SmallVectorImpl<LoadPOPPair> &);
- bool combineLoads(SmallVectorImpl<LoadPOPPair> &);
-};
-}
-
-bool LoadCombine::doInitialization(Function &F) {
- DEBUG(dbgs() << "LoadCombine function: " << F.getName() << "\n");
- C = &F.getContext();
- return true;
-}
-
-PointerOffsetPair LoadCombine::getPointerOffsetPair(LoadInst &LI) {
- auto &DL = LI.getModule()->getDataLayout();
-
- PointerOffsetPair POP;
- POP.Pointer = LI.getPointerOperand();
- unsigned BitWidth = DL.getPointerSizeInBits(LI.getPointerAddressSpace());
- POP.Offset = APInt(BitWidth, 0);
-
- while (isa<BitCastInst>(POP.Pointer) || isa<GetElementPtrInst>(POP.Pointer)) {
- if (auto *GEP = dyn_cast<GetElementPtrInst>(POP.Pointer)) {
- APInt LastOffset = POP.Offset;
- if (!GEP->accumulateConstantOffset(DL, POP.Offset)) {
- // Can't handle GEPs with variable indices.
- POP.Offset = LastOffset;
- return POP;
- }
- POP.Pointer = GEP->getPointerOperand();
- } else if (auto *BC = dyn_cast<BitCastInst>(POP.Pointer)) {
- POP.Pointer = BC->getOperand(0);
- }
- }
- return POP;
-}
-
-bool LoadCombine::combineLoads(
- DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> &LoadMap) {
- bool Combined = false;
- for (auto &Loads : LoadMap) {
- if (Loads.second.size() < 2)
- continue;
- std::sort(Loads.second.begin(), Loads.second.end(),
- [](const LoadPOPPair &A, const LoadPOPPair &B) {
- return A.POP.Offset.slt(B.POP.Offset);
- });
- if (aggregateLoads(Loads.second))
- Combined = true;
- }
- return Combined;
-}
-
-/// \brief Try to aggregate loads from a sorted list of loads to be combined.
-///
-/// It is guaranteed that no writes occur between any of the loads. All loads
-/// have the same base pointer. There are at least two loads.
-bool LoadCombine::aggregateLoads(SmallVectorImpl<LoadPOPPair> &Loads) {
- assert(Loads.size() >= 2 && "Insufficient loads!");
- LoadInst *BaseLoad = nullptr;
- SmallVector<LoadPOPPair, 8> AggregateLoads;
- bool Combined = false;
- bool ValidPrevOffset = false;
- APInt PrevOffset;
- uint64_t PrevSize = 0;
- for (auto &L : Loads) {
- if (ValidPrevOffset == false) {
- BaseLoad = L.Load;
- PrevOffset = L.POP.Offset;
- PrevSize = L.Load->getModule()->getDataLayout().getTypeStoreSize(
- L.Load->getType());
- AggregateLoads.push_back(L);
- ValidPrevOffset = true;
- continue;
- }
- if (L.Load->getAlignment() > BaseLoad->getAlignment())
- continue;
- APInt PrevEnd = PrevOffset + PrevSize;
- if (L.POP.Offset.sgt(PrevEnd)) {
- // No other load will be combinable
- if (combineLoads(AggregateLoads))
- Combined = true;
- AggregateLoads.clear();
- ValidPrevOffset = false;
- continue;
- }
- if (L.POP.Offset != PrevEnd)
- // This load is offset less than the size of the last load.
- // FIXME: We may want to handle this case.
- continue;
- PrevOffset = L.POP.Offset;
- PrevSize = L.Load->getModule()->getDataLayout().getTypeStoreSize(
- L.Load->getType());
- AggregateLoads.push_back(L);
- }
- if (combineLoads(AggregateLoads))
- Combined = true;
- return Combined;
-}
-
-/// \brief Given a list of combinable load. Combine the maximum number of them.
-bool LoadCombine::combineLoads(SmallVectorImpl<LoadPOPPair> &Loads) {
- // Remove loads from the end while the size is not a power of 2.
- unsigned TotalSize = 0;
- for (const auto &L : Loads)
- TotalSize += L.Load->getType()->getPrimitiveSizeInBits();
- while (TotalSize != 0 && !isPowerOf2_32(TotalSize))
- TotalSize -= Loads.pop_back_val().Load->getType()->getPrimitiveSizeInBits();
- if (Loads.size() < 2)
- return false;
-
- DEBUG({
- dbgs() << "***** Combining Loads ******\n";
- for (const auto &L : Loads) {
- dbgs() << L.POP.Offset << ": " << *L.Load << "\n";
- }
- });
-
- // Find first load. This is where we put the new load.
- LoadPOPPair FirstLP;
- FirstLP.InsertOrder = -1u;
- for (const auto &L : Loads)
- if (L.InsertOrder < FirstLP.InsertOrder)
- FirstLP = L;
-
- unsigned AddressSpace =
- FirstLP.POP.Pointer->getType()->getPointerAddressSpace();
-
- Builder->SetInsertPoint(FirstLP.Load);
- Value *Ptr = Builder->CreateConstGEP1_64(
- Builder->CreatePointerCast(Loads[0].POP.Pointer,
- Builder->getInt8PtrTy(AddressSpace)),
- Loads[0].POP.Offset.getSExtValue());
- LoadInst *NewLoad = new LoadInst(
- Builder->CreatePointerCast(
- Ptr, PointerType::get(IntegerType::get(Ptr->getContext(), TotalSize),
- Ptr->getType()->getPointerAddressSpace())),
- Twine(Loads[0].Load->getName()) + ".combined", false,
- Loads[0].Load->getAlignment(), FirstLP.Load);
-
- for (const auto &L : Loads) {
- Builder->SetInsertPoint(L.Load);
- Value *V = Builder->CreateExtractInteger(
- L.Load->getModule()->getDataLayout(), NewLoad,
- cast<IntegerType>(L.Load->getType()),
- (L.POP.Offset - Loads[0].POP.Offset).getZExtValue(), "combine.extract");
- L.Load->replaceAllUsesWith(V);
- }
-
- NumLoadsCombined = NumLoadsCombined + Loads.size();
- return true;
-}
-
-bool LoadCombine::runOnBasicBlock(BasicBlock &BB) {
- if (skipBasicBlock(BB))
- return false;
-
- AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
-
- IRBuilder<TargetFolder> TheBuilder(
- BB.getContext(), TargetFolder(BB.getModule()->getDataLayout()));
- Builder = &TheBuilder;
-
- DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> LoadMap;
- AliasSetTracker AST(*AA);
-
- bool Combined = false;
- unsigned Index = 0;
- for (auto &I : BB) {
- if (I.mayThrow() || (I.mayWriteToMemory() && AST.containsUnknown(&I))) {
- if (combineLoads(LoadMap))
- Combined = true;
- LoadMap.clear();
- AST.clear();
- continue;
- }
- LoadInst *LI = dyn_cast<LoadInst>(&I);
- if (!LI)
- continue;
- ++NumLoadsAnalyzed;
- if (!LI->isSimple() || !LI->getType()->isIntegerTy())
- continue;
- auto POP = getPointerOffsetPair(*LI);
- if (!POP.Pointer)
- continue;
- LoadMap[POP.Pointer].push_back({LI, std::move(POP), Index++});
- AST.add(LI);
- }
- if (combineLoads(LoadMap))
- Combined = true;
- return Combined;
-}
-
-char LoadCombine::ID = 0;
-
-BasicBlockPass *llvm::createLoadCombinePass() {
- return new LoadCombine();
-}
-
-INITIALIZE_PASS_BEGIN(LoadCombine, "load-combine", LDCOMBINE_NAME, false, false)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(LoadCombine, "load-combine", LDCOMBINE_NAME, false, false)
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
index cca75a3..ac4dd44 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -20,6 +20,7 @@
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/IR/Dominators.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
@@ -29,32 +30,45 @@ using namespace llvm;
STATISTIC(NumDeleted, "Number of loops deleted");
-/// isLoopDead - Determined if a loop is dead. This assumes that we've already
-/// checked for unique exit and exiting blocks, and that the code is in LCSSA
-/// form.
-bool LoopDeletionPass::isLoopDead(Loop *L, ScalarEvolution &SE,
- SmallVectorImpl<BasicBlock *> &exitingBlocks,
- SmallVectorImpl<BasicBlock *> &exitBlocks,
- bool &Changed, BasicBlock *Preheader) {
- BasicBlock *exitBlock = exitBlocks[0];
-
+/// This function deletes dead loops. The caller of this function needs to
+/// guarantee that the loop is infact dead. Here we handle two kinds of dead
+/// loop. The first kind (\p isLoopDead) is where only invariant values from
+/// within the loop are used outside of it. The second kind (\p
+/// isLoopNeverExecuted) is where the loop is provably never executed. We can
+/// always remove never executed loops since they will not cause any difference
+/// to program behaviour.
+///
+/// This also updates the relevant analysis information in \p DT, \p SE, and \p
+/// LI. It also updates the loop PM if an updater struct is provided.
+// TODO: This function will be used by loop-simplifyCFG as well. So, move this
+// to LoopUtils.cpp
+static void deleteDeadLoop(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
+ LoopInfo &LI, LPMUpdater *Updater = nullptr);
+/// Determines if a loop is dead.
+///
+/// This assumes that we've already checked for unique exit and exiting blocks,
+/// and that the code is in LCSSA form.
+static bool isLoopDead(Loop *L, ScalarEvolution &SE,
+ SmallVectorImpl<BasicBlock *> &ExitingBlocks,
+ BasicBlock *ExitBlock, bool &Changed,
+ BasicBlock *Preheader) {
// Make sure that all PHI entries coming from the loop are loop invariant.
// Because the code is in LCSSA form, any values used outside of the loop
// must pass through a PHI in the exit block, meaning that this check is
// sufficient to guarantee that no loop-variant values are used outside
// of the loop.
- BasicBlock::iterator BI = exitBlock->begin();
+ BasicBlock::iterator BI = ExitBlock->begin();
bool AllEntriesInvariant = true;
bool AllOutgoingValuesSame = true;
while (PHINode *P = dyn_cast<PHINode>(BI)) {
- Value *incoming = P->getIncomingValueForBlock(exitingBlocks[0]);
+ Value *incoming = P->getIncomingValueForBlock(ExitingBlocks[0]);
// Make sure all exiting blocks produce the same incoming value for the exit
// block. If there are different incoming values for different exiting
// blocks, then it is impossible to statically determine which value should
// be used.
AllOutgoingValuesSame =
- all_of(makeArrayRef(exitingBlocks).slice(1), [&](BasicBlock *BB) {
+ all_of(makeArrayRef(ExitingBlocks).slice(1), [&](BasicBlock *BB) {
return incoming == P->getIncomingValueForBlock(BB);
});
@@ -78,95 +92,187 @@ bool LoopDeletionPass::isLoopDead(Loop *L, ScalarEvolution &SE,
// Make sure that no instructions in the block have potential side-effects.
// This includes instructions that could write to memory, and loads that are
- // marked volatile. This could be made more aggressive by using aliasing
- // information to identify readonly and readnone calls.
- for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
- LI != LE; ++LI) {
- for (Instruction &I : **LI) {
- if (I.mayHaveSideEffects())
- return false;
- }
- }
-
+ // marked volatile.
+ for (auto &I : L->blocks())
+ if (any_of(*I, [](Instruction &I) { return I.mayHaveSideEffects(); }))
+ return false;
return true;
}
-/// Remove dead loops, by which we mean loops that do not impact the observable
-/// behavior of the program other than finite running time. Note we do ensure
-/// that this never remove a loop that might be infinite, as doing so could
-/// change the halting/non-halting nature of a program. NOTE: This entire
-/// process relies pretty heavily on LoopSimplify and LCSSA in order to make
-/// various safety checks work.
-bool LoopDeletionPass::runImpl(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
- LoopInfo &loopInfo) {
- assert(L->isLCSSAForm(DT) && "Expected LCSSA!");
+/// This function returns true if there is no viable path from the
+/// entry block to the header of \p L. Right now, it only does
+/// a local search to save compile time.
+static bool isLoopNeverExecuted(Loop *L) {
+ using namespace PatternMatch;
- // We can only remove the loop if there is a preheader that we can
- // branch from after removing it.
- BasicBlock *preheader = L->getLoopPreheader();
- if (!preheader)
- return false;
+ auto *Preheader = L->getLoopPreheader();
+ // TODO: We can relax this constraint, since we just need a loop
+ // predecessor.
+ assert(Preheader && "Needs preheader!");
- // If LoopSimplify form is not available, stay out of trouble.
- if (!L->hasDedicatedExits())
+ if (Preheader == &Preheader->getParent()->getEntryBlock())
return false;
+ // All predecessors of the preheader should have a constant conditional
+ // branch, with the loop's preheader as not-taken.
+ for (auto *Pred: predecessors(Preheader)) {
+ BasicBlock *Taken, *NotTaken;
+ ConstantInt *Cond;
+ if (!match(Pred->getTerminator(),
+ m_Br(m_ConstantInt(Cond), Taken, NotTaken)))
+ return false;
+ if (!Cond->getZExtValue())
+ std::swap(Taken, NotTaken);
+ if (Taken == Preheader)
+ return false;
+ }
+ assert(!pred_empty(Preheader) &&
+ "Preheader should have predecessors at this point!");
+ // All the predecessors have the loop preheader as not-taken target.
+ return true;
+}
+/// Remove a loop if it is dead.
+///
+/// A loop is considered dead if it does not impact the observable behavior of
+/// the program other than finite running time. This never removes a loop that
+/// might be infinite (unless it is never executed), as doing so could change
+/// the halting/non-halting nature of a program.
+///
+/// This entire process relies pretty heavily on LoopSimplify form and LCSSA in
+/// order to make various safety checks work.
+///
+/// \returns true if any changes were made. This may mutate the loop even if it
+/// is unable to delete it due to hoisting trivially loop invariant
+/// instructions out of the loop.
+static bool deleteLoopIfDead(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
+ LoopInfo &LI, LPMUpdater *Updater = nullptr) {
+ assert(L->isLCSSAForm(DT) && "Expected LCSSA!");
+
+ // We can only remove the loop if there is a preheader that we can branch from
+ // after removing it. Also, if LoopSimplify form is not available, stay out
+ // of trouble.
+ BasicBlock *Preheader = L->getLoopPreheader();
+ if (!Preheader || !L->hasDedicatedExits()) {
+ DEBUG(dbgs()
+ << "Deletion requires Loop with preheader and dedicated exits.\n");
+ return false;
+ }
// We can't remove loops that contain subloops. If the subloops were dead,
// they would already have been removed in earlier executions of this pass.
- if (L->begin() != L->end())
+ if (L->begin() != L->end()) {
+ DEBUG(dbgs() << "Loop contains subloops.\n");
return false;
+ }
- SmallVector<BasicBlock *, 4> exitingBlocks;
- L->getExitingBlocks(exitingBlocks);
- SmallVector<BasicBlock *, 4> exitBlocks;
- L->getUniqueExitBlocks(exitBlocks);
+ BasicBlock *ExitBlock = L->getUniqueExitBlock();
+
+ if (ExitBlock && isLoopNeverExecuted(L)) {
+ DEBUG(dbgs() << "Loop is proven to never execute, delete it!");
+ // Set incoming value to undef for phi nodes in the exit block.
+ BasicBlock::iterator BI = ExitBlock->begin();
+ while (PHINode *P = dyn_cast<PHINode>(BI)) {
+ for (unsigned i = 0; i < P->getNumIncomingValues(); i++)
+ P->setIncomingValue(i, UndefValue::get(P->getType()));
+ BI++;
+ }
+ deleteDeadLoop(L, DT, SE, LI, Updater);
+ ++NumDeleted;
+ return true;
+ }
+
+ // The remaining checks below are for a loop being dead because all statements
+ // in the loop are invariant.
+ SmallVector<BasicBlock *, 4> ExitingBlocks;
+ L->getExitingBlocks(ExitingBlocks);
// We require that the loop only have a single exit block. Otherwise, we'd
// be in the situation of needing to be able to solve statically which exit
// block will be branched to, or trying to preserve the branching logic in
// a loop invariant manner.
- if (exitBlocks.size() != 1)
+ if (!ExitBlock) {
+ DEBUG(dbgs() << "Deletion requires single exit block\n");
return false;
-
+ }
// Finally, we have to check that the loop really is dead.
bool Changed = false;
- if (!isLoopDead(L, SE, exitingBlocks, exitBlocks, Changed, preheader))
+ if (!isLoopDead(L, SE, ExitingBlocks, ExitBlock, Changed, Preheader)) {
+ DEBUG(dbgs() << "Loop is not invariant, cannot delete.\n");
return Changed;
+ }
// Don't remove loops for which we can't solve the trip count.
// They could be infinite, in which case we'd be changing program behavior.
const SCEV *S = SE.getMaxBackedgeTakenCount(L);
- if (isa<SCEVCouldNotCompute>(S))
+ if (isa<SCEVCouldNotCompute>(S)) {
+ DEBUG(dbgs() << "Could not compute SCEV MaxBackedgeTakenCount.\n");
return Changed;
+ }
+
+ DEBUG(dbgs() << "Loop is invariant, delete it!");
+ deleteDeadLoop(L, DT, SE, LI, Updater);
+ ++NumDeleted;
+
+ return true;
+}
+
+static void deleteDeadLoop(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
+ LoopInfo &LI, LPMUpdater *Updater) {
+ assert(L->isLCSSAForm(DT) && "Expected LCSSA!");
+ auto *Preheader = L->getLoopPreheader();
+ assert(Preheader && "Preheader should exist!");
// Now that we know the removal is safe, remove the loop by changing the
// branch from the preheader to go to the single exit block.
- BasicBlock *exitBlock = exitBlocks[0];
-
+ //
// Because we're deleting a large chunk of code at once, the sequence in which
- // we remove things is very important to avoid invalidation issues. Don't
- // mess with this unless you have good reason and know what you're doing.
+ // we remove things is very important to avoid invalidation issues.
+
+ // If we have an LPM updater, tell it about the loop being removed.
+ if (Updater)
+ Updater->markLoopAsDeleted(*L);
// Tell ScalarEvolution that the loop is deleted. Do this before
// deleting the loop so that ScalarEvolution can look at the loop
// to determine what it needs to clean up.
SE.forgetLoop(L);
- // Connect the preheader directly to the exit block.
- TerminatorInst *TI = preheader->getTerminator();
- TI->replaceUsesOfWith(L->getHeader(), exitBlock);
+ auto *ExitBlock = L->getUniqueExitBlock();
+ assert(ExitBlock && "Should have a unique exit block!");
- // Rewrite phis in the exit block to get their inputs from
- // the preheader instead of the exiting block.
- BasicBlock *exitingBlock = exitingBlocks[0];
- BasicBlock::iterator BI = exitBlock->begin();
+ assert(L->hasDedicatedExits() && "Loop should have dedicated exits!");
+
+ // Connect the preheader directly to the exit block.
+ // Even when the loop is never executed, we cannot remove the edge from the
+ // source block to the exit block. Consider the case where the unexecuted loop
+ // branches back to an outer loop. If we deleted the loop and removed the edge
+ // coming to this inner loop, this will break the outer loop structure (by
+ // deleting the backedge of the outer loop). If the outer loop is indeed a
+ // non-loop, it will be deleted in a future iteration of loop deletion pass.
+ Preheader->getTerminator()->replaceUsesOfWith(L->getHeader(), ExitBlock);
+
+ // Rewrite phis in the exit block to get their inputs from the Preheader
+ // instead of the exiting block.
+ BasicBlock::iterator BI = ExitBlock->begin();
while (PHINode *P = dyn_cast<PHINode>(BI)) {
- int j = P->getBasicBlockIndex(exitingBlock);
- assert(j >= 0 && "Can't find exiting block in exit block's phi node!");
- P->setIncomingBlock(j, preheader);
- for (unsigned i = 1; i < exitingBlocks.size(); ++i)
- P->removeIncomingValue(exitingBlocks[i]);
+ // Set the zero'th element of Phi to be from the preheader and remove all
+ // other incoming values. Given the loop has dedicated exits, all other
+ // incoming values must be from the exiting blocks.
+ int PredIndex = 0;
+ P->setIncomingBlock(PredIndex, Preheader);
+ // Removes all incoming values from all other exiting blocks (including
+ // duplicate values from an exiting block).
+ // Nuke all entries except the zero'th entry which is the preheader entry.
+ // NOTE! We need to remove Incoming Values in the reverse order as done
+ // below, to keep the indices valid for deletion (removeIncomingValues
+ // updates getNumIncomingValues and shifts all values down into the operand
+ // being deleted).
+ for (unsigned i = 0, e = P->getNumIncomingValues() - 1; i != e; ++i)
+ P->removeIncomingValue(e-i, false);
+
+ assert((P->getNumIncomingValues() == 1 &&
+ P->getIncomingBlock(PredIndex) == Preheader) &&
+ "Should have exactly one value and that's from the preheader!");
++BI;
}
@@ -175,11 +281,11 @@ bool LoopDeletionPass::runImpl(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
SmallVector<DomTreeNode*, 8> ChildNodes;
for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
LI != LE; ++LI) {
- // Move all of the block's children to be children of the preheader, which
+ // Move all of the block's children to be children of the Preheader, which
// allows us to remove the domtree entry for the block.
ChildNodes.insert(ChildNodes.begin(), DT[*LI]->begin(), DT[*LI]->end());
for (DomTreeNode *ChildNode : ChildNodes) {
- DT.changeImmediateDominator(ChildNode, DT[preheader]);
+ DT.changeImmediateDominator(ChildNode, DT[Preheader]);
}
ChildNodes.clear();
@@ -204,22 +310,19 @@ bool LoopDeletionPass::runImpl(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
SmallPtrSet<BasicBlock *, 8> blocks;
blocks.insert(L->block_begin(), L->block_end());
for (BasicBlock *BB : blocks)
- loopInfo.removeBlock(BB);
+ LI.removeBlock(BB);
// The last step is to update LoopInfo now that we've eliminated this loop.
- loopInfo.markAsRemoved(L);
- Changed = true;
-
- ++NumDeleted;
-
- return Changed;
+ LI.markAsRemoved(L);
}
PreservedAnalyses LoopDeletionPass::run(Loop &L, LoopAnalysisManager &AM,
LoopStandardAnalysisResults &AR,
- LPMUpdater &) {
- bool Changed = runImpl(&L, AR.DT, AR.SE, AR.LI);
- if (!Changed)
+ LPMUpdater &Updater) {
+
+ DEBUG(dbgs() << "Analyzing Loop for deletion: ");
+ DEBUG(L.dump());
+ if (!deleteLoopIfDead(&L, AR.DT, AR.SE, AR.LI, &Updater))
return PreservedAnalyses::all();
return getLoopPassPreservedAnalyses();
@@ -254,11 +357,11 @@ Pass *llvm::createLoopDeletionPass() { return new LoopDeletionLegacyPass(); }
bool LoopDeletionLegacyPass::runOnLoop(Loop *L, LPPassManager &) {
if (skipLoop(L))
return false;
-
DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- LoopInfo &loopInfo = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- LoopDeletionPass Impl;
- return Impl.runImpl(L, DT, SE, loopInfo);
+ DEBUG(dbgs() << "Analyzing Loop for deletion: ");
+ DEBUG(L->dump());
+ return deleteLoopIfDead(L, DT, SE, LI);
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
index 19716b2..3624bba 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -812,29 +812,29 @@ private:
const RuntimePointerChecking *RtPtrChecking) {
SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
- std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks),
- [&](const RuntimePointerChecking::PointerCheck &Check) {
- for (unsigned PtrIdx1 : Check.first->Members)
- for (unsigned PtrIdx2 : Check.second->Members)
- // Only include this check if there is a pair of pointers
- // that require checking and the pointers fall into
- // separate partitions.
- //
- // (Note that we already know at this point that the two
- // pointer groups need checking but it doesn't follow
- // that each pair of pointers within the two groups need
- // checking as well.
- //
- // In other words we don't want to include a check just
- // because there is a pair of pointers between the two
- // pointer groups that require checks and a different
- // pair whose pointers fall into different partitions.)
- if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) &&
- !RuntimePointerChecking::arePointersInSamePartition(
- PtrToPartition, PtrIdx1, PtrIdx2))
- return true;
- return false;
- });
+ copy_if(AllChecks, std::back_inserter(Checks),
+ [&](const RuntimePointerChecking::PointerCheck &Check) {
+ for (unsigned PtrIdx1 : Check.first->Members)
+ for (unsigned PtrIdx2 : Check.second->Members)
+ // Only include this check if there is a pair of pointers
+ // that require checking and the pointers fall into
+ // separate partitions.
+ //
+ // (Note that we already know at this point that the two
+ // pointer groups need checking but it doesn't follow
+ // that each pair of pointers within the two groups need
+ // checking as well.
+ //
+ // In other words we don't want to include a check just
+ // because there is a pair of pointers between the two
+ // pointer groups that require checks and a different
+ // pair whose pointers fall into different partitions.)
+ if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) &&
+ !RuntimePointerChecking::arePointersInSamePartition(
+ PtrToPartition, PtrIdx1, PtrIdx2))
+ return true;
+ return false;
+ });
return Checks;
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 5fec51c..4a6a35c 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -110,6 +110,16 @@ private:
bool HasMemset;
bool HasMemsetPattern;
bool HasMemcpy;
+ /// Return code for isLegalStore()
+ enum LegalStoreKind {
+ None = 0,
+ Memset,
+ MemsetPattern,
+ Memcpy,
+ UnorderedAtomicMemcpy,
+ DontUse // Dummy retval never to be used. Allows catching errors in retval
+ // handling.
+ };
/// \name Countable Loop Idiom Handling
/// @{
@@ -119,8 +129,7 @@ private:
SmallVectorImpl<BasicBlock *> &ExitBlocks);
void collectStores(BasicBlock *BB);
- bool isLegalStore(StoreInst *SI, bool &ForMemset, bool &ForMemsetPattern,
- bool &ForMemcpy);
+ LegalStoreKind isLegalStore(StoreInst *SI);
bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
bool ForMemset);
bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
@@ -144,6 +153,10 @@ private:
bool recognizePopcount();
void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
PHINode *CntPhi, Value *Var);
+ bool recognizeAndInsertCTLZ();
+ void transformLoopToCountable(BasicBlock *PreCondBB, Instruction *CntInst,
+ PHINode *CntPhi, Value *Var, const DebugLoc DL,
+ bool ZeroCheck, bool IsCntPhiUsedOutsideLoop);
/// @}
};
@@ -236,9 +249,9 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) {
ApplyCodeSizeHeuristics =
L->getHeader()->getParent()->optForSize() && UseLIRCodeSizeHeurs;
- HasMemset = TLI->has(LibFunc::memset);
- HasMemsetPattern = TLI->has(LibFunc::memset_pattern16);
- HasMemcpy = TLI->has(LibFunc::memcpy);
+ HasMemset = TLI->has(LibFunc_memset);
+ HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
+ HasMemcpy = TLI->has(LibFunc_memcpy);
if (HasMemset || HasMemsetPattern || HasMemcpy)
if (SE->hasLoopInvariantBackedgeTakenCount(L))
@@ -339,15 +352,24 @@ static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
}
-bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset,
- bool &ForMemsetPattern, bool &ForMemcpy) {
+LoopIdiomRecognize::LegalStoreKind
+LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
+
// Don't touch volatile stores.
- if (!SI->isSimple())
- return false;
+ if (SI->isVolatile())
+ return LegalStoreKind::None;
+ // We only want simple or unordered-atomic stores.
+ if (!SI->isUnordered())
+ return LegalStoreKind::None;
+
+ // Don't convert stores of non-integral pointer types to memsets (which stores
+ // integers).
+ if (DL->isNonIntegralPointerType(SI->getValueOperand()->getType()))
+ return LegalStoreKind::None;
// Avoid merging nontemporal stores.
if (SI->getMetadata(LLVMContext::MD_nontemporal))
- return false;
+ return LegalStoreKind::None;
Value *StoredVal = SI->getValueOperand();
Value *StorePtr = SI->getPointerOperand();
@@ -355,7 +377,7 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset,
// Reject stores that are so large that they overflow an unsigned.
uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
if ((SizeInBits & 7) || (SizeInBits >> 32) != 0)
- return false;
+ return LegalStoreKind::None;
// See if the pointer expression is an AddRec like {base,+,1} on the current
// loop, which indicates a strided store. If we have something else, it's a
@@ -363,11 +385,11 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset,
const SCEVAddRecExpr *StoreEv =
dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
- return false;
+ return LegalStoreKind::None;
// Check to see if we have a constant stride.
if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
- return false;
+ return LegalStoreKind::None;
// See if the store can be turned into a memset.
@@ -378,22 +400,23 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset,
Value *SplatValue = isBytewiseValue(StoredVal);
Constant *PatternValue = nullptr;
+ // Note: memset and memset_pattern on unordered-atomic is yet not supported
+ bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
+
// If we're allowed to form a memset, and the stored value would be
// acceptable for memset, use it.
- if (HasMemset && SplatValue &&
+ if (!UnorderedAtomic && HasMemset && SplatValue &&
// Verify that the stored value is loop invariant. If not, we can't
// promote the memset.
CurLoop->isLoopInvariant(SplatValue)) {
// It looks like we can use SplatValue.
- ForMemset = true;
- return true;
- } else if (HasMemsetPattern &&
+ return LegalStoreKind::Memset;
+ } else if (!UnorderedAtomic && HasMemsetPattern &&
// Don't create memset_pattern16s with address spaces.
StorePtr->getType()->getPointerAddressSpace() == 0 &&
(PatternValue = getMemSetPatternValue(StoredVal, DL))) {
// It looks like we can use PatternValue!
- ForMemsetPattern = true;
- return true;
+ return LegalStoreKind::MemsetPattern;
}
// Otherwise, see if the store can be turned into a memcpy.
@@ -403,12 +426,17 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset,
APInt Stride = getStoreStride(StoreEv);
unsigned StoreSize = getStoreSizeInBytes(SI, DL);
if (StoreSize != Stride && StoreSize != -Stride)
- return false;
+ return LegalStoreKind::None;
// The store must be feeding a non-volatile load.
LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
- if (!LI || !LI->isSimple())
- return false;
+
+ // Only allow non-volatile loads
+ if (!LI || LI->isVolatile())
+ return LegalStoreKind::None;
+ // Only allow simple or unordered-atomic loads
+ if (!LI->isUnordered())
+ return LegalStoreKind::None;
// See if the pointer expression is an AddRec like {base,+,1} on the current
// loop, which indicates a strided load. If we have something else, it's a
@@ -416,18 +444,19 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset,
const SCEVAddRecExpr *LoadEv =
dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
- return false;
+ return LegalStoreKind::None;
// The store and load must share the same stride.
if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
- return false;
+ return LegalStoreKind::None;
// Success. This store can be converted into a memcpy.
- ForMemcpy = true;
- return true;
+ UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
+ return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
+ : LegalStoreKind::Memcpy;
}
// This store can't be transformed into a memset/memcpy.
- return false;
+ return LegalStoreKind::None;
}
void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
@@ -439,24 +468,29 @@ void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
if (!SI)
continue;
- bool ForMemset = false;
- bool ForMemsetPattern = false;
- bool ForMemcpy = false;
// Make sure this is a strided store with a constant stride.
- if (!isLegalStore(SI, ForMemset, ForMemsetPattern, ForMemcpy))
- continue;
-
- // Save the store locations.
- if (ForMemset) {
+ switch (isLegalStore(SI)) {
+ case LegalStoreKind::None:
+ // Nothing to do
+ break;
+ case LegalStoreKind::Memset: {
// Find the base pointer.
Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL);
StoreRefsForMemset[Ptr].push_back(SI);
- } else if (ForMemsetPattern) {
+ } break;
+ case LegalStoreKind::MemsetPattern: {
// Find the base pointer.
Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL);
StoreRefsForMemsetPattern[Ptr].push_back(SI);
- } else if (ForMemcpy)
+ } break;
+ case LegalStoreKind::Memcpy:
+ case LegalStoreKind::UnorderedAtomicMemcpy:
StoreRefsForMemcpy.push_back(SI);
+ break;
+ default:
+ assert(false && "unhandled return value");
+ break;
+ }
}
}
@@ -494,7 +528,7 @@ bool LoopIdiomRecognize::runOnLoopBlock(
Instruction *Inst = &*I++;
// Look for memset instructions, which may be optimized to a larger memset.
if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
- WeakVH InstPtr(&*I);
+ WeakTrackingVH InstPtr(&*I);
if (!processLoopMemSet(MSI, BECount))
continue;
MadeChange = true;
@@ -778,6 +812,11 @@ bool LoopIdiomRecognize::processLoopStridedStore(
if (NegStride)
Start = getStartForNegStride(Start, BECount, IntPtr, StoreSize, SE);
+ // TODO: ideally we should still be able to generate memset if SCEV expander
+ // is taught to generate the dependencies at the latest point.
+ if (!isSafeToExpand(Start, *SE))
+ return false;
+
// Okay, we have a strided store "p[i]" of a splattable value. We can turn
// this into a memset in the loop preheader now if we want. However, this
// would be unsafe to do if there is anything else in the loop that may read
@@ -809,6 +848,11 @@ bool LoopIdiomRecognize::processLoopStridedStore(
SCEV::FlagNUW);
}
+ // TODO: ideally we should still be able to generate memset if SCEV expander
+ // is taught to generate the dependencies at the latest point.
+ if (!isSafeToExpand(NumBytesS, *SE))
+ return false;
+
Value *NumBytes =
Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator());
@@ -823,7 +867,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
Module *M = TheStore->getModule();
Value *MSP =
M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(),
- Int8PtrTy, Int8PtrTy, IntPtr, (void *)nullptr);
+ Int8PtrTy, Int8PtrTy, IntPtr);
inferLibFuncAttributes(*M->getFunction("memset_pattern16"), *TLI);
// Otherwise we should form a memset_pattern16. PatternValue is known to be
@@ -851,10 +895,10 @@ bool LoopIdiomRecognize::processLoopStridedStore(
/// If the stored value is a strided load in the same loop with the same stride
/// this may be transformable into a memcpy. This kicks in for stuff like
-/// for (i) A[i] = B[i];
+/// for (i) A[i] = B[i];
bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
const SCEV *BECount) {
- assert(SI->isSimple() && "Expected only non-volatile stores.");
+ assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
Value *StorePtr = SI->getPointerOperand();
const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
@@ -864,7 +908,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
// The store must be feeding a non-volatile load.
LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
- assert(LI->isSimple() && "Expected only non-volatile stores.");
+ assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
// See if the pointer expression is an AddRec like {base,+,1} on the current
// loop, which indicates a strided load. If we have something else, it's a
@@ -938,6 +982,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
const SCEV *NumBytesS =
SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW);
+
if (StoreSize != 1)
NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize),
SCEV::FlagNUW);
@@ -945,9 +990,37 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
Value *NumBytes =
Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());
- CallInst *NewCall =
- Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes,
- std::min(SI->getAlignment(), LI->getAlignment()));
+ unsigned Align = std::min(SI->getAlignment(), LI->getAlignment());
+ CallInst *NewCall = nullptr;
+ // Check whether to generate an unordered atomic memcpy:
+ // If the load or store are atomic, then they must neccessarily be unordered
+ // by previous checks.
+ if (!SI->isAtomic() && !LI->isAtomic())
+ NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, Align);
+ else {
+ // We cannot allow unaligned ops for unordered load/store, so reject
+ // anything where the alignment isn't at least the element size.
+ if (Align < StoreSize)
+ return false;
+
+ // If the element.atomic memcpy is not lowered into explicit
+ // loads/stores later, then it will be lowered into an element-size
+ // specific lib call. If the lib call doesn't exist for our store size, then
+ // we shouldn't generate the memcpy.
+ if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
+ return false;
+
+ NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
+ StoreBasePtr, LoadBasePtr, NumBytes, StoreSize);
+
+ // Propagate alignment info onto the pointer args. Note that unordered
+ // atomic loads/stores are *required* by the spec to have an alignment
+ // but non-atomic loads/stores may not.
+ NewCall->addParamAttr(0, Attribute::getWithAlignment(NewCall->getContext(),
+ SI->getAlignment()));
+ NewCall->addParamAttr(1, Attribute::getWithAlignment(NewCall->getContext(),
+ LI->getAlignment()));
+ }
NewCall->setDebugLoc(SI->getDebugLoc());
DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n"
@@ -979,7 +1052,7 @@ bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
}
bool LoopIdiomRecognize::runOnNoncountableLoop() {
- return recognizePopcount();
+ return recognizePopcount() || recognizeAndInsertCTLZ();
}
/// Check if the given conditional branch is based on the comparison between
@@ -1007,6 +1080,17 @@ static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry) {
return nullptr;
}
+// Check if the recurrence variable `VarX` is in the right form to create
+// the idiom. Returns the value coerced to a PHINode if so.
+static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX,
+ BasicBlock *LoopEntry) {
+ auto *PhiX = dyn_cast<PHINode>(VarX);
+ if (PhiX && PhiX->getParent() == LoopEntry &&
+ (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
+ return PhiX;
+ return nullptr;
+}
+
/// Return true iff the idiom is detected in the loop.
///
/// Additionally:
@@ -1076,19 +1160,15 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
if (!Dec ||
!((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) ||
(SubInst->getOpcode() == Instruction::Add &&
- Dec->isAllOnesValue()))) {
+ Dec->isMinusOne()))) {
return false;
}
}
// step 3: Check the recurrence of variable X
- {
- PhiX = dyn_cast<PHINode>(VarX1);
- if (!PhiX ||
- (PhiX->getOperand(0) != DefX2 && PhiX->getOperand(1) != DefX2)) {
- return false;
- }
- }
+ PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
+ if (!PhiX)
+ return false;
// step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
{
@@ -1104,8 +1184,8 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
if (!Inc || !Inc->isOne())
continue;
- PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0));
- if (!Phi || Phi->getParent() != LoopEntry)
+ PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
+ if (!Phi)
continue;
// Check if the result of the instruction is live of the loop.
@@ -1144,6 +1224,169 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
return true;
}
+/// Return true if the idiom is detected in the loop.
+///
+/// Additionally:
+/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
+/// or nullptr if there is no such.
+/// 2) \p CntPhi is set to the corresponding phi node
+/// or nullptr if there is no such.
+/// 3) \p Var is set to the value whose CTLZ could be used.
+/// 4) \p DefX is set to the instruction calculating Loop exit condition.
+///
+/// The core idiom we are trying to detect is:
+/// \code
+/// if (x0 == 0)
+/// goto loop-exit // the precondition of the loop
+/// cnt0 = init-val;
+/// do {
+/// x = phi (x0, x.next); //PhiX
+/// cnt = phi(cnt0, cnt.next);
+///
+/// cnt.next = cnt + 1;
+/// ...
+/// x.next = x >> 1; // DefX
+/// ...
+/// } while(x.next != 0);
+///
+/// loop-exit:
+/// \endcode
+static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX,
+ Instruction *&CntInst, PHINode *&CntPhi,
+ Instruction *&DefX) {
+ BasicBlock *LoopEntry;
+ Value *VarX = nullptr;
+
+ DefX = nullptr;
+ PhiX = nullptr;
+ CntInst = nullptr;
+ CntPhi = nullptr;
+ LoopEntry = *(CurLoop->block_begin());
+
+ // step 1: Check if the loop-back branch is in desirable form.
+ if (Value *T = matchCondition(
+ dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
+ DefX = dyn_cast<Instruction>(T);
+ else
+ return false;
+
+ // step 2: detect instructions corresponding to "x.next = x >> 1"
+ if (!DefX || DefX->getOpcode() != Instruction::AShr)
+ return false;
+ if (ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1)))
+ if (!Shft || !Shft->isOne())
+ return false;
+ VarX = DefX->getOperand(0);
+
+ // step 3: Check the recurrence of variable X
+ PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
+ if (!PhiX)
+ return false;
+
+ // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
+ // TODO: We can skip the step. If loop trip count is known (CTLZ),
+ // then all uses of "cnt.next" could be optimized to the trip count
+ // plus "cnt0". Currently it is not optimized.
+ // This step could be used to detect POPCNT instruction:
+ // cnt.next = cnt + (x.next & 1)
+ for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
+ IterE = LoopEntry->end();
+ Iter != IterE; Iter++) {
+ Instruction *Inst = &*Iter;
+ if (Inst->getOpcode() != Instruction::Add)
+ continue;
+
+ ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
+ if (!Inc || !Inc->isOne())
+ continue;
+
+ PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
+ if (!Phi)
+ continue;
+
+ CntInst = Inst;
+ CntPhi = Phi;
+ break;
+ }
+ if (!CntInst)
+ return false;
+
+ return true;
+}
+
+/// Recognize CTLZ idiom in a non-countable loop and convert the loop
+/// to countable (with CTLZ trip count).
+/// If CTLZ inserted as a new trip count returns true; otherwise, returns false.
+bool LoopIdiomRecognize::recognizeAndInsertCTLZ() {
+ // Give up if the loop has multiple blocks or multiple backedges.
+ if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
+ return false;
+
+ Instruction *CntInst, *DefX;
+ PHINode *CntPhi, *PhiX;
+ if (!detectCTLZIdiom(CurLoop, PhiX, CntInst, CntPhi, DefX))
+ return false;
+
+ bool IsCntPhiUsedOutsideLoop = false;
+ for (User *U : CntPhi->users())
+ if (!CurLoop->contains(dyn_cast<Instruction>(U))) {
+ IsCntPhiUsedOutsideLoop = true;
+ break;
+ }
+ bool IsCntInstUsedOutsideLoop = false;
+ for (User *U : CntInst->users())
+ if (!CurLoop->contains(dyn_cast<Instruction>(U))) {
+ IsCntInstUsedOutsideLoop = true;
+ break;
+ }
+ // If both CntInst and CntPhi are used outside the loop the profitability
+ // is questionable.
+ if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
+ return false;
+
+ // For some CPUs result of CTLZ(X) intrinsic is undefined
+ // when X is 0. If we can not guarantee X != 0, we need to check this
+ // when expand.
+ bool ZeroCheck = false;
+ // It is safe to assume Preheader exist as it was checked in
+ // parent function RunOnLoop.
+ BasicBlock *PH = CurLoop->getLoopPreheader();
+ Value *InitX = PhiX->getIncomingValueForBlock(PH);
+ // If we check X != 0 before entering the loop we don't need a zero
+ // check in CTLZ intrinsic, but only if Cnt Phi is not used outside of the
+ // loop (if it is used we count CTLZ(X >> 1)).
+ if (!IsCntPhiUsedOutsideLoop)
+ if (BasicBlock *PreCondBB = PH->getSinglePredecessor())
+ if (BranchInst *PreCondBr =
+ dyn_cast<BranchInst>(PreCondBB->getTerminator())) {
+ if (matchCondition(PreCondBr, PH) == InitX)
+ ZeroCheck = true;
+ }
+
+ // Check if CTLZ intrinsic is profitable. Assume it is always profitable
+ // if we delete the loop (the loop has only 6 instructions):
+ // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
+ // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
+ // %shr = ashr %n.addr.0, 1
+ // %tobool = icmp eq %shr, 0
+ // %inc = add nsw %i.0, 1
+ // br i1 %tobool
+
+ IRBuilder<> Builder(PH->getTerminator());
+ SmallVector<const Value *, 2> Ops =
+ {InitX, ZeroCheck ? Builder.getTrue() : Builder.getFalse()};
+ ArrayRef<const Value *> Args(Ops);
+ if (CurLoop->getHeader()->size() != 6 &&
+ TTI->getIntrinsicCost(Intrinsic::ctlz, InitX->getType(), Args) >
+ TargetTransformInfo::TCC_Basic)
+ return false;
+
+ const DebugLoc DL = DefX->getDebugLoc();
+ transformLoopToCountable(PH, CntInst, CntPhi, InitX, DL, ZeroCheck,
+ IsCntPhiUsedOutsideLoop);
+ return true;
+}
+
/// Recognizes a population count idiom in a non-countable loop.
///
/// If detected, transforms the relevant code to issue the popcount intrinsic
@@ -1207,6 +1450,134 @@ static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
return CI;
}
+static CallInst *createCTLZIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
+ const DebugLoc &DL, bool ZeroCheck) {
+ Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()};
+ Type *Tys[] = {Val->getType()};
+
+ Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
+ Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctlz, Tys);
+ CallInst *CI = IRBuilder.CreateCall(Func, Ops);
+ CI->setDebugLoc(DL);
+
+ return CI;
+}
+
+/// Transform the following loop:
+/// loop:
+/// CntPhi = PHI [Cnt0, CntInst]
+/// PhiX = PHI [InitX, DefX]
+/// CntInst = CntPhi + 1
+/// DefX = PhiX >> 1
+// LOOP_BODY
+/// Br: loop if (DefX != 0)
+/// Use(CntPhi) or Use(CntInst)
+///
+/// Into:
+/// If CntPhi used outside the loop:
+/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
+/// Count = CountPrev + 1
+/// else
+/// Count = BitWidth(InitX) - CTLZ(InitX)
+/// loop:
+/// CntPhi = PHI [Cnt0, CntInst]
+/// PhiX = PHI [InitX, DefX]
+/// PhiCount = PHI [Count, Dec]
+/// CntInst = CntPhi + 1
+/// DefX = PhiX >> 1
+/// Dec = PhiCount - 1
+/// LOOP_BODY
+/// Br: loop if (Dec != 0)
+/// Use(CountPrev + Cnt0) // Use(CntPhi)
+/// or
+/// Use(Count + Cnt0) // Use(CntInst)
+///
+/// If LOOP_BODY is empty the loop will be deleted.
+/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
+void LoopIdiomRecognize::transformLoopToCountable(
+ BasicBlock *Preheader, Instruction *CntInst, PHINode *CntPhi, Value *InitX,
+ const DebugLoc DL, bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
+ BranchInst *PreheaderBr = dyn_cast<BranchInst>(Preheader->getTerminator());
+
+ // Step 1: Insert the CTLZ instruction at the end of the preheader block
+ // Count = BitWidth - CTLZ(InitX);
+ // If there are uses of CntPhi create:
+ // CountPrev = BitWidth - CTLZ(InitX >> 1);
+ IRBuilder<> Builder(PreheaderBr);
+ Builder.SetCurrentDebugLocation(DL);
+ Value *CTLZ, *Count, *CountPrev, *NewCount, *InitXNext;
+
+ if (IsCntPhiUsedOutsideLoop)
+ InitXNext = Builder.CreateAShr(InitX,
+ ConstantInt::get(InitX->getType(), 1));
+ else
+ InitXNext = InitX;
+ CTLZ = createCTLZIntrinsic(Builder, InitXNext, DL, ZeroCheck);
+ Count = Builder.CreateSub(
+ ConstantInt::get(CTLZ->getType(),
+ CTLZ->getType()->getIntegerBitWidth()),
+ CTLZ);
+ if (IsCntPhiUsedOutsideLoop) {
+ CountPrev = Count;
+ Count = Builder.CreateAdd(
+ CountPrev,
+ ConstantInt::get(CountPrev->getType(), 1));
+ }
+ if (IsCntPhiUsedOutsideLoop)
+ NewCount = Builder.CreateZExtOrTrunc(CountPrev,
+ cast<IntegerType>(CntInst->getType()));
+ else
+ NewCount = Builder.CreateZExtOrTrunc(Count,
+ cast<IntegerType>(CntInst->getType()));
+
+ // If the CTLZ counter's initial value is not zero, insert Add Inst.
+ Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
+ ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
+ if (!InitConst || !InitConst->isZero())
+ NewCount = Builder.CreateAdd(NewCount, CntInitVal);
+
+ // Step 2: Insert new IV and loop condition:
+ // loop:
+ // ...
+ // PhiCount = PHI [Count, Dec]
+ // ...
+ // Dec = PhiCount - 1
+ // ...
+ // Br: loop if (Dec != 0)
+ BasicBlock *Body = *(CurLoop->block_begin());
+ auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator());
+ ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
+ Type *Ty = Count->getType();
+
+ PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
+
+ Builder.SetInsertPoint(LbCond);
+ Instruction *TcDec = cast<Instruction>(
+ Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
+ "tcdec", false, true));
+
+ TcPhi->addIncoming(Count, Preheader);
+ TcPhi->addIncoming(TcDec, Body);
+
+ CmpInst::Predicate Pred =
+ (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
+ LbCond->setPredicate(Pred);
+ LbCond->setOperand(0, TcDec);
+ LbCond->setOperand(1, ConstantInt::get(Ty, 0));
+
+ // Step 3: All the references to the original counter outside
+ // the loop are replaced with the NewCount -- the value returned from
+ // __builtin_ctlz(x).
+ if (IsCntPhiUsedOutsideLoop)
+ CntPhi->replaceUsesOutsideBlock(NewCount, Body);
+ else
+ CntInst->replaceUsesOutsideBlock(NewCount, Body);
+
+ // step 4: Forget the "non-computable" trip-count SCEV associated with the
+ // loop. The loop would otherwise not be deleted even if it becomes empty.
+ SE->forgetLoop(CurLoop);
+}
+
void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
Instruction *CntInst,
PHINode *CntPhi, Value *Var) {
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
index 69102d1..af09556 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -77,7 +77,7 @@ static bool SimplifyLoopInst(Loop *L, DominatorTree *DT, LoopInfo *LI,
// Don't bother simplifying unused instructions.
if (!I->use_empty()) {
- Value *V = SimplifyInstruction(I, DL, TLI, DT, AC);
+ Value *V = SimplifyInstruction(I, {DL, TLI, DT, AC});
if (V && LI->replacementPreservesLCSSAForm(I, V)) {
// Mark all uses for resimplification next time round the loop.
for (User *U : I->users())
@@ -189,7 +189,9 @@ PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
if (!SimplifyLoopInst(&L, &AR.DT, &AR.LI, &AR.AC, &AR.TLI))
return PreservedAnalyses::all();
- return getLoopPassPreservedAnalyses();
+ auto PA = getLoopPassPreservedAnalyses();
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
}
char LoopInstSimplifyLegacyPass::ID = 0;
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index e9f84ed..2e0d8e0 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -22,6 +22,7 @@
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopIterator.h"
#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
@@ -39,7 +40,7 @@
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
+
using namespace llvm;
#define DEBUG_TYPE "loop-interchange"
@@ -323,9 +324,10 @@ static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) {
class LoopInterchangeLegality {
public:
LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
- LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA)
+ LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA,
+ OptimizationRemarkEmitter *ORE)
: OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
- PreserveLCSSA(PreserveLCSSA), InnerLoopHasReduction(false) {}
+ PreserveLCSSA(PreserveLCSSA), ORE(ORE), InnerLoopHasReduction(false) {}
/// Check if the loops can be interchanged.
bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
@@ -353,6 +355,8 @@ private:
LoopInfo *LI;
DominatorTree *DT;
bool PreserveLCSSA;
+ /// Interface to emit optimization remarks.
+ OptimizationRemarkEmitter *ORE;
bool InnerLoopHasReduction;
};
@@ -361,8 +365,9 @@ private:
/// loop.
class LoopInterchangeProfitability {
public:
- LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE)
- : OuterLoop(Outer), InnerLoop(Inner), SE(SE) {}
+ LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
+ OptimizationRemarkEmitter *ORE)
+ : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
/// Check if the loop interchange is profitable.
bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId,
@@ -376,6 +381,8 @@ private:
/// Scev analysis.
ScalarEvolution *SE;
+ /// Interface to emit optimization remarks.
+ OptimizationRemarkEmitter *ORE;
};
/// LoopInterchangeTransform interchanges the loop.
@@ -422,6 +429,9 @@ struct LoopInterchange : public FunctionPass {
DependenceInfo *DI;
DominatorTree *DT;
bool PreserveLCSSA;
+ /// Interface to emit optimization remarks.
+ OptimizationRemarkEmitter *ORE;
+
LoopInterchange()
: FunctionPass(ID), SE(nullptr), LI(nullptr), DI(nullptr), DT(nullptr) {
initializeLoopInterchangePass(*PassRegistry::getPassRegistry());
@@ -435,6 +445,7 @@ struct LoopInterchange : public FunctionPass {
AU.addRequired<DependenceAnalysisWrapperPass>();
AU.addRequiredID(LoopSimplifyID);
AU.addRequiredID(LCSSAID);
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
}
bool runOnFunction(Function &F) override {
@@ -446,6 +457,7 @@ struct LoopInterchange : public FunctionPass {
DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI();
auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
DT = DTWP ? &DTWP->getDomTree() : nullptr;
+ ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
// Build up a worklist of loop pairs to analyze.
@@ -575,18 +587,23 @@ struct LoopInterchange : public FunctionPass {
Loop *OuterLoop = LoopList[OuterLoopId];
LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, LI, DT,
- PreserveLCSSA);
+ PreserveLCSSA, ORE);
if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
DEBUG(dbgs() << "Not interchanging Loops. Cannot prove legality\n");
return false;
}
DEBUG(dbgs() << "Loops are legal to interchange\n");
- LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE);
+ LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) {
DEBUG(dbgs() << "Interchanging loops not profitable\n");
return false;
}
+ ORE->emit(OptimizationRemark(DEBUG_TYPE, "Interchanged",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Loop interchanged with enclosing loop.");
+
LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT,
LoopNestExit, LIL.hasInnerLoopReduction());
LIT.transform();
@@ -757,13 +774,28 @@ bool LoopInterchangeLegality::currentLimitations() {
PHINode *InnerInductionVar;
SmallVector<PHINode *, 8> Inductions;
SmallVector<PHINode *, 8> Reductions;
- if (!findInductionAndReductions(InnerLoop, Inductions, Reductions))
+ if (!findInductionAndReductions(InnerLoop, Inductions, Reductions)) {
+ DEBUG(dbgs() << "Only inner loops with induction or reduction PHI nodes "
+ << "are supported currently.\n");
+ ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+ "UnsupportedPHIInner",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Only inner loops with induction or reduction PHI nodes can be"
+ " interchange currently.");
return true;
+ }
// TODO: Currently we handle only loops with 1 induction variable.
if (Inductions.size() != 1) {
DEBUG(dbgs() << "We currently only support loops with 1 induction variable."
<< "Failed to interchange due to current limitation\n");
+ ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+ "MultiInductionInner",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Only inner loops with 1 induction variable can be "
+ "interchanged currently.");
return true;
}
if (Reductions.size() > 0)
@@ -771,32 +803,80 @@ bool LoopInterchangeLegality::currentLimitations() {
InnerInductionVar = Inductions.pop_back_val();
Reductions.clear();
- if (!findInductionAndReductions(OuterLoop, Inductions, Reductions))
+ if (!findInductionAndReductions(OuterLoop, Inductions, Reductions)) {
+ DEBUG(dbgs() << "Only outer loops with induction or reduction PHI nodes "
+ << "are supported currently.\n");
+ ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+ "UnsupportedPHIOuter",
+ OuterLoop->getStartLoc(),
+ OuterLoop->getHeader())
+ << "Only outer loops with induction or reduction PHI nodes can be"
+ " interchanged currently.");
return true;
+ }
// Outer loop cannot have reduction because then loops will not be tightly
// nested.
- if (!Reductions.empty())
+ if (!Reductions.empty()) {
+ DEBUG(dbgs() << "Outer loops with reductions are not supported "
+ << "currently.\n");
+ ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+ "ReductionsOuter",
+ OuterLoop->getStartLoc(),
+ OuterLoop->getHeader())
+ << "Outer loops with reductions cannot be interchangeed "
+ "currently.");
return true;
+ }
// TODO: Currently we handle only loops with 1 induction variable.
- if (Inductions.size() != 1)
+ if (Inductions.size() != 1) {
+ DEBUG(dbgs() << "Loops with more than 1 induction variables are not "
+ << "supported currently.\n");
+ ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+ "MultiIndutionOuter",
+ OuterLoop->getStartLoc(),
+ OuterLoop->getHeader())
+ << "Only outer loops with 1 induction variable can be "
+ "interchanged currently.");
return true;
+ }
// TODO: Triangular loops are not handled for now.
if (!isLoopStructureUnderstood(InnerInductionVar)) {
DEBUG(dbgs() << "Loop structure not understood by pass\n");
+ ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+ "UnsupportedStructureInner",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Inner loop structure not understood currently.");
return true;
}
// TODO: We only handle LCSSA PHI's corresponding to reduction for now.
BasicBlock *LoopExitBlock =
getLoopLatchExitBlock(OuterLoopLatch, OuterLoopHeader);
- if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, true))
+ if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, true)) {
+ DEBUG(dbgs() << "Can only handle LCSSA PHIs in outer loops currently.\n");
+ ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+ "NoLCSSAPHIOuter",
+ OuterLoop->getStartLoc(),
+ OuterLoop->getHeader())
+ << "Only outer loops with LCSSA PHIs can be interchange "
+ "currently.");
return true;
+ }
LoopExitBlock = getLoopLatchExitBlock(InnerLoopLatch, InnerLoopHeader);
- if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, false))
+ if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, false)) {
+ DEBUG(dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n");
+ ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+ "NoLCSSAPHIOuterInner",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Only inner loops with LCSSA PHIs can be interchange "
+ "currently.");
return true;
+ }
// TODO: Current limitation: Since we split the inner loop latch at the point
// were induction variable is incremented (induction.next); We cannot have
@@ -816,8 +896,16 @@ bool LoopInterchangeLegality::currentLimitations() {
InnerIndexVarInc =
dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0));
- if (!InnerIndexVarInc)
+ if (!InnerIndexVarInc) {
+ DEBUG(dbgs() << "Did not find an instruction to increment the induction "
+ << "variable.\n");
+ ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+ "NoIncrementInInner",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "The inner loop does not increment the induction variable.");
return true;
+ }
// Since we split the inner loop latch on this induction variable. Make sure
// we do not have any instruction between the induction variable and branch
@@ -827,19 +915,35 @@ bool LoopInterchangeLegality::currentLimitations() {
for (const Instruction &I : reverse(*InnerLoopLatch)) {
if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I))
continue;
+
// We found an instruction. If this is not induction variable then it is not
// safe to split this loop latch.
- if (!I.isIdenticalTo(InnerIndexVarInc))
+ if (!I.isIdenticalTo(InnerIndexVarInc)) {
+ DEBUG(dbgs() << "Found unsupported instructions between induction "
+ << "variable increment and branch.\n");
+ ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+ "UnsupportedInsBetweenInduction",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Found unsupported instruction between induction variable "
+ "increment and branch.");
return true;
+ }
FoundInduction = true;
break;
}
// The loop latch ended and we didn't find the induction variable return as
// current limitation.
- if (!FoundInduction)
+ if (!FoundInduction) {
+ DEBUG(dbgs() << "Did not find the induction variable.\n");
+ ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+ "NoIndutionVariable",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Did not find the induction variable.");
return true;
-
+ }
return false;
}
@@ -851,6 +955,11 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
<< " and OuterLoopId = " << OuterLoopId
<< " due to dependence\n");
+ ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+ "Dependence",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Cannot interchange loops due to dependences.");
return false;
}
@@ -886,6 +995,12 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
// Check if the loops are tightly nested.
if (!tightlyNested(OuterLoop, InnerLoop)) {
DEBUG(dbgs() << "Loops not tightly nested\n");
+ ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+ "NotTightlyNested",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Cannot interchange loops because they are not tightly "
+ "nested.");
return false;
}
@@ -981,9 +1096,18 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
// It is not profitable as per current cache profitability model. But check if
// we can move this loop outside to improve parallelism.
- bool ImprovesPar =
- isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
- return ImprovesPar;
+ if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix))
+ return true;
+
+ ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+ "InterchangeNotProfitable",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Interchanging loops is too costly (cost="
+ << ore::NV("Cost", Cost) << ", threshold="
+ << ore::NV("Threshold", LoopInterchangeCostThreshold) <<
+ ") and it does not improve parallelism.");
+ return false;
}
void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
@@ -1267,6 +1391,7 @@ INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
INITIALIZE_PASS_END(LoopInterchange, "loop-interchange",
"Interchanges loops for cache reuse", false, false)
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
index 8fb5801..20b37c4 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -20,13 +20,14 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/Transforms/Scalar/LoopLoadElimination.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
@@ -45,9 +46,9 @@
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/LoopVersioning.h"
-#include <forward_list>
-#include <cassert>
#include <algorithm>
+#include <cassert>
+#include <forward_list>
#include <set>
#include <tuple>
#include <utility>
@@ -196,8 +197,7 @@ public:
continue;
// Only progagate the value if they are of the same type.
- if (Store->getPointerOperand()->getType() !=
- Load->getPointerOperand()->getType())
+ if (Store->getPointerOperandType() != Load->getPointerOperandType())
continue;
Candidates.emplace_front(Load, Store);
@@ -373,15 +373,15 @@ public:
const auto &AllChecks = LAI.getRuntimePointerChecking()->getChecks();
SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
- std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks),
- [&](const RuntimePointerChecking::PointerCheck &Check) {
- for (auto PtrIdx1 : Check.first->Members)
- for (auto PtrIdx2 : Check.second->Members)
- if (needsChecking(PtrIdx1, PtrIdx2,
- PtrsWrittenOnFwdingPath, CandLoadPtrs))
- return true;
- return false;
- });
+ copy_if(AllChecks, std::back_inserter(Checks),
+ [&](const RuntimePointerChecking::PointerCheck &Check) {
+ for (auto PtrIdx1 : Check.first->Members)
+ for (auto PtrIdx2 : Check.second->Members)
+ if (needsChecking(PtrIdx1, PtrIdx2, PtrsWrittenOnFwdingPath,
+ CandLoadPtrs))
+ return true;
+ return false;
+ });
DEBUG(dbgs() << "\nPointer Checks (count: " << Checks.size() << "):\n");
DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks));
@@ -558,6 +558,32 @@ private:
PredicatedScalarEvolution PSE;
};
+static bool
+eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT,
+ function_ref<const LoopAccessInfo &(Loop &)> GetLAI) {
+ // Build up a worklist of inner-loops to transform to avoid iterator
+ // invalidation.
+ // FIXME: This logic comes from other passes that actually change the loop
+ // nest structure. It isn't clear this is necessary (or useful) for a pass
+ // which merely optimizes the use of loads in a loop.
+ SmallVector<Loop *, 8> Worklist;
+
+ for (Loop *TopLevelLoop : LI)
+ for (Loop *L : depth_first(TopLevelLoop))
+ // We only handle inner-most loops.
+ if (L->empty())
+ Worklist.push_back(L);
+
+ // Now walk the identified inner loops.
+ bool Changed = false;
+ for (Loop *L : Worklist) {
+ // The actual work is performed by LoadEliminationForLoop.
+ LoadEliminationForLoop LEL(L, &LI, GetLAI(*L), &DT);
+ Changed |= LEL.processLoop();
+ }
+ return Changed;
+}
+
/// \brief The pass. Most of the work is delegated to the per-loop
/// LoadEliminationForLoop class.
class LoopLoadElimination : public FunctionPass {
@@ -570,32 +596,14 @@ public:
if (skipFunction(F))
return false;
- auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
- auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-
- // Build up a worklist of inner-loops to vectorize. This is necessary as the
- // act of distributing a loop creates new loops and can invalidate iterators
- // across the loops.
- SmallVector<Loop *, 8> Worklist;
-
- for (Loop *TopLevelLoop : *LI)
- for (Loop *L : depth_first(TopLevelLoop))
- // We only handle inner-most loops.
- if (L->empty())
- Worklist.push_back(L);
-
- // Now walk the identified inner loops.
- bool Changed = false;
- for (Loop *L : Worklist) {
- const LoopAccessInfo &LAI = LAA->getInfo(L);
- // The actual work is performed by LoadEliminationForLoop.
- LoadEliminationForLoop LEL(L, LI, LAI, DT);
- Changed |= LEL.processLoop();
- }
+ auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto &LAA = getAnalysis<LoopAccessLegacyAnalysis>();
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
// Process each loop nest in the function.
- return Changed;
+ return eliminateLoadsAcrossLoops(
+ F, LI, DT,
+ [&LAA](Loop &L) -> const LoopAccessInfo & { return LAA.getInfo(&L); });
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -631,4 +639,28 @@ FunctionPass *createLoopLoadEliminationPass() {
return new LoopLoadElimination();
}
+PreservedAnalyses LoopLoadEliminationPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+ auto &LI = AM.getResult<LoopAnalysis>(F);
+ auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ auto &AA = AM.getResult<AAManager>(F);
+ auto &AC = AM.getResult<AssumptionAnalysis>(F);
+
+ auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
+ bool Changed = eliminateLoadsAcrossLoops(
+ F, LI, DT, [&](Loop &L) -> const LoopAccessInfo & {
+ LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI};
+ return LAM.getResult<LoopAccessAnalysis>(L, AR);
+ });
+
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ return PA;
+}
+
} // end namespace llvm
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopPassManager.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
index 028f4bb..10f6fcd 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
@@ -42,6 +42,13 @@ PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &,
break;
}
+#ifndef NDEBUG
+ // Verify the loop structure and LCSSA form before visiting the loop.
+ L.verifyLoop();
+ assert(L.isRecursivelyLCSSAForm(AR.DT, AR.LI) &&
+ "Loops must remain in LCSSA form!");
+#endif
+
// Update the analysis manager as each pass runs and potentially
// invalidates analyses.
AM.invalidate(L, PassPA);
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopPredication.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopPredication.cpp
new file mode 100644
index 0000000..9b12ba1
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopPredication.cpp
@@ -0,0 +1,330 @@
+//===-- LoopPredication.cpp - Guard based loop predication pass -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The LoopPredication pass tries to convert loop variant range checks to loop
+// invariant by widening checks across loop iterations. For example, it will
+// convert
+//
+// for (i = 0; i < n; i++) {
+// guard(i < len);
+// ...
+// }
+//
+// to
+//
+// for (i = 0; i < n; i++) {
+// guard(n - 1 < len);
+// ...
+// }
+//
+// After this transformation the condition of the guard is loop invariant, so
+// loop-unswitch can later unswitch the loop by this condition which basically
+// predicates the loop by the widened condition:
+//
+// if (n - 1 < len)
+// for (i = 0; i < n; i++) {
+// ...
+// }
+// else
+// deoptimize
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopPredication.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+#define DEBUG_TYPE "loop-predication"
+
+using namespace llvm;
+
+namespace {
+class LoopPredication {
+ /// Represents an induction variable check:
+ /// icmp Pred, <induction variable>, <loop invariant limit>
+ struct LoopICmp {
+ ICmpInst::Predicate Pred;
+ const SCEVAddRecExpr *IV;
+ const SCEV *Limit;
+ LoopICmp(ICmpInst::Predicate Pred, const SCEVAddRecExpr *IV,
+ const SCEV *Limit)
+ : Pred(Pred), IV(IV), Limit(Limit) {}
+ LoopICmp() {}
+ };
+
+ ScalarEvolution *SE;
+
+ Loop *L;
+ const DataLayout *DL;
+ BasicBlock *Preheader;
+
+ Optional<LoopICmp> parseLoopICmp(ICmpInst *ICI);
+
+ Value *expandCheck(SCEVExpander &Expander, IRBuilder<> &Builder,
+ ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS,
+ Instruction *InsertAt);
+
+ Optional<Value *> widenICmpRangeCheck(ICmpInst *ICI, SCEVExpander &Expander,
+ IRBuilder<> &Builder);
+ bool widenGuardConditions(IntrinsicInst *II, SCEVExpander &Expander);
+
+public:
+ LoopPredication(ScalarEvolution *SE) : SE(SE){};
+ bool runOnLoop(Loop *L);
+};
+
+class LoopPredicationLegacyPass : public LoopPass {
+public:
+ static char ID;
+ LoopPredicationLegacyPass() : LoopPass(ID) {
+ initializeLoopPredicationLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ getLoopAnalysisUsage(AU);
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+ if (skipLoop(L))
+ return false;
+ auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ LoopPredication LP(SE);
+ return LP.runOnLoop(L);
+ }
+};
+
+char LoopPredicationLegacyPass::ID = 0;
+} // end namespace llvm
+
+INITIALIZE_PASS_BEGIN(LoopPredicationLegacyPass, "loop-predication",
+ "Loop predication", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_END(LoopPredicationLegacyPass, "loop-predication",
+ "Loop predication", false, false)
+
+Pass *llvm::createLoopPredicationPass() {
+ return new LoopPredicationLegacyPass();
+}
+
+PreservedAnalyses LoopPredicationPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &U) {
+ LoopPredication LP(&AR.SE);
+ if (!LP.runOnLoop(&L))
+ return PreservedAnalyses::all();
+
+ return getLoopPassPreservedAnalyses();
+}
+
+Optional<LoopPredication::LoopICmp>
+LoopPredication::parseLoopICmp(ICmpInst *ICI) {
+ ICmpInst::Predicate Pred = ICI->getPredicate();
+
+ Value *LHS = ICI->getOperand(0);
+ Value *RHS = ICI->getOperand(1);
+ const SCEV *LHSS = SE->getSCEV(LHS);
+ if (isa<SCEVCouldNotCompute>(LHSS))
+ return None;
+ const SCEV *RHSS = SE->getSCEV(RHS);
+ if (isa<SCEVCouldNotCompute>(RHSS))
+ return None;
+
+ // Canonicalize RHS to be loop invariant bound, LHS - a loop computable IV
+ if (SE->isLoopInvariant(LHSS, L)) {
+ std::swap(LHS, RHS);
+ std::swap(LHSS, RHSS);
+ Pred = ICmpInst::getSwappedPredicate(Pred);
+ }
+
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHSS);
+ if (!AR || AR->getLoop() != L)
+ return None;
+
+ return LoopICmp(Pred, AR, RHSS);
+}
+
+Value *LoopPredication::expandCheck(SCEVExpander &Expander,
+ IRBuilder<> &Builder,
+ ICmpInst::Predicate Pred, const SCEV *LHS,
+ const SCEV *RHS, Instruction *InsertAt) {
+ Type *Ty = LHS->getType();
+ assert(Ty == RHS->getType() && "expandCheck operands have different types?");
+ Value *LHSV = Expander.expandCodeFor(LHS, Ty, InsertAt);
+ Value *RHSV = Expander.expandCodeFor(RHS, Ty, InsertAt);
+ return Builder.CreateICmp(Pred, LHSV, RHSV);
+}
+
+/// If ICI can be widened to a loop invariant condition emits the loop
+/// invariant condition in the loop preheader and return it, otherwise
+/// returns None.
+Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
+ SCEVExpander &Expander,
+ IRBuilder<> &Builder) {
+ DEBUG(dbgs() << "Analyzing ICmpInst condition:\n");
+ DEBUG(ICI->dump());
+
+ auto RangeCheck = parseLoopICmp(ICI);
+ if (!RangeCheck) {
+ DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
+ return None;
+ }
+
+ ICmpInst::Predicate Pred = RangeCheck->Pred;
+ const SCEVAddRecExpr *IndexAR = RangeCheck->IV;
+ const SCEV *RHSS = RangeCheck->Limit;
+
+ auto CanExpand = [this](const SCEV *S) {
+ return SE->isLoopInvariant(S, L) && isSafeToExpand(S, *SE);
+ };
+ if (!CanExpand(RHSS))
+ return None;
+
+ DEBUG(dbgs() << "IndexAR: ");
+ DEBUG(IndexAR->dump());
+
+ bool IsIncreasing = false;
+ if (!SE->isMonotonicPredicate(IndexAR, Pred, IsIncreasing))
+ return None;
+
+ // If the predicate is increasing the condition can change from false to true
+ // as the loop progresses, in this case take the value on the first iteration
+ // for the widened check. Otherwise the condition can change from true to
+ // false as the loop progresses, so take the value on the last iteration.
+ const SCEV *NewLHSS = IsIncreasing
+ ? IndexAR->getStart()
+ : SE->getSCEVAtScope(IndexAR, L->getParentLoop());
+ if (NewLHSS == IndexAR) {
+ DEBUG(dbgs() << "Can't compute NewLHSS!\n");
+ return None;
+ }
+
+ DEBUG(dbgs() << "NewLHSS: ");
+ DEBUG(NewLHSS->dump());
+
+ if (!CanExpand(NewLHSS))
+ return None;
+
+ DEBUG(dbgs() << "NewLHSS is loop invariant and safe to expand. Expand!\n");
+
+ Instruction *InsertAt = Preheader->getTerminator();
+ return expandCheck(Expander, Builder, Pred, NewLHSS, RHSS, InsertAt);
+}
+
+bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard,
+ SCEVExpander &Expander) {
+ DEBUG(dbgs() << "Processing guard:\n");
+ DEBUG(Guard->dump());
+
+ IRBuilder<> Builder(cast<Instruction>(Preheader->getTerminator()));
+
+ // The guard condition is expected to be in form of:
+ // cond1 && cond2 && cond3 ...
+ // Iterate over subconditions looking for for icmp conditions which can be
+ // widened across loop iterations. Widening these conditions remember the
+ // resulting list of subconditions in Checks vector.
+ SmallVector<Value *, 4> Worklist(1, Guard->getOperand(0));
+ SmallPtrSet<Value *, 4> Visited;
+
+ SmallVector<Value *, 4> Checks;
+
+ unsigned NumWidened = 0;
+ do {
+ Value *Condition = Worklist.pop_back_val();
+ if (!Visited.insert(Condition).second)
+ continue;
+
+ Value *LHS, *RHS;
+ using namespace llvm::PatternMatch;
+ if (match(Condition, m_And(m_Value(LHS), m_Value(RHS)))) {
+ Worklist.push_back(LHS);
+ Worklist.push_back(RHS);
+ continue;
+ }
+
+ if (ICmpInst *ICI = dyn_cast<ICmpInst>(Condition)) {
+ if (auto NewRangeCheck = widenICmpRangeCheck(ICI, Expander, Builder)) {
+ Checks.push_back(NewRangeCheck.getValue());
+ NumWidened++;
+ continue;
+ }
+ }
+
+ // Save the condition as is if we can't widen it
+ Checks.push_back(Condition);
+ } while (Worklist.size() != 0);
+
+ if (NumWidened == 0)
+ return false;
+
+ // Emit the new guard condition
+ Builder.SetInsertPoint(Guard);
+ Value *LastCheck = nullptr;
+ for (auto *Check : Checks)
+ if (!LastCheck)
+ LastCheck = Check;
+ else
+ LastCheck = Builder.CreateAnd(LastCheck, Check);
+ Guard->setOperand(0, LastCheck);
+
+ DEBUG(dbgs() << "Widened checks = " << NumWidened << "\n");
+ return true;
+}
+
+bool LoopPredication::runOnLoop(Loop *Loop) {
+ L = Loop;
+
+ DEBUG(dbgs() << "Analyzing ");
+ DEBUG(L->dump());
+
+ Module *M = L->getHeader()->getModule();
+
+ // There is nothing to do if the module doesn't use guards
+ auto *GuardDecl =
+ M->getFunction(Intrinsic::getName(Intrinsic::experimental_guard));
+ if (!GuardDecl || GuardDecl->use_empty())
+ return false;
+
+ DL = &M->getDataLayout();
+
+ Preheader = L->getLoopPreheader();
+ if (!Preheader)
+ return false;
+
+ // Collect all the guards into a vector and process later, so as not
+ // to invalidate the instruction iterator.
+ SmallVector<IntrinsicInst *, 4> Guards;
+ for (const auto BB : L->blocks())
+ for (auto &I : *BB)
+ if (auto *II = dyn_cast<IntrinsicInst>(&I))
+ if (II->getIntrinsicID() == Intrinsic::experimental_guard)
+ Guards.push_back(II);
+
+ if (Guards.empty())
+ return false;
+
+ SCEVExpander Expander(*SE, *DL, "loop-predication");
+
+ bool Changed = false;
+ for (auto *Guard : Guards)
+ Changed |= widenGuardConditions(Guard, Expander);
+
+ return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
index 86058fe..fc0216e 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -11,10 +11,9 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
@@ -31,6 +30,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
@@ -557,7 +557,7 @@ bool LoopReroll::isLoopControlIV(Loop *L, Instruction *IV) {
Instruction *UUser = dyn_cast<Instruction>(UU);
// Skip SExt if we are extending an nsw value
// TODO: Allow ZExt too
- if (BO->hasNoSignedWrap() && UUser && UUser->getNumUses() == 1 &&
+ if (BO->hasNoSignedWrap() && UUser && UUser->hasOneUse() &&
isa<SExtInst>(UUser))
UUser = dyn_cast<Instruction>(*(UUser->user_begin()));
if (!isCompareUsedByBranch(UUser))
@@ -852,7 +852,7 @@ collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) {
for (auto &KV : Roots) {
if (KV.first == 0)
continue;
- if (KV.second->getNumUses() != NumBaseUses) {
+ if (!KV.second->hasNUses(NumBaseUses)) {
DEBUG(dbgs() << "LRR: Aborting - Root and Base #users not the same: "
<< "#Base=" << NumBaseUses << ", #Root=" <<
KV.second->getNumUses() << "\n");
@@ -867,7 +867,7 @@ void LoopReroll::DAGRootTracker::
findRootsRecursive(Instruction *I, SmallInstructionSet SubsumedInsts) {
// Does the user look like it could be part of a root set?
// All its users must be simple arithmetic ops.
- if (I->getNumUses() > IL_MaxRerollIterations)
+ if (I->hasNUsesOrMore(IL_MaxRerollIterations + 1))
return;
if (I != IV && findRootsBase(I, SubsumedInsts))
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
index cc83069..3506ac3 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
@@ -58,13 +58,14 @@ class LoopRotate {
AssumptionCache *AC;
DominatorTree *DT;
ScalarEvolution *SE;
+ const SimplifyQuery &SQ;
public:
LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI,
const TargetTransformInfo *TTI, AssumptionCache *AC,
- DominatorTree *DT, ScalarEvolution *SE)
- : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE) {
- }
+ DominatorTree *DT, ScalarEvolution *SE, const SimplifyQuery &SQ)
+ : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE),
+ SQ(SQ) {}
bool processLoop(Loop *L);
private:
@@ -79,7 +80,8 @@ private:
/// to merge the two values. Do this now.
static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
BasicBlock *OrigPreheader,
- ValueToValueMapTy &ValueMap) {
+ ValueToValueMapTy &ValueMap,
+ SmallVectorImpl<PHINode*> *InsertedPHIs) {
// Remove PHI node entries that are no longer live.
BasicBlock::iterator I, E = OrigHeader->end();
for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I)
@@ -87,7 +89,7 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
// Now fix up users of the instructions in OrigHeader, inserting PHI nodes
// as necessary.
- SSAUpdater SSA;
+ SSAUpdater SSA(InsertedPHIs);
for (I = OrigHeader->begin(); I != E; ++I) {
Value *OrigHeaderVal = &*I;
@@ -174,6 +176,38 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
}
}
+/// Propagate dbg.value intrinsics through the newly inserted Phis.
+static void insertDebugValues(BasicBlock *OrigHeader,
+ SmallVectorImpl<PHINode*> &InsertedPHIs) {
+ ValueToValueMapTy DbgValueMap;
+
+ // Map existing PHI nodes to their dbg.values.
+ for (auto &I : *OrigHeader) {
+ if (auto DbgII = dyn_cast<DbgInfoIntrinsic>(&I)) {
+ if (auto *Loc = dyn_cast_or_null<PHINode>(DbgII->getVariableLocation()))
+ DbgValueMap.insert({Loc, DbgII});
+ }
+ }
+
+ // Then iterate through the new PHIs and look to see if they use one of the
+ // previously mapped PHIs. If so, insert a new dbg.value intrinsic that will
+ // propagate the info through the new PHI.
+ LLVMContext &C = OrigHeader->getContext();
+ for (auto PHI : InsertedPHIs) {
+ for (auto VI : PHI->operand_values()) {
+ auto V = DbgValueMap.find(VI);
+ if (V != DbgValueMap.end()) {
+ auto *DbgII = cast<DbgInfoIntrinsic>(V->second);
+ Instruction *NewDbgII = DbgII->clone();
+ auto PhiMAV = MetadataAsValue::get(C, ValueAsMetadata::get(PHI));
+ NewDbgII->setOperand(0, PhiMAV);
+ BasicBlock *Parent = PHI->getParent();
+ NewDbgII->insertBefore(Parent->getFirstNonPHIOrDbgOrLifetime());
+ }
+ }
+ }
+}
+
/// Rotate loop LP. Return true if the loop is rotated.
///
/// \param SimplifiedLatch is true if the latch was just folded into the final
@@ -278,8 +312,6 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
ValueMap[PN] = PN->getIncomingValueForBlock(OrigPreheader);
- const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
-
// For the rest of the instructions, either hoist to the OrigPreheader if
// possible or create a clone in the OldPreHeader if not.
TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator();
@@ -309,14 +341,13 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
// With the operands remapped, see if the instruction constant folds or is
// otherwise simplifyable. This commonly occurs because the entry from PHI
// nodes allows icmps and other instructions to fold.
- // FIXME: Provide TLI, DT, AC to SimplifyInstruction.
- Value *V = SimplifyInstruction(C, DL);
+ Value *V = SimplifyInstruction(C, SQ);
if (V && LI->replacementPreservesLCSSAForm(C, V)) {
// If so, then delete the temporary instruction and stick the folded value
// in the map.
ValueMap[Inst] = V;
if (!C->mayHaveSideEffects()) {
- delete C;
+ C->deleteValue();
C = nullptr;
}
} else {
@@ -347,9 +378,18 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
// remove the corresponding incoming values from the PHI nodes in OrigHeader.
LoopEntryBranch->eraseFromParent();
+
+ SmallVector<PHINode*, 2> InsertedPHIs;
// If there were any uses of instructions in the duplicated block outside the
// loop, update them, inserting PHI nodes as required
- RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap);
+ RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap,
+ &InsertedPHIs);
+
+ // Attach dbg.value intrinsics to the new phis if that phi uses a value that
+ // previously had debug metadata attached. This keeps the debug info
+ // up-to-date in the loop body.
+ if (!InsertedPHIs.empty())
+ insertDebugValues(OrigHeader, InsertedPHIs);
// NewHeader is now the header of the loop.
L->moveToHeader(NewHeader);
@@ -445,10 +485,22 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
DomTreeNode *Node = HeaderChildren[I];
BasicBlock *BB = Node->getBlock();
- pred_iterator PI = pred_begin(BB);
- BasicBlock *NearestDom = *PI;
- for (pred_iterator PE = pred_end(BB); PI != PE; ++PI)
- NearestDom = DT->findNearestCommonDominator(NearestDom, *PI);
+ BasicBlock *NearestDom = nullptr;
+ for (BasicBlock *Pred : predecessors(BB)) {
+ // Consider only reachable basic blocks.
+ if (!DT->getNode(Pred))
+ continue;
+
+ if (!NearestDom) {
+ NearestDom = Pred;
+ continue;
+ }
+
+ NearestDom = DT->findNearestCommonDominator(NearestDom, Pred);
+ assert(NearestDom && "No NearestCommonDominator found");
+ }
+
+ assert(NearestDom && "Nearest dominator not found");
// Remember if this changes the DomTree.
if (Node->getIDom()->getBlock() != NearestDom) {
@@ -629,11 +681,15 @@ PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM,
LoopStandardAnalysisResults &AR,
LPMUpdater &) {
int Threshold = EnableHeaderDuplication ? DefaultRotationThreshold : 0;
- LoopRotate LR(Threshold, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE);
+ const DataLayout &DL = L.getHeader()->getModule()->getDataLayout();
+ const SimplifyQuery SQ = getBestSimplifyQuery(AR, DL);
+ LoopRotate LR(Threshold, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE,
+ SQ);
bool Changed = LR.processLoop(&L);
if (!Changed)
return PreservedAnalyses::all();
+
return getLoopPassPreservedAnalyses();
}
@@ -671,7 +727,8 @@ public:
auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
auto *SE = SEWP ? &SEWP->getSE() : nullptr;
- LoopRotate LR(MaxHeaderSize, LI, TTI, AC, DT, SE);
+ const SimplifyQuery SQ = getBestSimplifyQuery(*this, F);
+ LoopRotate LR(MaxHeaderSize, LI, TTI, AC, DT, SE, SQ);
return LR.processLoop(L);
}
};
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index 1606121..35c05e8 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -40,7 +40,7 @@ static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI) {
bool Changed = false;
// Copy blocks into a temporary array to avoid iterator invalidation issues
// as we remove them.
- SmallVector<WeakVH, 16> Blocks(L.blocks());
+ SmallVector<WeakTrackingVH, 16> Blocks(L.blocks());
for (auto &Block : Blocks) {
// Attempt to merge blocks in the trivial case. Don't modify blocks which
@@ -69,6 +69,7 @@ PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM,
LPMUpdater &) {
if (!simplifyLoopCFG(L, AR.DT, AR.LI))
return PreservedAnalyses::all();
+
return getLoopPassPreservedAnalyses();
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp
index f3f4152..c9d55b4 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp
@@ -1,4 +1,4 @@
-//===-- LoopSink.cpp - Loop Sink Pass ------------------------===//
+//===-- LoopSink.cpp - Loop Sink Pass -------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -28,8 +28,10 @@
// InsertBBs = UseBBs - DomBBs + BB
// For BB in InsertBBs:
// Insert I at BB's beginning
+//
//===----------------------------------------------------------------------===//
+#include "llvm/Transforms/Scalar/LoopSink.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AliasSetTracker.h"
@@ -297,6 +299,42 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
return Changed;
}
+PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) {
+ LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
+ // Nothing to do if there are no loops.
+ if (LI.empty())
+ return PreservedAnalyses::all();
+
+ AAResults &AA = FAM.getResult<AAManager>(F);
+ DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+ BlockFrequencyInfo &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+
+ // We want to do a postorder walk over the loops. Since loops are a tree this
+ // is equivalent to a reversed preorder walk and preorder is easy to compute
+ // without recursion. Since we reverse the preorder, we will visit siblings
+ // in reverse program order. This isn't expected to matter at all but is more
+ // consistent with sinking algorithms which generally work bottom-up.
+ SmallVector<Loop *, 4> PreorderLoops = LI.getLoopsInPreorder();
+
+ bool Changed = false;
+ do {
+ Loop &L = *PreorderLoops.pop_back_val();
+
+ // Note that we don't pass SCEV here because it is only used to invalidate
+ // loops in SCEV and we don't preserve (or request) SCEV at all making that
+ // unnecessary.
+ Changed |= sinkLoopInvariantInstructions(L, AA, LI, DT, BFI,
+ /*ScalarEvolution*/ nullptr);
+ } while (!PreorderLoops.empty());
+
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
+
namespace {
struct LegacyLoopSinkPass : public LoopPass {
static char ID;
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 194587a..3638da1 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -129,6 +129,24 @@ static cl::opt<bool> EnablePhiElim(
"enable-lsr-phielim", cl::Hidden, cl::init(true),
cl::desc("Enable LSR phi elimination"));
+// The flag adds instruction count to solutions cost comparision.
+static cl::opt<bool> InsnsCost(
+ "lsr-insns-cost", cl::Hidden, cl::init(false),
+ cl::desc("Add instruction count to a LSR cost model"));
+
+// Flag to choose how to narrow complex lsr solution
+static cl::opt<bool> LSRExpNarrow(
+ "lsr-exp-narrow", cl::Hidden, cl::init(false),
+ cl::desc("Narrow LSR complex solution using"
+ " expectation of registers number"));
+
+// Flag to narrow search space by filtering non-optimal formulae with
+// the same ScaledReg and Scale.
+static cl::opt<bool> FilterSameScaledReg(
+ "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true),
+ cl::desc("Narrow LSR search space by filtering non-optimal formulae"
+ " with the same ScaledReg and Scale"));
+
#ifndef NDEBUG
// Stress test IV chain generation.
static cl::opt<bool> StressIVChain(
@@ -181,10 +199,11 @@ void RegSortData::print(raw_ostream &OS) const {
OS << "[NumUses=" << UsedByIndices.count() << ']';
}
-LLVM_DUMP_METHOD
-void RegSortData::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void RegSortData::dump() const {
print(errs()); errs() << '\n';
}
+#endif
namespace {
@@ -295,9 +314,13 @@ struct Formula {
/// canonical representation of a formula is
/// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
/// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
+ /// 3. The reg containing recurrent expr related with currect loop in the
+ /// formula should be put in the ScaledReg.
/// #1 enforces that the scaled register is always used when at least two
/// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
/// #2 enforces that 1 * reg is reg.
+ /// #3 ensures invariant regs with respect to current loop can be combined
+ /// together in LSR codegen.
/// This invariant can be temporarly broken while building a formula.
/// However, every formula inserted into the LSRInstance must be in canonical
/// form.
@@ -318,12 +341,14 @@ struct Formula {
void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
- bool isCanonical() const;
+ bool isCanonical(const Loop &L) const;
- void canonicalize();
+ void canonicalize(const Loop &L);
bool unscale();
+ bool hasZeroEnd() const;
+
size_t getNumRegs() const;
Type *getType() const;
@@ -410,16 +435,35 @@ void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
BaseRegs.push_back(Sum);
HasBaseReg = true;
}
- canonicalize();
+ canonicalize(*L);
}
/// \brief Check whether or not this formula statisfies the canonical
/// representation.
/// \see Formula::BaseRegs.
-bool Formula::isCanonical() const {
- if (ScaledReg)
- return Scale != 1 || !BaseRegs.empty();
- return BaseRegs.size() <= 1;
+bool Formula::isCanonical(const Loop &L) const {
+ if (!ScaledReg)
+ return BaseRegs.size() <= 1;
+
+ if (Scale != 1)
+ return true;
+
+ if (Scale == 1 && BaseRegs.empty())
+ return false;
+
+ const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg);
+ if (SAR && SAR->getLoop() == &L)
+ return true;
+
+ // If ScaledReg is not a recurrent expr, or it is but its loop is not current
+ // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
+ // loop, we want to swap the reg in BaseRegs with ScaledReg.
+ auto I =
+ find_if(make_range(BaseRegs.begin(), BaseRegs.end()), [&](const SCEV *S) {
+ return isa<const SCEVAddRecExpr>(S) &&
+ (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
+ });
+ return I == BaseRegs.end();
}
/// \brief Helper method to morph a formula into its canonical representation.
@@ -428,21 +472,33 @@ bool Formula::isCanonical() const {
/// field. Otherwise, we would have to do special cases everywhere in LSR
/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
/// On the other hand, 1*reg should be canonicalized into reg.
-void Formula::canonicalize() {
- if (isCanonical())
+void Formula::canonicalize(const Loop &L) {
+ if (isCanonical(L))
return;
// So far we did not need this case. This is easy to implement but it is
// useless to maintain dead code. Beside it could hurt compile time.
assert(!BaseRegs.empty() && "1*reg => reg, should not be needed.");
+
// Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
- ScaledReg = BaseRegs.back();
- BaseRegs.pop_back();
- Scale = 1;
- size_t BaseRegsSize = BaseRegs.size();
- size_t Try = 0;
- // If ScaledReg is an invariant, try to find a variant expression.
- while (Try < BaseRegsSize && !isa<SCEVAddRecExpr>(ScaledReg))
- std::swap(ScaledReg, BaseRegs[Try++]);
+ if (!ScaledReg) {
+ ScaledReg = BaseRegs.back();
+ BaseRegs.pop_back();
+ Scale = 1;
+ }
+
+ // If ScaledReg is an invariant with respect to L, find the reg from
+ // BaseRegs containing the recurrent expr related with Loop L. Swap the
+ // reg with ScaledReg.
+ const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg);
+ if (!SAR || SAR->getLoop() != &L) {
+ auto I = find_if(make_range(BaseRegs.begin(), BaseRegs.end()),
+ [&](const SCEV *S) {
+ return isa<const SCEVAddRecExpr>(S) &&
+ (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
+ });
+ if (I != BaseRegs.end())
+ std::swap(ScaledReg, *I);
+ }
}
/// \brief Get rid of the scale in the formula.
@@ -458,6 +514,14 @@ bool Formula::unscale() {
return true;
}
+bool Formula::hasZeroEnd() const {
+ if (UnfoldedOffset || BaseOffset)
+ return false;
+ if (BaseRegs.size() != 1 || ScaledReg)
+ return false;
+ return true;
+}
+
/// Return the total number of register operands used by this formula. This does
/// not include register uses implied by non-constant addrec strides.
size_t Formula::getNumRegs() const {
@@ -534,10 +598,11 @@ void Formula::print(raw_ostream &OS) const {
}
}
-LLVM_DUMP_METHOD
-void Formula::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void Formula::dump() const {
print(errs()); errs() << '\n';
}
+#endif
/// Return true if the given addrec can be sign-extended without changing its
/// value.
@@ -711,7 +776,7 @@ static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
bool isAddress = isa<LoadInst>(Inst);
if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
- if (SI->getOperand(1) == OperandVal)
+ if (SI->getPointerOperand() == OperandVal)
isAddress = true;
} else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
// Addressing modes can also be folded into prefetches and a variety
@@ -723,6 +788,12 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
isAddress = true;
break;
}
+ } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
+ if (RMW->getPointerOperand() == OperandVal)
+ isAddress = true;
+ } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+ if (CmpX->getPointerOperand() == OperandVal)
+ isAddress = true;
}
return isAddress;
}
@@ -735,6 +806,10 @@ static MemAccessTy getAccessType(const Instruction *Inst) {
AccessTy.AddrSpace = SI->getPointerAddressSpace();
} else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
AccessTy.AddrSpace = LI->getPointerAddressSpace();
+ } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
+ AccessTy.AddrSpace = RMW->getPointerAddressSpace();
+ } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+ AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
}
// All pointers have the same requirements, so canonicalize them to an
@@ -832,7 +907,7 @@ static bool isHighCostExpansion(const SCEV *S,
/// If any of the instructions is the specified set are trivially dead, delete
/// them and see if this makes any of their operands subsequently dead.
static bool
-DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
+DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
bool Changed = false;
while (!DeadInsts.empty()) {
@@ -875,44 +950,44 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
const LSRUse &LU, const Formula &F);
// Get the cost of the scaling factor used in F for LU.
static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
- const LSRUse &LU, const Formula &F);
+ const LSRUse &LU, const Formula &F,
+ const Loop &L);
namespace {
/// This class is used to measure and compare candidate formulae.
class Cost {
- /// TODO: Some of these could be merged. Also, a lexical ordering
- /// isn't always optimal.
- unsigned NumRegs;
- unsigned AddRecCost;
- unsigned NumIVMuls;
- unsigned NumBaseAdds;
- unsigned ImmCost;
- unsigned SetupCost;
- unsigned ScaleCost;
+ TargetTransformInfo::LSRCost C;
public:
- Cost()
- : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0),
- SetupCost(0), ScaleCost(0) {}
+ Cost() {
+ C.Insns = 0;
+ C.NumRegs = 0;
+ C.AddRecCost = 0;
+ C.NumIVMuls = 0;
+ C.NumBaseAdds = 0;
+ C.ImmCost = 0;
+ C.SetupCost = 0;
+ C.ScaleCost = 0;
+ }
- bool operator<(const Cost &Other) const;
+ bool isLess(Cost &Other, const TargetTransformInfo &TTI);
void Lose();
#ifndef NDEBUG
// Once any of the metrics loses, they must all remain losers.
bool isValid() {
- return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
- | ImmCost | SetupCost | ScaleCost) != ~0u)
- || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
- & ImmCost & SetupCost & ScaleCost) == ~0u);
+ return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds
+ | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u)
+ || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds
+ & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);
}
#endif
bool isLoser() {
assert(isValid() && "invalid cost");
- return NumRegs == ~0u;
+ return C.NumRegs == ~0u;
}
void RateFormula(const TargetTransformInfo &TTI,
@@ -1067,7 +1142,8 @@ public:
}
bool HasFormulaWithSameRegs(const Formula &F) const;
- bool InsertFormula(const Formula &F);
+ float getNotSelectedProbability(const SCEV *Reg) const;
+ bool InsertFormula(const Formula &F, const Loop &L);
void DeleteFormula(Formula &F);
void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
@@ -1083,20 +1159,26 @@ void Cost::RateRegister(const SCEV *Reg,
const Loop *L,
ScalarEvolution &SE, DominatorTree &DT) {
if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
- // If this is an addrec for another loop, don't second-guess its addrec phi
- // nodes. LSR isn't currently smart enough to reason about more than one
- // loop at a time. LSR has already run on inner loops, will not run on outer
- // loops, and cannot be expected to change sibling loops.
+ // If this is an addrec for another loop, it should be an invariant
+ // with respect to L since L is the innermost loop (at least
+ // for now LSR only handles innermost loops).
if (AR->getLoop() != L) {
// If the AddRec exists, consider it's register free and leave it alone.
if (isExistingPhi(AR, SE))
return;
- // Otherwise, do not consider this formula at all.
- Lose();
+ // It is bad to allow LSR for current loop to add induction variables
+ // for its sibling loops.
+ if (!AR->getLoop()->contains(L)) {
+ Lose();
+ return;
+ }
+
+ // Otherwise, it will be an invariant with respect to Loop L.
+ ++C.NumRegs;
return;
}
- AddRecCost += 1; /// TODO: This should be a function of the stride.
+ C.AddRecCost += 1; /// TODO: This should be a function of the stride.
// Add the step value register, if it needs one.
// TODO: The non-affine case isn't precisely modeled here.
@@ -1108,7 +1190,7 @@ void Cost::RateRegister(const SCEV *Reg,
}
}
}
- ++NumRegs;
+ ++C.NumRegs;
// Rough heuristic; favor registers which don't require extra setup
// instructions in the preheader.
@@ -1117,9 +1199,9 @@ void Cost::RateRegister(const SCEV *Reg,
!(isa<SCEVAddRecExpr>(Reg) &&
(isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) ||
isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart()))))
- ++SetupCost;
+ ++C.SetupCost;
- NumIVMuls += isa<SCEVMulExpr>(Reg) &&
+ C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
SE.hasComputableLoopEvolution(Reg, L);
}
@@ -1150,8 +1232,11 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
ScalarEvolution &SE, DominatorTree &DT,
const LSRUse &LU,
SmallPtrSetImpl<const SCEV *> *LoserRegs) {
- assert(F.isCanonical() && "Cost is accurate only for canonical formula");
+ assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
// Tally up the registers.
+ unsigned PrevAddRecCost = C.AddRecCost;
+ unsigned PrevNumRegs = C.NumRegs;
+ unsigned PrevNumBaseAdds = C.NumBaseAdds;
if (const SCEV *ScaledReg = F.ScaledReg) {
if (VisitedRegs.count(ScaledReg)) {
Lose();
@@ -1176,73 +1261,113 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
if (NumBaseParts > 1)
// Do not count the base and a possible second register if the target
// allows to fold 2 registers.
- NumBaseAdds +=
+ C.NumBaseAdds +=
NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI, LU, F)));
- NumBaseAdds += (F.UnfoldedOffset != 0);
+ C.NumBaseAdds += (F.UnfoldedOffset != 0);
// Accumulate non-free scaling amounts.
- ScaleCost += getScalingFactorCost(TTI, LU, F);
+ C.ScaleCost += getScalingFactorCost(TTI, LU, F, *L);
// Tally up the non-zero immediates.
for (const LSRFixup &Fixup : LU.Fixups) {
int64_t O = Fixup.Offset;
int64_t Offset = (uint64_t)O + F.BaseOffset;
if (F.BaseGV)
- ImmCost += 64; // Handle symbolic values conservatively.
+ C.ImmCost += 64; // Handle symbolic values conservatively.
// TODO: This should probably be the pointer size.
else if (Offset != 0)
- ImmCost += APInt(64, Offset, true).getMinSignedBits();
+ C.ImmCost += APInt(64, Offset, true).getMinSignedBits();
// Check with target if this offset with this instruction is
// specifically not supported.
if ((isa<LoadInst>(Fixup.UserInst) || isa<StoreInst>(Fixup.UserInst)) &&
!TTI.isFoldableMemAccessOffset(Fixup.UserInst, Offset))
- NumBaseAdds++;
+ C.NumBaseAdds++;
+ }
+
+ // If we don't count instruction cost exit here.
+ if (!InsnsCost) {
+ assert(isValid() && "invalid cost");
+ return;
+ }
+
+ // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
+ // additional instruction (at least fill).
+ unsigned TTIRegNum = TTI.getNumberOfRegisters(false) - 1;
+ if (C.NumRegs > TTIRegNum) {
+ // Cost already exceeded TTIRegNum, then only newly added register can add
+ // new instructions.
+ if (PrevNumRegs > TTIRegNum)
+ C.Insns += (C.NumRegs - PrevNumRegs);
+ else
+ C.Insns += (C.NumRegs - TTIRegNum);
}
+
+ // If ICmpZero formula ends with not 0, it could not be replaced by
+ // just add or sub. We'll need to compare final result of AddRec.
+ // That means we'll need an additional instruction.
+ // For -10 + {0, +, 1}:
+ // i = i + 1;
+ // cmp i, 10
+ //
+ // For {-10, +, 1}:
+ // i = i + 1;
+ if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd())
+ C.Insns++;
+ // Each new AddRec adds 1 instruction to calculation.
+ C.Insns += (C.AddRecCost - PrevAddRecCost);
+
+ // BaseAdds adds instructions for unfolded registers.
+ if (LU.Kind != LSRUse::ICmpZero)
+ C.Insns += C.NumBaseAdds - PrevNumBaseAdds;
assert(isValid() && "invalid cost");
}
/// Set this cost to a losing value.
void Cost::Lose() {
- NumRegs = ~0u;
- AddRecCost = ~0u;
- NumIVMuls = ~0u;
- NumBaseAdds = ~0u;
- ImmCost = ~0u;
- SetupCost = ~0u;
- ScaleCost = ~0u;
+ C.Insns = ~0u;
+ C.NumRegs = ~0u;
+ C.AddRecCost = ~0u;
+ C.NumIVMuls = ~0u;
+ C.NumBaseAdds = ~0u;
+ C.ImmCost = ~0u;
+ C.SetupCost = ~0u;
+ C.ScaleCost = ~0u;
}
/// Choose the lower cost.
-bool Cost::operator<(const Cost &Other) const {
- return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost,
- ImmCost, SetupCost) <
- std::tie(Other.NumRegs, Other.AddRecCost, Other.NumIVMuls,
- Other.NumBaseAdds, Other.ScaleCost, Other.ImmCost,
- Other.SetupCost);
+bool Cost::isLess(Cost &Other, const TargetTransformInfo &TTI) {
+ if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&
+ C.Insns != Other.C.Insns)
+ return C.Insns < Other.C.Insns;
+ return TTI.isLSRCostLess(C, Other.C);
}
void Cost::print(raw_ostream &OS) const {
- OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
- if (AddRecCost != 0)
- OS << ", with addrec cost " << AddRecCost;
- if (NumIVMuls != 0)
- OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s");
- if (NumBaseAdds != 0)
- OS << ", plus " << NumBaseAdds << " base add"
- << (NumBaseAdds == 1 ? "" : "s");
- if (ScaleCost != 0)
- OS << ", plus " << ScaleCost << " scale cost";
- if (ImmCost != 0)
- OS << ", plus " << ImmCost << " imm cost";
- if (SetupCost != 0)
- OS << ", plus " << SetupCost << " setup cost";
+ if (InsnsCost)
+ OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s ");
+ OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s");
+ if (C.AddRecCost != 0)
+ OS << ", with addrec cost " << C.AddRecCost;
+ if (C.NumIVMuls != 0)
+ OS << ", plus " << C.NumIVMuls << " IV mul"
+ << (C.NumIVMuls == 1 ? "" : "s");
+ if (C.NumBaseAdds != 0)
+ OS << ", plus " << C.NumBaseAdds << " base add"
+ << (C.NumBaseAdds == 1 ? "" : "s");
+ if (C.ScaleCost != 0)
+ OS << ", plus " << C.ScaleCost << " scale cost";
+ if (C.ImmCost != 0)
+ OS << ", plus " << C.ImmCost << " imm cost";
+ if (C.SetupCost != 0)
+ OS << ", plus " << C.SetupCost << " setup cost";
}
-LLVM_DUMP_METHOD
-void Cost::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void Cost::dump() const {
print(errs()); errs() << '\n';
}
+#endif
LSRFixup::LSRFixup()
: UserInst(nullptr), OperandValToReplace(nullptr),
@@ -1285,10 +1410,11 @@ void LSRFixup::print(raw_ostream &OS) const {
OS << ", Offset=" << Offset;
}
-LLVM_DUMP_METHOD
-void LSRFixup::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LSRFixup::dump() const {
print(errs()); errs() << '\n';
}
+#endif
/// Test whether this use as a formula which has the same registers as the given
/// formula.
@@ -1300,10 +1426,19 @@ bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
return Uniquifier.count(Key);
}
+/// The function returns a probability of selecting formula without Reg.
+float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {
+ unsigned FNum = 0;
+ for (const Formula &F : Formulae)
+ if (F.referencesReg(Reg))
+ FNum++;
+ return ((float)(Formulae.size() - FNum)) / Formulae.size();
+}
+
/// If the given formula has not yet been inserted, add it to the list, and
/// return true. Return false otherwise. The formula must be in canonical form.
-bool LSRUse::InsertFormula(const Formula &F) {
- assert(F.isCanonical() && "Invalid canonical representation");
+bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
+ assert(F.isCanonical(L) && "Invalid canonical representation");
if (!Formulae.empty() && RigidFormula)
return false;
@@ -1391,10 +1526,11 @@ void LSRUse::print(raw_ostream &OS) const {
OS << ", widest fixup type: " << *WidestFixupType;
}
-LLVM_DUMP_METHOD
-void LSRUse::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LSRUse::dump() const {
print(errs()); errs() << '\n';
}
+#endif
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
LSRUse::KindType Kind, MemAccessTy AccessTy,
@@ -1472,7 +1608,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
int64_t MinOffset, int64_t MaxOffset,
LSRUse::KindType Kind, MemAccessTy AccessTy,
- const Formula &F) {
+ const Formula &F, const Loop &L) {
// For the purpose of isAMCompletelyFolded either having a canonical formula
// or a scale not equal to zero is correct.
// Problems may arise from non canonical formulae having a scale == 0.
@@ -1480,7 +1616,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
// However, when we generate the scaled formulae, we first check that the
// scaling factor is profitable before computing the actual ScaledReg for
// compile time sake.
- assert((F.isCanonical() || F.Scale != 0));
+ assert((F.isCanonical(L) || F.Scale != 0));
return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
}
@@ -1515,14 +1651,15 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
}
static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
- const LSRUse &LU, const Formula &F) {
+ const LSRUse &LU, const Formula &F,
+ const Loop &L) {
if (!F.Scale)
return 0;
// If the use is not completely folded in that instruction, we will have to
// pay an extra cost only for scale != 1.
if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
- LU.AccessTy, F))
+ LU.AccessTy, F, L))
return F.Scale != 1;
switch (LU.Kind) {
@@ -1718,7 +1855,7 @@ class LSRInstance {
void FinalizeChain(IVChain &Chain);
void CollectChains();
void GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
- SmallVectorImpl<WeakVH> &DeadInsts);
+ SmallVectorImpl<WeakTrackingVH> &DeadInsts);
void CollectInterestingTypesAndFactors();
void CollectFixupsAndInitialFormulae();
@@ -1772,6 +1909,8 @@ class LSRInstance {
void NarrowSearchSpaceByDetectingSupersets();
void NarrowSearchSpaceByCollapsingUnrolledCode();
void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
+ void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
+ void NarrowSearchSpaceByDeletingCostlyFormulas();
void NarrowSearchSpaceByPickingWinnerRegs();
void NarrowSearchSpaceUsingHeuristics();
@@ -1792,19 +1931,15 @@ class LSRInstance {
const LSRUse &LU,
SCEVExpander &Rewriter) const;
- Value *Expand(const LSRUse &LU, const LSRFixup &LF,
- const Formula &F,
- BasicBlock::iterator IP,
- SCEVExpander &Rewriter,
- SmallVectorImpl<WeakVH> &DeadInsts) const;
+ Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
+ BasicBlock::iterator IP, SCEVExpander &Rewriter,
+ SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
- const Formula &F,
- SCEVExpander &Rewriter,
- SmallVectorImpl<WeakVH> &DeadInsts) const;
- void Rewrite(const LSRUse &LU, const LSRFixup &LF,
- const Formula &F,
+ const Formula &F, SCEVExpander &Rewriter,
+ SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
+ void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
SCEVExpander &Rewriter,
- SmallVectorImpl<WeakVH> &DeadInsts) const;
+ SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
public:
@@ -2191,7 +2326,7 @@ LSRInstance::OptimizeLoopTermCond() {
dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
const ConstantInt *C = D->getValue();
// Stride of one or negative one can have reuse with non-addresses.
- if (C->isOne() || C->isAllOnesValue())
+ if (C->isOne() || C->isMinusOne())
goto decline_post_inc;
// Avoid weird situations.
if (C->getValue().getMinSignedBits() >= 64 ||
@@ -2492,7 +2627,12 @@ static Value *getWideOperand(Value *Oper) {
static bool isCompatibleIVType(Value *LVal, Value *RVal) {
Type *LType = LVal->getType();
Type *RType = RVal->getType();
- return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy());
+ return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy() &&
+ // Different address spaces means (possibly)
+ // different types of the pointer implementation,
+ // e.g. i16 vs i32 so disallow that.
+ (LType->getPointerAddressSpace() ==
+ RType->getPointerAddressSpace()));
}
/// Return an approximation of this SCEV expression's "base", or NULL for any
@@ -2881,7 +3021,7 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
/// Generate an add or subtract for each IVInc in a chain to materialize the IV
/// user's operand from the previous IV user's operand.
void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
- SmallVectorImpl<WeakVH> &DeadInsts) {
+ SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
// Find the new IVOperand for the head of the chain. It may have been replaced
// by LSR.
const IVInc &Head = Chain.Incs[0];
@@ -2989,8 +3129,10 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
User::op_iterator UseI =
find(UserInst->operands(), U.getOperandValToReplace());
assert(UseI != UserInst->op_end() && "cannot find IV operand");
- if (IVIncSet.count(UseI))
+ if (IVIncSet.count(UseI)) {
+ DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
continue;
+ }
LSRUse::KindType Kind = LSRUse::Basic;
MemAccessTy AccessTy;
@@ -3025,8 +3167,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE)) {
// S is normalized, so normalize N before folding it into S
// to keep the result normalized.
- N = TransformForPostIncUse(Normalize, N, CI, nullptr,
- TmpPostIncLoops, SE, DT);
+ N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
Kind = LSRUse::ICmpZero;
S = SE.getMinusSCEV(N, S);
}
@@ -3108,7 +3249,8 @@ bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
// Do not insert formula that we will not be able to expand.
assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
"Formula is illegal");
- if (!LU.InsertFormula(F))
+
+ if (!LU.InsertFormula(F, *L))
return false;
CountRegisters(F, LUIdx);
@@ -3347,7 +3489,7 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
F.BaseRegs.push_back(*J);
// We may have changed the number of register in base regs, adjust the
// formula accordingly.
- F.canonicalize();
+ F.canonicalize(*L);
if (InsertFormula(LU, LUIdx, F))
// If that formula hadn't been seen before, recurse to find more like
@@ -3359,7 +3501,7 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
/// Split out subexpressions from adds and the bases of addrecs.
void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
Formula Base, unsigned Depth) {
- assert(Base.isCanonical() && "Input must be in the canonical form");
+ assert(Base.isCanonical(*L) && "Input must be in the canonical form");
// Arbitrarily cap recursion to protect compile time.
if (Depth >= 3)
return;
@@ -3400,7 +3542,7 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
// rather than proceed with zero in a register.
if (!Sum->isZero()) {
F.BaseRegs.push_back(Sum);
- F.canonicalize();
+ F.canonicalize(*L);
(void)InsertFormula(LU, LUIdx, F);
}
}
@@ -3457,7 +3599,7 @@ void LSRInstance::GenerateConstantOffsetsImpl(
F.ScaledReg = nullptr;
} else
F.deleteBaseReg(F.BaseRegs[Idx]);
- F.canonicalize();
+ F.canonicalize(*L);
} else if (IsScaledReg)
F.ScaledReg = NewG;
else
@@ -3620,10 +3762,10 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
if (LU.Kind == LSRUse::ICmpZero &&
!Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV)
continue;
- // For each addrec base reg, apply the scale, if possible.
- for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
- if (const SCEVAddRecExpr *AR =
- dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) {
+ // For each addrec base reg, if its loop is current loop, apply the scale.
+ for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]);
+ if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) {
const SCEV *FactorS = SE.getConstant(IntTy, Factor);
if (FactorS->isZero())
continue;
@@ -3637,11 +3779,17 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
// The canonical representation of 1*reg is reg, which is already in
// Base. In that case, do not try to insert the formula, it will be
// rejected anyway.
- if (F.Scale == 1 && F.BaseRegs.empty())
+ if (F.Scale == 1 && (F.BaseRegs.empty() ||
+ (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
continue;
+ // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
+ // non canonical Formula with ScaledReg's loop not being L.
+ if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
+ F.canonicalize(*L);
(void)InsertFormula(LU, LUIdx, F);
}
}
+ }
}
}
@@ -3668,6 +3816,7 @@ void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
continue;
+ F.canonicalize(*L);
(void)InsertFormula(LU, LUIdx, F);
}
}
@@ -3697,10 +3846,11 @@ void WorkItem::print(raw_ostream &OS) const {
<< " , add offset " << Imm;
}
-LLVM_DUMP_METHOD
-void WorkItem::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void WorkItem::dump() const {
print(errs()); errs() << '\n';
}
+#endif
/// Look for registers which are a constant distance apart and try to form reuse
/// opportunities between them.
@@ -3764,8 +3914,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
// Compute the difference between the two.
int64_t Imm = (uint64_t)JImm - M->first;
- for (int LUIdx = UsedByIndices.find_first(); LUIdx != -1;
- LUIdx = UsedByIndices.find_next(LUIdx))
+ for (unsigned LUIdx : UsedByIndices.set_bits())
// Make a memo of this use, offset, and register tuple.
if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
@@ -3821,7 +3970,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
continue;
// OK, looks good.
- NewF.canonicalize();
+ NewF.canonicalize(*this->L);
(void)InsertFormula(LU, LUIdx, NewF);
} else {
// Use the immediate in a base register.
@@ -3853,7 +4002,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
goto skip_formula;
// Ok, looks good.
- NewF.canonicalize();
+ NewF.canonicalize(*this->L);
(void)InsertFormula(LU, LUIdx, NewF);
break;
skip_formula:;
@@ -3967,7 +4116,7 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
Cost CostBest;
Regs.clear();
CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, SE, DT, LU);
- if (CostF < CostBest)
+ if (CostF.isLess(CostBest, TTI))
std::swap(F, Best);
DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
dbgs() << "\n"
@@ -4165,6 +4314,242 @@ void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
}
}
+/// If a LSRUse has multiple formulae with the same ScaledReg and Scale.
+/// Pick the best one and delete the others.
+/// This narrowing heuristic is to keep as many formulae with different
+/// Scale and ScaledReg pair as possible while narrowing the search space.
+/// The benefit is that it is more likely to find out a better solution
+/// from a formulae set with more Scale and ScaledReg variations than
+/// a formulae set with the same Scale and ScaledReg. The picking winner
+/// reg heurstic will often keep the formulae with the same Scale and
+/// ScaledReg and filter others, and we want to avoid that if possible.
+void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
+ if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+ return;
+
+ DEBUG(dbgs() << "The search space is too complex.\n"
+ "Narrowing the search space by choosing the best Formula "
+ "from the Formulae with the same Scale and ScaledReg.\n");
+
+ // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
+ typedef DenseMap<std::pair<const SCEV *, int64_t>, size_t> BestFormulaeTy;
+ BestFormulaeTy BestFormulae;
+#ifndef NDEBUG
+ bool ChangedFormulae = false;
+#endif
+ DenseSet<const SCEV *> VisitedRegs;
+ SmallPtrSet<const SCEV *, 16> Regs;
+
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+ LSRUse &LU = Uses[LUIdx];
+ DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n');
+
+ // Return true if Formula FA is better than Formula FB.
+ auto IsBetterThan = [&](Formula &FA, Formula &FB) {
+ // First we will try to choose the Formula with fewer new registers.
+ // For a register used by current Formula, the more the register is
+ // shared among LSRUses, the less we increase the register number
+ // counter of the formula.
+ size_t FARegNum = 0;
+ for (const SCEV *Reg : FA.BaseRegs) {
+ const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
+ FARegNum += (NumUses - UsedByIndices.count() + 1);
+ }
+ size_t FBRegNum = 0;
+ for (const SCEV *Reg : FB.BaseRegs) {
+ const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
+ FBRegNum += (NumUses - UsedByIndices.count() + 1);
+ }
+ if (FARegNum != FBRegNum)
+ return FARegNum < FBRegNum;
+
+ // If the new register numbers are the same, choose the Formula with
+ // less Cost.
+ Cost CostFA, CostFB;
+ Regs.clear();
+ CostFA.RateFormula(TTI, FA, Regs, VisitedRegs, L, SE, DT, LU);
+ Regs.clear();
+ CostFB.RateFormula(TTI, FB, Regs, VisitedRegs, L, SE, DT, LU);
+ return CostFA.isLess(CostFB, TTI);
+ };
+
+ bool Any = false;
+ for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
+ ++FIdx) {
+ Formula &F = LU.Formulae[FIdx];
+ if (!F.ScaledReg)
+ continue;
+ auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx});
+ if (P.second)
+ continue;
+
+ Formula &Best = LU.Formulae[P.first->second];
+ if (IsBetterThan(F, Best))
+ std::swap(F, Best);
+ DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
+ dbgs() << "\n"
+ " in favor of formula ";
+ Best.print(dbgs()); dbgs() << '\n');
+#ifndef NDEBUG
+ ChangedFormulae = true;
+#endif
+ LU.DeleteFormula(F);
+ --FIdx;
+ --NumForms;
+ Any = true;
+ }
+ if (Any)
+ LU.RecomputeRegs(LUIdx, RegUses);
+
+ // Reset this to prepare for the next use.
+ BestFormulae.clear();
+ }
+
+ DEBUG(if (ChangedFormulae) {
+ dbgs() << "\n"
+ "After filtering out undesirable candidates:\n";
+ print_uses(dbgs());
+ });
+}
+
+/// The function delete formulas with high registers number expectation.
+/// Assuming we don't know the value of each formula (already delete
+/// all inefficient), generate probability of not selecting for each
+/// register.
+/// For example,
+/// Use1:
+/// reg(a) + reg({0,+,1})
+/// reg(a) + reg({-1,+,1}) + 1
+/// reg({a,+,1})
+/// Use2:
+/// reg(b) + reg({0,+,1})
+/// reg(b) + reg({-1,+,1}) + 1
+/// reg({b,+,1})
+/// Use3:
+/// reg(c) + reg(b) + reg({0,+,1})
+/// reg(c) + reg({b,+,1})
+///
+/// Probability of not selecting
+/// Use1 Use2 Use3
+/// reg(a) (1/3) * 1 * 1
+/// reg(b) 1 * (1/3) * (1/2)
+/// reg({0,+,1}) (2/3) * (2/3) * (1/2)
+/// reg({-1,+,1}) (2/3) * (2/3) * 1
+/// reg({a,+,1}) (2/3) * 1 * 1
+/// reg({b,+,1}) 1 * (2/3) * (2/3)
+/// reg(c) 1 * 1 * 0
+///
+/// Now count registers number mathematical expectation for each formula:
+/// Note that for each use we exclude probability if not selecting for the use.
+/// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding
+/// probabilty 1/3 of not selecting for Use1).
+/// Use1:
+/// reg(a) + reg({0,+,1}) 1 + 1/3 -- to be deleted
+/// reg(a) + reg({-1,+,1}) + 1 1 + 4/9 -- to be deleted
+/// reg({a,+,1}) 1
+/// Use2:
+/// reg(b) + reg({0,+,1}) 1/2 + 1/3 -- to be deleted
+/// reg(b) + reg({-1,+,1}) + 1 1/2 + 2/3 -- to be deleted
+/// reg({b,+,1}) 2/3
+/// Use3:
+/// reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
+/// reg(c) + reg({b,+,1}) 1 + 2/3
+
+void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
+ if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+ return;
+ // Ok, we have too many of formulae on our hands to conveniently handle.
+ // Use a rough heuristic to thin out the list.
+
+ // Set of Regs wich will be 100% used in final solution.
+ // Used in each formula of a solution (in example above this is reg(c)).
+ // We can skip them in calculations.
+ SmallPtrSet<const SCEV *, 4> UniqRegs;
+ DEBUG(dbgs() << "The search space is too complex.\n");
+
+ // Map each register to probability of not selecting
+ DenseMap <const SCEV *, float> RegNumMap;
+ for (const SCEV *Reg : RegUses) {
+ if (UniqRegs.count(Reg))
+ continue;
+ float PNotSel = 1;
+ for (const LSRUse &LU : Uses) {
+ if (!LU.Regs.count(Reg))
+ continue;
+ float P = LU.getNotSelectedProbability(Reg);
+ if (P != 0.0)
+ PNotSel *= P;
+ else
+ UniqRegs.insert(Reg);
+ }
+ RegNumMap.insert(std::make_pair(Reg, PNotSel));
+ }
+
+ DEBUG(dbgs() << "Narrowing the search space by deleting costly formulas\n");
+
+ // Delete formulas where registers number expectation is high.
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+ LSRUse &LU = Uses[LUIdx];
+ // If nothing to delete - continue.
+ if (LU.Formulae.size() < 2)
+ continue;
+ // This is temporary solution to test performance. Float should be
+ // replaced with round independent type (based on integers) to avoid
+ // different results for different target builds.
+ float FMinRegNum = LU.Formulae[0].getNumRegs();
+ float FMinARegNum = LU.Formulae[0].getNumRegs();
+ size_t MinIdx = 0;
+ for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
+ Formula &F = LU.Formulae[i];
+ float FRegNum = 0;
+ float FARegNum = 0;
+ for (const SCEV *BaseReg : F.BaseRegs) {
+ if (UniqRegs.count(BaseReg))
+ continue;
+ FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
+ if (isa<SCEVAddRecExpr>(BaseReg))
+ FARegNum +=
+ RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
+ }
+ if (const SCEV *ScaledReg = F.ScaledReg) {
+ if (!UniqRegs.count(ScaledReg)) {
+ FRegNum +=
+ RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
+ if (isa<SCEVAddRecExpr>(ScaledReg))
+ FARegNum +=
+ RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
+ }
+ }
+ if (FMinRegNum > FRegNum ||
+ (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) {
+ FMinRegNum = FRegNum;
+ FMinARegNum = FARegNum;
+ MinIdx = i;
+ }
+ }
+ DEBUG(dbgs() << " The formula "; LU.Formulae[MinIdx].print(dbgs());
+ dbgs() << " with min reg num " << FMinRegNum << '\n');
+ if (MinIdx != 0)
+ std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
+ while (LU.Formulae.size() != 1) {
+ DEBUG(dbgs() << " Deleting "; LU.Formulae.back().print(dbgs());
+ dbgs() << '\n');
+ LU.Formulae.pop_back();
+ }
+ LU.RecomputeRegs(LUIdx, RegUses);
+ assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
+ Formula &F = LU.Formulae[0];
+ DEBUG(dbgs() << " Leaving only "; F.print(dbgs()); dbgs() << '\n');
+ // When we choose the formula, the regs become unique.
+ UniqRegs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
+ if (F.ScaledReg)
+ UniqRegs.insert(F.ScaledReg);
+ }
+ DEBUG(dbgs() << "After pre-selection:\n";
+ print_uses(dbgs()));
+}
+
+
/// Pick a register which seems likely to be profitable, and then in any use
/// which has any reference to that register, delete all formulae which do not
/// reference that register.
@@ -4237,7 +4622,12 @@ void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
NarrowSearchSpaceByDetectingSupersets();
NarrowSearchSpaceByCollapsingUnrolledCode();
NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
- NarrowSearchSpaceByPickingWinnerRegs();
+ if (FilterSameScaledReg)
+ NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
+ if (LSRExpNarrow)
+ NarrowSearchSpaceByDeletingCostlyFormulas();
+ else
+ NarrowSearchSpaceByPickingWinnerRegs();
}
/// This is the recursive solver.
@@ -4294,7 +4684,7 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
NewCost = CurCost;
NewRegs = CurRegs;
NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, SE, DT, LU);
- if (NewCost < SolutionCost) {
+ if (NewCost.isLess(SolutionCost, TTI)) {
Workspace.push_back(&F);
if (Workspace.size() != Uses.size()) {
SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
@@ -4476,12 +4866,10 @@ LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP,
/// Emit instructions for the leading candidate expression for this LSRUse (this
/// is called "expanding").
-Value *LSRInstance::Expand(const LSRUse &LU,
- const LSRFixup &LF,
- const Formula &F,
- BasicBlock::iterator IP,
+Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
+ const Formula &F, BasicBlock::iterator IP,
SCEVExpander &Rewriter,
- SmallVectorImpl<WeakVH> &DeadInsts) const {
+ SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
if (LU.RigidFormula)
return LF.OperandValToReplace;
@@ -4515,11 +4903,7 @@ Value *LSRInstance::Expand(const LSRUse &LU,
assert(!Reg->isZero() && "Zero allocated in a base register!");
// If we're expanding for a post-inc user, make the post-inc adjustment.
- PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
- Reg = TransformForPostIncUse(Denormalize, Reg,
- LF.UserInst, LF.OperandValToReplace,
- Loops, SE, DT);
-
+ Reg = denormalizeForPostIncUse(Reg, LF.PostIncLoops, SE);
Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr)));
}
@@ -4530,9 +4914,7 @@ Value *LSRInstance::Expand(const LSRUse &LU,
// If we're expanding for a post-inc user, make the post-inc adjustment.
PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
- ScaledS = TransformForPostIncUse(Denormalize, ScaledS,
- LF.UserInst, LF.OperandValToReplace,
- Loops, SE, DT);
+ ScaledS = denormalizeForPostIncUse(ScaledS, Loops, SE);
if (LU.Kind == LSRUse::ICmpZero) {
// Expand ScaleReg as if it was part of the base regs.
@@ -4662,12 +5044,9 @@ Value *LSRInstance::Expand(const LSRUse &LU,
/// Helper for Rewrite. PHI nodes are special because the use of their operands
/// effectively happens in their predecessor blocks, so the expression may need
/// to be expanded in multiple places.
-void LSRInstance::RewriteForPHI(PHINode *PN,
- const LSRUse &LU,
- const LSRFixup &LF,
- const Formula &F,
- SCEVExpander &Rewriter,
- SmallVectorImpl<WeakVH> &DeadInsts) const {
+void LSRInstance::RewriteForPHI(
+ PHINode *PN, const LSRUse &LU, const LSRFixup &LF, const Formula &F,
+ SCEVExpander &Rewriter, SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
DenseMap<BasicBlock *, Value *> Inserted;
for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
@@ -4739,11 +5118,9 @@ void LSRInstance::RewriteForPHI(PHINode *PN,
/// Emit instructions for the leading candidate expression for this LSRUse (this
/// is called "expanding"), and update the UserInst to reference the newly
/// expanded value.
-void LSRInstance::Rewrite(const LSRUse &LU,
- const LSRFixup &LF,
- const Formula &F,
- SCEVExpander &Rewriter,
- SmallVectorImpl<WeakVH> &DeadInsts) const {
+void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF,
+ const Formula &F, SCEVExpander &Rewriter,
+ SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
// First, find an insertion point that dominates UserInst. For PHI nodes,
// find the nearest block which dominates all the relevant uses.
if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
@@ -4781,7 +5158,7 @@ void LSRInstance::ImplementSolution(
const SmallVectorImpl<const Formula *> &Solution) {
// Keep track of instructions we may have made dead, so that
// we can remove them after we are done working.
- SmallVector<WeakVH, 16> DeadInsts;
+ SmallVector<WeakTrackingVH, 16> DeadInsts;
SCEVExpander Rewriter(SE, L->getHeader()->getModule()->getDataLayout(),
"lsr");
@@ -4975,10 +5352,11 @@ void LSRInstance::print(raw_ostream &OS) const {
print_uses(OS);
}
-LLVM_DUMP_METHOD
-void LSRInstance::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LSRInstance::dump() const {
print(errs()); errs() << '\n';
}
+#endif
namespace {
@@ -5030,7 +5408,7 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
// Remove any extra phis created by processing inner loops.
Changed |= DeleteDeadPHIs(L->getHeader());
if (EnablePhiElim && L->isLoopSimplifyForm()) {
- SmallVector<WeakVH, 16> DeadInsts;
+ SmallVector<WeakTrackingVH, 16> DeadInsts;
const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
SCEVExpander Rewriter(SE, DL, "lsr");
#ifndef NDEBUG
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index c7f9122..530a684 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -44,7 +44,11 @@ using namespace llvm;
static cl::opt<unsigned>
UnrollThreshold("unroll-threshold", cl::Hidden,
- cl::desc("The baseline cost threshold for loop unrolling"));
+ cl::desc("The cost threshold for loop unrolling"));
+
+static cl::opt<unsigned> UnrollPartialThreshold(
+ "unroll-partial-threshold", cl::Hidden,
+ cl::desc("The cost threshold for partial loop unrolling"));
static cl::opt<unsigned> UnrollMaxPercentThresholdBoost(
"unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden,
@@ -106,10 +110,19 @@ static cl::opt<unsigned> FlatLoopTripCountThreshold(
"aggressively unrolled."));
static cl::opt<bool>
- UnrollAllowPeeling("unroll-allow-peeling", cl::Hidden,
+ UnrollAllowPeeling("unroll-allow-peeling", cl::init(true), cl::Hidden,
cl::desc("Allows loops to be peeled when the dynamic "
"trip count is known to be low."));
+// This option isn't ever intended to be enabled, it serves to allow
+// experiments to check the assumptions about when this kind of revisit is
+// necessary.
+static cl::opt<bool> UnrollRevisitChildLoops(
+ "unroll-revisit-child-loops", cl::Hidden,
+ cl::desc("Enqueue and re-visit child loops in the loop PM after unrolling. "
+ "This shouldn't typically be needed as child loops (or their "
+ "clones) were already visited."));
+
/// A magic value for use with the Threshold parameter to indicate
/// that the loop unroll should be performed regardless of how much
/// code expansion would result.
@@ -118,16 +131,17 @@ static const unsigned NoThreshold = UINT_MAX;
/// Gather the various unrolling parameters based on the defaults, compiler
/// flags, TTI overrides and user specified parameters.
static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
- Loop *L, const TargetTransformInfo &TTI, Optional<unsigned> UserThreshold,
- Optional<unsigned> UserCount, Optional<bool> UserAllowPartial,
- Optional<bool> UserRuntime, Optional<bool> UserUpperBound) {
+ Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel,
+ Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
+ Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
+ Optional<bool> UserUpperBound) {
TargetTransformInfo::UnrollingPreferences UP;
// Set up the defaults
- UP.Threshold = 150;
+ UP.Threshold = OptLevel > 2 ? 300 : 150;
UP.MaxPercentThresholdBoost = 400;
UP.OptSizeThreshold = 0;
- UP.PartialThreshold = UP.Threshold;
+ UP.PartialThreshold = 150;
UP.PartialOptSizeThreshold = 0;
UP.Count = 0;
UP.PeelCount = 0;
@@ -141,10 +155,10 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
UP.AllowExpensiveTripCount = false;
UP.Force = false;
UP.UpperBound = false;
- UP.AllowPeeling = false;
+ UP.AllowPeeling = true;
// Override with any target specific settings
- TTI.getUnrollingPreferences(L, UP);
+ TTI.getUnrollingPreferences(L, SE, UP);
// Apply size attributes
if (L->getHeader()->getParent()->optForSize()) {
@@ -153,10 +167,10 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
}
// Apply any user values specified by cl::opt
- if (UnrollThreshold.getNumOccurrences() > 0) {
+ if (UnrollThreshold.getNumOccurrences() > 0)
UP.Threshold = UnrollThreshold;
- UP.PartialThreshold = UnrollThreshold;
- }
+ if (UnrollPartialThreshold.getNumOccurrences() > 0)
+ UP.PartialThreshold = UnrollPartialThreshold;
if (UnrollMaxPercentThresholdBoost.getNumOccurrences() > 0)
UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost;
if (UnrollMaxCount.getNumOccurrences() > 0)
@@ -495,7 +509,7 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
KnownSucc = SI->getSuccessor(0);
else if (ConstantInt *SimpleCondVal =
dyn_cast<ConstantInt>(SimpleCond))
- KnownSucc = SI->findCaseValue(SimpleCondVal).getCaseSuccessor();
+ KnownSucc = SI->findCaseValue(SimpleCondVal)->getCaseSuccessor();
}
}
if (KnownSucc) {
@@ -685,7 +699,7 @@ static uint64_t getUnrolledLoopSize(
// Calculates unroll count and writes it to UP.Count.
static bool computeUnrollCount(
Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
- ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, unsigned &TripCount,
+ ScalarEvolution &SE, OptimizationRemarkEmitter *ORE, unsigned &TripCount,
unsigned MaxTripCount, unsigned &TripMultiple, unsigned LoopSize,
TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) {
// Check for explicit Count.
@@ -756,7 +770,7 @@ static bool computeUnrollCount(
// helps to remove a significant number of instructions.
// To check that, run additional analysis on the loop.
if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
- L, FullUnrollTripCount, DT, *SE, TTI,
+ L, FullUnrollTripCount, DT, SE, TTI,
UP.Threshold * UP.MaxPercentThresholdBoost / 100)) {
unsigned Boost =
getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
@@ -770,7 +784,15 @@ static bool computeUnrollCount(
}
}
- // 4rd priority is partial unrolling.
+ // 4th priority is loop peeling
+ computePeelCount(L, LoopSize, UP, TripCount);
+ if (UP.PeelCount) {
+ UP.Runtime = false;
+ UP.Count = 1;
+ return ExplicitUnroll;
+ }
+
+ // 5th priority is partial unrolling.
// Try partial unroll only when TripCount could be staticaly calculated.
if (TripCount) {
UP.Partial |= ExplicitUnroll;
@@ -814,6 +836,8 @@ static bool computeUnrollCount(
} else {
UP.Count = TripCount;
}
+ if (UP.Count > UP.MaxCount)
+ UP.Count = UP.MaxCount;
if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
UP.Count != TripCount)
ORE->emit(
@@ -833,14 +857,6 @@ static bool computeUnrollCount(
<< "Unable to fully unroll loop as directed by unroll(full) pragma "
"because loop has a runtime trip count.");
- // 5th priority is loop peeling
- computePeelCount(L, LoopSize, UP);
- if (UP.PeelCount) {
- UP.Runtime = false;
- UP.Count = 1;
- return ExplicitUnroll;
- }
-
// 6th priority is runtime unrolling.
// Don't unroll a runtime trip count loop when it is disabled.
if (HasRuntimeUnrollDisablePragma(L)) {
@@ -912,9 +928,9 @@ static bool computeUnrollCount(
}
static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
- ScalarEvolution *SE, const TargetTransformInfo &TTI,
+ ScalarEvolution &SE, const TargetTransformInfo &TTI,
AssumptionCache &AC, OptimizationRemarkEmitter &ORE,
- bool PreserveLCSSA,
+ bool PreserveLCSSA, int OptLevel,
Optional<unsigned> ProvidedCount,
Optional<unsigned> ProvidedThreshold,
Optional<bool> ProvidedAllowPartial,
@@ -934,8 +950,8 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
bool NotDuplicatable;
bool Convergent;
TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
- L, TTI, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial,
- ProvidedRuntime, ProvidedUpperBound);
+ L, SE, TTI, OptLevel, ProvidedThreshold, ProvidedCount,
+ ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound);
// Exit early if unrolling is disabled.
if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0))
return false;
@@ -963,8 +979,8 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))
ExitingBlock = L->getExitingBlock();
if (ExitingBlock) {
- TripCount = SE->getSmallConstantTripCount(L, ExitingBlock);
- TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock);
+ TripCount = SE.getSmallConstantTripCount(L, ExitingBlock);
+ TripMultiple = SE.getSmallConstantTripMultiple(L, ExitingBlock);
}
// If the loop contains a convergent operation, the prelude we'd add
@@ -986,8 +1002,8 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
// count.
bool MaxOrZero = false;
if (!TripCount) {
- MaxTripCount = SE->getSmallConstantMaxTripCount(L);
- MaxOrZero = SE->isBackedgeTakenCountMaxOrZero(L);
+ MaxTripCount = SE.getSmallConstantMaxTripCount(L);
+ MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L);
// We can unroll by the upper bound amount if it's generally allowed or if
// we know that the loop is executed either the upper bound or zero times.
// (MaxOrZero unrolling keeps only the first loop test, so the number of
@@ -1016,7 +1032,7 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
// Unroll the loop.
if (!UnrollLoop(L, UP.Count, TripCount, UP.Force, UP.Runtime,
UP.AllowExpensiveTripCount, UseUpperBound, MaxOrZero,
- TripMultiple, UP.PeelCount, LI, SE, &DT, &AC, &ORE,
+ TripMultiple, UP.PeelCount, LI, &SE, &DT, &AC, &ORE,
PreserveLCSSA))
return false;
@@ -1034,16 +1050,17 @@ namespace {
class LoopUnroll : public LoopPass {
public:
static char ID; // Pass ID, replacement for typeid
- LoopUnroll(Optional<unsigned> Threshold = None,
+ LoopUnroll(int OptLevel = 2, Optional<unsigned> Threshold = None,
Optional<unsigned> Count = None,
Optional<bool> AllowPartial = None, Optional<bool> Runtime = None,
Optional<bool> UpperBound = None)
- : LoopPass(ID), ProvidedCount(std::move(Count)),
+ : LoopPass(ID), OptLevel(OptLevel), ProvidedCount(std::move(Count)),
ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound) {
initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
}
+ int OptLevel;
Optional<unsigned> ProvidedCount;
Optional<unsigned> ProvidedThreshold;
Optional<bool> ProvidedAllowPartial;
@@ -1058,7 +1075,7 @@ public:
auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
const TargetTransformInfo &TTI =
getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
@@ -1068,7 +1085,7 @@ public:
OptimizationRemarkEmitter ORE(&F);
bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
- return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA,
+ return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA, OptLevel,
ProvidedCount, ProvidedThreshold,
ProvidedAllowPartial, ProvidedRuntime,
ProvidedUpperBound);
@@ -1094,26 +1111,27 @@ INITIALIZE_PASS_DEPENDENCY(LoopPass)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
-Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial,
- int Runtime, int UpperBound) {
+Pass *llvm::createLoopUnrollPass(int OptLevel, int Threshold, int Count,
+ int AllowPartial, int Runtime,
+ int UpperBound) {
// TODO: It would make more sense for this function to take the optionals
// directly, but that's dangerous since it would silently break out of tree
// callers.
- return new LoopUnroll(Threshold == -1 ? None : Optional<unsigned>(Threshold),
- Count == -1 ? None : Optional<unsigned>(Count),
- AllowPartial == -1 ? None
- : Optional<bool>(AllowPartial),
- Runtime == -1 ? None : Optional<bool>(Runtime),
- UpperBound == -1 ? None : Optional<bool>(UpperBound));
+ return new LoopUnroll(
+ OptLevel, Threshold == -1 ? None : Optional<unsigned>(Threshold),
+ Count == -1 ? None : Optional<unsigned>(Count),
+ AllowPartial == -1 ? None : Optional<bool>(AllowPartial),
+ Runtime == -1 ? None : Optional<bool>(Runtime),
+ UpperBound == -1 ? None : Optional<bool>(UpperBound));
}
-Pass *llvm::createSimpleLoopUnrollPass() {
- return llvm::createLoopUnrollPass(-1, -1, 0, 0, 0);
+Pass *llvm::createSimpleLoopUnrollPass(int OptLevel) {
+ return llvm::createLoopUnrollPass(OptLevel, -1, -1, 0, 0, 0);
}
PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
LoopStandardAnalysisResults &AR,
- LPMUpdater &) {
+ LPMUpdater &Updater) {
const auto &FAM =
AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
Function *F = L.getHeader()->getParent();
@@ -1124,12 +1142,84 @@ PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
report_fatal_error("LoopUnrollPass: OptimizationRemarkEmitterAnalysis not "
"cached at a higher level");
- bool Changed = tryToUnrollLoop(&L, AR.DT, &AR.LI, &AR.SE, AR.TTI, AR.AC, *ORE,
- /*PreserveLCSSA*/ true, ProvidedCount,
- ProvidedThreshold, ProvidedAllowPartial,
- ProvidedRuntime, ProvidedUpperBound);
-
+ // Keep track of the previous loop structure so we can identify new loops
+ // created by unrolling.
+ Loop *ParentL = L.getParentLoop();
+ SmallPtrSet<Loop *, 4> OldLoops;
+ if (ParentL)
+ OldLoops.insert(ParentL->begin(), ParentL->end());
+ else
+ OldLoops.insert(AR.LI.begin(), AR.LI.end());
+
+ // The API here is quite complex to call, but there are only two interesting
+ // states we support: partial and full (or "simple") unrolling. However, to
+ // enable these things we actually pass "None" in for the optional to avoid
+ // providing an explicit choice.
+ Optional<bool> AllowPartialParam, RuntimeParam, UpperBoundParam;
+ if (!AllowPartialUnrolling)
+ AllowPartialParam = RuntimeParam = UpperBoundParam = false;
+ bool Changed = tryToUnrollLoop(
+ &L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE,
+ /*PreserveLCSSA*/ true, OptLevel, /*Count*/ None,
+ /*Threshold*/ None, AllowPartialParam, RuntimeParam, UpperBoundParam);
if (!Changed)
return PreservedAnalyses::all();
+
+ // The parent must not be damaged by unrolling!
+#ifndef NDEBUG
+ if (ParentL)
+ ParentL->verifyLoop();
+#endif
+
+ // Unrolling can do several things to introduce new loops into a loop nest:
+ // - Partial unrolling clones child loops within the current loop. If it
+ // uses a remainder, then it can also create any number of sibling loops.
+ // - Full unrolling clones child loops within the current loop but then
+ // removes the current loop making all of the children appear to be new
+ // sibling loops.
+ // - Loop peeling can directly introduce new sibling loops by peeling one
+ // iteration.
+ //
+ // When a new loop appears as a sibling loop, either from peeling an
+ // iteration or fully unrolling, its nesting structure has fundamentally
+ // changed and we want to revisit it to reflect that.
+ //
+ // When unrolling has removed the current loop, we need to tell the
+ // infrastructure that it is gone.
+ //
+ // Finally, we support a debugging/testing mode where we revisit child loops
+ // as well. These are not expected to require further optimizations as either
+ // they or the loop they were cloned from have been directly visited already.
+ // But the debugging mode allows us to check this assumption.
+ bool IsCurrentLoopValid = false;
+ SmallVector<Loop *, 4> SibLoops;
+ if (ParentL)
+ SibLoops.append(ParentL->begin(), ParentL->end());
+ else
+ SibLoops.append(AR.LI.begin(), AR.LI.end());
+ erase_if(SibLoops, [&](Loop *SibLoop) {
+ if (SibLoop == &L) {
+ IsCurrentLoopValid = true;
+ return true;
+ }
+
+ // Otherwise erase the loop from the list if it was in the old loops.
+ return OldLoops.count(SibLoop) != 0;
+ });
+ Updater.addSiblingLoops(SibLoops);
+
+ if (!IsCurrentLoopValid) {
+ Updater.markLoopAsDeleted(L);
+ } else {
+ // We can only walk child loops if the current loop remained valid.
+ if (UnrollRevisitChildLoops) {
+ // Walk *all* of the child loops. This is a highly speculative mode
+ // anyways so look for any simplifications that arose from partial
+ // unrolling or peeling off of iterations.
+ SmallVector<Loop *, 4> ChildLoops(L.begin(), L.end());
+ Updater.addChildLoops(ChildLoops);
+ }
+ }
+
return getLoopPassPreservedAnalyses();
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
index 76fe918..d0c96fa 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
// This pass transforms loops that contain branches on loop-invariant conditions
-// to have multiple loops. For example, it turns the left into the right code:
+// to multiple loops. For example, it turns the left into the right code:
//
// for (...) if (lic)
// A for (...)
@@ -26,32 +26,34 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/Transforms/Scalar.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Support/BranchProbability.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/BranchProbability.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/Local.h"
@@ -77,19 +79,6 @@ static cl::opt<unsigned>
Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"),
cl::init(100), cl::Hidden);
-static cl::opt<bool>
-LoopUnswitchWithBlockFrequency("loop-unswitch-with-block-frequency",
- cl::init(false), cl::Hidden,
- cl::desc("Enable the use of the block frequency analysis to access PGO "
- "heuristics to minimize code growth in cold regions."));
-
-static cl::opt<unsigned>
-ColdnessThreshold("loop-unswitch-coldness-threshold", cl::init(1), cl::Hidden,
- cl::desc("Coldness threshold in percentage. The loop header frequency "
- "(relative to the entry frequency) is compared with this "
- "threshold to determine if non-trivial unswitching should be "
- "enabled."));
-
namespace {
class LUAnalysisCache {
@@ -174,13 +163,6 @@ namespace {
LUAnalysisCache BranchesInfo;
- bool EnabledPGO;
-
- // BFI and ColdEntryFreq are only used when PGO and
- // LoopUnswitchWithBlockFrequency are enabled.
- BlockFrequencyInfo BFI;
- BlockFrequency ColdEntryFreq;
-
bool OptimizeForSize;
bool redoLoop;
@@ -199,12 +181,14 @@ namespace {
// NewBlocks contained cloned copy of basic blocks from LoopBlocks.
std::vector<BasicBlock*> NewBlocks;
+ bool hasBranchDivergence;
+
public:
static char ID; // Pass ID, replacement for typeid
- explicit LoopUnswitch(bool Os = false) :
+ explicit LoopUnswitch(bool Os = false, bool hasBranchDivergence = false) :
LoopPass(ID), OptimizeForSize(Os), redoLoop(false),
currentLoop(nullptr), DT(nullptr), loopHeader(nullptr),
- loopPreheader(nullptr) {
+ loopPreheader(nullptr), hasBranchDivergence(hasBranchDivergence) {
initializeLoopUnswitchPass(*PassRegistry::getPassRegistry());
}
@@ -217,6 +201,8 @@ namespace {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<TargetTransformInfoWrapperPass>();
+ if (hasBranchDivergence)
+ AU.addRequired<DivergenceAnalysis>();
getLoopAnalysisUsage(AU);
}
@@ -255,6 +241,11 @@ namespace {
TerminatorInst *TI);
void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L);
+
+ /// Given that the Invariant is not equal to Val. Simplify instructions
+ /// in the loop.
+ Value *SimplifyInstructionWithNotEqual(Instruction *Inst, Value *Invariant,
+ Constant *Val);
};
}
@@ -381,16 +372,35 @@ INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops",
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(LoopPass)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops",
false, false)
-Pass *llvm::createLoopUnswitchPass(bool Os) {
- return new LoopUnswitch(Os);
+Pass *llvm::createLoopUnswitchPass(bool Os, bool hasBranchDivergence) {
+ return new LoopUnswitch(Os, hasBranchDivergence);
}
+/// Operator chain lattice.
+enum OperatorChain {
+ OC_OpChainNone, ///< There is no operator.
+ OC_OpChainOr, ///< There are only ORs.
+ OC_OpChainAnd, ///< There are only ANDs.
+ OC_OpChainMixed ///< There are ANDs and ORs.
+};
+
/// Cond is a condition that occurs in L. If it is invariant in the loop, or has
/// an invariant piece, return the invariant. Otherwise, return null.
+//
+/// NOTE: FindLIVLoopCondition will not return a partial LIV by walking up a
+/// mixed operator chain, as we can not reliably find a value which will simplify
+/// the operator chain. If the chain is AND-only or OR-only, we can use 0 or ~0
+/// to simplify the chain.
+///
+/// NOTE: In case a partial LIV and a mixed operator chain, we may be able to
+/// simplify the condition itself to a loop variant condition, but at the
+/// cost of creating an entirely new loop.
static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
+ OperatorChain &ParentChain,
DenseMap<Value *, Value *> &Cache) {
auto CacheIt = Cache.find(Cond);
if (CacheIt != Cache.end())
@@ -414,21 +424,53 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
return Cond;
}
+ // Walk up the operator chain to find partial invariant conditions.
if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Cond))
if (BO->getOpcode() == Instruction::And ||
BO->getOpcode() == Instruction::Or) {
- // If either the left or right side is invariant, we can unswitch on this,
- // which will cause the branch to go away in one loop and the condition to
- // simplify in the other one.
- if (Value *LHS =
- FindLIVLoopCondition(BO->getOperand(0), L, Changed, Cache)) {
- Cache[Cond] = LHS;
- return LHS;
+ // Given the previous operator, compute the current operator chain status.
+ OperatorChain NewChain;
+ switch (ParentChain) {
+ case OC_OpChainNone:
+ NewChain = BO->getOpcode() == Instruction::And ? OC_OpChainAnd :
+ OC_OpChainOr;
+ break;
+ case OC_OpChainOr:
+ NewChain = BO->getOpcode() == Instruction::Or ? OC_OpChainOr :
+ OC_OpChainMixed;
+ break;
+ case OC_OpChainAnd:
+ NewChain = BO->getOpcode() == Instruction::And ? OC_OpChainAnd :
+ OC_OpChainMixed;
+ break;
+ case OC_OpChainMixed:
+ NewChain = OC_OpChainMixed;
+ break;
}
- if (Value *RHS =
- FindLIVLoopCondition(BO->getOperand(1), L, Changed, Cache)) {
- Cache[Cond] = RHS;
- return RHS;
+
+ // If we reach a Mixed state, we do not want to keep walking up as we can not
+ // reliably find a value that will simplify the chain. With this check, we
+ // will return null on the first sight of mixed chain and the caller will
+ // either backtrack to find partial LIV in other operand or return null.
+ if (NewChain != OC_OpChainMixed) {
+ // Update the current operator chain type before we search up the chain.
+ ParentChain = NewChain;
+ // If either the left or right side is invariant, we can unswitch on this,
+ // which will cause the branch to go away in one loop and the condition to
+ // simplify in the other one.
+ if (Value *LHS = FindLIVLoopCondition(BO->getOperand(0), L, Changed,
+ ParentChain, Cache)) {
+ Cache[Cond] = LHS;
+ return LHS;
+ }
+ // We did not manage to find a partial LIV in operand(0). Backtrack and try
+ // operand(1).
+ ParentChain = NewChain;
+ if (Value *RHS = FindLIVLoopCondition(BO->getOperand(1), L, Changed,
+ ParentChain, Cache)) {
+ Cache[Cond] = RHS;
+ return RHS;
+ }
}
}
@@ -436,9 +478,21 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
return nullptr;
}
-static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) {
+/// Cond is a condition that occurs in L. If it is invariant in the loop, or has
+/// an invariant piece, return the invariant along with the operator chain type.
+/// Otherwise, return null.
+static std::pair<Value *, OperatorChain> FindLIVLoopCondition(Value *Cond,
+ Loop *L,
+ bool &Changed) {
DenseMap<Value *, Value *> Cache;
- return FindLIVLoopCondition(Cond, L, Changed, Cache);
+ OperatorChain OpChain = OC_OpChainNone;
+ Value *FCond = FindLIVLoopCondition(Cond, L, Changed, OpChain, Cache);
+
+ // In case we do find a LIV, it can not be obtained by walking up a mixed
+ // operator chain.
+ assert((!FCond || OpChain != OC_OpChainMixed) &&
+ "Do not expect a partial LIV with mixed operator chain");
+ return {FCond, OpChain};
}
bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
@@ -457,19 +511,6 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
if (SanitizeMemory)
computeLoopSafetyInfo(&SafetyInfo, L);
- EnabledPGO = F->getEntryCount().hasValue();
-
- if (LoopUnswitchWithBlockFrequency && EnabledPGO) {
- BranchProbabilityInfo BPI(*F, *LI);
- BFI.calculate(*L->getHeader()->getParent(), BPI, *LI);
-
- // Use BranchProbability to compute a minimum frequency based on
- // function entry baseline frequency. Loops with headers below this
- // frequency are considered as cold.
- const BranchProbability ColdProb(ColdnessThreshold, 100);
- ColdEntryFreq = BlockFrequency(BFI.getEntryFreq()) * ColdProb;
- }
-
bool Changed = false;
do {
assert(currentLoop->isLCSSAForm(*DT));
@@ -581,19 +622,9 @@ bool LoopUnswitch::processCurrentLoop() {
loopHeader->getParent()->hasFnAttribute(Attribute::OptimizeForSize))
return false;
- if (LoopUnswitchWithBlockFrequency && EnabledPGO) {
- // Compute the weighted frequency of the hottest block in the
- // loop (loopHeader in this case since inner loops should be
- // processed before outer loop). If it is less than ColdFrequency,
- // we should not unswitch.
- BlockFrequency LoopEntryFreq = BFI.getBlockFreq(loopHeader);
- if (LoopEntryFreq < ColdEntryFreq)
- return false;
- }
-
for (IntrinsicInst *Guard : Guards) {
Value *LoopCond =
- FindLIVLoopCondition(Guard->getOperand(0), currentLoop, Changed);
+ FindLIVLoopCondition(Guard->getOperand(0), currentLoop, Changed).first;
if (LoopCond &&
UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) {
// NB! Unswitching (if successful) could have erased some of the
@@ -634,7 +665,7 @@ bool LoopUnswitch::processCurrentLoop() {
// See if this, or some part of it, is loop invariant. If so, we can
// unswitch on it if we desire.
Value *LoopCond = FindLIVLoopCondition(BI->getCondition(),
- currentLoop, Changed);
+ currentLoop, Changed).first;
if (LoopCond &&
UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context), TI)) {
++NumBranches;
@@ -642,24 +673,48 @@ bool LoopUnswitch::processCurrentLoop() {
}
}
} else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
- Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
- currentLoop, Changed);
+ Value *SC = SI->getCondition();
+ Value *LoopCond;
+ OperatorChain OpChain;
+ std::tie(LoopCond, OpChain) =
+ FindLIVLoopCondition(SC, currentLoop, Changed);
+
unsigned NumCases = SI->getNumCases();
if (LoopCond && NumCases) {
// Find a value to unswitch on:
// FIXME: this should chose the most expensive case!
// FIXME: scan for a case with a non-critical edge?
Constant *UnswitchVal = nullptr;
-
- // Do not process same value again and again.
- // At this point we have some cases already unswitched and
- // some not yet unswitched. Let's find the first not yet unswitched one.
- for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
- i != e; ++i) {
- Constant *UnswitchValCandidate = i.getCaseValue();
- if (!BranchesInfo.isUnswitched(SI, UnswitchValCandidate)) {
- UnswitchVal = UnswitchValCandidate;
- break;
+ // Find a case value such that at least one case value is unswitched
+ // out.
+ if (OpChain == OC_OpChainAnd) {
+ // If the chain only has ANDs and the switch has a case value of 0.
+ // Dropping in a 0 to the chain will unswitch out the 0-casevalue.
+ auto *AllZero = cast<ConstantInt>(Constant::getNullValue(SC->getType()));
+ if (BranchesInfo.isUnswitched(SI, AllZero))
+ continue;
+ // We are unswitching 0 out.
+ UnswitchVal = AllZero;
+ } else if (OpChain == OC_OpChainOr) {
+ // If the chain only has ORs and the switch has a case value of ~0.
+ // Dropping in a ~0 to the chain will unswitch out the ~0-casevalue.
+ auto *AllOne = cast<ConstantInt>(Constant::getAllOnesValue(SC->getType()));
+ if (BranchesInfo.isUnswitched(SI, AllOne))
+ continue;
+ // We are unswitching ~0 out.
+ UnswitchVal = AllOne;
+ } else {
+ assert(OpChain == OC_OpChainNone &&
+ "Expect to unswitch on trivial chain");
+ // Do not process same value again and again.
+ // At this point we have some cases already unswitched and
+ // some not yet unswitched. Let's find the first not yet unswitched one.
+ for (auto Case : SI->cases()) {
+ Constant *UnswitchValCandidate = Case.getCaseValue();
+ if (!BranchesInfo.isUnswitched(SI, UnswitchValCandidate)) {
+ UnswitchVal = UnswitchValCandidate;
+ break;
+ }
}
}
@@ -668,6 +723,11 @@ bool LoopUnswitch::processCurrentLoop() {
if (UnswitchIfProfitable(LoopCond, UnswitchVal)) {
++NumSwitches;
+ // In case of a full LIV, UnswitchVal is the value we unswitched out.
+ // In case of a partial LIV, we only unswitch when its an AND-chain
+ // or OR-chain. In both cases switch input value simplifies to
+ // UnswitchVal.
+ BranchesInfo.setUnswitched(SI, UnswitchVal);
return true;
}
}
@@ -678,7 +738,7 @@ bool LoopUnswitch::processCurrentLoop() {
BBI != E; ++BBI)
if (SelectInst *SI = dyn_cast<SelectInst>(BBI)) {
Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
- currentLoop, Changed);
+ currentLoop, Changed).first;
if (LoopCond && UnswitchIfProfitable(LoopCond,
ConstantInt::getTrue(Context))) {
++NumSelects;
@@ -753,6 +813,15 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val,
<< ". Cost too high.\n");
return false;
}
+ if (hasBranchDivergence &&
+ getAnalysis<DivergenceAnalysis>().isDivergent(LoopCond)) {
+ DEBUG(dbgs() << "NOT unswitching loop %"
+ << currentLoop->getHeader()->getName()
+ << " at non-trivial condition '" << *Val
+ << "' == " << *LoopCond << "\n"
+ << ". Condition is divergent.\n");
+ return false;
+ }
UnswitchNontrivialCondition(LoopCond, Val, currentLoop, TI);
return true;
@@ -762,7 +831,12 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val,
/// mapping the blocks with the specified map.
static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM,
LoopInfo *LI, LPPassManager *LPM) {
- Loop &New = LPM->addLoop(PL);
+ Loop &New = *new Loop();
+ if (PL)
+ PL->addChildLoop(&New);
+ else
+ LI->addTopLevelLoop(&New);
+ LPM->addLoop(New);
// Add all of the blocks in L to the new loop.
for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
@@ -899,7 +973,6 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
if (I.mayHaveSideEffects())
return false;
- // FIXME: add check for constant foldable switch instructions.
if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) {
if (BI->isUnconditional()) {
CurrentBB = BI->getSuccessor(0);
@@ -911,7 +984,16 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
// Found a trivial condition candidate: non-foldable conditional branch.
break;
}
+ } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
+ // At this point, any constant-foldable instructions should have probably
+ // been folded.
+ ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition());
+ if (!Cond)
+ break;
+ // Find the target block we are definitely going to.
+ CurrentBB = SI->findCaseValue(Cond)->getCaseSuccessor();
} else {
+ // We do not understand these terminator instructions.
break;
}
@@ -929,7 +1011,7 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
return false;
Value *LoopCond = FindLIVLoopCondition(BI->getCondition(),
- currentLoop, Changed);
+ currentLoop, Changed).first;
// Unswitch only if the trivial condition itself is an LIV (not
// partial LIV which could occur in and/or)
@@ -960,7 +1042,7 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
} else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
// If this isn't switching on an invariant condition, we can't unswitch it.
Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
- currentLoop, Changed);
+ currentLoop, Changed).first;
// Unswitch only if the trivial condition itself is an LIV (not
// partial LIV which could occur in and/or)
@@ -973,13 +1055,12 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
// this.
// Note that we can't trivially unswitch on the default case or
// on already unswitched cases.
- for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
- i != e; ++i) {
+ for (auto Case : SI->cases()) {
BasicBlock *LoopExitCandidate;
- if ((LoopExitCandidate = isTrivialLoopExitBlock(currentLoop,
- i.getCaseSuccessor()))) {
+ if ((LoopExitCandidate =
+ isTrivialLoopExitBlock(currentLoop, Case.getCaseSuccessor()))) {
// Okay, we found a trivial case, remember the value that is trivial.
- ConstantInt *CaseVal = i.getCaseValue();
+ ConstantInt *CaseVal = Case.getCaseValue();
// Check that it was not unswitched before, since already unswitched
// trivial vals are looks trivial too.
@@ -998,6 +1079,9 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, LoopExitBB,
nullptr);
+
+ // We are only unswitching full LIV.
+ BranchesInfo.setUnswitched(SI, CondVal);
++NumSwitches;
return true;
}
@@ -1152,11 +1236,12 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
LoopProcessWorklist.push_back(NewLoop);
redoLoop = true;
- // Keep a WeakVH holding onto LIC. If the first call to RewriteLoopBody
+ // Keep a WeakTrackingVH holding onto LIC. If the first call to
+ // RewriteLoopBody
// deletes the instruction (for example by simplifying a PHI that feeds into
// the condition that we're unswitching on), we don't rewrite the second
// iteration.
- WeakVH LICHandle(LIC);
+ WeakTrackingVH LICHandle(LIC);
// Now we rewrite the original code to know that the condition is true and the
// new code to know that the condition is false.
@@ -1183,7 +1268,7 @@ static void RemoveFromWorklist(Instruction *I,
static void ReplaceUsesOfWith(Instruction *I, Value *V,
std::vector<Instruction*> &Worklist,
Loop *L, LPPassManager *LPM) {
- DEBUG(dbgs() << "Replace with '" << *V << "': " << *I);
+ DEBUG(dbgs() << "Replace with '" << *V << "': " << *I << "\n");
// Add uses to the worklist, which may be dead now.
for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
@@ -1196,7 +1281,8 @@ static void ReplaceUsesOfWith(Instruction *I, Value *V,
LPM->deleteSimpleAnalysisValue(I, L);
RemoveFromWorklist(I, Worklist);
I->replaceAllUsesWith(V);
- I->eraseFromParent();
+ if (!I->mayHaveSideEffects())
+ I->eraseFromParent();
++NumSimplify;
}
@@ -1253,18 +1339,38 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
if (!UI || !L->contains(UI))
continue;
- Worklist.push_back(UI);
+ // At this point, we know LIC is definitely not Val. Try to use some simple
+ // logic to simplify the user w.r.t. to the context.
+ if (Value *Replacement = SimplifyInstructionWithNotEqual(UI, LIC, Val)) {
+ if (LI->replacementPreservesLCSSAForm(UI, Replacement)) {
+ // This in-loop instruction has been simplified w.r.t. its context,
+ // i.e. LIC != Val, make sure we propagate its replacement value to
+ // all its users.
+ //
+ // We can not yet delete UI, the LIC user, yet, because that would invalidate
+ // the LIC->users() iterator !. However, we can make this instruction
+ // dead by replacing all its users and push it onto the worklist so that
+ // it can be properly deleted and its operands simplified.
+ UI->replaceAllUsesWith(Replacement);
+ }
+ }
- // TODO: We could do other simplifications, for example, turning
- // 'icmp eq LIC, Val' -> false.
+ // This is a LIC user, push it into the worklist so that SimplifyCode can
+ // attempt to simplify it.
+ Worklist.push_back(UI);
// If we know that LIC is not Val, use this info to simplify code.
SwitchInst *SI = dyn_cast<SwitchInst>(UI);
if (!SI || !isa<ConstantInt>(Val)) continue;
- SwitchInst::CaseIt DeadCase = SI->findCaseValue(cast<ConstantInt>(Val));
+ // NOTE: if a case value for the switch is unswitched out, we record it
+ // after the unswitch finishes. We can not record it here as the switch
+ // is not a direct user of the partial LIV.
+ SwitchInst::CaseHandle DeadCase =
+ *SI->findCaseValue(cast<ConstantInt>(Val));
// Default case is live for multiple values.
- if (DeadCase == SI->case_default()) continue;
+ if (DeadCase == *SI->case_default())
+ continue;
// Found a dead case value. Don't remove PHI nodes in the
// successor if they become single-entry, those PHI nodes may
@@ -1274,8 +1380,6 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
BasicBlock *SISucc = DeadCase.getCaseSuccessor();
BasicBlock *Latch = L->getLoopLatch();
- BranchesInfo.setUnswitched(SI, Val);
-
if (!SI->findCaseDest(SISucc)) continue; // Edge is critical.
// If the DeadCase successor dominates the loop latch, then the
// transformation isn't safe since it will delete the sole predecessor edge
@@ -1334,7 +1438,7 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
// Simple DCE.
if (isInstructionTriviallyDead(I)) {
- DEBUG(dbgs() << "Remove dead instruction '" << *I);
+ DEBUG(dbgs() << "Remove dead instruction '" << *I << "\n");
// Add uses to the worklist, which may be dead now.
for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
@@ -1397,3 +1501,27 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
}
}
}
+
+/// Simple simplifications we can do given the information that Cond is
+/// definitely not equal to Val.
+Value *LoopUnswitch::SimplifyInstructionWithNotEqual(Instruction *Inst,
+ Value *Invariant,
+ Constant *Val) {
+ // icmp eq cond, val -> false
+ ICmpInst *CI = dyn_cast<ICmpInst>(Inst);
+ if (CI && CI->isEquality()) {
+ Value *Op0 = CI->getOperand(0);
+ Value *Op1 = CI->getOperand(1);
+ if ((Op0 == Invariant && Op1 == Val) || (Op0 == Val && Op1 == Invariant)) {
+ LLVMContext &Ctx = Inst->getContext();
+ if (CI->getPredicate() == CmpInst::ICMP_EQ)
+ return ConstantInt::getFalse(Ctx);
+ else
+ return ConstantInt::getTrue(Ctx);
+ }
+ }
+
+ // FIXME: there may be other opportunities, e.g. comparison with floating
+ // point, or Invariant - Val != 0, etc.
+ return nullptr;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp
index 08e60b1..6f77c5b 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp
@@ -155,8 +155,7 @@ public:
}
bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
+ // Don't skip optnone functions; atomics still need to be lowered.
FunctionAnalysisManager DummyFAM;
auto PA = Impl.run(F, DummyFAM);
return !PA.areAllPreserved();
diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
index 52975ef..46f8a35 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
@@ -14,6 +14,7 @@
#include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
@@ -67,11 +68,11 @@ static bool handleSwitchExpect(SwitchInst &SI) {
if (!ExpectedValue)
return false;
- SwitchInst::CaseIt Case = SI.findCaseValue(ExpectedValue);
+ SwitchInst::CaseHandle Case = *SI.findCaseValue(ExpectedValue);
unsigned n = SI.getNumCases(); // +1 for default case.
SmallVector<uint32_t, 16> Weights(n + 1, UnlikelyBranchWeight);
- if (Case == SI.case_default())
+ if (Case == *SI.case_default())
Weights[0] = LikelyBranchWeight;
else
Weights[Case.getCaseIndex() + 1] = LikelyBranchWeight;
@@ -83,6 +84,151 @@ static bool handleSwitchExpect(SwitchInst &SI) {
return true;
}
+/// Handler for PHINodes that define the value argument to an
+/// @llvm.expect call.
+///
+/// If the operand of the phi has a constant value and it 'contradicts'
+/// with the expected value of phi def, then the corresponding incoming
+/// edge of the phi is unlikely to be taken. Using that information,
+/// the branch probability info for the originating branch can be inferred.
+static void handlePhiDef(CallInst *Expect) {
+ Value &Arg = *Expect->getArgOperand(0);
+ ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(Expect->getArgOperand(1));
+ if (!ExpectedValue)
+ return;
+ const APInt &ExpectedPhiValue = ExpectedValue->getValue();
+
+ // Walk up in backward a list of instructions that
+ // have 'copy' semantics by 'stripping' the copies
+ // until a PHI node or an instruction of unknown kind
+ // is reached. Negation via xor is also handled.
+ //
+ // C = PHI(...);
+ // B = C;
+ // A = B;
+ // D = __builtin_expect(A, 0);
+ //
+ Value *V = &Arg;
+ SmallVector<Instruction *, 4> Operations;
+ while (!isa<PHINode>(V)) {
+ if (ZExtInst *ZExt = dyn_cast<ZExtInst>(V)) {
+ V = ZExt->getOperand(0);
+ Operations.push_back(ZExt);
+ continue;
+ }
+
+ if (SExtInst *SExt = dyn_cast<SExtInst>(V)) {
+ V = SExt->getOperand(0);
+ Operations.push_back(SExt);
+ continue;
+ }
+
+ BinaryOperator *BinOp = dyn_cast<BinaryOperator>(V);
+ if (!BinOp || BinOp->getOpcode() != Instruction::Xor)
+ return;
+
+ ConstantInt *CInt = dyn_cast<ConstantInt>(BinOp->getOperand(1));
+ if (!CInt)
+ return;
+
+ V = BinOp->getOperand(0);
+ Operations.push_back(BinOp);
+ }
+
+ // Executes the recorded operations on input 'Value'.
+ auto ApplyOperations = [&](const APInt &Value) {
+ APInt Result = Value;
+ for (auto Op : llvm::reverse(Operations)) {
+ switch (Op->getOpcode()) {
+ case Instruction::Xor:
+ Result ^= cast<ConstantInt>(Op->getOperand(1))->getValue();
+ break;
+ case Instruction::ZExt:
+ Result = Result.zext(Op->getType()->getIntegerBitWidth());
+ break;
+ case Instruction::SExt:
+ Result = Result.sext(Op->getType()->getIntegerBitWidth());
+ break;
+ default:
+ llvm_unreachable("Unexpected operation");
+ }
+ }
+ return Result;
+ };
+
+ auto *PhiDef = dyn_cast<PHINode>(V);
+
+ // Get the first dominating conditional branch of the operand
+ // i's incoming block.
+ auto GetDomConditional = [&](unsigned i) -> BranchInst * {
+ BasicBlock *BB = PhiDef->getIncomingBlock(i);
+ BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
+ if (BI && BI->isConditional())
+ return BI;
+ BB = BB->getSinglePredecessor();
+ if (!BB)
+ return nullptr;
+ BI = dyn_cast<BranchInst>(BB->getTerminator());
+ if (!BI || BI->isUnconditional())
+ return nullptr;
+ return BI;
+ };
+
+ // Now walk through all Phi operands to find phi oprerands with values
+ // conflicting with the expected phi output value. Any such operand
+ // indicates the incoming edge to that operand is unlikely.
+ for (unsigned i = 0, e = PhiDef->getNumIncomingValues(); i != e; ++i) {
+
+ Value *PhiOpnd = PhiDef->getIncomingValue(i);
+ ConstantInt *CI = dyn_cast<ConstantInt>(PhiOpnd);
+ if (!CI)
+ continue;
+
+ // Not an interesting case when IsUnlikely is false -- we can not infer
+ // anything useful when the operand value matches the expected phi
+ // output.
+ if (ExpectedPhiValue == ApplyOperations(CI->getValue()))
+ continue;
+
+ BranchInst *BI = GetDomConditional(i);
+ if (!BI)
+ continue;
+
+ MDBuilder MDB(PhiDef->getContext());
+
+ // There are two situations in which an operand of the PhiDef comes
+ // from a given successor of a branch instruction BI.
+ // 1) When the incoming block of the operand is the successor block;
+ // 2) When the incoming block is BI's enclosing block and the
+ // successor is the PhiDef's enclosing block.
+ //
+ // Returns true if the operand which comes from OpndIncomingBB
+ // comes from outgoing edge of BI that leads to Succ block.
+ auto *OpndIncomingBB = PhiDef->getIncomingBlock(i);
+ auto IsOpndComingFromSuccessor = [&](BasicBlock *Succ) {
+ if (OpndIncomingBB == Succ)
+ // If this successor is the incoming block for this
+ // Phi operand, then this successor does lead to the Phi.
+ return true;
+ if (OpndIncomingBB == BI->getParent() && Succ == PhiDef->getParent())
+ // Otherwise, if the edge is directly from the branch
+ // to the Phi, this successor is the one feeding this
+ // Phi operand.
+ return true;
+ return false;
+ };
+
+ if (IsOpndComingFromSuccessor(BI->getSuccessor(1)))
+ BI->setMetadata(
+ LLVMContext::MD_prof,
+ MDB.createBranchWeights(LikelyBranchWeight, UnlikelyBranchWeight));
+ else if (IsOpndComingFromSuccessor(BI->getSuccessor(0)))
+ BI->setMetadata(
+ LLVMContext::MD_prof,
+ MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight));
+ }
+}
+
// Handle both BranchInst and SelectInst.
template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) {
@@ -98,10 +244,18 @@ template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) {
CallInst *CI;
ICmpInst *CmpI = dyn_cast<ICmpInst>(BSI.getCondition());
+ CmpInst::Predicate Predicate;
+ ConstantInt *CmpConstOperand = nullptr;
if (!CmpI) {
CI = dyn_cast<CallInst>(BSI.getCondition());
+ Predicate = CmpInst::ICMP_NE;
} else {
- if (CmpI->getPredicate() != CmpInst::ICMP_NE)
+ Predicate = CmpI->getPredicate();
+ if (Predicate != CmpInst::ICMP_NE && Predicate != CmpInst::ICMP_EQ)
+ return false;
+
+ CmpConstOperand = dyn_cast<ConstantInt>(CmpI->getOperand(1));
+ if (!CmpConstOperand)
return false;
CI = dyn_cast<CallInst>(CmpI->getOperand(0));
}
@@ -109,6 +263,13 @@ template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) {
if (!CI)
return false;
+ uint64_t ValueComparedTo = 0;
+ if (CmpConstOperand) {
+ if (CmpConstOperand->getBitWidth() > 64)
+ return false;
+ ValueComparedTo = CmpConstOperand->getZExtValue();
+ }
+
Function *Fn = CI->getCalledFunction();
if (!Fn || Fn->getIntrinsicID() != Intrinsic::expect)
return false;
@@ -121,9 +282,8 @@ template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) {
MDBuilder MDB(CI->getContext());
MDNode *Node;
- // If expect value is equal to 1 it means that we are more likely to take
- // branch 0, in other case more likely is branch 1.
- if (ExpectedValue->isOne())
+ if ((ExpectedValue->getZExtValue() == ValueComparedTo) ==
+ (Predicate == CmpInst::ICMP_EQ))
Node = MDB.createBranchWeights(LikelyBranchWeight, UnlikelyBranchWeight);
else
Node = MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight);
@@ -173,6 +333,10 @@ static bool lowerExpectIntrinsic(Function &F) {
Function *Fn = CI->getCalledFunction();
if (Fn && Fn->getIntrinsicID() == Intrinsic::expect) {
+ // Before erasing the llvm.expect, walk backward to find
+ // phi that define llvm.expect's first arg, and
+ // infer branch probability:
+ handlePhiDef(CI);
Value *Exp = CI->getArgOperand(0);
CI->replaceAllUsesWith(Exp);
CI->eraseFromParent();
diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
index 4f41371..070114a 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
@@ -17,10 +17,10 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/Pass.h"
diff --git a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 1b59014..7896396 100644
--- a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -13,19 +13,48 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/MemCpyOptimizer.h"
-#include "llvm/Transforms/Scalar.h"
#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
#include "llvm/IR/GetElementPtrTypeIterator.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
using namespace llvm;
#define DEBUG_TYPE "memcpyopt"
@@ -119,6 +148,7 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
return true;
}
+namespace {
/// Represents a range of memset'd bytes with the ByteVal value.
/// This allows us to analyze stores like:
@@ -130,7 +160,6 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
/// the first store, we make a range [1, 2). The second store extends the range
/// to [0, 2). The third makes a new range [2, 3). The fourth store joins the
/// two ranges into [0, 3) which is memset'able.
-namespace {
struct MemsetRange {
// Start/End - A semi range that describes the span that this range covers.
// The range is closed at the start and open at the end: [Start, End).
@@ -148,7 +177,8 @@ struct MemsetRange {
bool isProfitableToUseMemset(const DataLayout &DL) const;
};
-} // end anon namespace
+
+} // end anonymous namespace
bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
// If we found more than 4 stores to merge or 16 bytes, use memset.
@@ -192,13 +222,14 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
return TheStores.size() > NumPointerStores+NumByteStores;
}
-
namespace {
+
class MemsetRanges {
/// A sorted list of the memset ranges.
SmallVector<MemsetRange, 8> Ranges;
typedef SmallVectorImpl<MemsetRange>::iterator range_iterator;
const DataLayout &DL;
+
public:
MemsetRanges(const DataLayout &DL) : DL(DL) {}
@@ -231,8 +262,7 @@ public:
};
-} // end anon namespace
-
+} // end anonymous namespace
/// Add a new store to the MemsetRanges data structure. This adds a
/// new range for the specified store at the specified offset, merging into
@@ -299,48 +329,36 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
//===----------------------------------------------------------------------===//
namespace {
- class MemCpyOptLegacyPass : public FunctionPass {
- MemCpyOptPass Impl;
- public:
- static char ID; // Pass identification, replacement for typeid
- MemCpyOptLegacyPass() : FunctionPass(ID) {
- initializeMemCpyOptLegacyPassPass(*PassRegistry::getPassRegistry());
- }
- bool runOnFunction(Function &F) override;
-
- private:
- // This transformation requires dominator postdominator info
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<MemoryDependenceWrapperPass>();
- AU.addRequired<AAResultsWrapperPass>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- AU.addPreserved<MemoryDependenceWrapperPass>();
- }
+class MemCpyOptLegacyPass : public FunctionPass {
+ MemCpyOptPass Impl;
- // Helper functions
- bool processStore(StoreInst *SI, BasicBlock::iterator &BBI);
- bool processMemSet(MemSetInst *SI, BasicBlock::iterator &BBI);
- bool processMemCpy(MemCpyInst *M);
- bool processMemMove(MemMoveInst *M);
- bool performCallSlotOptzn(Instruction *cpy, Value *cpyDst, Value *cpySrc,
- uint64_t cpyLen, unsigned cpyAlign, CallInst *C);
- bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep);
- bool processMemSetMemCpyDependence(MemCpyInst *M, MemSetInst *MDep);
- bool performMemCpyToMemSetOptzn(MemCpyInst *M, MemSetInst *MDep);
- bool processByValArgument(CallSite CS, unsigned ArgNo);
- Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr,
- Value *ByteVal);
-
- bool iterateOnFunction(Function &F);
- };
+public:
+ static char ID; // Pass identification, replacement for typeid
- char MemCpyOptLegacyPass::ID = 0;
-}
+ MemCpyOptLegacyPass() : FunctionPass(ID) {
+ initializeMemCpyOptLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+
+private:
+ // This transformation requires dominator postdominator info
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<MemoryDependenceWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<MemoryDependenceWrapperPass>();
+ }
+};
+
+char MemCpyOptLegacyPass::ID = 0;
+
+} // end anonymous namespace
/// The public interface to this file...
FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOptLegacyPass(); }
@@ -523,14 +541,15 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
if (Args.erase(C))
NeedLift = true;
else if (MayAlias) {
- NeedLift = any_of(MemLocs, [C, &AA](const MemoryLocation &ML) {
+ NeedLift = llvm::any_of(MemLocs, [C, &AA](const MemoryLocation &ML) {
return AA.getModRefInfo(C, ML);
});
if (!NeedLift)
- NeedLift = any_of(CallSites, [C, &AA](const ImmutableCallSite &CS) {
- return AA.getModRefInfo(C, CS);
- });
+ NeedLift =
+ llvm::any_of(CallSites, [C, &AA](const ImmutableCallSite &CS) {
+ return AA.getModRefInfo(C, CS);
+ });
}
if (!NeedLift)
@@ -567,7 +586,7 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
}
// We made it, we need to lift
- for (auto *I : reverse(ToLift)) {
+ for (auto *I : llvm::reverse(ToLift)) {
DEBUG(dbgs() << "Lifting " << *I << " before " << *P << "\n");
I->moveBefore(P);
}
@@ -761,7 +780,6 @@ bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
return false;
}
-
/// Takes a memcpy and a call that it depends on,
/// and checks for the possibility of a call slot optimization by having
/// the call write its result directly into the destination of the memcpy.
@@ -914,6 +932,17 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
if (MR != MRI_NoModRef)
return false;
+ // We can't create address space casts here because we don't know if they're
+ // safe for the target.
+ if (cpySrc->getType()->getPointerAddressSpace() !=
+ cpyDest->getType()->getPointerAddressSpace())
+ return false;
+ for (unsigned i = 0; i < CS.arg_size(); ++i)
+ if (CS.getArgument(i)->stripPointerCasts() == cpySrc &&
+ cpySrc->getType()->getPointerAddressSpace() !=
+ CS.getArgument(i)->getType()->getPointerAddressSpace())
+ return false;
+
// All the checks have passed, so do the transformation.
bool changedArgument = false;
for (unsigned i = 0; i < CS.arg_size(); ++i)
@@ -1240,7 +1269,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M) {
bool MemCpyOptPass::processMemMove(MemMoveInst *M) {
AliasAnalysis &AA = LookupAliasAnalysis();
- if (!TLI->has(LibFunc::memmove))
+ if (!TLI->has(LibFunc_memmove))
return false;
// See if the pointers alias.
@@ -1294,7 +1323,7 @@ bool MemCpyOptPass::processByValArgument(CallSite CS, unsigned ArgNo) {
// Get the alignment of the byval. If the call doesn't specify the alignment,
// then it is some target specific value that we can't know.
- unsigned ByValAlign = CS.getParamAlignment(ArgNo+1);
+ unsigned ByValAlign = CS.getParamAlignment(ArgNo);
if (ByValAlign == 0) return false;
// If it is greater than the memcpy, then we check to see if we can force the
@@ -1306,6 +1335,11 @@ bool MemCpyOptPass::processByValArgument(CallSite CS, unsigned ArgNo) {
CS.getInstruction(), &AC, &DT) < ByValAlign)
return false;
+ // The address space of the memcpy source must match the byval argument
+ if (MDep->getSource()->getType()->getPointerAddressSpace() !=
+ ByValArg->getType()->getPointerAddressSpace())
+ return false;
+
// Verify that the copied-from memory doesn't change in between the memcpy and
// the byval call.
// memcpy(a <- b)
@@ -1375,7 +1409,6 @@ bool MemCpyOptPass::iterateOnFunction(Function &F) {
}
PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) {
-
auto &MD = AM.getResult<MemoryDependenceAnalysis>(F);
auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
@@ -1393,7 +1426,9 @@ PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) {
LookupAssumptionCache, LookupDomTree);
if (!MadeChange)
return PreservedAnalyses::all();
+
PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
PA.preserve<GlobalsAA>();
PA.preserve<MemoryDependenceAnalysis>();
return PA;
@@ -1414,10 +1449,10 @@ bool MemCpyOptPass::runImpl(
// If we don't have at least memset and memcpy, there is little point of doing
// anything here. These are required by a freestanding implementation, so if
// even they are disabled, there is no point in trying hard.
- if (!TLI->has(LibFunc::memset) || !TLI->has(LibFunc::memcpy))
+ if (!TLI->has(LibFunc_memset) || !TLI->has(LibFunc_memcpy))
return false;
- while (1) {
+ while (true) {
if (!iterateOnFunction(F))
break;
MadeChange = true;
diff --git a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
index 6a64c6b..6727cf0 100644
--- a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -19,6 +19,8 @@
// thinks it safe to do so. This optimization helps with eg. hiding load
// latencies, triggering if-conversion, and reducing static code size.
//
+// NOTE: This code no longer performs load hoisting, it is subsumed by GVNHoist.
+//
//===----------------------------------------------------------------------===//
//
//
@@ -87,7 +89,6 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
using namespace llvm;
@@ -118,16 +119,6 @@ private:
void removeInstruction(Instruction *Inst);
BasicBlock *getDiamondTail(BasicBlock *BB);
bool isDiamondHead(BasicBlock *BB);
- // Routines for hoisting loads
- bool isLoadHoistBarrierInRange(const Instruction &Start,
- const Instruction &End, LoadInst *LI,
- bool SafeToLoadUnconditionally);
- LoadInst *canHoistFromBlock(BasicBlock *BB, LoadInst *LI);
- void hoistInstruction(BasicBlock *BB, Instruction *HoistCand,
- Instruction *ElseInst);
- bool isSafeToHoist(Instruction *I) const;
- bool hoistLoad(BasicBlock *BB, LoadInst *HoistCand, LoadInst *ElseInst);
- bool mergeLoads(BasicBlock *BB);
// Routines for sinking stores
StoreInst *canSinkFromBlock(BasicBlock *BB, StoreInst *SI);
PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1);
@@ -188,169 +179,6 @@ bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) {
return true;
}
-///
-/// \brief True when instruction is a hoist barrier for a load
-///
-/// Whenever an instruction could possibly modify the value
-/// being loaded or protect against the load from happening
-/// it is considered a hoist barrier.
-///
-bool MergedLoadStoreMotion::isLoadHoistBarrierInRange(
- const Instruction &Start, const Instruction &End, LoadInst *LI,
- bool SafeToLoadUnconditionally) {
- if (!SafeToLoadUnconditionally)
- for (const Instruction &Inst :
- make_range(Start.getIterator(), End.getIterator()))
- if (!isGuaranteedToTransferExecutionToSuccessor(&Inst))
- return true;
- MemoryLocation Loc = MemoryLocation::get(LI);
- return AA->canInstructionRangeModRef(Start, End, Loc, MRI_Mod);
-}
-
-///
-/// \brief Decide if a load can be hoisted
-///
-/// When there is a load in \p BB to the same address as \p LI
-/// and it can be hoisted from \p BB, return that load.
-/// Otherwise return Null.
-///
-LoadInst *MergedLoadStoreMotion::canHoistFromBlock(BasicBlock *BB1,
- LoadInst *Load0) {
- BasicBlock *BB0 = Load0->getParent();
- BasicBlock *Head = BB0->getSinglePredecessor();
- bool SafeToLoadUnconditionally = isSafeToLoadUnconditionally(
- Load0->getPointerOperand(), Load0->getAlignment(),
- Load0->getModule()->getDataLayout(),
- /*ScanFrom=*/Head->getTerminator());
- for (BasicBlock::iterator BBI = BB1->begin(), BBE = BB1->end(); BBI != BBE;
- ++BBI) {
- Instruction *Inst = &*BBI;
-
- // Only merge and hoist loads when their result in used only in BB
- auto *Load1 = dyn_cast<LoadInst>(Inst);
- if (!Load1 || Inst->isUsedOutsideOfBlock(BB1))
- continue;
-
- MemoryLocation Loc0 = MemoryLocation::get(Load0);
- MemoryLocation Loc1 = MemoryLocation::get(Load1);
- if (Load0->isSameOperationAs(Load1) && AA->isMustAlias(Loc0, Loc1) &&
- !isLoadHoistBarrierInRange(BB1->front(), *Load1, Load1,
- SafeToLoadUnconditionally) &&
- !isLoadHoistBarrierInRange(BB0->front(), *Load0, Load0,
- SafeToLoadUnconditionally)) {
- return Load1;
- }
- }
- return nullptr;
-}
-
-///
-/// \brief Merge two equivalent instructions \p HoistCand and \p ElseInst into
-/// \p BB
-///
-/// BB is the head of a diamond
-///
-void MergedLoadStoreMotion::hoistInstruction(BasicBlock *BB,
- Instruction *HoistCand,
- Instruction *ElseInst) {
- DEBUG(dbgs() << " Hoist Instruction into BB \n"; BB->dump();
- dbgs() << "Instruction Left\n"; HoistCand->dump(); dbgs() << "\n";
- dbgs() << "Instruction Right\n"; ElseInst->dump(); dbgs() << "\n");
- // Hoist the instruction.
- assert(HoistCand->getParent() != BB);
-
- // Intersect optional metadata.
- HoistCand->andIRFlags(ElseInst);
- HoistCand->dropUnknownNonDebugMetadata();
-
- // Prepend point for instruction insert
- Instruction *HoistPt = BB->getTerminator();
-
- // Merged instruction
- Instruction *HoistedInst = HoistCand->clone();
-
- // Hoist instruction.
- HoistedInst->insertBefore(HoistPt);
-
- HoistCand->replaceAllUsesWith(HoistedInst);
- removeInstruction(HoistCand);
- // Replace the else block instruction.
- ElseInst->replaceAllUsesWith(HoistedInst);
- removeInstruction(ElseInst);
-}
-
-///
-/// \brief Return true if no operand of \p I is defined in I's parent block
-///
-bool MergedLoadStoreMotion::isSafeToHoist(Instruction *I) const {
- BasicBlock *Parent = I->getParent();
- for (Use &U : I->operands())
- if (auto *Instr = dyn_cast<Instruction>(&U))
- if (Instr->getParent() == Parent)
- return false;
- return true;
-}
-
-///
-/// \brief Merge two equivalent loads and GEPs and hoist into diamond head
-///
-bool MergedLoadStoreMotion::hoistLoad(BasicBlock *BB, LoadInst *L0,
- LoadInst *L1) {
- // Only one definition?
- auto *A0 = dyn_cast<Instruction>(L0->getPointerOperand());
- auto *A1 = dyn_cast<Instruction>(L1->getPointerOperand());
- if (A0 && A1 && A0->isIdenticalTo(A1) && isSafeToHoist(A0) &&
- A0->hasOneUse() && (A0->getParent() == L0->getParent()) &&
- A1->hasOneUse() && (A1->getParent() == L1->getParent()) &&
- isa<GetElementPtrInst>(A0)) {
- DEBUG(dbgs() << "Hoist Instruction into BB \n"; BB->dump();
- dbgs() << "Instruction Left\n"; L0->dump(); dbgs() << "\n";
- dbgs() << "Instruction Right\n"; L1->dump(); dbgs() << "\n");
- hoistInstruction(BB, A0, A1);
- hoistInstruction(BB, L0, L1);
- return true;
- }
- return false;
-}
-
-///
-/// \brief Try to hoist two loads to same address into diamond header
-///
-/// Starting from a diamond head block, iterate over the instructions in one
-/// successor block and try to match a load in the second successor.
-///
-bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) {
- bool MergedLoads = false;
- assert(isDiamondHead(BB));
- BranchInst *BI = cast<BranchInst>(BB->getTerminator());
- BasicBlock *Succ0 = BI->getSuccessor(0);
- BasicBlock *Succ1 = BI->getSuccessor(1);
- // #Instructions in Succ1 for Compile Time Control
- int Size1 = Succ1->size();
- int NLoads = 0;
- for (BasicBlock::iterator BBI = Succ0->begin(), BBE = Succ0->end();
- BBI != BBE;) {
- Instruction *I = &*BBI;
- ++BBI;
-
- // Don't move non-simple (atomic, volatile) loads.
- auto *L0 = dyn_cast<LoadInst>(I);
- if (!L0 || !L0->isSimple() || L0->isUsedOutsideOfBlock(Succ0))
- continue;
-
- ++NLoads;
- if (NLoads * Size1 >= MagicCompileTimeControl)
- break;
- if (LoadInst *L1 = canHoistFromBlock(Succ1, L0)) {
- bool Res = hoistLoad(BB, L0, L1);
- MergedLoads |= Res;
- // Don't attempt to hoist above loads that had not been hoisted.
- if (!Res)
- break;
- }
- }
- return MergedLoads;
-}
///
/// \brief True when instruction is a sink barrier for a store
@@ -410,7 +238,7 @@ PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0,
&BB->front());
NewPN->addIncoming(Opd1, S0->getParent());
NewPN->addIncoming(Opd2, S1->getParent());
- if (MD && NewPN->getType()->getScalarType()->isPointerTy())
+ if (MD && NewPN->getType()->isPtrOrPtrVectorTy())
MD->invalidateCachedPointerInfo(NewPN);
return NewPN;
}
@@ -534,7 +362,6 @@ bool MergedLoadStoreMotion::run(Function &F, MemoryDependenceResults *MD,
// Hoist equivalent loads and sink stores
// outside diamonds when possible
if (isDiamondHead(BB)) {
- Changed |= mergeLoads(BB);
Changed |= mergeStores(getDiamondTail(BB));
}
}
@@ -596,8 +423,8 @@ MergedLoadStoreMotionPass::run(Function &F, FunctionAnalysisManager &AM) {
if (!Impl.run(F, MD, AA))
return PreservedAnalyses::all();
- // FIXME: This should also 'preserve the CFG'.
PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
PA.preserve<GlobalsAA>();
PA.preserve<MemoryDependenceAnalysis>();
return PA;
diff --git a/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
index 0a3bf7b..d0bfe36 100644
--- a/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -156,20 +156,12 @@ PreservedAnalyses NaryReassociatePass::run(Function &F,
auto *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
- bool Changed = runImpl(F, AC, DT, SE, TLI, TTI);
-
- // FIXME: We need to invalidate this to avoid PR28400. Is there a better
- // solution?
- AM.invalidate<ScalarEvolutionAnalysis>(F);
-
- if (!Changed)
+ if (!runImpl(F, AC, DT, SE, TLI, TTI))
return PreservedAnalyses::all();
- // FIXME: This should also 'preserve the CFG'.
PreservedAnalyses PA;
- PA.preserve<DominatorTreeAnalysis>();
+ PA.preserveSet<CFGAnalyses>();
PA.preserve<ScalarEvolutionAnalysis>();
- PA.preserve<TargetLibraryAnalysis>();
return PA;
}
@@ -219,7 +211,8 @@ bool NaryReassociatePass::doOneIteration(Function &F) {
Changed = true;
SE->forgetValue(&*I);
I->replaceAllUsesWith(NewI);
- // If SeenExprs constains I's WeakVH, that entry will be replaced with
+ // If SeenExprs constains I's WeakTrackingVH, that entry will be
+ // replaced with
// nullptr.
RecursivelyDeleteTriviallyDeadInstructions(&*I, TLI);
I = NewI->getIterator();
@@ -227,7 +220,7 @@ bool NaryReassociatePass::doOneIteration(Function &F) {
// Add the rewritten instruction to SeenExprs; the original instruction
// is deleted.
const SCEV *NewSCEV = SE->getSCEV(&*I);
- SeenExprs[NewSCEV].push_back(WeakVH(&*I));
+ SeenExprs[NewSCEV].push_back(WeakTrackingVH(&*I));
// Ideally, NewSCEV should equal OldSCEV because tryReassociate(I)
// is equivalent to I. However, ScalarEvolution::getSCEV may
// weaken nsw causing NewSCEV not to equal OldSCEV. For example, suppose
@@ -247,7 +240,7 @@ bool NaryReassociatePass::doOneIteration(Function &F) {
//
// This improvement is exercised in @reassociate_gep_nsw in nary-gep.ll.
if (NewSCEV != OldSCEV)
- SeenExprs[OldSCEV].push_back(WeakVH(&*I));
+ SeenExprs[OldSCEV].push_back(WeakTrackingVH(&*I));
}
}
}
@@ -502,7 +495,8 @@ NaryReassociatePass::findClosestMatchingDominator(const SCEV *CandidateExpr,
// future instruction either. Therefore, we pop it out of the stack. This
// optimization makes the algorithm O(n).
while (!Candidates.empty()) {
- // Candidates stores WeakVHs, so a candidate can be nullptr if it's removed
+ // Candidates stores WeakTrackingVHs, so a candidate can be nullptr if it's
+ // removed
// during rewriting.
if (Value *Candidate = Candidates.back()) {
Instruction *CandidateInstruction = cast<Instruction>(Candidate);
diff --git a/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp b/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 57e6e3d..9d01856 100644
--- a/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -17,6 +17,37 @@
/// "A Sparse Algorithm for Predicated Global Value Numbering" from
/// Karthik Gargi.
///
+/// A brief overview of the algorithm: The algorithm is essentially the same as
+/// the standard RPO value numbering algorithm (a good reference is the paper
+/// "SCC based value numbering" by L. Taylor Simpson) with one major difference:
+/// The RPO algorithm proceeds, on every iteration, to process every reachable
+/// block and every instruction in that block. This is because the standard RPO
+/// algorithm does not track what things have the same value number, it only
+/// tracks what the value number of a given operation is (the mapping is
+/// operation -> value number). Thus, when a value number of an operation
+/// changes, it must reprocess everything to ensure all uses of a value number
+/// get updated properly. In constrast, the sparse algorithm we use *also*
+/// tracks what operations have a given value number (IE it also tracks the
+/// reverse mapping from value number -> operations with that value number), so
+/// that it only needs to reprocess the instructions that are affected when
+/// something's value number changes. The vast majority of complexity and code
+/// in this file is devoted to tracking what value numbers could change for what
+/// instructions when various things happen. The rest of the algorithm is
+/// devoted to performing symbolic evaluation, forward propagation, and
+/// simplification of operations based on the value numbers deduced so far
+///
+/// In order to make the GVN mostly-complete, we use a technique derived from
+/// "Detection of Redundant Expressions: A Complete and Polynomial-time
+/// Algorithm in SSA" by R.R. Pai. The source of incompleteness in most SSA
+/// based GVN algorithms is related to their inability to detect equivalence
+/// between phi of ops (IE phi(a+b, c+d)) and op of phis (phi(a,c) + phi(b, d)).
+/// We resolve this issue by generating the equivalent "phi of ops" form for
+/// each op of phis we see, in a way that only takes polynomial time to resolve.
+///
+/// We also do not perform elimination by using any published algorithm. All
+/// published algorithms are O(Instructions). Instead, we use a technique that
+/// is O(number of operations with the same value number), enabling us to skip
+/// trying to eliminate things that have unique value numbers.
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/NewGVN.h"
@@ -30,7 +61,6 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SparseBitVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/TinyPtrVector.h"
#include "llvm/Analysis/AliasAnalysis.h"
@@ -40,13 +70,10 @@
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/Loads.h"
#include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Analysis/MemoryDependenceAnalysis.h"
#include "llvm/Analysis/MemoryLocation.h"
-#include "llvm/Analysis/PHITransAddr.h"
+#include "llvm/Analysis/MemorySSA.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/GlobalVariable.h"
@@ -55,24 +82,25 @@
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/PredIteratorCache.h"
#include "llvm/IR/Type.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/GVNExpression.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/MemorySSA.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/PredicateInfo.h"
+#include "llvm/Transforms/Utils/VNCoercion.h"
+#include <numeric>
#include <unordered_map>
#include <utility>
#include <vector>
using namespace llvm;
using namespace PatternMatch;
using namespace llvm::GVNExpression;
-
+using namespace llvm::VNCoercion;
#define DEBUG_TYPE "newgvn"
STATISTIC(NumGVNInstrDeleted, "Number of instructions deleted");
@@ -85,8 +113,19 @@ STATISTIC(NumGVNLeaderChanges, "Number of leader changes");
STATISTIC(NumGVNSortedLeaderChanges, "Number of sorted leader changes");
STATISTIC(NumGVNAvoidedSortedLeaderChanges,
"Number of avoided sorted leader changes");
-STATISTIC(NumGVNNotMostDominatingLeader,
- "Number of times a member dominated it's new classes' leader");
+STATISTIC(NumGVNDeadStores, "Number of redundant/dead stores eliminated");
+STATISTIC(NumGVNPHIOfOpsCreated, "Number of PHI of ops created");
+STATISTIC(NumGVNPHIOfOpsEliminations,
+ "Number of things eliminated using PHI of ops");
+DEBUG_COUNTER(VNCounter, "newgvn-vn",
+ "Controls which instructions are value numbered")
+DEBUG_COUNTER(PHIOfOpsCounter, "newgvn-phi",
+ "Controls which instructions we create phi of ops for")
+// Currently store defining access refinement is too slow due to basicaa being
+// egregiously slow. This flag lets us keep it working while we work on this
+// issue.
+static cl::opt<bool> EnableStoreRefinement("enable-store-refinement",
+ cl::init(false), cl::Hidden);
//===----------------------------------------------------------------------===//
// GVN Pass
@@ -105,6 +144,79 @@ PHIExpression::~PHIExpression() = default;
}
}
+// Tarjan's SCC finding algorithm with Nuutila's improvements
+// SCCIterator is actually fairly complex for the simple thing we want.
+// It also wants to hand us SCC's that are unrelated to the phi node we ask
+// about, and have us process them there or risk redoing work.
+// Graph traits over a filter iterator also doesn't work that well here.
+// This SCC finder is specialized to walk use-def chains, and only follows
+// instructions,
+// not generic values (arguments, etc).
+struct TarjanSCC {
+
+ TarjanSCC() : Components(1) {}
+
+ void Start(const Instruction *Start) {
+ if (Root.lookup(Start) == 0)
+ FindSCC(Start);
+ }
+
+ const SmallPtrSetImpl<const Value *> &getComponentFor(const Value *V) const {
+ unsigned ComponentID = ValueToComponent.lookup(V);
+
+ assert(ComponentID > 0 &&
+ "Asking for a component for a value we never processed");
+ return Components[ComponentID];
+ }
+
+private:
+ void FindSCC(const Instruction *I) {
+ Root[I] = ++DFSNum;
+ // Store the DFS Number we had before it possibly gets incremented.
+ unsigned int OurDFS = DFSNum;
+ for (auto &Op : I->operands()) {
+ if (auto *InstOp = dyn_cast<Instruction>(Op)) {
+ if (Root.lookup(Op) == 0)
+ FindSCC(InstOp);
+ if (!InComponent.count(Op))
+ Root[I] = std::min(Root.lookup(I), Root.lookup(Op));
+ }
+ }
+ // See if we really were the root of a component, by seeing if we still have
+ // our DFSNumber. If we do, we are the root of the component, and we have
+ // completed a component. If we do not, we are not the root of a component,
+ // and belong on the component stack.
+ if (Root.lookup(I) == OurDFS) {
+ unsigned ComponentID = Components.size();
+ Components.resize(Components.size() + 1);
+ auto &Component = Components.back();
+ Component.insert(I);
+ DEBUG(dbgs() << "Component root is " << *I << "\n");
+ InComponent.insert(I);
+ ValueToComponent[I] = ComponentID;
+ // Pop a component off the stack and label it.
+ while (!Stack.empty() && Root.lookup(Stack.back()) >= OurDFS) {
+ auto *Member = Stack.back();
+ DEBUG(dbgs() << "Component member is " << *Member << "\n");
+ Component.insert(Member);
+ InComponent.insert(Member);
+ ValueToComponent[Member] = ComponentID;
+ Stack.pop_back();
+ }
+ } else {
+ // Part of a component, push to stack
+ Stack.push_back(I);
+ }
+ }
+ unsigned int DFSNum = 1;
+ SmallPtrSet<const Value *, 8> InComponent;
+ DenseMap<const Value *, unsigned int> Root;
+ SmallVector<const Value *, 8> Stack;
+ // Store the components as vector of ptr sets, because we need the topo order
+ // of SCC's, but not individual member order
+ SmallVector<SmallPtrSet<const Value *, 8>, 8> Components;
+ DenseMap<const Value *, unsigned> ValueToComponent;
+};
// Congruence classes represent the set of expressions/instructions
// that are all the same *during some scope in the function*.
// That is, because of the way we perform equality propagation, and
@@ -115,46 +227,166 @@ PHIExpression::~PHIExpression() = default;
// For any Value in the Member set, it is valid to replace any dominated member
// with that Value.
//
-// Every congruence class has a leader, and the leader is used to
-// symbolize instructions in a canonical way (IE every operand of an
-// instruction that is a member of the same congruence class will
-// always be replaced with leader during symbolization).
-// To simplify symbolization, we keep the leader as a constant if class can be
-// proved to be a constant value.
-// Otherwise, the leader is a randomly chosen member of the value set, it does
-// not matter which one is chosen.
-// Each congruence class also has a defining expression,
-// though the expression may be null. If it exists, it can be used for forward
-// propagation and reassociation of values.
-//
-struct CongruenceClass {
- using MemberSet = SmallPtrSet<Value *, 4>;
+// Every congruence class has a leader, and the leader is used to symbolize
+// instructions in a canonical way (IE every operand of an instruction that is a
+// member of the same congruence class will always be replaced with leader
+// during symbolization). To simplify symbolization, we keep the leader as a
+// constant if class can be proved to be a constant value. Otherwise, the
+// leader is the member of the value set with the smallest DFS number. Each
+// congruence class also has a defining expression, though the expression may be
+// null. If it exists, it can be used for forward propagation and reassociation
+// of values.
+
+// For memory, we also track a representative MemoryAccess, and a set of memory
+// members for MemoryPhis (which have no real instructions). Note that for
+// memory, it seems tempting to try to split the memory members into a
+// MemoryCongruenceClass or something. Unfortunately, this does not work
+// easily. The value numbering of a given memory expression depends on the
+// leader of the memory congruence class, and the leader of memory congruence
+// class depends on the value numbering of a given memory expression. This
+// leads to wasted propagation, and in some cases, missed optimization. For
+// example: If we had value numbered two stores together before, but now do not,
+// we move them to a new value congruence class. This in turn will move at one
+// of the memorydefs to a new memory congruence class. Which in turn, affects
+// the value numbering of the stores we just value numbered (because the memory
+// congruence class is part of the value number). So while theoretically
+// possible to split them up, it turns out to be *incredibly* complicated to get
+// it to work right, because of the interdependency. While structurally
+// slightly messier, it is algorithmically much simpler and faster to do what we
+// do here, and track them both at once in the same class.
+// Note: The default iterators for this class iterate over values
+class CongruenceClass {
+public:
+ using MemberType = Value;
+ using MemberSet = SmallPtrSet<MemberType *, 4>;
+ using MemoryMemberType = MemoryPhi;
+ using MemoryMemberSet = SmallPtrSet<const MemoryMemberType *, 2>;
+
+ explicit CongruenceClass(unsigned ID) : ID(ID) {}
+ CongruenceClass(unsigned ID, Value *Leader, const Expression *E)
+ : ID(ID), RepLeader(Leader), DefiningExpr(E) {}
+ unsigned getID() const { return ID; }
+ // True if this class has no members left. This is mainly used for assertion
+ // purposes, and for skipping empty classes.
+ bool isDead() const {
+ // If it's both dead from a value perspective, and dead from a memory
+ // perspective, it's really dead.
+ return empty() && memory_empty();
+ }
+ // Leader functions
+ Value *getLeader() const { return RepLeader; }
+ void setLeader(Value *Leader) { RepLeader = Leader; }
+ const std::pair<Value *, unsigned int> &getNextLeader() const {
+ return NextLeader;
+ }
+ void resetNextLeader() { NextLeader = {nullptr, ~0}; }
+
+ void addPossibleNextLeader(std::pair<Value *, unsigned int> LeaderPair) {
+ if (LeaderPair.second < NextLeader.second)
+ NextLeader = LeaderPair;
+ }
+
+ Value *getStoredValue() const { return RepStoredValue; }
+ void setStoredValue(Value *Leader) { RepStoredValue = Leader; }
+ const MemoryAccess *getMemoryLeader() const { return RepMemoryAccess; }
+ void setMemoryLeader(const MemoryAccess *Leader) { RepMemoryAccess = Leader; }
+
+ // Forward propagation info
+ const Expression *getDefiningExpr() const { return DefiningExpr; }
+
+ // Value member set
+ bool empty() const { return Members.empty(); }
+ unsigned size() const { return Members.size(); }
+ MemberSet::const_iterator begin() const { return Members.begin(); }
+ MemberSet::const_iterator end() const { return Members.end(); }
+ void insert(MemberType *M) { Members.insert(M); }
+ void erase(MemberType *M) { Members.erase(M); }
+ void swap(MemberSet &Other) { Members.swap(Other); }
+
+ // Memory member set
+ bool memory_empty() const { return MemoryMembers.empty(); }
+ unsigned memory_size() const { return MemoryMembers.size(); }
+ MemoryMemberSet::const_iterator memory_begin() const {
+ return MemoryMembers.begin();
+ }
+ MemoryMemberSet::const_iterator memory_end() const {
+ return MemoryMembers.end();
+ }
+ iterator_range<MemoryMemberSet::const_iterator> memory() const {
+ return make_range(memory_begin(), memory_end());
+ }
+ void memory_insert(const MemoryMemberType *M) { MemoryMembers.insert(M); }
+ void memory_erase(const MemoryMemberType *M) { MemoryMembers.erase(M); }
+
+ // Store count
+ unsigned getStoreCount() const { return StoreCount; }
+ void incStoreCount() { ++StoreCount; }
+ void decStoreCount() {
+ assert(StoreCount != 0 && "Store count went negative");
+ --StoreCount;
+ }
+
+ // True if this class has no memory members.
+ bool definesNoMemory() const { return StoreCount == 0 && memory_empty(); }
+
+ // Return true if two congruence classes are equivalent to each other. This
+ // means
+ // that every field but the ID number and the dead field are equivalent.
+ bool isEquivalentTo(const CongruenceClass *Other) const {
+ if (!Other)
+ return false;
+ if (this == Other)
+ return true;
+
+ if (std::tie(StoreCount, RepLeader, RepStoredValue, RepMemoryAccess) !=
+ std::tie(Other->StoreCount, Other->RepLeader, Other->RepStoredValue,
+ Other->RepMemoryAccess))
+ return false;
+ if (DefiningExpr != Other->DefiningExpr)
+ if (!DefiningExpr || !Other->DefiningExpr ||
+ *DefiningExpr != *Other->DefiningExpr)
+ return false;
+ // We need some ordered set
+ std::set<Value *> AMembers(Members.begin(), Members.end());
+ std::set<Value *> BMembers(Members.begin(), Members.end());
+ return AMembers == BMembers;
+ }
+
+private:
unsigned ID;
// Representative leader.
Value *RepLeader = nullptr;
+ // The most dominating leader after our current leader, because the member set
+ // is not sorted and is expensive to keep sorted all the time.
+ std::pair<Value *, unsigned int> NextLeader = {nullptr, ~0U};
+ // If this is represented by a store, the value of the store.
+ Value *RepStoredValue = nullptr;
+ // If this class contains MemoryDefs or MemoryPhis, this is the leading memory
+ // access.
+ const MemoryAccess *RepMemoryAccess = nullptr;
// Defining Expression.
const Expression *DefiningExpr = nullptr;
// Actual members of this class.
MemberSet Members;
-
- // True if this class has no members left. This is mainly used for assertion
- // purposes, and for skipping empty classes.
- bool Dead = false;
-
+ // This is the set of MemoryPhis that exist in the class. MemoryDefs and
+ // MemoryUses have real instructions representing them, so we only need to
+ // track MemoryPhis here.
+ MemoryMemberSet MemoryMembers;
// Number of stores in this congruence class.
// This is used so we can detect store equivalence changes properly.
int StoreCount = 0;
-
- // The most dominating leader after our current leader, because the member set
- // is not sorted and is expensive to keep sorted all the time.
- std::pair<Value *, unsigned int> NextLeader = {nullptr, ~0U};
-
- explicit CongruenceClass(unsigned ID) : ID(ID) {}
- CongruenceClass(unsigned ID, Value *Leader, const Expression *E)
- : ID(ID), RepLeader(Leader), DefiningExpr(E) {}
};
namespace llvm {
+struct ExactEqualsExpression {
+ const Expression &E;
+ explicit ExactEqualsExpression(const Expression &E) : E(E) {}
+ hash_code getComputedHash() const { return E.getComputedHash(); }
+ bool operator==(const Expression &Other) const {
+ return E.exactlyEquals(Other);
+ }
+};
+
template <> struct DenseMapInfo<const Expression *> {
static const Expression *getEmptyKey() {
auto Val = static_cast<uintptr_t>(-1);
@@ -166,51 +398,144 @@ template <> struct DenseMapInfo<const Expression *> {
Val <<= PointerLikeTypeTraits<const Expression *>::NumLowBitsAvailable;
return reinterpret_cast<const Expression *>(Val);
}
- static unsigned getHashValue(const Expression *V) {
- return static_cast<unsigned>(V->getHashValue());
+ static unsigned getHashValue(const Expression *E) {
+ return E->getComputedHash();
}
+ static unsigned getHashValue(const ExactEqualsExpression &E) {
+ return E.getComputedHash();
+ }
+ static bool isEqual(const ExactEqualsExpression &LHS, const Expression *RHS) {
+ if (RHS == getTombstoneKey() || RHS == getEmptyKey())
+ return false;
+ return LHS == *RHS;
+ }
+
static bool isEqual(const Expression *LHS, const Expression *RHS) {
if (LHS == RHS)
return true;
if (LHS == getTombstoneKey() || RHS == getTombstoneKey() ||
LHS == getEmptyKey() || RHS == getEmptyKey())
return false;
+ // Compare hashes before equality. This is *not* what the hashtable does,
+ // since it is computing it modulo the number of buckets, whereas we are
+ // using the full hash keyspace. Since the hashes are precomputed, this
+ // check is *much* faster than equality.
+ if (LHS->getComputedHash() != RHS->getComputedHash())
+ return false;
return *LHS == *RHS;
}
};
} // end namespace llvm
-class NewGVN : public FunctionPass {
+namespace {
+class NewGVN {
+ Function &F;
DominatorTree *DT;
- const DataLayout *DL;
const TargetLibraryInfo *TLI;
- AssumptionCache *AC;
AliasAnalysis *AA;
MemorySSA *MSSA;
MemorySSAWalker *MSSAWalker;
- BumpPtrAllocator ExpressionAllocator;
- ArrayRecycler<Value *> ArgRecycler;
+ const DataLayout &DL;
+ std::unique_ptr<PredicateInfo> PredInfo;
+
+ // These are the only two things the create* functions should have
+ // side-effects on due to allocating memory.
+ mutable BumpPtrAllocator ExpressionAllocator;
+ mutable ArrayRecycler<Value *> ArgRecycler;
+ mutable TarjanSCC SCCFinder;
+ const SimplifyQuery SQ;
+
+ // Number of function arguments, used by ranking
+ unsigned int NumFuncArgs;
+
+ // RPOOrdering of basic blocks
+ DenseMap<const DomTreeNode *, unsigned> RPOOrdering;
// Congruence class info.
- CongruenceClass *InitialClass;
+
+ // This class is called INITIAL in the paper. It is the class everything
+ // startsout in, and represents any value. Being an optimistic analysis,
+ // anything in the TOP class has the value TOP, which is indeterminate and
+ // equivalent to everything.
+ CongruenceClass *TOPClass;
std::vector<CongruenceClass *> CongruenceClasses;
unsigned NextCongruenceNum;
// Value Mappings.
DenseMap<Value *, CongruenceClass *> ValueToClass;
DenseMap<Value *, const Expression *> ValueToExpression;
+ // Value PHI handling, used to make equivalence between phi(op, op) and
+ // op(phi, phi).
+ // These mappings just store various data that would normally be part of the
+ // IR.
+ DenseSet<const Instruction *> PHINodeUses;
+ // Map a temporary instruction we created to a parent block.
+ DenseMap<const Value *, BasicBlock *> TempToBlock;
+ // Map between the temporary phis we created and the real instructions they
+ // are known equivalent to.
+ DenseMap<const Value *, PHINode *> RealToTemp;
+ // In order to know when we should re-process instructions that have
+ // phi-of-ops, we track the set of expressions that they needed as
+ // leaders. When we discover new leaders for those expressions, we process the
+ // associated phi-of-op instructions again in case they have changed. The
+ // other way they may change is if they had leaders, and those leaders
+ // disappear. However, at the point they have leaders, there are uses of the
+ // relevant operands in the created phi node, and so they will get reprocessed
+ // through the normal user marking we perform.
+ mutable DenseMap<const Value *, SmallPtrSet<Value *, 2>> AdditionalUsers;
+ DenseMap<const Expression *, SmallPtrSet<Instruction *, 2>>
+ ExpressionToPhiOfOps;
+ // Map from basic block to the temporary operations we created
+ DenseMap<const BasicBlock *, SmallVector<PHINode *, 8>> PHIOfOpsPHIs;
+ // Map from temporary operation to MemoryAccess.
+ DenseMap<const Instruction *, MemoryUseOrDef *> TempToMemory;
+ // Set of all temporary instructions we created.
+ DenseSet<Instruction *> AllTempInstructions;
+
+ // Mapping from predicate info we used to the instructions we used it with.
+ // In order to correctly ensure propagation, we must keep track of what
+ // comparisons we used, so that when the values of the comparisons change, we
+ // propagate the information to the places we used the comparison.
+ mutable DenseMap<const Value *, SmallPtrSet<Instruction *, 2>>
+ PredicateToUsers;
+ // the same reasoning as PredicateToUsers. When we skip MemoryAccesses for
+ // stores, we no longer can rely solely on the def-use chains of MemorySSA.
+ mutable DenseMap<const MemoryAccess *, SmallPtrSet<MemoryAccess *, 2>>
+ MemoryToUsers;
// A table storing which memorydefs/phis represent a memory state provably
// equivalent to another memory state.
// We could use the congruence class machinery, but the MemoryAccess's are
// abstract memory states, so they can only ever be equivalent to each other,
// and not to constants, etc.
- DenseMap<const MemoryAccess *, MemoryAccess *> MemoryAccessEquiv;
-
+ DenseMap<const MemoryAccess *, CongruenceClass *> MemoryAccessToClass;
+
+ // We could, if we wanted, build MemoryPhiExpressions and
+ // MemoryVariableExpressions, etc, and value number them the same way we value
+ // number phi expressions. For the moment, this seems like overkill. They
+ // can only exist in one of three states: they can be TOP (equal to
+ // everything), Equivalent to something else, or unique. Because we do not
+ // create expressions for them, we need to simulate leader change not just
+ // when they change class, but when they change state. Note: We can do the
+ // same thing for phis, and avoid having phi expressions if we wanted, We
+ // should eventually unify in one direction or the other, so this is a little
+ // bit of an experiment in which turns out easier to maintain.
+ enum MemoryPhiState { MPS_Invalid, MPS_TOP, MPS_Equivalent, MPS_Unique };
+ DenseMap<const MemoryPhi *, MemoryPhiState> MemoryPhiState;
+
+ enum InstCycleState { ICS_Unknown, ICS_CycleFree, ICS_Cycle };
+ mutable DenseMap<const Instruction *, InstCycleState> InstCycleState;
// Expression to class mapping.
using ExpressionClassMap = DenseMap<const Expression *, CongruenceClass *>;
ExpressionClassMap ExpressionToClass;
+ // We have a single expression that represents currently DeadExpressions.
+ // For dead expressions we can prove will stay dead, we mark them with
+ // DFS number zero. However, it's possible in the case of phi nodes
+ // for us to assume/prove all arguments are dead during fixpointing.
+ // We use DeadExpression for that case.
+ DeadExpression *SingletonDeadExpression = nullptr;
+
// Which values have changed as a result of leader changes.
SmallPtrSet<Value *, 8> LeaderChanges;
@@ -231,8 +556,6 @@ class NewGVN : public FunctionPass {
BitVector TouchedInstructions;
DenseMap<const BasicBlock *, std::pair<unsigned, unsigned>> BlockInstRange;
- DenseMap<const DomTreeNode *, std::pair<unsigned, unsigned>>
- DominatedInstRange;
#ifndef NDEBUG
// Debugging for how many times each block and instruction got processed.
@@ -240,56 +563,47 @@ class NewGVN : public FunctionPass {
#endif
// DFS info.
- DenseMap<const BasicBlock *, std::pair<int, int>> DFSDomMap;
+ // This contains a mapping from Instructions to DFS numbers.
+ // The numbering starts at 1. An instruction with DFS number zero
+ // means that the instruction is dead.
DenseMap<const Value *, unsigned> InstrDFS;
+
+ // This contains the mapping DFS numbers to instructions.
SmallVector<Value *, 32> DFSToInstr;
// Deletion info.
SmallPtrSet<Instruction *, 8> InstructionsToErase;
public:
- static char ID; // Pass identification, replacement for typeid.
- NewGVN() : FunctionPass(ID) {
- initializeNewGVNPass(*PassRegistry::getPassRegistry());
+ NewGVN(Function &F, DominatorTree *DT, AssumptionCache *AC,
+ TargetLibraryInfo *TLI, AliasAnalysis *AA, MemorySSA *MSSA,
+ const DataLayout &DL)
+ : F(F), DT(DT), TLI(TLI), AA(AA), MSSA(MSSA), DL(DL),
+ PredInfo(make_unique<PredicateInfo>(F, *DT, *AC)), SQ(DL, TLI, DT, AC) {
}
-
- bool runOnFunction(Function &F) override;
- bool runGVN(Function &F, DominatorTree *DT, AssumptionCache *AC,
- TargetLibraryInfo *TLI, AliasAnalysis *AA, MemorySSA *MSSA);
+ bool runGVN();
private:
- // This transformation requires dominator postdominator info.
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- AU.addRequired<MemorySSAWrapperPass>();
- AU.addRequired<AAResultsWrapperPass>();
-
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- }
-
// Expression handling.
- const Expression *createExpression(Instruction *, const BasicBlock *);
- const Expression *createBinaryExpression(unsigned, Type *, Value *, Value *,
- const BasicBlock *);
- PHIExpression *createPHIExpression(Instruction *);
- const VariableExpression *createVariableExpression(Value *);
- const ConstantExpression *createConstantExpression(Constant *);
- const Expression *createVariableOrConstant(Value *V, const BasicBlock *B);
- const UnknownExpression *createUnknownExpression(Instruction *);
- const StoreExpression *createStoreExpression(StoreInst *, MemoryAccess *,
- const BasicBlock *);
+ const Expression *createExpression(Instruction *) const;
+ const Expression *createBinaryExpression(unsigned, Type *, Value *,
+ Value *) const;
+ PHIExpression *createPHIExpression(Instruction *, bool &HasBackEdge,
+ bool &OriginalOpsConstant) const;
+ const DeadExpression *createDeadExpression() const;
+ const VariableExpression *createVariableExpression(Value *) const;
+ const ConstantExpression *createConstantExpression(Constant *) const;
+ const Expression *createVariableOrConstant(Value *V) const;
+ const UnknownExpression *createUnknownExpression(Instruction *) const;
+ const StoreExpression *createStoreExpression(StoreInst *,
+ const MemoryAccess *) const;
LoadExpression *createLoadExpression(Type *, Value *, LoadInst *,
- MemoryAccess *, const BasicBlock *);
-
- const CallExpression *createCallExpression(CallInst *, MemoryAccess *,
- const BasicBlock *);
+ const MemoryAccess *) const;
+ const CallExpression *createCallExpression(CallInst *,
+ const MemoryAccess *) const;
const AggregateValueExpression *
- createAggregateValueExpression(Instruction *, const BasicBlock *);
- bool setBasicExpressionInfo(Instruction *, BasicExpression *,
- const BasicBlock *);
+ createAggregateValueExpression(Instruction *) const;
+ bool setBasicExpressionInfo(Instruction *, BasicExpression *) const;
// Congruence class handling.
CongruenceClass *createCongruenceClass(Value *Leader, const Expression *E) {
@@ -298,13 +612,28 @@ private:
return result;
}
+ CongruenceClass *createMemoryClass(MemoryAccess *MA) {
+ auto *CC = createCongruenceClass(nullptr, nullptr);
+ CC->setMemoryLeader(MA);
+ return CC;
+ }
+ CongruenceClass *ensureLeaderOfMemoryClass(MemoryAccess *MA) {
+ auto *CC = getMemoryClass(MA);
+ if (CC->getMemoryLeader() != MA)
+ CC = createMemoryClass(MA);
+ return CC;
+ }
+
CongruenceClass *createSingletonCongruenceClass(Value *Member) {
CongruenceClass *CClass = createCongruenceClass(Member, nullptr);
- CClass->Members.insert(Member);
+ CClass->insert(Member);
ValueToClass[Member] = CClass;
return CClass;
}
void initializeCongruenceClasses(Function &F);
+ const Expression *makePossiblePhiOfOps(Instruction *,
+ SmallPtrSetImpl<Value *> &);
+ void addPhiOfOps(PHINode *Op, BasicBlock *BB, Instruction *ExistingValue);
// Value number an Instruction or MemoryPhi.
void valueNumberMemoryPhi(MemoryPhi *);
@@ -312,78 +641,128 @@ private:
// Symbolic evaluation.
const Expression *checkSimplificationResults(Expression *, Instruction *,
- Value *);
- const Expression *performSymbolicEvaluation(Value *, const BasicBlock *);
- const Expression *performSymbolicLoadEvaluation(Instruction *,
- const BasicBlock *);
- const Expression *performSymbolicStoreEvaluation(Instruction *,
- const BasicBlock *);
- const Expression *performSymbolicCallEvaluation(Instruction *,
- const BasicBlock *);
- const Expression *performSymbolicPHIEvaluation(Instruction *,
- const BasicBlock *);
- bool setMemoryAccessEquivTo(MemoryAccess *From, MemoryAccess *To);
- const Expression *performSymbolicAggrValueEvaluation(Instruction *,
- const BasicBlock *);
+ Value *) const;
+ const Expression *performSymbolicEvaluation(Value *,
+ SmallPtrSetImpl<Value *> &) const;
+ const Expression *performSymbolicLoadCoercion(Type *, Value *, LoadInst *,
+ Instruction *,
+ MemoryAccess *) const;
+ const Expression *performSymbolicLoadEvaluation(Instruction *) const;
+ const Expression *performSymbolicStoreEvaluation(Instruction *) const;
+ const Expression *performSymbolicCallEvaluation(Instruction *) const;
+ const Expression *performSymbolicPHIEvaluation(Instruction *) const;
+ const Expression *performSymbolicAggrValueEvaluation(Instruction *) const;
+ const Expression *performSymbolicCmpEvaluation(Instruction *) const;
+ const Expression *performSymbolicPredicateInfoEvaluation(Instruction *) const;
// Congruence finding.
- // Templated to allow them to work both on BB's and BB-edges.
- template <class T>
- Value *lookupOperandLeader(Value *, const User *, const T &) const;
+ bool someEquivalentDominates(const Instruction *, const Instruction *) const;
+ Value *lookupOperandLeader(Value *) const;
void performCongruenceFinding(Instruction *, const Expression *);
- void moveValueToNewCongruenceClass(Instruction *, CongruenceClass *,
- CongruenceClass *);
+ void moveValueToNewCongruenceClass(Instruction *, const Expression *,
+ CongruenceClass *, CongruenceClass *);
+ void moveMemoryToNewCongruenceClass(Instruction *, MemoryAccess *,
+ CongruenceClass *, CongruenceClass *);
+ Value *getNextValueLeader(CongruenceClass *) const;
+ const MemoryAccess *getNextMemoryLeader(CongruenceClass *) const;
+ bool setMemoryClass(const MemoryAccess *From, CongruenceClass *To);
+ CongruenceClass *getMemoryClass(const MemoryAccess *MA) const;
+ const MemoryAccess *lookupMemoryLeader(const MemoryAccess *) const;
+ bool isMemoryAccessTOP(const MemoryAccess *) const;
+
+ // Ranking
+ unsigned int getRank(const Value *) const;
+ bool shouldSwapOperands(const Value *, const Value *) const;
+
// Reachability handling.
void updateReachableEdge(BasicBlock *, BasicBlock *);
void processOutgoingEdges(TerminatorInst *, BasicBlock *);
- bool isOnlyReachableViaThisEdge(const BasicBlockEdge &) const;
- Value *findConditionEquivalence(Value *, BasicBlock *) const;
- MemoryAccess *lookupMemoryAccessEquiv(MemoryAccess *) const;
+ Value *findConditionEquivalence(Value *) const;
// Elimination.
struct ValueDFS;
- void convertDenseToDFSOrdered(CongruenceClass::MemberSet &,
- SmallVectorImpl<ValueDFS> &);
+ void convertClassToDFSOrdered(const CongruenceClass &,
+ SmallVectorImpl<ValueDFS> &,
+ DenseMap<const Value *, unsigned int> &,
+ SmallPtrSetImpl<Instruction *> &) const;
+ void convertClassToLoadsAndStores(const CongruenceClass &,
+ SmallVectorImpl<ValueDFS> &) const;
bool eliminateInstructions(Function &);
void replaceInstruction(Instruction *, Value *);
void markInstructionForDeletion(Instruction *);
void deleteInstructionsInBlock(BasicBlock *);
+ Value *findPhiOfOpsLeader(const Expression *E, const BasicBlock *BB) const;
// New instruction creation.
void handleNewInstruction(Instruction *){};
// Various instruction touch utilities
+ template <typename Map, typename KeyType, typename Func>
+ void for_each_found(Map &, const KeyType &, Func);
+ template <typename Map, typename KeyType>
+ void touchAndErase(Map &, const KeyType &);
void markUsersTouched(Value *);
- void markMemoryUsersTouched(MemoryAccess *);
- void markLeaderChangeTouched(CongruenceClass *CC);
+ void markMemoryUsersTouched(const MemoryAccess *);
+ void markMemoryDefTouched(const MemoryAccess *);
+ void markPredicateUsersTouched(Instruction *);
+ void markValueLeaderChangeTouched(CongruenceClass *CC);
+ void markMemoryLeaderChangeTouched(CongruenceClass *CC);
+ void markPhiOfOpsChanged(const Expression *E);
+ void addPredicateUsers(const PredicateBase *, Instruction *) const;
+ void addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const;
+ void addAdditionalUsers(Value *To, Value *User) const;
+
+ // Main loop of value numbering
+ void iterateTouchedInstructions();
// Utilities.
void cleanupTables();
std::pair<unsigned, unsigned> assignDFSNumbers(BasicBlock *, unsigned);
- void updateProcessedCount(Value *V);
+ void updateProcessedCount(const Value *V);
void verifyMemoryCongruency() const;
- bool singleReachablePHIPath(const MemoryAccess *, const MemoryAccess *) const;
-};
-
-char NewGVN::ID = 0;
+ void verifyIterationSettled(Function &F);
+ void verifyStoreExpressions() const;
+ bool singleReachablePHIPath(SmallPtrSet<const MemoryAccess *, 8> &,
+ const MemoryAccess *, const MemoryAccess *) const;
+ BasicBlock *getBlockForValue(Value *V) const;
+ void deleteExpression(const Expression *E) const;
+ MemoryUseOrDef *getMemoryAccess(const Instruction *) const;
+ MemoryAccess *getDefiningAccess(const MemoryAccess *) const;
+ MemoryPhi *getMemoryAccess(const BasicBlock *) const;
+ template <class T, class Range> T *getMinDFSOfRange(const Range &) const;
+ unsigned InstrToDFSNum(const Value *V) const {
+ assert(isa<Instruction>(V) && "This should not be used for MemoryAccesses");
+ return InstrDFS.lookup(V);
+ }
-// createGVNPass - The public interface to this file.
-FunctionPass *llvm::createNewGVNPass() { return new NewGVN(); }
+ unsigned InstrToDFSNum(const MemoryAccess *MA) const {
+ return MemoryToDFSNum(MA);
+ }
+ Value *InstrFromDFSNum(unsigned DFSNum) { return DFSToInstr[DFSNum]; }
+ // Given a MemoryAccess, return the relevant instruction DFS number. Note:
+ // This deliberately takes a value so it can be used with Use's, which will
+ // auto-convert to Value's but not to MemoryAccess's.
+ unsigned MemoryToDFSNum(const Value *MA) const {
+ assert(isa<MemoryAccess>(MA) &&
+ "This should not be used with instructions");
+ return isa<MemoryUseOrDef>(MA)
+ ? InstrToDFSNum(cast<MemoryUseOrDef>(MA)->getMemoryInst())
+ : InstrDFS.lookup(MA);
+ }
+ bool isCycleFree(const Instruction *) const;
+ bool isBackedge(BasicBlock *From, BasicBlock *To) const;
+ // Debug counter info. When verifying, we have to reset the value numbering
+ // debug counter to the same state it started in to get the same results.
+ std::pair<int, int> StartingVNCounter;
+};
+} // end anonymous namespace
template <typename T>
static bool equalsLoadStoreHelper(const T &LHS, const Expression &RHS) {
- if ((!isa<LoadExpression>(RHS) && !isa<StoreExpression>(RHS)) ||
- !LHS.BasicExpression::equals(RHS)) {
+ if (!isa<LoadExpression>(RHS) && !isa<StoreExpression>(RHS))
return false;
- } else if (const auto *L = dyn_cast<LoadExpression>(&RHS)) {
- if (LHS.getDefiningAccess() != L->getDefiningAccess())
- return false;
- } else if (const auto *S = dyn_cast<StoreExpression>(&RHS)) {
- if (LHS.getDefiningAccess() != S->getDefiningAccess())
- return false;
- }
- return true;
+ return LHS.MemoryExpression::equals(RHS);
}
bool LoadExpression::equals(const Expression &Other) const {
@@ -391,7 +770,22 @@ bool LoadExpression::equals(const Expression &Other) const {
}
bool StoreExpression::equals(const Expression &Other) const {
- return equalsLoadStoreHelper(*this, Other);
+ if (!equalsLoadStoreHelper(*this, Other))
+ return false;
+ // Make sure that store vs store includes the value operand.
+ if (const auto *S = dyn_cast<StoreExpression>(&Other))
+ if (getStoredValue() != S->getStoredValue())
+ return false;
+ return true;
+}
+
+// Determine if the edge From->To is a backedge
+bool NewGVN::isBackedge(BasicBlock *From, BasicBlock *To) const {
+ if (From == To)
+ return true;
+ auto *FromDTN = DT->getNode(From);
+ auto *ToDTN = DT->getNode(To);
+ return RPOOrdering.lookup(FromDTN) >= RPOOrdering.lookup(ToDTN);
}
#ifndef NDEBUG
@@ -400,17 +794,45 @@ static std::string getBlockName(const BasicBlock *B) {
}
#endif
-INITIALIZE_PASS_BEGIN(NewGVN, "newgvn", "Global Value Numbering", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
-INITIALIZE_PASS_END(NewGVN, "newgvn", "Global Value Numbering", false, false)
+// Get a MemoryAccess for an instruction, fake or real.
+MemoryUseOrDef *NewGVN::getMemoryAccess(const Instruction *I) const {
+ auto *Result = MSSA->getMemoryAccess(I);
+ return Result ? Result : TempToMemory.lookup(I);
+}
-PHIExpression *NewGVN::createPHIExpression(Instruction *I) {
- BasicBlock *PHIBlock = I->getParent();
+// Get a MemoryPhi for a basic block. These are all real.
+MemoryPhi *NewGVN::getMemoryAccess(const BasicBlock *BB) const {
+ return MSSA->getMemoryAccess(BB);
+}
+
+// Get the basic block from an instruction/memory value.
+BasicBlock *NewGVN::getBlockForValue(Value *V) const {
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ auto *Parent = I->getParent();
+ if (Parent)
+ return Parent;
+ Parent = TempToBlock.lookup(V);
+ assert(Parent && "Every fake instruction should have a block");
+ return Parent;
+ }
+
+ auto *MP = dyn_cast<MemoryPhi>(V);
+ assert(MP && "Should have been an instruction or a MemoryPhi");
+ return MP->getBlock();
+}
+
+// Delete a definitely dead expression, so it can be reused by the expression
+// allocator. Some of these are not in creation functions, so we have to accept
+// const versions.
+void NewGVN::deleteExpression(const Expression *E) const {
+ assert(isa<BasicExpression>(E));
+ auto *BE = cast<BasicExpression>(E);
+ const_cast<BasicExpression *>(BE)->deallocateOperands(ArgRecycler);
+ ExpressionAllocator.Deallocate(E);
+}
+PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge,
+ bool &OriginalOpsConstant) const {
+ BasicBlock *PHIBlock = getBlockForValue(I);
auto *PN = cast<PHINode>(I);
auto *E =
new (ExpressionAllocator) PHIExpression(PN->getNumOperands(), PHIBlock);
@@ -419,28 +841,47 @@ PHIExpression *NewGVN::createPHIExpression(Instruction *I) {
E->setType(I->getType());
E->setOpcode(I->getOpcode());
- auto ReachablePhiArg = [&](const Use &U) {
- return ReachableBlocks.count(PN->getIncomingBlock(U));
- };
-
- // Filter out unreachable operands
- auto Filtered = make_filter_range(PN->operands(), ReachablePhiArg);
-
+ // NewGVN assumes the operands of a PHI node are in a consistent order across
+ // PHIs. LLVM doesn't seem to always guarantee this. While we need to fix
+ // this in LLVM at some point we don't want GVN to find wrong congruences.
+ // Therefore, here we sort uses in predecessor order.
+ // We're sorting the values by pointer. In theory this might be cause of
+ // non-determinism, but here we don't rely on the ordering for anything
+ // significant, e.g. we don't create new instructions based on it so we're
+ // fine.
+ SmallVector<const Use *, 4> PHIOperands;
+ for (const Use &U : PN->operands())
+ PHIOperands.push_back(&U);
+ std::sort(PHIOperands.begin(), PHIOperands.end(),
+ [&](const Use *U1, const Use *U2) {
+ return PN->getIncomingBlock(*U1) < PN->getIncomingBlock(*U2);
+ });
+
+ // Filter out unreachable phi operands.
+ auto Filtered = make_filter_range(PHIOperands, [&](const Use *U) {
+ if (*U == PN)
+ return false;
+ if (!ReachableEdges.count({PN->getIncomingBlock(*U), PHIBlock}))
+ return false;
+ // Things in TOPClass are equivalent to everything.
+ if (ValueToClass.lookup(*U) == TOPClass)
+ return false;
+ return lookupOperandLeader(*U) != PN;
+ });
std::transform(Filtered.begin(), Filtered.end(), op_inserter(E),
- [&](const Use &U) -> Value * {
- // Don't try to transform self-defined phis.
- if (U == PN)
- return PN;
- const BasicBlockEdge BBE(PN->getIncomingBlock(U), PHIBlock);
- return lookupOperandLeader(U, I, BBE);
+ [&](const Use *U) -> Value * {
+ auto *BB = PN->getIncomingBlock(*U);
+ HasBackedge = HasBackedge || isBackedge(BB, PHIBlock);
+ OriginalOpsConstant =
+ OriginalOpsConstant && isa<Constant>(*U);
+ return lookupOperandLeader(*U);
});
return E;
}
// Set basic expression info (Arguments, type, opcode) for Expression
// E from Instruction I in block B.
-bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E,
- const BasicBlock *B) {
+bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) const {
bool AllConstant = true;
if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
E->setType(GEP->getSourceElementType());
@@ -452,8 +893,8 @@ bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E,
// Transform the operand array into an operand leader array, and keep track of
// whether all members are constant.
std::transform(I->op_begin(), I->op_end(), op_inserter(E), [&](Value *O) {
- auto Operand = lookupOperandLeader(O, I, B);
- AllConstant &= isa<Constant>(Operand);
+ auto Operand = lookupOperandLeader(O);
+ AllConstant = AllConstant && isa<Constant>(Operand);
return Operand;
});
@@ -461,8 +902,8 @@ bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E,
}
const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T,
- Value *Arg1, Value *Arg2,
- const BasicBlock *B) {
+ Value *Arg1,
+ Value *Arg2) const {
auto *E = new (ExpressionAllocator) BasicExpression(2);
E->setType(T);
@@ -473,14 +914,13 @@ const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T,
// of their operands get the same value number by sorting the operand value
// numbers. Since all commutative instructions have two operands it is more
// efficient to sort by hand rather than using, say, std::sort.
- if (Arg1 > Arg2)
+ if (shouldSwapOperands(Arg1, Arg2))
std::swap(Arg1, Arg2);
}
- E->op_push_back(lookupOperandLeader(Arg1, nullptr, B));
- E->op_push_back(lookupOperandLeader(Arg2, nullptr, B));
+ E->op_push_back(lookupOperandLeader(Arg1));
+ E->op_push_back(lookupOperandLeader(Arg2));
- Value *V = SimplifyBinOp(Opcode, E->getOperand(0), E->getOperand(1), *DL, TLI,
- DT, AC);
+ Value *V = SimplifyBinOp(Opcode, E->getOperand(0), E->getOperand(1), SQ);
if (const Expression *SimplifiedE = checkSimplificationResults(E, nullptr, V))
return SimplifiedE;
return E;
@@ -492,7 +932,8 @@ const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T,
// TODO: Once finished, this should not take an Instruction, we only
// use it for printing.
const Expression *NewGVN::checkSimplificationResults(Expression *E,
- Instruction *I, Value *V) {
+ Instruction *I,
+ Value *V) const {
if (!V)
return nullptr;
if (auto *C = dyn_cast<Constant>(V)) {
@@ -502,40 +943,40 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E,
NumGVNOpsSimplified++;
assert(isa<BasicExpression>(E) &&
"We should always have had a basic expression here");
-
- cast<BasicExpression>(E)->deallocateOperands(ArgRecycler);
- ExpressionAllocator.Deallocate(E);
+ deleteExpression(E);
return createConstantExpression(C);
} else if (isa<Argument>(V) || isa<GlobalVariable>(V)) {
if (I)
DEBUG(dbgs() << "Simplified " << *I << " to "
<< " variable " << *V << "\n");
- cast<BasicExpression>(E)->deallocateOperands(ArgRecycler);
- ExpressionAllocator.Deallocate(E);
+ deleteExpression(E);
return createVariableExpression(V);
}
CongruenceClass *CC = ValueToClass.lookup(V);
- if (CC && CC->DefiningExpr) {
+ if (CC && CC->getDefiningExpr()) {
+ // If we simplified to something else, we need to communicate
+ // that we're users of the value we simplified to.
+ if (I != V) {
+ // Don't add temporary instructions to the user lists.
+ if (!AllTempInstructions.count(I))
+ addAdditionalUsers(V, I);
+ }
+
if (I)
DEBUG(dbgs() << "Simplified " << *I << " to "
- << " expression " << *V << "\n");
+ << " expression " << *CC->getDefiningExpr() << "\n");
NumGVNOpsSimplified++;
- assert(isa<BasicExpression>(E) &&
- "We should always have had a basic expression here");
- cast<BasicExpression>(E)->deallocateOperands(ArgRecycler);
- ExpressionAllocator.Deallocate(E);
- return CC->DefiningExpr;
+ deleteExpression(E);
+ return CC->getDefiningExpr();
}
return nullptr;
}
-const Expression *NewGVN::createExpression(Instruction *I,
- const BasicBlock *B) {
-
+const Expression *NewGVN::createExpression(Instruction *I) const {
auto *E = new (ExpressionAllocator) BasicExpression(I->getNumOperands());
- bool AllConstant = setBasicExpressionInfo(I, E, B);
+ bool AllConstant = setBasicExpressionInfo(I, E);
if (I->isCommutative()) {
// Ensure that commutative instructions that only differ by a permutation
@@ -543,7 +984,7 @@ const Expression *NewGVN::createExpression(Instruction *I,
// numbers. Since all commutative instructions have two operands it is more
// efficient to sort by hand rather than using, say, std::sort.
assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!");
- if (E->getOperand(0) > E->getOperand(1))
+ if (shouldSwapOperands(E->getOperand(0), E->getOperand(1)))
E->swapOperands(0, 1);
}
@@ -559,48 +1000,43 @@ const Expression *NewGVN::createExpression(Instruction *I,
// Sort the operand value numbers so x<y and y>x get the same value
// number.
CmpInst::Predicate Predicate = CI->getPredicate();
- if (E->getOperand(0) > E->getOperand(1)) {
+ if (shouldSwapOperands(E->getOperand(0), E->getOperand(1))) {
E->swapOperands(0, 1);
Predicate = CmpInst::getSwappedPredicate(Predicate);
}
E->setOpcode((CI->getOpcode() << 8) | Predicate);
// TODO: 25% of our time is spent in SimplifyCmpInst with pointer operands
- // TODO: Since we noop bitcasts, we may need to check types before
- // simplifying, so that we don't end up simplifying based on a wrong
- // type assumption. We should clean this up so we can use constants of the
- // wrong type
-
assert(I->getOperand(0)->getType() == I->getOperand(1)->getType() &&
"Wrong types on cmp instruction");
- if ((E->getOperand(0)->getType() == I->getOperand(0)->getType() &&
- E->getOperand(1)->getType() == I->getOperand(1)->getType())) {
- Value *V = SimplifyCmpInst(Predicate, E->getOperand(0), E->getOperand(1),
- *DL, TLI, DT, AC);
- if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
- return SimplifiedE;
- }
+ assert((E->getOperand(0)->getType() == I->getOperand(0)->getType() &&
+ E->getOperand(1)->getType() == I->getOperand(1)->getType()));
+ Value *V =
+ SimplifyCmpInst(Predicate, E->getOperand(0), E->getOperand(1), SQ);
+ if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+ return SimplifiedE;
} else if (isa<SelectInst>(I)) {
if (isa<Constant>(E->getOperand(0)) ||
- (E->getOperand(1)->getType() == I->getOperand(1)->getType() &&
- E->getOperand(2)->getType() == I->getOperand(2)->getType())) {
+ E->getOperand(0) == E->getOperand(1)) {
+ assert(E->getOperand(1)->getType() == I->getOperand(1)->getType() &&
+ E->getOperand(2)->getType() == I->getOperand(2)->getType());
Value *V = SimplifySelectInst(E->getOperand(0), E->getOperand(1),
- E->getOperand(2), *DL, TLI, DT, AC);
+ E->getOperand(2), SQ);
if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
return SimplifiedE;
}
} else if (I->isBinaryOp()) {
- Value *V = SimplifyBinOp(E->getOpcode(), E->getOperand(0), E->getOperand(1),
- *DL, TLI, DT, AC);
+ Value *V =
+ SimplifyBinOp(E->getOpcode(), E->getOperand(0), E->getOperand(1), SQ);
if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
return SimplifiedE;
} else if (auto *BI = dyn_cast<BitCastInst>(I)) {
- Value *V = SimplifyInstruction(BI, *DL, TLI, DT, AC);
+ Value *V =
+ SimplifyCastInst(BI->getOpcode(), BI->getOperand(0), BI->getType(), SQ);
if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
return SimplifiedE;
} else if (isa<GetElementPtrInst>(I)) {
- Value *V = SimplifyGEPInst(E->getType(),
- ArrayRef<Value *>(E->op_begin(), E->op_end()),
- *DL, TLI, DT, AC);
+ Value *V = SimplifyGEPInst(
+ E->getType(), ArrayRef<Value *>(E->op_begin(), E->op_end()), SQ);
if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
return SimplifiedE;
} else if (AllConstant) {
@@ -615,7 +1051,7 @@ const Expression *NewGVN::createExpression(Instruction *I,
for (Value *Arg : E->operands())
C.emplace_back(cast<Constant>(Arg));
- if (Value *V = ConstantFoldInstOperands(I, C, *DL, TLI))
+ if (Value *V = ConstantFoldInstOperands(I, C, DL, TLI))
if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
return SimplifiedE;
}
@@ -623,18 +1059,18 @@ const Expression *NewGVN::createExpression(Instruction *I,
}
const AggregateValueExpression *
-NewGVN::createAggregateValueExpression(Instruction *I, const BasicBlock *B) {
+NewGVN::createAggregateValueExpression(Instruction *I) const {
if (auto *II = dyn_cast<InsertValueInst>(I)) {
auto *E = new (ExpressionAllocator)
AggregateValueExpression(I->getNumOperands(), II->getNumIndices());
- setBasicExpressionInfo(I, E, B);
+ setBasicExpressionInfo(I, E);
E->allocateIntOperands(ExpressionAllocator);
std::copy(II->idx_begin(), II->idx_end(), int_op_inserter(E));
return E;
} else if (auto *EI = dyn_cast<ExtractValueInst>(I)) {
auto *E = new (ExpressionAllocator)
AggregateValueExpression(I->getNumOperands(), EI->getNumIndices());
- setBasicExpressionInfo(EI, E, B);
+ setBasicExpressionInfo(EI, E);
E->allocateIntOperands(ExpressionAllocator);
std::copy(EI->idx_begin(), EI->idx_end(), int_op_inserter(E));
return E;
@@ -642,67 +1078,120 @@ NewGVN::createAggregateValueExpression(Instruction *I, const BasicBlock *B) {
llvm_unreachable("Unhandled type of aggregate value operation");
}
-const VariableExpression *NewGVN::createVariableExpression(Value *V) {
+const DeadExpression *NewGVN::createDeadExpression() const {
+ // DeadExpression has no arguments and all DeadExpression's are the same,
+ // so we only need one of them.
+ return SingletonDeadExpression;
+}
+
+const VariableExpression *NewGVN::createVariableExpression(Value *V) const {
auto *E = new (ExpressionAllocator) VariableExpression(V);
E->setOpcode(V->getValueID());
return E;
}
-const Expression *NewGVN::createVariableOrConstant(Value *V,
- const BasicBlock *B) {
- auto Leader = lookupOperandLeader(V, nullptr, B);
- if (auto *C = dyn_cast<Constant>(Leader))
+const Expression *NewGVN::createVariableOrConstant(Value *V) const {
+ if (auto *C = dyn_cast<Constant>(V))
return createConstantExpression(C);
- return createVariableExpression(Leader);
+ return createVariableExpression(V);
}
-const ConstantExpression *NewGVN::createConstantExpression(Constant *C) {
+const ConstantExpression *NewGVN::createConstantExpression(Constant *C) const {
auto *E = new (ExpressionAllocator) ConstantExpression(C);
E->setOpcode(C->getValueID());
return E;
}
-const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) {
+const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) const {
auto *E = new (ExpressionAllocator) UnknownExpression(I);
E->setOpcode(I->getOpcode());
return E;
}
-const CallExpression *NewGVN::createCallExpression(CallInst *CI,
- MemoryAccess *HV,
- const BasicBlock *B) {
+const CallExpression *
+NewGVN::createCallExpression(CallInst *CI, const MemoryAccess *MA) const {
// FIXME: Add operand bundles for calls.
auto *E =
- new (ExpressionAllocator) CallExpression(CI->getNumOperands(), CI, HV);
- setBasicExpressionInfo(CI, E, B);
+ new (ExpressionAllocator) CallExpression(CI->getNumOperands(), CI, MA);
+ setBasicExpressionInfo(CI, E);
return E;
}
+// Return true if some equivalent of instruction Inst dominates instruction U.
+bool NewGVN::someEquivalentDominates(const Instruction *Inst,
+ const Instruction *U) const {
+ auto *CC = ValueToClass.lookup(Inst);
+ // This must be an instruction because we are only called from phi nodes
+ // in the case that the value it needs to check against is an instruction.
+
+ // The most likely candiates for dominance are the leader and the next leader.
+ // The leader or nextleader will dominate in all cases where there is an
+ // equivalent that is higher up in the dom tree.
+ // We can't *only* check them, however, because the
+ // dominator tree could have an infinite number of non-dominating siblings
+ // with instructions that are in the right congruence class.
+ // A
+ // B C D E F G
+ // |
+ // H
+ // Instruction U could be in H, with equivalents in every other sibling.
+ // Depending on the rpo order picked, the leader could be the equivalent in
+ // any of these siblings.
+ if (!CC)
+ return false;
+ if (DT->dominates(cast<Instruction>(CC->getLeader()), U))
+ return true;
+ if (CC->getNextLeader().first &&
+ DT->dominates(cast<Instruction>(CC->getNextLeader().first), U))
+ return true;
+ return llvm::any_of(*CC, [&](const Value *Member) {
+ return Member != CC->getLeader() &&
+ DT->dominates(cast<Instruction>(Member), U);
+ });
+}
+
// See if we have a congruence class and leader for this operand, and if so,
// return it. Otherwise, return the operand itself.
-template <class T>
-Value *NewGVN::lookupOperandLeader(Value *V, const User *U, const T &B) const {
+Value *NewGVN::lookupOperandLeader(Value *V) const {
CongruenceClass *CC = ValueToClass.lookup(V);
- if (CC && (CC != InitialClass))
- return CC->RepLeader;
+ if (CC) {
+ // Everything in TOP is represented by undef, as it can be any value.
+ // We do have to make sure we get the type right though, so we can't set the
+ // RepLeader to undef.
+ if (CC == TOPClass)
+ return UndefValue::get(V->getType());
+ return CC->getStoredValue() ? CC->getStoredValue() : CC->getLeader();
+ }
+
return V;
}
-MemoryAccess *NewGVN::lookupMemoryAccessEquiv(MemoryAccess *MA) const {
- MemoryAccess *Result = MemoryAccessEquiv.lookup(MA);
- return Result ? Result : MA;
+const MemoryAccess *NewGVN::lookupMemoryLeader(const MemoryAccess *MA) const {
+ auto *CC = getMemoryClass(MA);
+ assert(CC->getMemoryLeader() &&
+ "Every MemoryAccess should be mapped to a congruence class with a "
+ "representative memory access");
+ return CC->getMemoryLeader();
+}
+
+// Return true if the MemoryAccess is really equivalent to everything. This is
+// equivalent to the lattice value "TOP" in most lattices. This is the initial
+// state of all MemoryAccesses.
+bool NewGVN::isMemoryAccessTOP(const MemoryAccess *MA) const {
+ return getMemoryClass(MA) == TOPClass;
}
LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp,
- LoadInst *LI, MemoryAccess *DA,
- const BasicBlock *B) {
- auto *E = new (ExpressionAllocator) LoadExpression(1, LI, DA);
+ LoadInst *LI,
+ const MemoryAccess *MA) const {
+ auto *E =
+ new (ExpressionAllocator) LoadExpression(1, LI, lookupMemoryLeader(MA));
E->allocateOperands(ArgRecycler, ExpressionAllocator);
E->setType(LoadType);
// Give store and loads same opcode so they value number together.
E->setOpcode(0);
- E->op_push_back(lookupOperandLeader(PointerOp, LI, B));
+ E->op_push_back(PointerOp);
if (LI)
E->setAlignment(LI->getAlignment());
@@ -712,17 +1201,17 @@ LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp,
return E;
}
-const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI,
- MemoryAccess *DA,
- const BasicBlock *B) {
- auto *E =
- new (ExpressionAllocator) StoreExpression(SI->getNumOperands(), SI, DA);
+const StoreExpression *
+NewGVN::createStoreExpression(StoreInst *SI, const MemoryAccess *MA) const {
+ auto *StoredValueLeader = lookupOperandLeader(SI->getValueOperand());
+ auto *E = new (ExpressionAllocator)
+ StoreExpression(SI->getNumOperands(), SI, StoredValueLeader, MA);
E->allocateOperands(ArgRecycler, ExpressionAllocator);
E->setType(SI->getValueOperand()->getType());
// Give store and loads same opcode so they value number together.
E->setOpcode(0);
- E->op_push_back(lookupOperandLeader(SI->getPointerOperand(), SI, B));
+ E->op_push_back(lookupOperandLeader(SI->getPointerOperand()));
// TODO: Value number heap versions. We may be able to discover
// things alias analysis can't on it's own (IE that a store and a
@@ -730,44 +1219,136 @@ const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI,
return E;
}
-// Utility function to check whether the congruence class has a member other
-// than the given instruction.
-bool hasMemberOtherThanUs(const CongruenceClass *CC, Instruction *I) {
- // Either it has more than one store, in which case it must contain something
- // other than us (because it's indexed by value), or if it only has one store
- // right now, that member should not be us.
- return CC->StoreCount > 1 || CC->Members.count(I) == 0;
-}
-
-const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I,
- const BasicBlock *B) {
+const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) const {
// Unlike loads, we never try to eliminate stores, so we do not check if they
// are simple and avoid value numbering them.
auto *SI = cast<StoreInst>(I);
- MemoryAccess *StoreAccess = MSSA->getMemoryAccess(SI);
- // See if we are defined by a previous store expression, it already has a
- // value, and it's the same value as our current store. FIXME: Right now, we
- // only do this for simple stores, we should expand to cover memcpys, etc.
+ auto *StoreAccess = getMemoryAccess(SI);
+ // Get the expression, if any, for the RHS of the MemoryDef.
+ const MemoryAccess *StoreRHS = StoreAccess->getDefiningAccess();
+ if (EnableStoreRefinement)
+ StoreRHS = MSSAWalker->getClobberingMemoryAccess(StoreAccess);
+ // If we bypassed the use-def chains, make sure we add a use.
+ if (StoreRHS != StoreAccess->getDefiningAccess())
+ addMemoryUsers(StoreRHS, StoreAccess);
+ StoreRHS = lookupMemoryLeader(StoreRHS);
+ // If we are defined by ourselves, use the live on entry def.
+ if (StoreRHS == StoreAccess)
+ StoreRHS = MSSA->getLiveOnEntryDef();
+
if (SI->isSimple()) {
- // Get the expression, if any, for the RHS of the MemoryDef.
- MemoryAccess *StoreRHS = lookupMemoryAccessEquiv(
- cast<MemoryDef>(StoreAccess)->getDefiningAccess());
- const Expression *OldStore = createStoreExpression(SI, StoreRHS, B);
- CongruenceClass *CC = ExpressionToClass.lookup(OldStore);
- // Basically, check if the congruence class the store is in is defined by a
- // store that isn't us, and has the same value. MemorySSA takes care of
- // ensuring the store has the same memory state as us already.
- if (CC && CC->DefiningExpr && isa<StoreExpression>(CC->DefiningExpr) &&
- CC->RepLeader == lookupOperandLeader(SI->getValueOperand(), SI, B) &&
- hasMemberOtherThanUs(CC, I))
- return createStoreExpression(SI, StoreRHS, B);
+ // See if we are defined by a previous store expression, it already has a
+ // value, and it's the same value as our current store. FIXME: Right now, we
+ // only do this for simple stores, we should expand to cover memcpys, etc.
+ const auto *LastStore = createStoreExpression(SI, StoreRHS);
+ const auto *LastCC = ExpressionToClass.lookup(LastStore);
+ // We really want to check whether the expression we matched was a store. No
+ // easy way to do that. However, we can check that the class we found has a
+ // store, which, assuming the value numbering state is not corrupt, is
+ // sufficient, because we must also be equivalent to that store's expression
+ // for it to be in the same class as the load.
+ if (LastCC && LastCC->getStoredValue() == LastStore->getStoredValue())
+ return LastStore;
+ // Also check if our value operand is defined by a load of the same memory
+ // location, and the memory state is the same as it was then (otherwise, it
+ // could have been overwritten later. See test32 in
+ // transforms/DeadStoreElimination/simple.ll).
+ if (auto *LI = dyn_cast<LoadInst>(LastStore->getStoredValue()))
+ if ((lookupOperandLeader(LI->getPointerOperand()) ==
+ LastStore->getOperand(0)) &&
+ (lookupMemoryLeader(getMemoryAccess(LI)->getDefiningAccess()) ==
+ StoreRHS))
+ return LastStore;
+ deleteExpression(LastStore);
+ }
+
+ // If the store is not equivalent to anything, value number it as a store that
+ // produces a unique memory state (instead of using it's MemoryUse, we use
+ // it's MemoryDef).
+ return createStoreExpression(SI, StoreAccess);
+}
+
+// See if we can extract the value of a loaded pointer from a load, a store, or
+// a memory instruction.
+const Expression *
+NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
+ LoadInst *LI, Instruction *DepInst,
+ MemoryAccess *DefiningAccess) const {
+ assert((!LI || LI->isSimple()) && "Not a simple load");
+ if (auto *DepSI = dyn_cast<StoreInst>(DepInst)) {
+ // Can't forward from non-atomic to atomic without violating memory model.
+ // Also don't need to coerce if they are the same type, we will just
+ // propogate..
+ if (LI->isAtomic() > DepSI->isAtomic() ||
+ LoadType == DepSI->getValueOperand()->getType())
+ return nullptr;
+ int Offset = analyzeLoadFromClobberingStore(LoadType, LoadPtr, DepSI, DL);
+ if (Offset >= 0) {
+ if (auto *C = dyn_cast<Constant>(
+ lookupOperandLeader(DepSI->getValueOperand()))) {
+ DEBUG(dbgs() << "Coercing load from store " << *DepSI << " to constant "
+ << *C << "\n");
+ return createConstantExpression(
+ getConstantStoreValueForLoad(C, Offset, LoadType, DL));
+ }
+ }
+
+ } else if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInst)) {
+ // Can't forward from non-atomic to atomic without violating memory model.
+ if (LI->isAtomic() > DepLI->isAtomic())
+ return nullptr;
+ int Offset = analyzeLoadFromClobberingLoad(LoadType, LoadPtr, DepLI, DL);
+ if (Offset >= 0) {
+ // We can coerce a constant load into a load
+ if (auto *C = dyn_cast<Constant>(lookupOperandLeader(DepLI)))
+ if (auto *PossibleConstant =
+ getConstantLoadValueForLoad(C, Offset, LoadType, DL)) {
+ DEBUG(dbgs() << "Coercing load from load " << *LI << " to constant "
+ << *PossibleConstant << "\n");
+ return createConstantExpression(PossibleConstant);
+ }
+ }
+
+ } else if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInst)) {
+ int Offset = analyzeLoadFromClobberingMemInst(LoadType, LoadPtr, DepMI, DL);
+ if (Offset >= 0) {
+ if (auto *PossibleConstant =
+ getConstantMemInstValueForLoad(DepMI, Offset, LoadType, DL)) {
+ DEBUG(dbgs() << "Coercing load from meminst " << *DepMI
+ << " to constant " << *PossibleConstant << "\n");
+ return createConstantExpression(PossibleConstant);
+ }
+ }
+ }
+
+ // All of the below are only true if the loaded pointer is produced
+ // by the dependent instruction.
+ if (LoadPtr != lookupOperandLeader(DepInst) &&
+ !AA->isMustAlias(LoadPtr, DepInst))
+ return nullptr;
+ // If this load really doesn't depend on anything, then we must be loading an
+ // undef value. This can happen when loading for a fresh allocation with no
+ // intervening stores, for example. Note that this is only true in the case
+ // that the result of the allocation is pointer equal to the load ptr.
+ if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI)) {
+ return createConstantExpression(UndefValue::get(LoadType));
+ }
+ // If this load occurs either right after a lifetime begin,
+ // then the loaded value is undefined.
+ else if (auto *II = dyn_cast<IntrinsicInst>(DepInst)) {
+ if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+ return createConstantExpression(UndefValue::get(LoadType));
+ }
+ // If this load follows a calloc (which zero initializes memory),
+ // then the loaded value is zero
+ else if (isCallocLikeFn(DepInst, TLI)) {
+ return createConstantExpression(Constant::getNullValue(LoadType));
}
- return createStoreExpression(SI, StoreAccess, B);
+ return nullptr;
}
-const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I,
- const BasicBlock *B) {
+const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const {
auto *LI = cast<LoadInst>(I);
// We can eliminate in favor of non-simple loads, but we won't be able to
@@ -775,12 +1356,13 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I,
if (!LI->isSimple())
return nullptr;
- Value *LoadAddressLeader = lookupOperandLeader(LI->getPointerOperand(), I, B);
+ Value *LoadAddressLeader = lookupOperandLeader(LI->getPointerOperand());
// Load of undef is undef.
if (isa<UndefValue>(LoadAddressLeader))
return createConstantExpression(UndefValue::get(LI->getType()));
-
- MemoryAccess *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(I);
+ MemoryAccess *OriginalAccess = getMemoryAccess(I);
+ MemoryAccess *DefiningAccess =
+ MSSAWalker->getClobberingMemoryAccess(OriginalAccess);
if (!MSSA->isLiveOnEntryDef(DefiningAccess)) {
if (auto *MD = dyn_cast<MemoryDef>(DefiningAccess)) {
@@ -788,88 +1370,263 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I,
// If the defining instruction is not reachable, replace with undef.
if (!ReachableBlocks.count(DefiningInst->getParent()))
return createConstantExpression(UndefValue::get(LI->getType()));
+ // This will handle stores and memory insts. We only do if it the
+ // defining access has a different type, or it is a pointer produced by
+ // certain memory operations that cause the memory to have a fixed value
+ // (IE things like calloc).
+ if (const auto *CoercionResult =
+ performSymbolicLoadCoercion(LI->getType(), LoadAddressLeader, LI,
+ DefiningInst, DefiningAccess))
+ return CoercionResult;
}
}
- const Expression *E =
- createLoadExpression(LI->getType(), LI->getPointerOperand(), LI,
- lookupMemoryAccessEquiv(DefiningAccess), B);
+ const Expression *E = createLoadExpression(LI->getType(), LoadAddressLeader,
+ LI, DefiningAccess);
return E;
}
+const Expression *
+NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
+ auto *PI = PredInfo->getPredicateInfoFor(I);
+ if (!PI)
+ return nullptr;
+
+ DEBUG(dbgs() << "Found predicate info from instruction !\n");
+
+ auto *PWC = dyn_cast<PredicateWithCondition>(PI);
+ if (!PWC)
+ return nullptr;
+
+ auto *CopyOf = I->getOperand(0);
+ auto *Cond = PWC->Condition;
+
+ // If this a copy of the condition, it must be either true or false depending
+ // on the predicate info type and edge
+ if (CopyOf == Cond) {
+ // We should not need to add predicate users because the predicate info is
+ // already a use of this operand.
+ if (isa<PredicateAssume>(PI))
+ return createConstantExpression(ConstantInt::getTrue(Cond->getType()));
+ if (auto *PBranch = dyn_cast<PredicateBranch>(PI)) {
+ if (PBranch->TrueEdge)
+ return createConstantExpression(ConstantInt::getTrue(Cond->getType()));
+ return createConstantExpression(ConstantInt::getFalse(Cond->getType()));
+ }
+ if (auto *PSwitch = dyn_cast<PredicateSwitch>(PI))
+ return createConstantExpression(cast<Constant>(PSwitch->CaseValue));
+ }
+
+ // Not a copy of the condition, so see what the predicates tell us about this
+ // value. First, though, we check to make sure the value is actually a copy
+ // of one of the condition operands. It's possible, in certain cases, for it
+ // to be a copy of a predicateinfo copy. In particular, if two branch
+ // operations use the same condition, and one branch dominates the other, we
+ // will end up with a copy of a copy. This is currently a small deficiency in
+ // predicateinfo. What will end up happening here is that we will value
+ // number both copies the same anyway.
+
+ // Everything below relies on the condition being a comparison.
+ auto *Cmp = dyn_cast<CmpInst>(Cond);
+ if (!Cmp)
+ return nullptr;
+
+ if (CopyOf != Cmp->getOperand(0) && CopyOf != Cmp->getOperand(1)) {
+ DEBUG(dbgs() << "Copy is not of any condition operands!\n");
+ return nullptr;
+ }
+ Value *FirstOp = lookupOperandLeader(Cmp->getOperand(0));
+ Value *SecondOp = lookupOperandLeader(Cmp->getOperand(1));
+ bool SwappedOps = false;
+ // Sort the ops
+ if (shouldSwapOperands(FirstOp, SecondOp)) {
+ std::swap(FirstOp, SecondOp);
+ SwappedOps = true;
+ }
+ CmpInst::Predicate Predicate =
+ SwappedOps ? Cmp->getSwappedPredicate() : Cmp->getPredicate();
+
+ if (isa<PredicateAssume>(PI)) {
+ // If the comparison is true when the operands are equal, then we know the
+ // operands are equal, because assumes must always be true.
+ if (CmpInst::isTrueWhenEqual(Predicate)) {
+ addPredicateUsers(PI, I);
+ addAdditionalUsers(Cmp->getOperand(0), I);
+ return createVariableOrConstant(FirstOp);
+ }
+ }
+ if (const auto *PBranch = dyn_cast<PredicateBranch>(PI)) {
+ // If we are *not* a copy of the comparison, we may equal to the other
+ // operand when the predicate implies something about equality of
+ // operations. In particular, if the comparison is true/false when the
+ // operands are equal, and we are on the right edge, we know this operation
+ // is equal to something.
+ if ((PBranch->TrueEdge && Predicate == CmpInst::ICMP_EQ) ||
+ (!PBranch->TrueEdge && Predicate == CmpInst::ICMP_NE)) {
+ addPredicateUsers(PI, I);
+ addAdditionalUsers(Cmp->getOperand(0), I);
+ return createVariableOrConstant(FirstOp);
+ }
+ // Handle the special case of floating point.
+ if (((PBranch->TrueEdge && Predicate == CmpInst::FCMP_OEQ) ||
+ (!PBranch->TrueEdge && Predicate == CmpInst::FCMP_UNE)) &&
+ isa<ConstantFP>(FirstOp) && !cast<ConstantFP>(FirstOp)->isZero()) {
+ addPredicateUsers(PI, I);
+ addAdditionalUsers(Cmp->getOperand(0), I);
+ return createConstantExpression(cast<Constant>(FirstOp));
+ }
+ }
+ return nullptr;
+}
+
// Evaluate read only and pure calls, and create an expression result.
-const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I,
- const BasicBlock *B) {
+const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) const {
auto *CI = cast<CallInst>(I);
- if (AA->doesNotAccessMemory(CI))
- return createCallExpression(CI, nullptr, B);
- if (AA->onlyReadsMemory(CI)) {
+ if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+ // Instrinsics with the returned attribute are copies of arguments.
+ if (auto *ReturnedValue = II->getReturnedArgOperand()) {
+ if (II->getIntrinsicID() == Intrinsic::ssa_copy)
+ if (const auto *Result = performSymbolicPredicateInfoEvaluation(I))
+ return Result;
+ return createVariableOrConstant(ReturnedValue);
+ }
+ }
+ if (AA->doesNotAccessMemory(CI)) {
+ return createCallExpression(CI, TOPClass->getMemoryLeader());
+ } else if (AA->onlyReadsMemory(CI)) {
MemoryAccess *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(CI);
- return createCallExpression(CI, lookupMemoryAccessEquiv(DefiningAccess), B);
+ return createCallExpression(CI, DefiningAccess);
}
return nullptr;
}
-// Update the memory access equivalence table to say that From is equal to To,
+// Retrieve the memory class for a given MemoryAccess.
+CongruenceClass *NewGVN::getMemoryClass(const MemoryAccess *MA) const {
+
+ auto *Result = MemoryAccessToClass.lookup(MA);
+ assert(Result && "Should have found memory class");
+ return Result;
+}
+
+// Update the MemoryAccess equivalence table to say that From is equal to To,
// and return true if this is different from what already existed in the table.
-bool NewGVN::setMemoryAccessEquivTo(MemoryAccess *From, MemoryAccess *To) {
- DEBUG(dbgs() << "Setting " << *From << " equivalent to ");
- if (!To)
- DEBUG(dbgs() << "itself");
- else
- DEBUG(dbgs() << *To);
- DEBUG(dbgs() << "\n");
- auto LookupResult = MemoryAccessEquiv.find(From);
+bool NewGVN::setMemoryClass(const MemoryAccess *From,
+ CongruenceClass *NewClass) {
+ assert(NewClass &&
+ "Every MemoryAccess should be getting mapped to a non-null class");
+ DEBUG(dbgs() << "Setting " << *From);
+ DEBUG(dbgs() << " equivalent to congruence class ");
+ DEBUG(dbgs() << NewClass->getID() << " with current MemoryAccess leader ");
+ DEBUG(dbgs() << *NewClass->getMemoryLeader() << "\n");
+
+ auto LookupResult = MemoryAccessToClass.find(From);
bool Changed = false;
// If it's already in the table, see if the value changed.
- if (LookupResult != MemoryAccessEquiv.end()) {
- if (To && LookupResult->second != To) {
+ if (LookupResult != MemoryAccessToClass.end()) {
+ auto *OldClass = LookupResult->second;
+ if (OldClass != NewClass) {
+ // If this is a phi, we have to handle memory member updates.
+ if (auto *MP = dyn_cast<MemoryPhi>(From)) {
+ OldClass->memory_erase(MP);
+ NewClass->memory_insert(MP);
+ // This may have killed the class if it had no non-memory members
+ if (OldClass->getMemoryLeader() == From) {
+ if (OldClass->definesNoMemory()) {
+ OldClass->setMemoryLeader(nullptr);
+ } else {
+ OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
+ DEBUG(dbgs() << "Memory class leader change for class "
+ << OldClass->getID() << " to "
+ << *OldClass->getMemoryLeader()
+ << " due to removal of a memory member " << *From
+ << "\n");
+ markMemoryLeaderChangeTouched(OldClass);
+ }
+ }
+ }
// It wasn't equivalent before, and now it is.
- LookupResult->second = To;
- Changed = true;
- } else if (!To) {
- // It used to be equivalent to something, and now it's not.
- MemoryAccessEquiv.erase(LookupResult);
+ LookupResult->second = NewClass;
Changed = true;
}
- } else {
- assert(!To &&
- "Memory equivalence should never change from nothing to something");
}
return Changed;
}
+
+// Determine if a instruction is cycle-free. That means the values in the
+// instruction don't depend on any expressions that can change value as a result
+// of the instruction. For example, a non-cycle free instruction would be v =
+// phi(0, v+1).
+bool NewGVN::isCycleFree(const Instruction *I) const {
+ // In order to compute cycle-freeness, we do SCC finding on the instruction,
+ // and see what kind of SCC it ends up in. If it is a singleton, it is
+ // cycle-free. If it is not in a singleton, it is only cycle free if the
+ // other members are all phi nodes (as they do not compute anything, they are
+ // copies).
+ auto ICS = InstCycleState.lookup(I);
+ if (ICS == ICS_Unknown) {
+ SCCFinder.Start(I);
+ auto &SCC = SCCFinder.getComponentFor(I);
+ // It's cycle free if it's size 1 or or the SCC is *only* phi nodes.
+ if (SCC.size() == 1)
+ InstCycleState.insert({I, ICS_CycleFree});
+ else {
+ bool AllPhis =
+ llvm::all_of(SCC, [](const Value *V) { return isa<PHINode>(V); });
+ ICS = AllPhis ? ICS_CycleFree : ICS_Cycle;
+ for (auto *Member : SCC)
+ if (auto *MemberPhi = dyn_cast<PHINode>(Member))
+ InstCycleState.insert({MemberPhi, ICS});
+ }
+ }
+ if (ICS == ICS_Cycle)
+ return false;
+ return true;
+}
+
// Evaluate PHI nodes symbolically, and create an expression result.
-const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I,
- const BasicBlock *B) {
- auto *E = cast<PHIExpression>(createPHIExpression(I));
+const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const {
+ // True if one of the incoming phi edges is a backedge.
+ bool HasBackedge = false;
+ // All constant tracks the state of whether all the *original* phi operands
+ // This is really shorthand for "this phi cannot cycle due to forward
+ // change in value of the phi is guaranteed not to later change the value of
+ // the phi. IE it can't be v = phi(undef, v+1)
+ bool AllConstant = true;
+ auto *E =
+ cast<PHIExpression>(createPHIExpression(I, HasBackedge, AllConstant));
// We match the semantics of SimplifyPhiNode from InstructionSimplify here.
-
- // See if all arguaments are the same.
+ // See if all arguments are the same.
// We track if any were undef because they need special handling.
bool HasUndef = false;
- auto Filtered = make_filter_range(E->operands(), [&](const Value *Arg) {
- if (Arg == I)
- return false;
+ auto Filtered = make_filter_range(E->operands(), [&](Value *Arg) {
if (isa<UndefValue>(Arg)) {
HasUndef = true;
return false;
}
return true;
});
- // If we are left with no operands, it's undef
+ // If we are left with no operands, it's dead.
if (Filtered.begin() == Filtered.end()) {
- DEBUG(dbgs() << "Simplified PHI node " << *I << " to undef"
- << "\n");
- E->deallocateOperands(ArgRecycler);
- ExpressionAllocator.Deallocate(E);
- return createConstantExpression(UndefValue::get(I->getType()));
+ // If it has undef at this point, it means there are no-non-undef arguments,
+ // and thus, the value of the phi node must be undef.
+ if (HasUndef) {
+ DEBUG(dbgs() << "PHI Node " << *I
+ << " has no non-undef arguments, valuing it as undef\n");
+ return createConstantExpression(UndefValue::get(I->getType()));
+ }
+
+ DEBUG(dbgs() << "No arguments of PHI node " << *I << " are live\n");
+ deleteExpression(E);
+ return createDeadExpression();
}
+ unsigned NumOps = 0;
Value *AllSameValue = *(Filtered.begin());
++Filtered.begin();
// Can't use std::equal here, sadly, because filter.begin moves.
- if (llvm::all_of(Filtered, [AllSameValue](const Value *V) {
- return V == AllSameValue;
+ if (llvm::all_of(Filtered, [&](Value *Arg) {
+ ++NumOps;
+ return Arg == AllSameValue;
})) {
// In LLVM's non-standard representation of phi nodes, it's possible to have
// phi nodes with cycles (IE dependent on other phis that are .... dependent
@@ -881,27 +1638,38 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I,
// We also special case undef, so that if we have an undef, we can't use the
// common value unless it dominates the phi block.
if (HasUndef) {
+ // If we have undef and at least one other value, this is really a
+ // multivalued phi, and we need to know if it's cycle free in order to
+ // evaluate whether we can ignore the undef. The other parts of this are
+ // just shortcuts. If there is no backedge, or all operands are
+ // constants, or all operands are ignored but the undef, it also must be
+ // cycle free.
+ if (!AllConstant && HasBackedge && NumOps > 0 &&
+ !isa<UndefValue>(AllSameValue) && !isCycleFree(I))
+ return E;
+
// Only have to check for instructions
if (auto *AllSameInst = dyn_cast<Instruction>(AllSameValue))
- if (!DT->dominates(AllSameInst, I))
+ if (!someEquivalentDominates(AllSameInst, I))
return E;
}
-
+ // Can't simplify to something that comes later in the iteration.
+ // Otherwise, when and if it changes congruence class, we will never catch
+ // up. We will always be a class behind it.
+ if (isa<Instruction>(AllSameValue) &&
+ InstrToDFSNum(AllSameValue) > InstrToDFSNum(I))
+ return E;
NumGVNPhisAllSame++;
DEBUG(dbgs() << "Simplified PHI node " << *I << " to " << *AllSameValue
<< "\n");
- E->deallocateOperands(ArgRecycler);
- ExpressionAllocator.Deallocate(E);
- if (auto *C = dyn_cast<Constant>(AllSameValue))
- return createConstantExpression(C);
- return createVariableExpression(AllSameValue);
+ deleteExpression(E);
+ return createVariableOrConstant(AllSameValue);
}
return E;
}
const Expression *
-NewGVN::performSymbolicAggrValueEvaluation(Instruction *I,
- const BasicBlock *B) {
+NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) const {
if (auto *EI = dyn_cast<ExtractValueInst>(I)) {
auto *II = dyn_cast<IntrinsicInst>(EI->getAggregateOperand());
if (II && EI->getNumIndices() == 1 && *EI->idx_begin() == 0) {
@@ -931,19 +1699,140 @@ NewGVN::performSymbolicAggrValueEvaluation(Instruction *I,
// expression.
assert(II->getNumArgOperands() == 2 &&
"Expect two args for recognised intrinsics.");
- return createBinaryExpression(Opcode, EI->getType(),
- II->getArgOperand(0),
- II->getArgOperand(1), B);
+ return createBinaryExpression(
+ Opcode, EI->getType(), II->getArgOperand(0), II->getArgOperand(1));
}
}
}
- return createAggregateValueExpression(I, B);
+ return createAggregateValueExpression(I);
+}
+const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const {
+ auto *CI = dyn_cast<CmpInst>(I);
+ // See if our operands are equal to those of a previous predicate, and if so,
+ // if it implies true or false.
+ auto Op0 = lookupOperandLeader(CI->getOperand(0));
+ auto Op1 = lookupOperandLeader(CI->getOperand(1));
+ auto OurPredicate = CI->getPredicate();
+ if (shouldSwapOperands(Op0, Op1)) {
+ std::swap(Op0, Op1);
+ OurPredicate = CI->getSwappedPredicate();
+ }
+
+ // Avoid processing the same info twice
+ const PredicateBase *LastPredInfo = nullptr;
+ // See if we know something about the comparison itself, like it is the target
+ // of an assume.
+ auto *CmpPI = PredInfo->getPredicateInfoFor(I);
+ if (dyn_cast_or_null<PredicateAssume>(CmpPI))
+ return createConstantExpression(ConstantInt::getTrue(CI->getType()));
+
+ if (Op0 == Op1) {
+ // This condition does not depend on predicates, no need to add users
+ if (CI->isTrueWhenEqual())
+ return createConstantExpression(ConstantInt::getTrue(CI->getType()));
+ else if (CI->isFalseWhenEqual())
+ return createConstantExpression(ConstantInt::getFalse(CI->getType()));
+ }
+
+ // NOTE: Because we are comparing both operands here and below, and using
+ // previous comparisons, we rely on fact that predicateinfo knows to mark
+ // comparisons that use renamed operands as users of the earlier comparisons.
+ // It is *not* enough to just mark predicateinfo renamed operands as users of
+ // the earlier comparisons, because the *other* operand may have changed in a
+ // previous iteration.
+ // Example:
+ // icmp slt %a, %b
+ // %b.0 = ssa.copy(%b)
+ // false branch:
+ // icmp slt %c, %b.0
+
+ // %c and %a may start out equal, and thus, the code below will say the second
+ // %icmp is false. c may become equal to something else, and in that case the
+ // %second icmp *must* be reexamined, but would not if only the renamed
+ // %operands are considered users of the icmp.
+
+ // *Currently* we only check one level of comparisons back, and only mark one
+ // level back as touched when changes appen . If you modify this code to look
+ // back farther through comparisons, you *must* mark the appropriate
+ // comparisons as users in PredicateInfo.cpp, or you will cause bugs. See if
+ // we know something just from the operands themselves
+
+ // See if our operands have predicate info, so that we may be able to derive
+ // something from a previous comparison.
+ for (const auto &Op : CI->operands()) {
+ auto *PI = PredInfo->getPredicateInfoFor(Op);
+ if (const auto *PBranch = dyn_cast_or_null<PredicateBranch>(PI)) {
+ if (PI == LastPredInfo)
+ continue;
+ LastPredInfo = PI;
+
+ // TODO: Along the false edge, we may know more things too, like icmp of
+ // same operands is false.
+ // TODO: We only handle actual comparison conditions below, not and/or.
+ auto *BranchCond = dyn_cast<CmpInst>(PBranch->Condition);
+ if (!BranchCond)
+ continue;
+ auto *BranchOp0 = lookupOperandLeader(BranchCond->getOperand(0));
+ auto *BranchOp1 = lookupOperandLeader(BranchCond->getOperand(1));
+ auto BranchPredicate = BranchCond->getPredicate();
+ if (shouldSwapOperands(BranchOp0, BranchOp1)) {
+ std::swap(BranchOp0, BranchOp1);
+ BranchPredicate = BranchCond->getSwappedPredicate();
+ }
+ if (BranchOp0 == Op0 && BranchOp1 == Op1) {
+ if (PBranch->TrueEdge) {
+ // If we know the previous predicate is true and we are in the true
+ // edge then we may be implied true or false.
+ if (CmpInst::isImpliedTrueByMatchingCmp(BranchPredicate,
+ OurPredicate)) {
+ addPredicateUsers(PI, I);
+ return createConstantExpression(
+ ConstantInt::getTrue(CI->getType()));
+ }
+
+ if (CmpInst::isImpliedFalseByMatchingCmp(BranchPredicate,
+ OurPredicate)) {
+ addPredicateUsers(PI, I);
+ return createConstantExpression(
+ ConstantInt::getFalse(CI->getType()));
+ }
+
+ } else {
+ // Just handle the ne and eq cases, where if we have the same
+ // operands, we may know something.
+ if (BranchPredicate == OurPredicate) {
+ addPredicateUsers(PI, I);
+ // Same predicate, same ops,we know it was false, so this is false.
+ return createConstantExpression(
+ ConstantInt::getFalse(CI->getType()));
+ } else if (BranchPredicate ==
+ CmpInst::getInversePredicate(OurPredicate)) {
+ addPredicateUsers(PI, I);
+ // Inverse predicate, we know the other was false, so this is true.
+ return createConstantExpression(
+ ConstantInt::getTrue(CI->getType()));
+ }
+ }
+ }
+ }
+ }
+ // Create expression will take care of simplifyCmpInst
+ return createExpression(I);
+}
+
+// Return true if V is a value that will always be available (IE can
+// be placed anywhere) in the function. We don't do globals here
+// because they are often worse to put in place.
+// TODO: Separate cost from availability
+static bool alwaysAvailable(Value *V) {
+ return isa<Constant>(V) || isa<Argument>(V);
}
// Substitute and symbolize the value before value numbering.
-const Expression *NewGVN::performSymbolicEvaluation(Value *V,
- const BasicBlock *B) {
+const Expression *
+NewGVN::performSymbolicEvaluation(Value *V,
+ SmallPtrSetImpl<Value *> &Visited) const {
const Expression *E = nullptr;
if (auto *C = dyn_cast<Constant>(V))
E = createConstantExpression(C);
@@ -957,24 +1846,27 @@ const Expression *NewGVN::performSymbolicEvaluation(Value *V,
switch (I->getOpcode()) {
case Instruction::ExtractValue:
case Instruction::InsertValue:
- E = performSymbolicAggrValueEvaluation(I, B);
+ E = performSymbolicAggrValueEvaluation(I);
break;
case Instruction::PHI:
- E = performSymbolicPHIEvaluation(I, B);
+ E = performSymbolicPHIEvaluation(I);
break;
case Instruction::Call:
- E = performSymbolicCallEvaluation(I, B);
+ E = performSymbolicCallEvaluation(I);
break;
case Instruction::Store:
- E = performSymbolicStoreEvaluation(I, B);
+ E = performSymbolicStoreEvaluation(I);
break;
case Instruction::Load:
- E = performSymbolicLoadEvaluation(I, B);
+ E = performSymbolicLoadEvaluation(I);
break;
case Instruction::BitCast: {
- E = createExpression(I, B);
+ E = createExpression(I);
+ } break;
+ case Instruction::ICmp:
+ case Instruction::FCmp: {
+ E = performSymbolicCmpEvaluation(I);
} break;
-
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
@@ -993,8 +1885,6 @@ const Expression *NewGVN::performSymbolicEvaluation(Value *V,
case Instruction::And:
case Instruction::Or:
case Instruction::Xor:
- case Instruction::ICmp:
- case Instruction::FCmp:
case Instruction::Trunc:
case Instruction::ZExt:
case Instruction::SExt:
@@ -1011,7 +1901,7 @@ const Expression *NewGVN::performSymbolicEvaluation(Value *V,
case Instruction::InsertElement:
case Instruction::ShuffleVector:
case Instruction::GetElementPtr:
- E = createExpression(I, B);
+ E = createExpression(I);
break;
default:
return nullptr;
@@ -1020,147 +1910,308 @@ const Expression *NewGVN::performSymbolicEvaluation(Value *V,
return E;
}
-// There is an edge from 'Src' to 'Dst'. Return true if every path from
-// the entry block to 'Dst' passes via this edge. In particular 'Dst'
-// must not be reachable via another edge from 'Src'.
-bool NewGVN::isOnlyReachableViaThisEdge(const BasicBlockEdge &E) const {
+// Look up a container in a map, and then call a function for each thing in the
+// found container.
+template <typename Map, typename KeyType, typename Func>
+void NewGVN::for_each_found(Map &M, const KeyType &Key, Func F) {
+ const auto Result = M.find_as(Key);
+ if (Result != M.end())
+ for (typename Map::mapped_type::value_type Mapped : Result->second)
+ F(Mapped);
+}
+
+// Look up a container of values/instructions in a map, and touch all the
+// instructions in the container. Then erase value from the map.
+template <typename Map, typename KeyType>
+void NewGVN::touchAndErase(Map &M, const KeyType &Key) {
+ const auto Result = M.find_as(Key);
+ if (Result != M.end()) {
+ for (const typename Map::mapped_type::value_type Mapped : Result->second)
+ TouchedInstructions.set(InstrToDFSNum(Mapped));
+ M.erase(Result);
+ }
+}
- // While in theory it is interesting to consider the case in which Dst has
- // more than one predecessor, because Dst might be part of a loop which is
- // only reachable from Src, in practice it is pointless since at the time
- // GVN runs all such loops have preheaders, which means that Dst will have
- // been changed to have only one predecessor, namely Src.
- const BasicBlock *Pred = E.getEnd()->getSinglePredecessor();
- const BasicBlock *Src = E.getStart();
- assert((!Pred || Pred == Src) && "No edge between these basic blocks!");
- (void)Src;
- return Pred != nullptr;
+void NewGVN::addAdditionalUsers(Value *To, Value *User) const {
+ if (isa<Instruction>(To))
+ AdditionalUsers[To].insert(User);
}
void NewGVN::markUsersTouched(Value *V) {
// Now mark the users as touched.
for (auto *User : V->users()) {
assert(isa<Instruction>(User) && "Use of value not within an instruction?");
- TouchedInstructions.set(InstrDFS[User]);
+ TouchedInstructions.set(InstrToDFSNum(User));
}
+ touchAndErase(AdditionalUsers, V);
}
-void NewGVN::markMemoryUsersTouched(MemoryAccess *MA) {
- for (auto U : MA->users()) {
- if (auto *MUD = dyn_cast<MemoryUseOrDef>(U))
- TouchedInstructions.set(InstrDFS[MUD->getMemoryInst()]);
- else
- TouchedInstructions.set(InstrDFS[U]);
- }
+void NewGVN::addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const {
+ DEBUG(dbgs() << "Adding memory user " << *U << " to " << *To << "\n");
+ MemoryToUsers[To].insert(U);
+}
+
+void NewGVN::markMemoryDefTouched(const MemoryAccess *MA) {
+ TouchedInstructions.set(MemoryToDFSNum(MA));
+}
+
+void NewGVN::markMemoryUsersTouched(const MemoryAccess *MA) {
+ if (isa<MemoryUse>(MA))
+ return;
+ for (auto U : MA->users())
+ TouchedInstructions.set(MemoryToDFSNum(U));
+ touchAndErase(MemoryToUsers, MA);
+}
+
+// Add I to the set of users of a given predicate.
+void NewGVN::addPredicateUsers(const PredicateBase *PB, Instruction *I) const {
+ // Don't add temporary instructions to the user lists.
+ if (AllTempInstructions.count(I))
+ return;
+
+ if (auto *PBranch = dyn_cast<PredicateBranch>(PB))
+ PredicateToUsers[PBranch->Condition].insert(I);
+ else if (auto *PAssume = dyn_cast<PredicateBranch>(PB))
+ PredicateToUsers[PAssume->Condition].insert(I);
+}
+
+// Touch all the predicates that depend on this instruction.
+void NewGVN::markPredicateUsersTouched(Instruction *I) {
+ touchAndErase(PredicateToUsers, I);
+}
+
+// Mark users affected by a memory leader change.
+void NewGVN::markMemoryLeaderChangeTouched(CongruenceClass *CC) {
+ for (auto M : CC->memory())
+ markMemoryDefTouched(M);
}
// Touch the instructions that need to be updated after a congruence class has a
// leader change, and mark changed values.
-void NewGVN::markLeaderChangeTouched(CongruenceClass *CC) {
- for (auto M : CC->Members) {
+void NewGVN::markValueLeaderChangeTouched(CongruenceClass *CC) {
+ for (auto M : *CC) {
if (auto *I = dyn_cast<Instruction>(M))
- TouchedInstructions.set(InstrDFS[I]);
+ TouchedInstructions.set(InstrToDFSNum(I));
LeaderChanges.insert(M);
}
}
+// Give a range of things that have instruction DFS numbers, this will return
+// the member of the range with the smallest dfs number.
+template <class T, class Range>
+T *NewGVN::getMinDFSOfRange(const Range &R) const {
+ std::pair<T *, unsigned> MinDFS = {nullptr, ~0U};
+ for (const auto X : R) {
+ auto DFSNum = InstrToDFSNum(X);
+ if (DFSNum < MinDFS.second)
+ MinDFS = {X, DFSNum};
+ }
+ return MinDFS.first;
+}
+
+// This function returns the MemoryAccess that should be the next leader of
+// congruence class CC, under the assumption that the current leader is going to
+// disappear.
+const MemoryAccess *NewGVN::getNextMemoryLeader(CongruenceClass *CC) const {
+ // TODO: If this ends up to slow, we can maintain a next memory leader like we
+ // do for regular leaders.
+ // Make sure there will be a leader to find
+ assert(!CC->definesNoMemory() && "Can't get next leader if there is none");
+ if (CC->getStoreCount() > 0) {
+ if (auto *NL = dyn_cast_or_null<StoreInst>(CC->getNextLeader().first))
+ return getMemoryAccess(NL);
+ // Find the store with the minimum DFS number.
+ auto *V = getMinDFSOfRange<Value>(make_filter_range(
+ *CC, [&](const Value *V) { return isa<StoreInst>(V); }));
+ return getMemoryAccess(cast<StoreInst>(V));
+ }
+ assert(CC->getStoreCount() == 0);
+
+ // Given our assertion, hitting this part must mean
+ // !OldClass->memory_empty()
+ if (CC->memory_size() == 1)
+ return *CC->memory_begin();
+ return getMinDFSOfRange<const MemoryPhi>(CC->memory());
+}
+
+// This function returns the next value leader of a congruence class, under the
+// assumption that the current leader is going away. This should end up being
+// the next most dominating member.
+Value *NewGVN::getNextValueLeader(CongruenceClass *CC) const {
+ // We don't need to sort members if there is only 1, and we don't care about
+ // sorting the TOP class because everything either gets out of it or is
+ // unreachable.
+
+ if (CC->size() == 1 || CC == TOPClass) {
+ return *(CC->begin());
+ } else if (CC->getNextLeader().first) {
+ ++NumGVNAvoidedSortedLeaderChanges;
+ return CC->getNextLeader().first;
+ } else {
+ ++NumGVNSortedLeaderChanges;
+ // NOTE: If this ends up to slow, we can maintain a dual structure for
+ // member testing/insertion, or keep things mostly sorted, and sort only
+ // here, or use SparseBitVector or ....
+ return getMinDFSOfRange<Value>(*CC);
+ }
+}
+
+// Move a MemoryAccess, currently in OldClass, to NewClass, including updates to
+// the memory members, etc for the move.
+//
+// The invariants of this function are:
+//
+// - I must be moving to NewClass from OldClass
+// - The StoreCount of OldClass and NewClass is expected to have been updated
+// for I already if it is is a store.
+// - The OldClass memory leader has not been updated yet if I was the leader.
+void NewGVN::moveMemoryToNewCongruenceClass(Instruction *I,
+ MemoryAccess *InstMA,
+ CongruenceClass *OldClass,
+ CongruenceClass *NewClass) {
+ // If the leader is I, and we had a represenative MemoryAccess, it should
+ // be the MemoryAccess of OldClass.
+ assert((!InstMA || !OldClass->getMemoryLeader() ||
+ OldClass->getLeader() != I ||
+ MemoryAccessToClass.lookup(OldClass->getMemoryLeader()) ==
+ MemoryAccessToClass.lookup(InstMA)) &&
+ "Representative MemoryAccess mismatch");
+ // First, see what happens to the new class
+ if (!NewClass->getMemoryLeader()) {
+ // Should be a new class, or a store becoming a leader of a new class.
+ assert(NewClass->size() == 1 ||
+ (isa<StoreInst>(I) && NewClass->getStoreCount() == 1));
+ NewClass->setMemoryLeader(InstMA);
+ // Mark it touched if we didn't just create a singleton
+ DEBUG(dbgs() << "Memory class leader change for class " << NewClass->getID()
+ << " due to new memory instruction becoming leader\n");
+ markMemoryLeaderChangeTouched(NewClass);
+ }
+ setMemoryClass(InstMA, NewClass);
+ // Now, fixup the old class if necessary
+ if (OldClass->getMemoryLeader() == InstMA) {
+ if (!OldClass->definesNoMemory()) {
+ OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
+ DEBUG(dbgs() << "Memory class leader change for class "
+ << OldClass->getID() << " to "
+ << *OldClass->getMemoryLeader()
+ << " due to removal of old leader " << *InstMA << "\n");
+ markMemoryLeaderChangeTouched(OldClass);
+ } else
+ OldClass->setMemoryLeader(nullptr);
+ }
+}
+
// Move a value, currently in OldClass, to be part of NewClass
-// Update OldClass for the move (including changing leaders, etc)
-void NewGVN::moveValueToNewCongruenceClass(Instruction *I,
+// Update OldClass and NewClass for the move (including changing leaders, etc).
+void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
CongruenceClass *OldClass,
CongruenceClass *NewClass) {
- DEBUG(dbgs() << "New congruence class for " << I << " is " << NewClass->ID
- << "\n");
-
- if (I == OldClass->NextLeader.first)
- OldClass->NextLeader = {nullptr, ~0U};
-
- // It's possible, though unlikely, for us to discover equivalences such
- // that the current leader does not dominate the old one.
- // This statistic tracks how often this happens.
- // We assert on phi nodes when this happens, currently, for debugging, because
- // we want to make sure we name phi node cycles properly.
- if (isa<Instruction>(NewClass->RepLeader) && NewClass->RepLeader &&
- I != NewClass->RepLeader &&
- DT->properlyDominates(
- I->getParent(),
- cast<Instruction>(NewClass->RepLeader)->getParent())) {
- ++NumGVNNotMostDominatingLeader;
- assert(!isa<PHINode>(I) &&
- "New class for instruction should not be dominated by instruction");
- }
-
- if (NewClass->RepLeader != I) {
- auto DFSNum = InstrDFS.lookup(I);
- if (DFSNum < NewClass->NextLeader.second)
- NewClass->NextLeader = {I, DFSNum};
- }
-
- OldClass->Members.erase(I);
- NewClass->Members.insert(I);
- if (isa<StoreInst>(I)) {
- --OldClass->StoreCount;
- assert(OldClass->StoreCount >= 0);
- ++NewClass->StoreCount;
- assert(NewClass->StoreCount > 0);
+ if (I == OldClass->getNextLeader().first)
+ OldClass->resetNextLeader();
+
+ OldClass->erase(I);
+ NewClass->insert(I);
+
+ if (NewClass->getLeader() != I)
+ NewClass->addPossibleNextLeader({I, InstrToDFSNum(I)});
+ // Handle our special casing of stores.
+ if (auto *SI = dyn_cast<StoreInst>(I)) {
+ OldClass->decStoreCount();
+ // Okay, so when do we want to make a store a leader of a class?
+ // If we have a store defined by an earlier load, we want the earlier load
+ // to lead the class.
+ // If we have a store defined by something else, we want the store to lead
+ // the class so everything else gets the "something else" as a value.
+ // If we have a store as the single member of the class, we want the store
+ // as the leader
+ if (NewClass->getStoreCount() == 0 && !NewClass->getStoredValue()) {
+ // If it's a store expression we are using, it means we are not equivalent
+ // to something earlier.
+ if (auto *SE = dyn_cast<StoreExpression>(E)) {
+ NewClass->setStoredValue(SE->getStoredValue());
+ markValueLeaderChangeTouched(NewClass);
+ // Shift the new class leader to be the store
+ DEBUG(dbgs() << "Changing leader of congruence class "
+ << NewClass->getID() << " from " << *NewClass->getLeader()
+ << " to " << *SI << " because store joined class\n");
+ // If we changed the leader, we have to mark it changed because we don't
+ // know what it will do to symbolic evaluation.
+ NewClass->setLeader(SI);
+ }
+ // We rely on the code below handling the MemoryAccess change.
+ }
+ NewClass->incStoreCount();
}
+ // True if there is no memory instructions left in a class that had memory
+ // instructions before.
+ // If it's not a memory use, set the MemoryAccess equivalence
+ auto *InstMA = dyn_cast_or_null<MemoryDef>(getMemoryAccess(I));
+ if (InstMA)
+ moveMemoryToNewCongruenceClass(I, InstMA, OldClass, NewClass);
ValueToClass[I] = NewClass;
// See if we destroyed the class or need to swap leaders.
- if (OldClass->Members.empty() && OldClass != InitialClass) {
- if (OldClass->DefiningExpr) {
- OldClass->Dead = true;
- DEBUG(dbgs() << "Erasing expression " << OldClass->DefiningExpr
+ if (OldClass->empty() && OldClass != TOPClass) {
+ if (OldClass->getDefiningExpr()) {
+ DEBUG(dbgs() << "Erasing expression " << *OldClass->getDefiningExpr()
<< " from table\n");
- ExpressionToClass.erase(OldClass->DefiningExpr);
+ // We erase it as an exact expression to make sure we don't just erase an
+ // equivalent one.
+ auto Iter = ExpressionToClass.find_as(
+ ExactEqualsExpression(*OldClass->getDefiningExpr()));
+ if (Iter != ExpressionToClass.end())
+ ExpressionToClass.erase(Iter);
+#ifdef EXPENSIVE_CHECKS
+ assert(
+ (*OldClass->getDefiningExpr() != *E || ExpressionToClass.lookup(E)) &&
+ "We erased the expression we just inserted, which should not happen");
+#endif
}
- } else if (OldClass->RepLeader == I) {
+ } else if (OldClass->getLeader() == I) {
// When the leader changes, the value numbering of
// everything may change due to symbolization changes, so we need to
// reprocess.
- DEBUG(dbgs() << "Leader change!\n");
+ DEBUG(dbgs() << "Value class leader change for class " << OldClass->getID()
+ << "\n");
++NumGVNLeaderChanges;
- // We don't need to sort members if there is only 1, and we don't care about
- // sorting the initial class because everything either gets out of it or is
- // unreachable.
- if (OldClass->Members.size() == 1 || OldClass == InitialClass) {
- OldClass->RepLeader = *(OldClass->Members.begin());
- } else if (OldClass->NextLeader.first) {
- ++NumGVNAvoidedSortedLeaderChanges;
- OldClass->RepLeader = OldClass->NextLeader.first;
- OldClass->NextLeader = {nullptr, ~0U};
- } else {
- ++NumGVNSortedLeaderChanges;
- // TODO: If this ends up to slow, we can maintain a dual structure for
- // member testing/insertion, or keep things mostly sorted, and sort only
- // here, or ....
- std::pair<Value *, unsigned> MinDFS = {nullptr, ~0U};
- for (const auto X : OldClass->Members) {
- auto DFSNum = InstrDFS.lookup(X);
- if (DFSNum < MinDFS.second)
- MinDFS = {X, DFSNum};
- }
- OldClass->RepLeader = MinDFS.first;
+ // Destroy the stored value if there are no more stores to represent it.
+ // Note that this is basically clean up for the expression removal that
+ // happens below. If we remove stores from a class, we may leave it as a
+ // class of equivalent memory phis.
+ if (OldClass->getStoreCount() == 0) {
+ if (OldClass->getStoredValue())
+ OldClass->setStoredValue(nullptr);
}
- markLeaderChangeTouched(OldClass);
+ OldClass->setLeader(getNextValueLeader(OldClass));
+ OldClass->resetNextLeader();
+ markValueLeaderChangeTouched(OldClass);
}
}
+// For a given expression, mark the phi of ops instructions that could have
+// changed as a result.
+void NewGVN::markPhiOfOpsChanged(const Expression *E) {
+ touchAndErase(ExpressionToPhiOfOps, ExactEqualsExpression(*E));
+}
+
// Perform congruence finding on a given value numbering expression.
void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
- ValueToExpression[I] = E;
// This is guaranteed to return something, since it will at least find
- // INITIAL.
+ // TOP.
- CongruenceClass *IClass = ValueToClass[I];
+ CongruenceClass *IClass = ValueToClass.lookup(I);
assert(IClass && "Should have found a IClass");
// Dead classes should have been eliminated from the mapping.
- assert(!IClass->Dead && "Found a dead class");
+ assert(!IClass->isDead() && "Found a dead class");
- CongruenceClass *EClass;
+ CongruenceClass *EClass = nullptr;
if (const auto *VE = dyn_cast<VariableExpression>(E)) {
- EClass = ValueToClass[VE->getVariableValue()];
- } else {
+ EClass = ValueToClass.lookup(VE->getVariableValue());
+ } else if (isa<DeadExpression>(E)) {
+ EClass = TOPClass;
+ }
+ if (!EClass) {
auto lookupResult = ExpressionToClass.insert({E, nullptr});
// If it's not in the value table, create a new congruence class.
@@ -1171,80 +2222,73 @@ void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
// Constants and variables should always be made the leader.
if (const auto *CE = dyn_cast<ConstantExpression>(E)) {
- NewClass->RepLeader = CE->getConstantValue();
+ NewClass->setLeader(CE->getConstantValue());
} else if (const auto *SE = dyn_cast<StoreExpression>(E)) {
StoreInst *SI = SE->getStoreInst();
- NewClass->RepLeader =
- lookupOperandLeader(SI->getValueOperand(), SI, SI->getParent());
+ NewClass->setLeader(SI);
+ NewClass->setStoredValue(SE->getStoredValue());
+ // The RepMemoryAccess field will be filled in properly by the
+ // moveValueToNewCongruenceClass call.
} else {
- NewClass->RepLeader = I;
+ NewClass->setLeader(I);
}
assert(!isa<VariableExpression>(E) &&
"VariableExpression should have been handled already");
EClass = NewClass;
DEBUG(dbgs() << "Created new congruence class for " << *I
- << " using expression " << *E << " at " << NewClass->ID
- << " and leader " << *(NewClass->RepLeader) << "\n");
- DEBUG(dbgs() << "Hash value was " << E->getHashValue() << "\n");
+ << " using expression " << *E << " at " << NewClass->getID()
+ << " and leader " << *(NewClass->getLeader()));
+ if (NewClass->getStoredValue())
+ DEBUG(dbgs() << " and stored value " << *(NewClass->getStoredValue()));
+ DEBUG(dbgs() << "\n");
} else {
EClass = lookupResult.first->second;
if (isa<ConstantExpression>(E))
- assert(isa<Constant>(EClass->RepLeader) &&
+ assert((isa<Constant>(EClass->getLeader()) ||
+ (EClass->getStoredValue() &&
+ isa<Constant>(EClass->getStoredValue()))) &&
"Any class with a constant expression should have a "
"constant leader");
assert(EClass && "Somehow don't have an eclass");
- assert(!EClass->Dead && "We accidentally looked up a dead class");
+ assert(!EClass->isDead() && "We accidentally looked up a dead class");
}
}
bool ClassChanged = IClass != EClass;
bool LeaderChanged = LeaderChanges.erase(I);
if (ClassChanged || LeaderChanged) {
- DEBUG(dbgs() << "Found class " << EClass->ID << " for expression " << E
+ DEBUG(dbgs() << "New class " << EClass->getID() << " for expression " << *E
<< "\n");
+ if (ClassChanged) {
+ moveValueToNewCongruenceClass(I, E, IClass, EClass);
+ markPhiOfOpsChanged(E);
+ }
- if (ClassChanged)
- moveValueToNewCongruenceClass(I, IClass, EClass);
markUsersTouched(I);
- if (MemoryAccess *MA = MSSA->getMemoryAccess(I)) {
- // If this is a MemoryDef, we need to update the equivalence table. If
- // we determined the expression is congruent to a different memory
- // state, use that different memory state. If we determined it didn't,
- // we update that as well. Right now, we only support store
- // expressions.
- if (!isa<MemoryUse>(MA) && isa<StoreExpression>(E) &&
- EClass->Members.size() != 1) {
- auto *DefAccess = cast<StoreExpression>(E)->getDefiningAccess();
- setMemoryAccessEquivTo(MA, DefAccess != MA ? DefAccess : nullptr);
- } else {
- setMemoryAccessEquivTo(MA, nullptr);
- }
+ if (MemoryAccess *MA = getMemoryAccess(I))
markMemoryUsersTouched(MA);
- }
- } else if (auto *SI = dyn_cast<StoreInst>(I)) {
- // There is, sadly, one complicating thing for stores. Stores do not
- // produce values, only consume them. However, in order to make loads and
- // stores value number the same, we ignore the value operand of the store.
- // But the value operand will still be the leader of our class, and thus, it
- // may change. Because the store is a use, the store will get reprocessed,
- // but nothing will change about it, and so nothing above will catch it
- // (since the class will not change). In order to make sure everything ends
- // up okay, we need to recheck the leader of the class. Since stores of
- // different values value number differently due to different memorydefs, we
- // are guaranteed the leader is always the same between stores in the same
- // class.
- DEBUG(dbgs() << "Checking store leader\n");
- auto ProperLeader =
- lookupOperandLeader(SI->getValueOperand(), SI, SI->getParent());
- if (EClass->RepLeader != ProperLeader) {
- DEBUG(dbgs() << "Store leader changed, fixing\n");
- EClass->RepLeader = ProperLeader;
- markLeaderChangeTouched(EClass);
- markMemoryUsersTouched(MSSA->getMemoryAccess(SI));
+ if (auto *CI = dyn_cast<CmpInst>(I))
+ markPredicateUsersTouched(CI);
+ }
+ // If we changed the class of the store, we want to ensure nothing finds the
+ // old store expression. In particular, loads do not compare against stored
+ // value, so they will find old store expressions (and associated class
+ // mappings) if we leave them in the table.
+ if (ClassChanged && isa<StoreInst>(I)) {
+ auto *OldE = ValueToExpression.lookup(I);
+ // It could just be that the old class died. We don't want to erase it if we
+ // just moved classes.
+ if (OldE && isa<StoreExpression>(OldE) && *E != *OldE) {
+ // Erase this as an exact expression to ensure we don't erase expressions
+ // equivalent to it.
+ auto Iter = ExpressionToClass.find_as(ExactEqualsExpression(*OldE));
+ if (Iter != ExpressionToClass.end())
+ ExpressionToClass.erase(Iter);
}
}
+ ValueToExpression[I] = E;
}
// Process the fact that Edge (from, to) is reachable, including marking
@@ -1266,25 +2310,26 @@ void NewGVN::updateReachableEdge(BasicBlock *From, BasicBlock *To) {
// impact predicates. Otherwise, only mark the phi nodes as touched, as
// they are the only thing that depend on new edges. Anything using their
// values will get propagated to if necessary.
- if (MemoryAccess *MemPhi = MSSA->getMemoryAccess(To))
- TouchedInstructions.set(InstrDFS[MemPhi]);
+ if (MemoryAccess *MemPhi = getMemoryAccess(To))
+ TouchedInstructions.set(InstrToDFSNum(MemPhi));
auto BI = To->begin();
while (isa<PHINode>(BI)) {
- TouchedInstructions.set(InstrDFS[&*BI]);
+ TouchedInstructions.set(InstrToDFSNum(&*BI));
++BI;
}
+ for_each_found(PHIOfOpsPHIs, To, [&](const PHINode *I) {
+ TouchedInstructions.set(InstrToDFSNum(I));
+ });
}
}
}
// Given a predicate condition (from a switch, cmp, or whatever) and a block,
// see if we know some constant value for it already.
-Value *NewGVN::findConditionEquivalence(Value *Cond, BasicBlock *B) const {
- auto Result = lookupOperandLeader(Cond, nullptr, B);
- if (isa<Constant>(Result))
- return Result;
- return nullptr;
+Value *NewGVN::findConditionEquivalence(Value *Cond) const {
+ auto Result = lookupOperandLeader(Cond);
+ return isa<Constant>(Result) ? Result : nullptr;
}
// Process the outgoing edges of a block for reachability.
@@ -1293,10 +2338,10 @@ void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) {
BranchInst *BR;
if ((BR = dyn_cast<BranchInst>(TI)) && BR->isConditional()) {
Value *Cond = BR->getCondition();
- Value *CondEvaluated = findConditionEquivalence(Cond, B);
+ Value *CondEvaluated = findConditionEquivalence(Cond);
if (!CondEvaluated) {
if (auto *I = dyn_cast<Instruction>(Cond)) {
- const Expression *E = createExpression(I, B);
+ const Expression *E = createExpression(I);
if (const auto *CE = dyn_cast<ConstantExpression>(E)) {
CondEvaluated = CE->getConstantValue();
}
@@ -1329,13 +2374,13 @@ void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) {
SmallDenseMap<BasicBlock *, unsigned, 16> SwitchEdges;
Value *SwitchCond = SI->getCondition();
- Value *CondEvaluated = findConditionEquivalence(SwitchCond, B);
+ Value *CondEvaluated = findConditionEquivalence(SwitchCond);
// See if we were able to turn this switch statement into a constant.
if (CondEvaluated && isa<ConstantInt>(CondEvaluated)) {
auto *CondVal = cast<ConstantInt>(CondEvaluated);
// We should be able to get case value for this.
- auto CaseVal = SI->findCaseValue(CondVal);
- if (CaseVal.getCaseSuccessor() == SI->getDefaultDest()) {
+ auto Case = *SI->findCaseValue(CondVal);
+ if (Case.getCaseSuccessor() == SI->getDefaultDest()) {
// We proved the value is outside of the range of the case.
// We can't do anything other than mark the default dest as reachable,
// and go home.
@@ -1343,7 +2388,7 @@ void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) {
return;
}
// Now get where it goes and mark it reachable.
- BasicBlock *TargetBlock = CaseVal.getCaseSuccessor();
+ BasicBlock *TargetBlock = Case.getCaseSuccessor();
updateReachableEdge(B, TargetBlock);
} else {
for (unsigned i = 0, e = SI->getNumSuccessors(); i != e; ++i) {
@@ -1361,45 +2406,215 @@ void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) {
}
// This also may be a memory defining terminator, in which case, set it
- // equivalent to nothing.
- if (MemoryAccess *MA = MSSA->getMemoryAccess(TI))
- setMemoryAccessEquivTo(MA, nullptr);
+ // equivalent only to itself.
+ //
+ auto *MA = getMemoryAccess(TI);
+ if (MA && !isa<MemoryUse>(MA)) {
+ auto *CC = ensureLeaderOfMemoryClass(MA);
+ if (setMemoryClass(MA, CC))
+ markMemoryUsersTouched(MA);
+ }
}
}
-// The algorithm initially places the values of the routine in the INITIAL
-// congruence
-// class. The leader of INITIAL is the undetermined value `TOP`.
-// When the algorithm has finished, values still in INITIAL are unreachable.
+void NewGVN::addPhiOfOps(PHINode *Op, BasicBlock *BB,
+ Instruction *ExistingValue) {
+ InstrDFS[Op] = InstrToDFSNum(ExistingValue);
+ AllTempInstructions.insert(Op);
+ PHIOfOpsPHIs[BB].push_back(Op);
+ TempToBlock[Op] = BB;
+ RealToTemp[ExistingValue] = Op;
+}
+
+static bool okayForPHIOfOps(const Instruction *I) {
+ return isa<BinaryOperator>(I) || isa<SelectInst>(I) || isa<CmpInst>(I) ||
+ isa<LoadInst>(I);
+}
+
+// When we see an instruction that is an op of phis, generate the equivalent phi
+// of ops form.
+const Expression *
+NewGVN::makePossiblePhiOfOps(Instruction *I,
+ SmallPtrSetImpl<Value *> &Visited) {
+ if (!okayForPHIOfOps(I))
+ return nullptr;
+
+ if (!Visited.insert(I).second)
+ return nullptr;
+ // For now, we require the instruction be cycle free because we don't
+ // *always* create a phi of ops for instructions that could be done as phi
+ // of ops, we only do it if we think it is useful. If we did do it all the
+ // time, we could remove the cycle free check.
+ if (!isCycleFree(I))
+ return nullptr;
+
+ unsigned IDFSNum = InstrToDFSNum(I);
+ SmallPtrSet<const Value *, 8> ProcessedPHIs;
+ // TODO: We don't do phi translation on memory accesses because it's
+ // complicated. For a load, we'd need to be able to simulate a new memoryuse,
+ // which we don't have a good way of doing ATM.
+ auto *MemAccess = getMemoryAccess(I);
+ // If the memory operation is defined by a memory operation this block that
+ // isn't a MemoryPhi, transforming the pointer backwards through a scalar phi
+ // can't help, as it would still be killed by that memory operation.
+ if (MemAccess && !isa<MemoryPhi>(MemAccess->getDefiningAccess()) &&
+ MemAccess->getDefiningAccess()->getBlock() == I->getParent())
+ return nullptr;
+
+ // Convert op of phis to phi of ops
+ for (auto &Op : I->operands()) {
+ // TODO: We can't handle expressions that must be recursively translated
+ // IE
+ // a = phi (b, c)
+ // f = use a
+ // g = f + phi of something
+ // To properly make a phi of ops for g, we'd have to properly translate and
+ // use the instruction for f. We should add this by splitting out the
+ // instruction creation we do below.
+ if (isa<Instruction>(Op) && PHINodeUses.count(cast<Instruction>(Op)))
+ return nullptr;
+ if (!isa<PHINode>(Op))
+ continue;
+ auto *OpPHI = cast<PHINode>(Op);
+ // No point in doing this for one-operand phis.
+ if (OpPHI->getNumOperands() == 1)
+ continue;
+ if (!DebugCounter::shouldExecute(PHIOfOpsCounter))
+ return nullptr;
+ SmallVector<std::pair<Value *, BasicBlock *>, 4> Ops;
+ auto *PHIBlock = getBlockForValue(OpPHI);
+ for (auto PredBB : OpPHI->blocks()) {
+ Value *FoundVal = nullptr;
+ // We could just skip unreachable edges entirely but it's tricky to do
+ // with rewriting existing phi nodes.
+ if (ReachableEdges.count({PredBB, PHIBlock})) {
+ // Clone the instruction, create an expression from it, and see if we
+ // have a leader.
+ Instruction *ValueOp = I->clone();
+ if (MemAccess)
+ TempToMemory.insert({ValueOp, MemAccess});
+
+ for (auto &Op : ValueOp->operands()) {
+ Op = Op->DoPHITranslation(PHIBlock, PredBB);
+ // When this operand changes, it could change whether there is a
+ // leader for us or not.
+ addAdditionalUsers(Op, I);
+ }
+ // Make sure it's marked as a temporary instruction.
+ AllTempInstructions.insert(ValueOp);
+ // and make sure anything that tries to add it's DFS number is
+ // redirected to the instruction we are making a phi of ops
+ // for.
+ InstrDFS.insert({ValueOp, IDFSNum});
+ const Expression *E = performSymbolicEvaluation(ValueOp, Visited);
+ InstrDFS.erase(ValueOp);
+ AllTempInstructions.erase(ValueOp);
+ ValueOp->deleteValue();
+ if (MemAccess)
+ TempToMemory.erase(ValueOp);
+ if (!E)
+ return nullptr;
+ FoundVal = findPhiOfOpsLeader(E, PredBB);
+ if (!FoundVal) {
+ ExpressionToPhiOfOps[E].insert(I);
+ return nullptr;
+ }
+ if (auto *SI = dyn_cast<StoreInst>(FoundVal))
+ FoundVal = SI->getValueOperand();
+ } else {
+ DEBUG(dbgs() << "Skipping phi of ops operand for incoming block "
+ << getBlockName(PredBB)
+ << " because the block is unreachable\n");
+ FoundVal = UndefValue::get(I->getType());
+ }
+
+ Ops.push_back({FoundVal, PredBB});
+ DEBUG(dbgs() << "Found phi of ops operand " << *FoundVal << " in "
+ << getBlockName(PredBB) << "\n");
+ }
+ auto *ValuePHI = RealToTemp.lookup(I);
+ bool NewPHI = false;
+ if (!ValuePHI) {
+ ValuePHI = PHINode::Create(I->getType(), OpPHI->getNumOperands());
+ addPhiOfOps(ValuePHI, PHIBlock, I);
+ NewPHI = true;
+ NumGVNPHIOfOpsCreated++;
+ }
+ if (NewPHI) {
+ for (auto PHIOp : Ops)
+ ValuePHI->addIncoming(PHIOp.first, PHIOp.second);
+ } else {
+ unsigned int i = 0;
+ for (auto PHIOp : Ops) {
+ ValuePHI->setIncomingValue(i, PHIOp.first);
+ ValuePHI->setIncomingBlock(i, PHIOp.second);
+ ++i;
+ }
+ }
+
+ DEBUG(dbgs() << "Created phi of ops " << *ValuePHI << " for " << *I
+ << "\n");
+ return performSymbolicEvaluation(ValuePHI, Visited);
+ }
+ return nullptr;
+}
+
+// The algorithm initially places the values of the routine in the TOP
+// congruence class. The leader of TOP is the undetermined value `undef`.
+// When the algorithm has finished, values still in TOP are unreachable.
void NewGVN::initializeCongruenceClasses(Function &F) {
- // FIXME now i can't remember why this is 2
- NextCongruenceNum = 2;
- // Initialize all other instructions to be in INITIAL class.
- CongruenceClass::MemberSet InitialValues;
- InitialClass = createCongruenceClass(nullptr, nullptr);
- for (auto &B : F) {
- if (auto *MP = MSSA->getMemoryAccess(&B))
- MemoryAccessEquiv.insert({MP, MSSA->getLiveOnEntryDef()});
-
- for (auto &I : B) {
- InitialValues.insert(&I);
- ValueToClass[&I] = InitialClass;
- // All memory accesses are equivalent to live on entry to start. They must
- // be initialized to something so that initial changes are noticed. For
- // the maximal answer, we initialize them all to be the same as
- // liveOnEntry. Note that to save time, we only initialize the
- // MemoryDef's for stores and all MemoryPhis to be equal. Right now, no
- // other expression can generate a memory equivalence. If we start
- // handling memcpy/etc, we can expand this.
- if (isa<StoreInst>(&I)) {
- MemoryAccessEquiv.insert(
- {MSSA->getMemoryAccess(&I), MSSA->getLiveOnEntryDef()});
- ++InitialClass->StoreCount;
- assert(InitialClass->StoreCount > 0);
+ NextCongruenceNum = 0;
+
+ // Note that even though we use the live on entry def as a representative
+ // MemoryAccess, it is *not* the same as the actual live on entry def. We
+ // have no real equivalemnt to undef for MemoryAccesses, and so we really
+ // should be checking whether the MemoryAccess is top if we want to know if it
+ // is equivalent to everything. Otherwise, what this really signifies is that
+ // the access "it reaches all the way back to the beginning of the function"
+
+ // Initialize all other instructions to be in TOP class.
+ TOPClass = createCongruenceClass(nullptr, nullptr);
+ TOPClass->setMemoryLeader(MSSA->getLiveOnEntryDef());
+ // The live on entry def gets put into it's own class
+ MemoryAccessToClass[MSSA->getLiveOnEntryDef()] =
+ createMemoryClass(MSSA->getLiveOnEntryDef());
+
+ for (auto DTN : nodes(DT)) {
+ BasicBlock *BB = DTN->getBlock();
+ // All MemoryAccesses are equivalent to live on entry to start. They must
+ // be initialized to something so that initial changes are noticed. For
+ // the maximal answer, we initialize them all to be the same as
+ // liveOnEntry.
+ auto *MemoryBlockDefs = MSSA->getBlockDefs(BB);
+ if (MemoryBlockDefs)
+ for (const auto &Def : *MemoryBlockDefs) {
+ MemoryAccessToClass[&Def] = TOPClass;
+ auto *MD = dyn_cast<MemoryDef>(&Def);
+ // Insert the memory phis into the member list.
+ if (!MD) {
+ const MemoryPhi *MP = cast<MemoryPhi>(&Def);
+ TOPClass->memory_insert(MP);
+ MemoryPhiState.insert({MP, MPS_TOP});
+ }
+
+ if (MD && isa<StoreInst>(MD->getMemoryInst()))
+ TOPClass->incStoreCount();
}
+ for (auto &I : *BB) {
+ // TODO: Move to helper
+ if (isa<PHINode>(&I))
+ for (auto *U : I.users())
+ if (auto *UInst = dyn_cast<Instruction>(U))
+ if (InstrToDFSNum(UInst) != 0 && okayForPHIOfOps(UInst))
+ PHINodeUses.insert(UInst);
+ // Don't insert void terminators into the class. We don't value number
+ // them, and they just end up sitting in TOP.
+ if (isa<TerminatorInst>(I) && I.getType()->isVoidTy())
+ continue;
+ TOPClass->insert(&I);
+ ValueToClass[&I] = TOPClass;
}
}
- InitialClass->Members.swap(InitialValues);
// Initialize arguments to be in their own unique congruence classes
for (auto &FA : F.args())
@@ -1408,45 +2623,79 @@ void NewGVN::initializeCongruenceClasses(Function &F) {
void NewGVN::cleanupTables() {
for (unsigned i = 0, e = CongruenceClasses.size(); i != e; ++i) {
- DEBUG(dbgs() << "Congruence class " << CongruenceClasses[i]->ID << " has "
- << CongruenceClasses[i]->Members.size() << " members\n");
+ DEBUG(dbgs() << "Congruence class " << CongruenceClasses[i]->getID()
+ << " has " << CongruenceClasses[i]->size() << " members\n");
// Make sure we delete the congruence class (probably worth switching to
// a unique_ptr at some point.
delete CongruenceClasses[i];
CongruenceClasses[i] = nullptr;
}
+ // Destroy the value expressions
+ SmallVector<Instruction *, 8> TempInst(AllTempInstructions.begin(),
+ AllTempInstructions.end());
+ AllTempInstructions.clear();
+
+ // We have to drop all references for everything first, so there are no uses
+ // left as we delete them.
+ for (auto *I : TempInst) {
+ I->dropAllReferences();
+ }
+
+ while (!TempInst.empty()) {
+ auto *I = TempInst.back();
+ TempInst.pop_back();
+ I->deleteValue();
+ }
+
ValueToClass.clear();
ArgRecycler.clear(ExpressionAllocator);
ExpressionAllocator.Reset();
CongruenceClasses.clear();
ExpressionToClass.clear();
ValueToExpression.clear();
+ RealToTemp.clear();
+ AdditionalUsers.clear();
+ ExpressionToPhiOfOps.clear();
+ TempToBlock.clear();
+ TempToMemory.clear();
+ PHIOfOpsPHIs.clear();
ReachableBlocks.clear();
ReachableEdges.clear();
#ifndef NDEBUG
ProcessedCount.clear();
#endif
- DFSDomMap.clear();
InstrDFS.clear();
InstructionsToErase.clear();
-
DFSToInstr.clear();
BlockInstRange.clear();
TouchedInstructions.clear();
- DominatedInstRange.clear();
- MemoryAccessEquiv.clear();
+ MemoryAccessToClass.clear();
+ PredicateToUsers.clear();
+ MemoryToUsers.clear();
}
+// Assign local DFS number mapping to instructions, and leave space for Value
+// PHI's.
std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B,
unsigned Start) {
unsigned End = Start;
- if (MemoryAccess *MemPhi = MSSA->getMemoryAccess(B)) {
+ if (MemoryAccess *MemPhi = getMemoryAccess(B)) {
InstrDFS[MemPhi] = End++;
DFSToInstr.emplace_back(MemPhi);
}
+ // Then the real block goes next.
for (auto &I : *B) {
+ // There's no need to call isInstructionTriviallyDead more than once on
+ // an instruction. Therefore, once we know that an instruction is dead
+ // we change its DFS number so that it doesn't get value numbered.
+ if (isInstructionTriviallyDead(&I, TLI)) {
+ InstrDFS[&I] = 0;
+ DEBUG(dbgs() << "Skipping trivially dead instruction " << I << "\n");
+ markInstructionForDeletion(&I);
+ continue;
+ }
InstrDFS[&I] = End++;
DFSToInstr.emplace_back(&I);
}
@@ -1457,12 +2706,12 @@ std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B,
return std::make_pair(Start, End);
}
-void NewGVN::updateProcessedCount(Value *V) {
+void NewGVN::updateProcessedCount(const Value *V) {
#ifndef NDEBUG
if (ProcessedCount.count(V) == 0) {
ProcessedCount.insert({V, 1});
} else {
- ProcessedCount[V] += 1;
+ ++ProcessedCount[V];
assert(ProcessedCount[V] < 100 &&
"Seem to have processed the same Value a lot");
}
@@ -1471,27 +2720,35 @@ void NewGVN::updateProcessedCount(Value *V) {
// Evaluate MemoryPhi nodes symbolically, just like PHI nodes
void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) {
// If all the arguments are the same, the MemoryPhi has the same value as the
- // argument.
- // Filter out unreachable blocks from our operands.
+ // argument. Filter out unreachable blocks and self phis from our operands.
+ // TODO: We could do cycle-checking on the memory phis to allow valueizing for
+ // self-phi checking.
+ const BasicBlock *PHIBlock = MP->getBlock();
auto Filtered = make_filter_range(MP->operands(), [&](const Use &U) {
- return ReachableBlocks.count(MP->getIncomingBlock(U));
+ return cast<MemoryAccess>(U) != MP &&
+ !isMemoryAccessTOP(cast<MemoryAccess>(U)) &&
+ ReachableEdges.count({MP->getIncomingBlock(U), PHIBlock});
});
-
- assert(Filtered.begin() != Filtered.end() &&
- "We should not be processing a MemoryPhi in a completely "
- "unreachable block");
+ // If all that is left is nothing, our memoryphi is undef. We keep it as
+ // InitialClass. Note: The only case this should happen is if we have at
+ // least one self-argument.
+ if (Filtered.begin() == Filtered.end()) {
+ if (setMemoryClass(MP, TOPClass))
+ markMemoryUsersTouched(MP);
+ return;
+ }
// Transform the remaining operands into operand leaders.
// FIXME: mapped_iterator should have a range version.
auto LookupFunc = [&](const Use &U) {
- return lookupMemoryAccessEquiv(cast<MemoryAccess>(U));
+ return lookupMemoryLeader(cast<MemoryAccess>(U));
};
auto MappedBegin = map_iterator(Filtered.begin(), LookupFunc);
auto MappedEnd = map_iterator(Filtered.end(), LookupFunc);
// and now check if all the elements are equal.
// Sadly, we can't use std::equals since these are random access iterators.
- MemoryAccess *AllSameValue = *MappedBegin;
+ const auto *AllSameValue = *MappedBegin;
++MappedBegin;
bool AllEqual = std::all_of(
MappedBegin, MappedEnd,
@@ -1501,8 +2758,18 @@ void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) {
DEBUG(dbgs() << "Memory Phi value numbered to " << *AllSameValue << "\n");
else
DEBUG(dbgs() << "Memory Phi value numbered to itself\n");
-
- if (setMemoryAccessEquivTo(MP, AllEqual ? AllSameValue : nullptr))
+ // If it's equal to something, it's in that class. Otherwise, it has to be in
+ // a class where it is the leader (other things may be equivalent to it, but
+ // it needs to start off in its own class, which means it must have been the
+ // leader, and it can't have stopped being the leader because it was never
+ // removed).
+ CongruenceClass *CC =
+ AllEqual ? getMemoryClass(AllSameValue) : ensureLeaderOfMemoryClass(MP);
+ auto OldState = MemoryPhiState.lookup(MP);
+ assert(OldState != MPS_Invalid && "Invalid memory phi state");
+ auto NewState = AllEqual ? MPS_Equivalent : MPS_Unique;
+ MemoryPhiState[MP] = NewState;
+ if (setMemoryClass(MP, CC) || OldState != NewState)
markMemoryUsersTouched(MP);
}
@@ -1510,13 +2777,23 @@ void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) {
// congruence finding, and updating mappings.
void NewGVN::valueNumberInstruction(Instruction *I) {
DEBUG(dbgs() << "Processing instruction " << *I << "\n");
- if (isInstructionTriviallyDead(I, TLI)) {
- DEBUG(dbgs() << "Skipping unused instruction\n");
- markInstructionForDeletion(I);
- return;
- }
if (!I->isTerminator()) {
- const auto *Symbolized = performSymbolicEvaluation(I, I->getParent());
+ const Expression *Symbolized = nullptr;
+ SmallPtrSet<Value *, 2> Visited;
+ if (DebugCounter::shouldExecute(VNCounter)) {
+ Symbolized = performSymbolicEvaluation(I, Visited);
+ // Make a phi of ops if necessary
+ if (Symbolized && !isa<ConstantExpression>(Symbolized) &&
+ !isa<VariableExpression>(Symbolized) && PHINodeUses.count(I)) {
+ auto *PHIE = makePossiblePhiOfOps(I, Visited);
+ if (PHIE)
+ Symbolized = PHIE;
+ }
+
+ } else {
+ // Mark the instruction as unused so we don't value number it again.
+ InstrDFS[I] = 0;
+ }
// If we couldn't come up with a symbolic expression, use the unknown
// expression
if (Symbolized == nullptr)
@@ -1524,7 +2801,8 @@ void NewGVN::valueNumberInstruction(Instruction *I) {
performCongruenceFinding(I, Symbolized);
} else {
// Handle terminators that return values. All of them produce values we
- // don't currently understand.
+ // don't currently understand. We don't place non-value producing
+ // terminators in a class.
if (!I->getType()->isVoidTy()) {
auto *Symbolized = createUnknownExpression(I);
performCongruenceFinding(I, Symbolized);
@@ -1535,76 +2813,126 @@ void NewGVN::valueNumberInstruction(Instruction *I) {
// Check if there is a path, using single or equal argument phi nodes, from
// First to Second.
-bool NewGVN::singleReachablePHIPath(const MemoryAccess *First,
- const MemoryAccess *Second) const {
+bool NewGVN::singleReachablePHIPath(
+ SmallPtrSet<const MemoryAccess *, 8> &Visited, const MemoryAccess *First,
+ const MemoryAccess *Second) const {
if (First == Second)
return true;
-
- if (auto *FirstDef = dyn_cast<MemoryUseOrDef>(First)) {
- auto *DefAccess = FirstDef->getDefiningAccess();
- return singleReachablePHIPath(DefAccess, Second);
- } else {
- auto *MP = cast<MemoryPhi>(First);
- auto ReachableOperandPred = [&](const Use &U) {
- return ReachableBlocks.count(MP->getIncomingBlock(U));
- };
- auto FilteredPhiArgs =
- make_filter_range(MP->operands(), ReachableOperandPred);
- SmallVector<const Value *, 32> OperandList;
- std::copy(FilteredPhiArgs.begin(), FilteredPhiArgs.end(),
- std::back_inserter(OperandList));
- bool Okay = OperandList.size() == 1;
- if (!Okay)
- Okay = std::equal(OperandList.begin(), OperandList.end(),
- OperandList.begin());
- if (Okay)
- return singleReachablePHIPath(cast<MemoryAccess>(OperandList[0]), Second);
+ if (MSSA->isLiveOnEntryDef(First))
return false;
+
+ // This is not perfect, but as we're just verifying here, we can live with
+ // the loss of precision. The real solution would be that of doing strongly
+ // connected component finding in this routine, and it's probably not worth
+ // the complexity for the time being. So, we just keep a set of visited
+ // MemoryAccess and return true when we hit a cycle.
+ if (Visited.count(First))
+ return true;
+ Visited.insert(First);
+
+ const auto *EndDef = First;
+ for (auto *ChainDef : optimized_def_chain(First)) {
+ if (ChainDef == Second)
+ return true;
+ if (MSSA->isLiveOnEntryDef(ChainDef))
+ return false;
+ EndDef = ChainDef;
}
+ auto *MP = cast<MemoryPhi>(EndDef);
+ auto ReachableOperandPred = [&](const Use &U) {
+ return ReachableEdges.count({MP->getIncomingBlock(U), MP->getBlock()});
+ };
+ auto FilteredPhiArgs =
+ make_filter_range(MP->operands(), ReachableOperandPred);
+ SmallVector<const Value *, 32> OperandList;
+ std::copy(FilteredPhiArgs.begin(), FilteredPhiArgs.end(),
+ std::back_inserter(OperandList));
+ bool Okay = OperandList.size() == 1;
+ if (!Okay)
+ Okay =
+ std::equal(OperandList.begin(), OperandList.end(), OperandList.begin());
+ if (Okay)
+ return singleReachablePHIPath(Visited, cast<MemoryAccess>(OperandList[0]),
+ Second);
+ return false;
}
// Verify the that the memory equivalence table makes sense relative to the
// congruence classes. Note that this checking is not perfect, and is currently
-// subject to very rare false negatives. It is only useful for testing/debugging.
+// subject to very rare false negatives. It is only useful for
+// testing/debugging.
void NewGVN::verifyMemoryCongruency() const {
- // Anything equivalent in the memory access table should be in the same
+#ifndef NDEBUG
+ // Verify that the memory table equivalence and memory member set match
+ for (const auto *CC : CongruenceClasses) {
+ if (CC == TOPClass || CC->isDead())
+ continue;
+ if (CC->getStoreCount() != 0) {
+ assert((CC->getStoredValue() || !isa<StoreInst>(CC->getLeader())) &&
+ "Any class with a store as a leader should have a "
+ "representative stored value");
+ assert(CC->getMemoryLeader() &&
+ "Any congruence class with a store should have a "
+ "representative access");
+ }
+
+ if (CC->getMemoryLeader())
+ assert(MemoryAccessToClass.lookup(CC->getMemoryLeader()) == CC &&
+ "Representative MemoryAccess does not appear to be reverse "
+ "mapped properly");
+ for (auto M : CC->memory())
+ assert(MemoryAccessToClass.lookup(M) == CC &&
+ "Memory member does not appear to be reverse mapped properly");
+ }
+
+ // Anything equivalent in the MemoryAccess table should be in the same
// congruence class.
// Filter out the unreachable and trivially dead entries, because they may
// never have been updated if the instructions were not processed.
auto ReachableAccessPred =
- [&](const std::pair<const MemoryAccess *, MemoryAccess *> Pair) {
+ [&](const std::pair<const MemoryAccess *, CongruenceClass *> Pair) {
bool Result = ReachableBlocks.count(Pair.first->getBlock());
- if (!Result)
+ if (!Result || MSSA->isLiveOnEntryDef(Pair.first) ||
+ MemoryToDFSNum(Pair.first) == 0)
return false;
if (auto *MemDef = dyn_cast<MemoryDef>(Pair.first))
return !isInstructionTriviallyDead(MemDef->getMemoryInst());
+
+ // We could have phi nodes which operands are all trivially dead,
+ // so we don't process them.
+ if (auto *MemPHI = dyn_cast<MemoryPhi>(Pair.first)) {
+ for (auto &U : MemPHI->incoming_values()) {
+ if (Instruction *I = dyn_cast<Instruction>(U.get())) {
+ if (!isInstructionTriviallyDead(I))
+ return true;
+ }
+ }
+ return false;
+ }
+
return true;
};
- auto Filtered = make_filter_range(MemoryAccessEquiv, ReachableAccessPred);
+ auto Filtered = make_filter_range(MemoryAccessToClass, ReachableAccessPred);
for (auto KV : Filtered) {
- assert(KV.first != KV.second &&
- "We added a useless equivalence to the memory equivalence table");
- // Unreachable instructions may not have changed because we never process
- // them.
- if (!ReachableBlocks.count(KV.first->getBlock()))
- continue;
if (auto *FirstMUD = dyn_cast<MemoryUseOrDef>(KV.first)) {
- auto *SecondMUD = dyn_cast<MemoryUseOrDef>(KV.second);
- if (FirstMUD && SecondMUD)
- assert((singleReachablePHIPath(FirstMUD, SecondMUD) ||
- ValueToClass.lookup(FirstMUD->getMemoryInst()) ==
- ValueToClass.lookup(SecondMUD->getMemoryInst())) &&
- "The instructions for these memory operations should have "
- "been in the same congruence class or reachable through"
- "a single argument phi");
+ auto *SecondMUD = dyn_cast<MemoryUseOrDef>(KV.second->getMemoryLeader());
+ if (FirstMUD && SecondMUD) {
+ SmallPtrSet<const MemoryAccess *, 8> VisitedMAS;
+ assert((singleReachablePHIPath(VisitedMAS, FirstMUD, SecondMUD) ||
+ ValueToClass.lookup(FirstMUD->getMemoryInst()) ==
+ ValueToClass.lookup(SecondMUD->getMemoryInst())) &&
+ "The instructions for these memory operations should have "
+ "been in the same congruence class or reachable through"
+ "a single argument phi");
+ }
} else if (auto *FirstMP = dyn_cast<MemoryPhi>(KV.first)) {
-
// We can only sanely verify that MemoryDefs in the operand list all have
// the same class.
auto ReachableOperandPred = [&](const Use &U) {
- return ReachableBlocks.count(FirstMP->getIncomingBlock(U)) &&
+ return ReachableEdges.count(
+ {FirstMP->getIncomingBlock(U), FirstMP->getBlock()}) &&
isa<MemoryDef>(U);
};
@@ -1622,35 +2950,179 @@ void NewGVN::verifyMemoryCongruency() const {
"All MemoryPhi arguments should be in the same class");
}
}
+#endif
+}
+
+// Verify that the sparse propagation we did actually found the maximal fixpoint
+// We do this by storing the value to class mapping, touching all instructions,
+// and redoing the iteration to see if anything changed.
+void NewGVN::verifyIterationSettled(Function &F) {
+#ifndef NDEBUG
+ DEBUG(dbgs() << "Beginning iteration verification\n");
+ if (DebugCounter::isCounterSet(VNCounter))
+ DebugCounter::setCounterValue(VNCounter, StartingVNCounter);
+
+ // Note that we have to store the actual classes, as we may change existing
+ // classes during iteration. This is because our memory iteration propagation
+ // is not perfect, and so may waste a little work. But it should generate
+ // exactly the same congruence classes we have now, with different IDs.
+ std::map<const Value *, CongruenceClass> BeforeIteration;
+
+ for (auto &KV : ValueToClass) {
+ if (auto *I = dyn_cast<Instruction>(KV.first))
+ // Skip unused/dead instructions.
+ if (InstrToDFSNum(I) == 0)
+ continue;
+ BeforeIteration.insert({KV.first, *KV.second});
+ }
+
+ TouchedInstructions.set();
+ TouchedInstructions.reset(0);
+ iterateTouchedInstructions();
+ DenseSet<std::pair<const CongruenceClass *, const CongruenceClass *>>
+ EqualClasses;
+ for (const auto &KV : ValueToClass) {
+ if (auto *I = dyn_cast<Instruction>(KV.first))
+ // Skip unused/dead instructions.
+ if (InstrToDFSNum(I) == 0)
+ continue;
+ // We could sink these uses, but i think this adds a bit of clarity here as
+ // to what we are comparing.
+ auto *BeforeCC = &BeforeIteration.find(KV.first)->second;
+ auto *AfterCC = KV.second;
+ // Note that the classes can't change at this point, so we memoize the set
+ // that are equal.
+ if (!EqualClasses.count({BeforeCC, AfterCC})) {
+ assert(BeforeCC->isEquivalentTo(AfterCC) &&
+ "Value number changed after main loop completed!");
+ EqualClasses.insert({BeforeCC, AfterCC});
+ }
+ }
+#endif
+}
+
+// Verify that for each store expression in the expression to class mapping,
+// only the latest appears, and multiple ones do not appear.
+// Because loads do not use the stored value when doing equality with stores,
+// if we don't erase the old store expressions from the table, a load can find
+// a no-longer valid StoreExpression.
+void NewGVN::verifyStoreExpressions() const {
+#ifndef NDEBUG
+ // This is the only use of this, and it's not worth defining a complicated
+ // densemapinfo hash/equality function for it.
+ std::set<
+ std::pair<const Value *,
+ std::tuple<const Value *, const CongruenceClass *, Value *>>>
+ StoreExpressionSet;
+ for (const auto &KV : ExpressionToClass) {
+ if (auto *SE = dyn_cast<StoreExpression>(KV.first)) {
+ // Make sure a version that will conflict with loads is not already there
+ auto Res = StoreExpressionSet.insert(
+ {SE->getOperand(0), std::make_tuple(SE->getMemoryLeader(), KV.second,
+ SE->getStoredValue())});
+ bool Okay = Res.second;
+ // It's okay to have the same expression already in there if it is
+ // identical in nature.
+ // This can happen when the leader of the stored value changes over time.
+ if (!Okay)
+ Okay = (std::get<1>(Res.first->second) == KV.second) &&
+ (lookupOperandLeader(std::get<2>(Res.first->second)) ==
+ lookupOperandLeader(SE->getStoredValue()));
+ assert(Okay && "Stored expression conflict exists in expression table");
+ auto *ValueExpr = ValueToExpression.lookup(SE->getStoreInst());
+ assert(ValueExpr && ValueExpr->equals(*SE) &&
+ "StoreExpression in ExpressionToClass is not latest "
+ "StoreExpression for value");
+ }
+ }
+#endif
+}
+
+// This is the main value numbering loop, it iterates over the initial touched
+// instruction set, propagating value numbers, marking things touched, etc,
+// until the set of touched instructions is completely empty.
+void NewGVN::iterateTouchedInstructions() {
+ unsigned int Iterations = 0;
+ // Figure out where touchedinstructions starts
+ int FirstInstr = TouchedInstructions.find_first();
+ // Nothing set, nothing to iterate, just return.
+ if (FirstInstr == -1)
+ return;
+ const BasicBlock *LastBlock = getBlockForValue(InstrFromDFSNum(FirstInstr));
+ while (TouchedInstructions.any()) {
+ ++Iterations;
+ // Walk through all the instructions in all the blocks in RPO.
+ // TODO: As we hit a new block, we should push and pop equalities into a
+ // table lookupOperandLeader can use, to catch things PredicateInfo
+ // might miss, like edge-only equivalences.
+ for (unsigned InstrNum : TouchedInstructions.set_bits()) {
+
+ // This instruction was found to be dead. We don't bother looking
+ // at it again.
+ if (InstrNum == 0) {
+ TouchedInstructions.reset(InstrNum);
+ continue;
+ }
+
+ Value *V = InstrFromDFSNum(InstrNum);
+ const BasicBlock *CurrBlock = getBlockForValue(V);
+
+ // If we hit a new block, do reachability processing.
+ if (CurrBlock != LastBlock) {
+ LastBlock = CurrBlock;
+ bool BlockReachable = ReachableBlocks.count(CurrBlock);
+ const auto &CurrInstRange = BlockInstRange.lookup(CurrBlock);
+
+ // If it's not reachable, erase any touched instructions and move on.
+ if (!BlockReachable) {
+ TouchedInstructions.reset(CurrInstRange.first, CurrInstRange.second);
+ DEBUG(dbgs() << "Skipping instructions in block "
+ << getBlockName(CurrBlock)
+ << " because it is unreachable\n");
+ continue;
+ }
+ updateProcessedCount(CurrBlock);
+ }
+ // Reset after processing (because we may mark ourselves as touched when
+ // we propagate equalities).
+ TouchedInstructions.reset(InstrNum);
+
+ if (auto *MP = dyn_cast<MemoryPhi>(V)) {
+ DEBUG(dbgs() << "Processing MemoryPhi " << *MP << "\n");
+ valueNumberMemoryPhi(MP);
+ } else if (auto *I = dyn_cast<Instruction>(V)) {
+ valueNumberInstruction(I);
+ } else {
+ llvm_unreachable("Should have been a MemoryPhi or Instruction");
+ }
+ updateProcessedCount(V);
+ }
+ }
+ NumGVNMaxIterations = std::max(NumGVNMaxIterations.getValue(), Iterations);
}
// This is the main transformation entry point.
-bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC,
- TargetLibraryInfo *_TLI, AliasAnalysis *_AA,
- MemorySSA *_MSSA) {
+bool NewGVN::runGVN() {
+ if (DebugCounter::isCounterSet(VNCounter))
+ StartingVNCounter = DebugCounter::getCounterValue(VNCounter);
bool Changed = false;
- DT = _DT;
- AC = _AC;
- TLI = _TLI;
- AA = _AA;
- MSSA = _MSSA;
- DL = &F.getParent()->getDataLayout();
+ NumFuncArgs = F.arg_size();
MSSAWalker = MSSA->getWalker();
+ SingletonDeadExpression = new (ExpressionAllocator) DeadExpression();
// Count number of instructions for sizing of hash tables, and come
// up with a global dfs numbering for instructions.
unsigned ICount = 1;
// Add an empty instruction to account for the fact that we start at 1
DFSToInstr.emplace_back(nullptr);
- // Note: We want RPO traversal of the blocks, which is not quite the same as
- // dominator tree order, particularly with regard whether backedges get
- // visited first or second, given a block with multiple successors.
+ // Note: We want ideal RPO traversal of the blocks, which is not quite the
+ // same as dominator tree order, particularly with regard whether backedges
+ // get visited first or second, given a block with multiple successors.
// If we visit in the wrong order, we will end up performing N times as many
// iterations.
// The dominator tree does guarantee that, for a given dom tree node, it's
// parent must occur before it in the RPO ordering. Thus, we only need to sort
// the siblings.
- DenseMap<const DomTreeNode *, unsigned> RPOOrdering;
ReversePostOrderTraversal<Function *> RPOT(&F);
unsigned Counter = 0;
for (auto &B : RPOT) {
@@ -1663,33 +3135,21 @@ bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC,
auto *Node = DT->getNode(B);
if (Node->getChildren().size() > 1)
std::sort(Node->begin(), Node->end(),
- [&RPOOrdering](const DomTreeNode *A, const DomTreeNode *B) {
+ [&](const DomTreeNode *A, const DomTreeNode *B) {
return RPOOrdering[A] < RPOOrdering[B];
});
}
// Now a standard depth first ordering of the domtree is equivalent to RPO.
- auto DFI = df_begin(DT->getRootNode());
- for (auto DFE = df_end(DT->getRootNode()); DFI != DFE; ++DFI) {
- BasicBlock *B = DFI->getBlock();
+ for (auto DTN : depth_first(DT->getRootNode())) {
+ BasicBlock *B = DTN->getBlock();
const auto &BlockRange = assignDFSNumbers(B, ICount);
BlockInstRange.insert({B, BlockRange});
ICount += BlockRange.second - BlockRange.first;
}
-
- // Handle forward unreachable blocks and figure out which blocks
- // have single preds.
- for (auto &B : F) {
- // Assign numbers to unreachable blocks.
- if (!DFI.nodeVisited(DT->getNode(&B))) {
- const auto &BlockRange = assignDFSNumbers(&B, ICount);
- BlockInstRange.insert({&B, BlockRange});
- ICount += BlockRange.second - BlockRange.first;
- }
- }
+ initializeCongruenceClasses(F);
TouchedInstructions.resize(ICount);
- DominatedInstRange.reserve(F.size());
// Ensure we don't end up resizing the expressionToClass map, as
// that can be quite expensive. At most, we have one expression per
// instruction.
@@ -1698,65 +3158,15 @@ bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC,
// Initialize the touched instructions to include the entry block.
const auto &InstRange = BlockInstRange.lookup(&F.getEntryBlock());
TouchedInstructions.set(InstRange.first, InstRange.second);
+ DEBUG(dbgs() << "Block " << getBlockName(&F.getEntryBlock())
+ << " marked reachable\n");
ReachableBlocks.insert(&F.getEntryBlock());
- initializeCongruenceClasses(F);
-
- unsigned int Iterations = 0;
- // We start out in the entry block.
- BasicBlock *LastBlock = &F.getEntryBlock();
- while (TouchedInstructions.any()) {
- ++Iterations;
- // Walk through all the instructions in all the blocks in RPO.
- for (int InstrNum = TouchedInstructions.find_first(); InstrNum != -1;
- InstrNum = TouchedInstructions.find_next(InstrNum)) {
- assert(InstrNum != 0 && "Bit 0 should never be set, something touched an "
- "instruction not in the lookup table");
- Value *V = DFSToInstr[InstrNum];
- BasicBlock *CurrBlock = nullptr;
-
- if (auto *I = dyn_cast<Instruction>(V))
- CurrBlock = I->getParent();
- else if (auto *MP = dyn_cast<MemoryPhi>(V))
- CurrBlock = MP->getBlock();
- else
- llvm_unreachable("DFSToInstr gave us an unknown type of instruction");
-
- // If we hit a new block, do reachability processing.
- if (CurrBlock != LastBlock) {
- LastBlock = CurrBlock;
- bool BlockReachable = ReachableBlocks.count(CurrBlock);
- const auto &CurrInstRange = BlockInstRange.lookup(CurrBlock);
-
- // If it's not reachable, erase any touched instructions and move on.
- if (!BlockReachable) {
- TouchedInstructions.reset(CurrInstRange.first, CurrInstRange.second);
- DEBUG(dbgs() << "Skipping instructions in block "
- << getBlockName(CurrBlock)
- << " because it is unreachable\n");
- continue;
- }
- updateProcessedCount(CurrBlock);
- }
-
- if (auto *MP = dyn_cast<MemoryPhi>(V)) {
- DEBUG(dbgs() << "Processing MemoryPhi " << *MP << "\n");
- valueNumberMemoryPhi(MP);
- } else if (auto *I = dyn_cast<Instruction>(V)) {
- valueNumberInstruction(I);
- } else {
- llvm_unreachable("Should have been a MemoryPhi or Instruction");
- }
- updateProcessedCount(V);
- // Reset after processing (because we may mark ourselves as touched when
- // we propagate equalities).
- TouchedInstructions.reset(InstrNum);
- }
- }
- NumGVNMaxIterations = std::max(NumGVNMaxIterations.getValue(), Iterations);
-#ifndef NDEBUG
+ iterateTouchedInstructions();
verifyMemoryCongruency();
-#endif
+ verifyIterationSettled(F);
+ verifyStoreExpressions();
+
Changed |= eliminateInstructions(F);
// Delete all instructions marked for deletion.
@@ -1764,7 +3174,8 @@ bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC,
if (!ToErase->use_empty())
ToErase->replaceAllUsesWith(UndefValue::get(ToErase->getType()));
- ToErase->eraseFromParent();
+ if (ToErase->getParent())
+ ToErase->eraseFromParent();
}
// Delete all unreachable blocks.
@@ -1783,59 +3194,15 @@ bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC,
return Changed;
}
-bool NewGVN::runOnFunction(Function &F) {
- if (skipFunction(F))
- return false;
- return runGVN(F, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
- &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
- &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
- &getAnalysis<AAResultsWrapperPass>().getAAResults(),
- &getAnalysis<MemorySSAWrapperPass>().getMSSA());
-}
-
-PreservedAnalyses NewGVNPass::run(Function &F, AnalysisManager<Function> &AM) {
- NewGVN Impl;
-
- // Apparently the order in which we get these results matter for
- // the old GVN (see Chandler's comment in GVN.cpp). I'll keep
- // the same order here, just in case.
- auto &AC = AM.getResult<AssumptionAnalysis>(F);
- auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
- auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
- auto &AA = AM.getResult<AAManager>(F);
- auto &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
- bool Changed = Impl.runGVN(F, &DT, &AC, &TLI, &AA, &MSSA);
- if (!Changed)
- return PreservedAnalyses::all();
- PreservedAnalyses PA;
- PA.preserve<DominatorTreeAnalysis>();
- PA.preserve<GlobalsAA>();
- return PA;
-}
-
-// Return true if V is a value that will always be available (IE can
-// be placed anywhere) in the function. We don't do globals here
-// because they are often worse to put in place.
-// TODO: Separate cost from availability
-static bool alwaysAvailable(Value *V) {
- return isa<Constant>(V) || isa<Argument>(V);
-}
-
-// Get the basic block from an instruction/value.
-static BasicBlock *getBlockForValue(Value *V) {
- if (auto *I = dyn_cast<Instruction>(V))
- return I->getParent();
- return nullptr;
-}
-
struct NewGVN::ValueDFS {
int DFSIn = 0;
int DFSOut = 0;
int LocalNum = 0;
- // Only one of these will be set.
- Value *Val = nullptr;
+ // Only one of Def and U will be set.
+ // The bool in the Def tells us whether the Def is the stored value of a
+ // store.
+ PointerIntPair<Value *, 1, bool> Def;
Use *U = nullptr;
-
bool operator<(const ValueDFS &Other) const {
// It's not enough that any given field be less than - we have sets
// of fields that need to be evaluated together to give a proper ordering.
@@ -1875,89 +3242,163 @@ struct NewGVN::ValueDFS {
// but .val and .u.
// It does not matter what order we replace these operands in.
// You will always end up with the same IR, and this is guaranteed.
- return std::tie(DFSIn, DFSOut, LocalNum, Val, U) <
- std::tie(Other.DFSIn, Other.DFSOut, Other.LocalNum, Other.Val,
+ return std::tie(DFSIn, DFSOut, LocalNum, Def, U) <
+ std::tie(Other.DFSIn, Other.DFSOut, Other.LocalNum, Other.Def,
Other.U);
}
};
-void NewGVN::convertDenseToDFSOrdered(
- CongruenceClass::MemberSet &Dense,
- SmallVectorImpl<ValueDFS> &DFSOrderedSet) {
+// This function converts the set of members for a congruence class from values,
+// to sets of defs and uses with associated DFS info. The total number of
+// reachable uses for each value is stored in UseCount, and instructions that
+// seem
+// dead (have no non-dead uses) are stored in ProbablyDead.
+void NewGVN::convertClassToDFSOrdered(
+ const CongruenceClass &Dense, SmallVectorImpl<ValueDFS> &DFSOrderedSet,
+ DenseMap<const Value *, unsigned int> &UseCounts,
+ SmallPtrSetImpl<Instruction *> &ProbablyDead) const {
for (auto D : Dense) {
// First add the value.
BasicBlock *BB = getBlockForValue(D);
// Constants are handled prior to ever calling this function, so
// we should only be left with instructions as members.
assert(BB && "Should have figured out a basic block for value");
- ValueDFS VD;
-
- std::pair<int, int> DFSPair = DFSDomMap[BB];
- assert(DFSPair.first != -1 && DFSPair.second != -1 && "Invalid DFS Pair");
- VD.DFSIn = DFSPair.first;
- VD.DFSOut = DFSPair.second;
- VD.Val = D;
- // If it's an instruction, use the real local dfs number.
- if (auto *I = dyn_cast<Instruction>(D))
- VD.LocalNum = InstrDFS[I];
- else
- llvm_unreachable("Should have been an instruction");
-
- DFSOrderedSet.emplace_back(VD);
+ ValueDFS VDDef;
+ DomTreeNode *DomNode = DT->getNode(BB);
+ VDDef.DFSIn = DomNode->getDFSNumIn();
+ VDDef.DFSOut = DomNode->getDFSNumOut();
+ // If it's a store, use the leader of the value operand, if it's always
+ // available, or the value operand. TODO: We could do dominance checks to
+ // find a dominating leader, but not worth it ATM.
+ if (auto *SI = dyn_cast<StoreInst>(D)) {
+ auto Leader = lookupOperandLeader(SI->getValueOperand());
+ if (alwaysAvailable(Leader)) {
+ VDDef.Def.setPointer(Leader);
+ } else {
+ VDDef.Def.setPointer(SI->getValueOperand());
+ VDDef.Def.setInt(true);
+ }
+ } else {
+ VDDef.Def.setPointer(D);
+ }
+ assert(isa<Instruction>(D) &&
+ "The dense set member should always be an instruction");
+ Instruction *Def = cast<Instruction>(D);
+ VDDef.LocalNum = InstrToDFSNum(D);
+ DFSOrderedSet.push_back(VDDef);
+ // If there is a phi node equivalent, add it
+ if (auto *PN = RealToTemp.lookup(Def)) {
+ auto *PHIE =
+ dyn_cast_or_null<PHIExpression>(ValueToExpression.lookup(Def));
+ if (PHIE) {
+ VDDef.Def.setInt(false);
+ VDDef.Def.setPointer(PN);
+ VDDef.LocalNum = 0;
+ DFSOrderedSet.push_back(VDDef);
+ }
+ }
- // Now add the users.
- for (auto &U : D->uses()) {
+ unsigned int UseCount = 0;
+ // Now add the uses.
+ for (auto &U : Def->uses()) {
if (auto *I = dyn_cast<Instruction>(U.getUser())) {
- ValueDFS VD;
+ // Don't try to replace into dead uses
+ if (InstructionsToErase.count(I))
+ continue;
+ ValueDFS VDUse;
// Put the phi node uses in the incoming block.
BasicBlock *IBlock;
if (auto *P = dyn_cast<PHINode>(I)) {
IBlock = P->getIncomingBlock(U);
// Make phi node users appear last in the incoming block
// they are from.
- VD.LocalNum = InstrDFS.size() + 1;
+ VDUse.LocalNum = InstrDFS.size() + 1;
} else {
- IBlock = I->getParent();
- VD.LocalNum = InstrDFS[I];
+ IBlock = getBlockForValue(I);
+ VDUse.LocalNum = InstrToDFSNum(I);
}
- std::pair<int, int> DFSPair = DFSDomMap[IBlock];
- VD.DFSIn = DFSPair.first;
- VD.DFSOut = DFSPair.second;
- VD.U = &U;
- DFSOrderedSet.emplace_back(VD);
+
+ // Skip uses in unreachable blocks, as we're going
+ // to delete them.
+ if (ReachableBlocks.count(IBlock) == 0)
+ continue;
+
+ DomTreeNode *DomNode = DT->getNode(IBlock);
+ VDUse.DFSIn = DomNode->getDFSNumIn();
+ VDUse.DFSOut = DomNode->getDFSNumOut();
+ VDUse.U = &U;
+ ++UseCount;
+ DFSOrderedSet.emplace_back(VDUse);
}
}
+
+ // If there are no uses, it's probably dead (but it may have side-effects,
+ // so not definitely dead. Otherwise, store the number of uses so we can
+ // track if it becomes dead later).
+ if (UseCount == 0)
+ ProbablyDead.insert(Def);
+ else
+ UseCounts[Def] = UseCount;
}
}
-static void patchReplacementInstruction(Instruction *I, Value *Repl) {
- // Patch the replacement so that it is not more restrictive than the value
- // being replaced.
- auto *Op = dyn_cast<BinaryOperator>(I);
- auto *ReplOp = dyn_cast<BinaryOperator>(Repl);
+// This function converts the set of members for a congruence class from values,
+// to the set of defs for loads and stores, with associated DFS info.
+void NewGVN::convertClassToLoadsAndStores(
+ const CongruenceClass &Dense,
+ SmallVectorImpl<ValueDFS> &LoadsAndStores) const {
+ for (auto D : Dense) {
+ if (!isa<LoadInst>(D) && !isa<StoreInst>(D))
+ continue;
- if (Op && ReplOp)
- ReplOp->andIRFlags(Op);
+ BasicBlock *BB = getBlockForValue(D);
+ ValueDFS VD;
+ DomTreeNode *DomNode = DT->getNode(BB);
+ VD.DFSIn = DomNode->getDFSNumIn();
+ VD.DFSOut = DomNode->getDFSNumOut();
+ VD.Def.setPointer(D);
- if (auto *ReplInst = dyn_cast<Instruction>(Repl)) {
- // FIXME: If both the original and replacement value are part of the
- // same control-flow region (meaning that the execution of one
- // guarentees the executation of the other), then we can combine the
- // noalias scopes here and do better than the general conservative
- // answer used in combineMetadata().
+ // If it's an instruction, use the real local dfs number.
+ if (auto *I = dyn_cast<Instruction>(D))
+ VD.LocalNum = InstrToDFSNum(I);
+ else
+ llvm_unreachable("Should have been an instruction");
- // In general, GVN unifies expressions over different control-flow
- // regions, and so we need a conservative combination of the noalias
- // scopes.
- unsigned KnownIDs[] = {
- LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
- LLVMContext::MD_noalias, LLVMContext::MD_range,
- LLVMContext::MD_fpmath, LLVMContext::MD_invariant_load,
- LLVMContext::MD_invariant_group};
- combineMetadata(ReplInst, I, KnownIDs);
+ LoadsAndStores.emplace_back(VD);
}
}
+static void patchReplacementInstruction(Instruction *I, Value *Repl) {
+ auto *ReplInst = dyn_cast<Instruction>(Repl);
+ if (!ReplInst)
+ return;
+
+ // Patch the replacement so that it is not more restrictive than the value
+ // being replaced.
+ // Note that if 'I' is a load being replaced by some operation,
+ // for example, by an arithmetic operation, then andIRFlags()
+ // would just erase all math flags from the original arithmetic
+ // operation, which is clearly not wanted and not needed.
+ if (!isa<LoadInst>(I))
+ ReplInst->andIRFlags(I);
+
+ // FIXME: If both the original and replacement value are part of the
+ // same control-flow region (meaning that the execution of one
+ // guarantees the execution of the other), then we can combine the
+ // noalias scopes here and do better than the general conservative
+ // answer used in combineMetadata().
+
+ // In general, GVN unifies expressions over different control-flow
+ // regions, and so we need a conservative combination of the noalias
+ // scopes.
+ static const unsigned KnownIDs[] = {
+ LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
+ LLVMContext::MD_noalias, LLVMContext::MD_range,
+ LLVMContext::MD_fpmath, LLVMContext::MD_invariant_load,
+ LLVMContext::MD_invariant_group};
+ combineMetadata(ReplInst, I, KnownIDs);
+}
+
static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) {
patchReplacementInstruction(I, Repl);
I->replaceAllUsesWith(Repl);
@@ -1967,10 +3408,6 @@ void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) {
DEBUG(dbgs() << " BasicBlock Dead:" << *BB);
++NumGVNBlocksDeleted;
- // Check to see if there are non-terminating instructions to delete.
- if (isa<TerminatorInst>(BB->begin()))
- return;
-
// Delete the instructions backwards, as it has a reduced likelihood of having
// to update as many def-use and use-def chains. Start after the terminator.
auto StartPoint = BB->rbegin();
@@ -1987,6 +3424,11 @@ void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) {
Inst.eraseFromParent();
++NumGVNInstrDeleted;
}
+ // Now insert something that simplifycfg will turn into an unreachable.
+ Type *Int8Ty = Type::getInt8Ty(BB->getContext());
+ new StoreInst(UndefValue::get(Int8Ty),
+ Constant::getNullValue(Int8Ty->getPointerTo()),
+ BB->getTerminator());
}
void NewGVN::markInstructionForDeletion(Instruction *I) {
@@ -2042,6 +3484,37 @@ private:
};
}
+// Given a value and a basic block we are trying to see if it is available in,
+// see if the value has a leader available in that block.
+Value *NewGVN::findPhiOfOpsLeader(const Expression *E,
+ const BasicBlock *BB) const {
+ // It would already be constant if we could make it constant
+ if (auto *CE = dyn_cast<ConstantExpression>(E))
+ return CE->getConstantValue();
+ if (auto *VE = dyn_cast<VariableExpression>(E))
+ return VE->getVariableValue();
+
+ auto *CC = ExpressionToClass.lookup(E);
+ if (!CC)
+ return nullptr;
+ if (alwaysAvailable(CC->getLeader()))
+ return CC->getLeader();
+
+ for (auto Member : *CC) {
+ auto *MemberInst = dyn_cast<Instruction>(Member);
+ // Anything that isn't an instruction is always available.
+ if (!MemberInst)
+ return Member;
+ // If we are looking for something in the same block as the member, it must
+ // be a leader because this function is looking for operands for a phi node.
+ if (MemberInst->getParent() == BB ||
+ DT->dominates(MemberInst->getParent(), BB)) {
+ return Member;
+ }
+ }
+ return nullptr;
+}
+
bool NewGVN::eliminateInstructions(Function &F) {
// This is a non-standard eliminator. The normal way to eliminate is
// to walk the dominator tree in order, keeping track of available
@@ -2072,73 +3545,91 @@ bool NewGVN::eliminateInstructions(Function &F) {
// DFS numbers are updated, we compute some ourselves.
DT->updateDFSNumbers();
- for (auto &B : F) {
- if (!ReachableBlocks.count(&B)) {
- for (const auto S : successors(&B)) {
- for (auto II = S->begin(); isa<PHINode>(II); ++II) {
- auto &Phi = cast<PHINode>(*II);
- DEBUG(dbgs() << "Replacing incoming value of " << *II << " for block "
- << getBlockName(&B)
- << " with undef due to it being unreachable\n");
- for (auto &Operand : Phi.incoming_values())
- if (Phi.getIncomingBlock(Operand) == &B)
- Operand.set(UndefValue::get(Phi.getType()));
- }
+ // Go through all of our phi nodes, and kill the arguments associated with
+ // unreachable edges.
+ auto ReplaceUnreachablePHIArgs = [&](PHINode &PHI, BasicBlock *BB) {
+ for (auto &Operand : PHI.incoming_values())
+ if (!ReachableEdges.count({PHI.getIncomingBlock(Operand), BB})) {
+ DEBUG(dbgs() << "Replacing incoming value of " << PHI << " for block "
+ << getBlockName(PHI.getIncomingBlock(Operand))
+ << " with undef due to it being unreachable\n");
+ Operand.set(UndefValue::get(PHI.getType()));
}
+ };
+ SmallPtrSet<BasicBlock *, 8> BlocksWithPhis;
+ for (auto &B : F)
+ if ((!B.empty() && isa<PHINode>(*B.begin())) ||
+ (PHIOfOpsPHIs.find(&B) != PHIOfOpsPHIs.end()))
+ BlocksWithPhis.insert(&B);
+ DenseMap<const BasicBlock *, unsigned> ReachablePredCount;
+ for (auto KV : ReachableEdges)
+ ReachablePredCount[KV.getEnd()]++;
+ for (auto *BB : BlocksWithPhis)
+ // TODO: It would be faster to use getNumIncomingBlocks() on a phi node in
+ // the block and subtract the pred count, but it's more complicated.
+ if (ReachablePredCount.lookup(BB) !=
+ unsigned(std::distance(pred_begin(BB), pred_end(BB)))) {
+ for (auto II = BB->begin(); isa<PHINode>(II); ++II) {
+ auto &PHI = cast<PHINode>(*II);
+ ReplaceUnreachablePHIArgs(PHI, BB);
+ }
+ for_each_found(PHIOfOpsPHIs, BB, [&](PHINode *PHI) {
+ ReplaceUnreachablePHIArgs(*PHI, BB);
+ });
}
- DomTreeNode *Node = DT->getNode(&B);
- if (Node)
- DFSDomMap[&B] = {Node->getDFSNumIn(), Node->getDFSNumOut()};
- }
- for (CongruenceClass *CC : CongruenceClasses) {
- // FIXME: We should eventually be able to replace everything still
- // in the initial class with undef, as they should be unreachable.
- // Right now, initial still contains some things we skip value
- // numbering of (UNREACHABLE's, for example).
- if (CC == InitialClass || CC->Dead)
+ // Map to store the use counts
+ DenseMap<const Value *, unsigned int> UseCounts;
+ for (auto *CC : reverse(CongruenceClasses)) {
+ DEBUG(dbgs() << "Eliminating in congruence class " << CC->getID() << "\n");
+ // Track the equivalent store info so we can decide whether to try
+ // dead store elimination.
+ SmallVector<ValueDFS, 8> PossibleDeadStores;
+ SmallPtrSet<Instruction *, 8> ProbablyDead;
+ if (CC->isDead() || CC->empty())
continue;
- assert(CC->RepLeader && "We should have had a leader");
+ // Everything still in the TOP class is unreachable or dead.
+ if (CC == TOPClass) {
+ for (auto M : *CC) {
+ auto *VTE = ValueToExpression.lookup(M);
+ if (VTE && isa<DeadExpression>(VTE))
+ markInstructionForDeletion(cast<Instruction>(M));
+ assert((!ReachableBlocks.count(cast<Instruction>(M)->getParent()) ||
+ InstructionsToErase.count(cast<Instruction>(M))) &&
+ "Everything in TOP should be unreachable or dead at this "
+ "point");
+ }
+ continue;
+ }
+ assert(CC->getLeader() && "We should have had a leader");
// If this is a leader that is always available, and it's a
// constant or has no equivalences, just replace everything with
// it. We then update the congruence class with whatever members
// are left.
- if (alwaysAvailable(CC->RepLeader)) {
- SmallPtrSet<Value *, 4> MembersLeft;
- for (auto M : CC->Members) {
-
+ Value *Leader =
+ CC->getStoredValue() ? CC->getStoredValue() : CC->getLeader();
+ if (alwaysAvailable(Leader)) {
+ CongruenceClass::MemberSet MembersLeft;
+ for (auto M : *CC) {
Value *Member = M;
-
// Void things have no uses we can replace.
- if (Member == CC->RepLeader || Member->getType()->isVoidTy()) {
+ if (Member == Leader || !isa<Instruction>(Member) ||
+ Member->getType()->isVoidTy()) {
MembersLeft.insert(Member);
continue;
}
-
- DEBUG(dbgs() << "Found replacement " << *(CC->RepLeader) << " for "
- << *Member << "\n");
- // Due to equality propagation, these may not always be
- // instructions, they may be real values. We don't really
- // care about trying to replace the non-instructions.
- if (auto *I = dyn_cast<Instruction>(Member)) {
- assert(CC->RepLeader != I &&
- "About to accidentally remove our leader");
- replaceInstruction(I, CC->RepLeader);
- AnythingReplaced = true;
-
- continue;
- } else {
- MembersLeft.insert(I);
- }
+ DEBUG(dbgs() << "Found replacement " << *(Leader) << " for " << *Member
+ << "\n");
+ auto *I = cast<Instruction>(Member);
+ assert(Leader != I && "About to accidentally remove our leader");
+ replaceInstruction(I, Leader);
+ AnythingReplaced = true;
}
- CC->Members.swap(MembersLeft);
-
+ CC->swap(MembersLeft);
} else {
- DEBUG(dbgs() << "Eliminating in congruence class " << CC->ID << "\n");
// If this is a singleton, we can skip it.
- if (CC->Members.size() != 1) {
-
+ if (CC->size() != 1 || RealToTemp.lookup(Leader)) {
// This is a stack because equality replacement/etc may place
// constants in the middle of the member list, and we want to use
// those constant values in preference to the current leader, over
@@ -2147,23 +3638,34 @@ bool NewGVN::eliminateInstructions(Function &F) {
// Convert the members to DFS ordered sets and then merge them.
SmallVector<ValueDFS, 8> DFSOrderedSet;
- convertDenseToDFSOrdered(CC->Members, DFSOrderedSet);
+ convertClassToDFSOrdered(*CC, DFSOrderedSet, UseCounts, ProbablyDead);
// Sort the whole thing.
std::sort(DFSOrderedSet.begin(), DFSOrderedSet.end());
-
for (auto &VD : DFSOrderedSet) {
int MemberDFSIn = VD.DFSIn;
int MemberDFSOut = VD.DFSOut;
- Value *Member = VD.Val;
- Use *MemberUse = VD.U;
-
- if (Member) {
- // We ignore void things because we can't get a value from them.
- // FIXME: We could actually use this to kill dead stores that are
- // dominated by equivalent earlier stores.
- if (Member->getType()->isVoidTy())
- continue;
+ Value *Def = VD.Def.getPointer();
+ bool FromStore = VD.Def.getInt();
+ Use *U = VD.U;
+ // We ignore void things because we can't get a value from them.
+ if (Def && Def->getType()->isVoidTy())
+ continue;
+ auto *DefInst = dyn_cast_or_null<Instruction>(Def);
+ if (DefInst && AllTempInstructions.count(DefInst)) {
+ auto *PN = cast<PHINode>(DefInst);
+
+ // If this is a value phi and that's the expression we used, insert
+ // it into the program
+ // remove from temp instruction list.
+ AllTempInstructions.erase(PN);
+ auto *DefBlock = getBlockForValue(Def);
+ DEBUG(dbgs() << "Inserting fully real phi of ops" << *Def
+ << " into block "
+ << getBlockName(getBlockForValue(Def)) << "\n");
+ PN->insertBefore(&DefBlock->front());
+ Def = PN;
+ NumGVNPHIOfOpsEliminations++;
}
if (EliminationStack.empty()) {
@@ -2189,69 +3691,251 @@ bool NewGVN::eliminateInstructions(Function &F) {
// start using, we also push.
// Otherwise, we walk along, processing members who are
// dominated by this scope, and eliminate them.
- bool ShouldPush =
- Member && (EliminationStack.empty() || isa<Constant>(Member));
+ bool ShouldPush = Def && EliminationStack.empty();
bool OutOfScope =
!EliminationStack.isInScope(MemberDFSIn, MemberDFSOut);
if (OutOfScope || ShouldPush) {
// Sync to our current scope.
EliminationStack.popUntilDFSScope(MemberDFSIn, MemberDFSOut);
- ShouldPush |= Member && EliminationStack.empty();
+ bool ShouldPush = Def && EliminationStack.empty();
if (ShouldPush) {
- EliminationStack.push_back(Member, MemberDFSIn, MemberDFSOut);
+ EliminationStack.push_back(Def, MemberDFSIn, MemberDFSOut);
+ }
+ }
+
+ // Skip the Def's, we only want to eliminate on their uses. But mark
+ // dominated defs as dead.
+ if (Def) {
+ // For anything in this case, what and how we value number
+ // guarantees that any side-effets that would have occurred (ie
+ // throwing, etc) can be proven to either still occur (because it's
+ // dominated by something that has the same side-effects), or never
+ // occur. Otherwise, we would not have been able to prove it value
+ // equivalent to something else. For these things, we can just mark
+ // it all dead. Note that this is different from the "ProbablyDead"
+ // set, which may not be dominated by anything, and thus, are only
+ // easy to prove dead if they are also side-effect free. Note that
+ // because stores are put in terms of the stored value, we skip
+ // stored values here. If the stored value is really dead, it will
+ // still be marked for deletion when we process it in its own class.
+ if (!EliminationStack.empty() && Def != EliminationStack.back() &&
+ isa<Instruction>(Def) && !FromStore)
+ markInstructionForDeletion(cast<Instruction>(Def));
+ continue;
+ }
+ // At this point, we know it is a Use we are trying to possibly
+ // replace.
+
+ assert(isa<Instruction>(U->get()) &&
+ "Current def should have been an instruction");
+ assert(isa<Instruction>(U->getUser()) &&
+ "Current user should have been an instruction");
+
+ // If the thing we are replacing into is already marked to be dead,
+ // this use is dead. Note that this is true regardless of whether
+ // we have anything dominating the use or not. We do this here
+ // because we are already walking all the uses anyway.
+ Instruction *InstUse = cast<Instruction>(U->getUser());
+ if (InstructionsToErase.count(InstUse)) {
+ auto &UseCount = UseCounts[U->get()];
+ if (--UseCount == 0) {
+ ProbablyDead.insert(cast<Instruction>(U->get()));
}
}
// If we get to this point, and the stack is empty we must have a use
- // with nothing we can use to eliminate it, just skip it.
+ // with nothing we can use to eliminate this use, so just skip it.
if (EliminationStack.empty())
continue;
- // Skip the Value's, we only want to eliminate on their uses.
- if (Member)
- continue;
- Value *Result = EliminationStack.back();
+ Value *DominatingLeader = EliminationStack.back();
+
+ auto *II = dyn_cast<IntrinsicInst>(DominatingLeader);
+ if (II && II->getIntrinsicID() == Intrinsic::ssa_copy)
+ DominatingLeader = II->getOperand(0);
// Don't replace our existing users with ourselves.
- if (MemberUse->get() == Result)
+ if (U->get() == DominatingLeader)
continue;
-
- DEBUG(dbgs() << "Found replacement " << *Result << " for "
- << *MemberUse->get() << " in " << *(MemberUse->getUser())
- << "\n");
+ DEBUG(dbgs() << "Found replacement " << *DominatingLeader << " for "
+ << *U->get() << " in " << *(U->getUser()) << "\n");
// If we replaced something in an instruction, handle the patching of
- // metadata.
- if (auto *ReplacedInst = dyn_cast<Instruction>(MemberUse->get()))
- patchReplacementInstruction(ReplacedInst, Result);
-
- assert(isa<Instruction>(MemberUse->getUser()));
- MemberUse->set(Result);
+ // metadata. Skip this if we are replacing predicateinfo with its
+ // original operand, as we already know we can just drop it.
+ auto *ReplacedInst = cast<Instruction>(U->get());
+ auto *PI = PredInfo->getPredicateInfoFor(ReplacedInst);
+ if (!PI || DominatingLeader != PI->OriginalOp)
+ patchReplacementInstruction(ReplacedInst, DominatingLeader);
+ U->set(DominatingLeader);
+ // This is now a use of the dominating leader, which means if the
+ // dominating leader was dead, it's now live!
+ auto &LeaderUseCount = UseCounts[DominatingLeader];
+ // It's about to be alive again.
+ if (LeaderUseCount == 0 && isa<Instruction>(DominatingLeader))
+ ProbablyDead.erase(cast<Instruction>(DominatingLeader));
+ if (LeaderUseCount == 0 && II)
+ ProbablyDead.insert(II);
+ ++LeaderUseCount;
AnythingReplaced = true;
}
}
}
+ // At this point, anything still in the ProbablyDead set is actually dead if
+ // would be trivially dead.
+ for (auto *I : ProbablyDead)
+ if (wouldInstructionBeTriviallyDead(I))
+ markInstructionForDeletion(I);
+
// Cleanup the congruence class.
- SmallPtrSet<Value *, 4> MembersLeft;
- for (Value *Member : CC->Members) {
- if (Member->getType()->isVoidTy()) {
+ CongruenceClass::MemberSet MembersLeft;
+ for (auto *Member : *CC)
+ if (!isa<Instruction>(Member) ||
+ !InstructionsToErase.count(cast<Instruction>(Member)))
MembersLeft.insert(Member);
- continue;
- }
-
- if (auto *MemberInst = dyn_cast<Instruction>(Member)) {
- if (isInstructionTriviallyDead(MemberInst)) {
- // TODO: Don't mark loads of undefs.
- markInstructionForDeletion(MemberInst);
- continue;
+ CC->swap(MembersLeft);
+
+ // If we have possible dead stores to look at, try to eliminate them.
+ if (CC->getStoreCount() > 0) {
+ convertClassToLoadsAndStores(*CC, PossibleDeadStores);
+ std::sort(PossibleDeadStores.begin(), PossibleDeadStores.end());
+ ValueDFSStack EliminationStack;
+ for (auto &VD : PossibleDeadStores) {
+ int MemberDFSIn = VD.DFSIn;
+ int MemberDFSOut = VD.DFSOut;
+ Instruction *Member = cast<Instruction>(VD.Def.getPointer());
+ if (EliminationStack.empty() ||
+ !EliminationStack.isInScope(MemberDFSIn, MemberDFSOut)) {
+ // Sync to our current scope.
+ EliminationStack.popUntilDFSScope(MemberDFSIn, MemberDFSOut);
+ if (EliminationStack.empty()) {
+ EliminationStack.push_back(Member, MemberDFSIn, MemberDFSOut);
+ continue;
+ }
}
+ // We already did load elimination, so nothing to do here.
+ if (isa<LoadInst>(Member))
+ continue;
+ assert(!EliminationStack.empty());
+ Instruction *Leader = cast<Instruction>(EliminationStack.back());
+ (void)Leader;
+ assert(DT->dominates(Leader->getParent(), Member->getParent()));
+ // Member is dominater by Leader, and thus dead
+ DEBUG(dbgs() << "Marking dead store " << *Member
+ << " that is dominated by " << *Leader << "\n");
+ markInstructionForDeletion(Member);
+ CC->erase(Member);
+ ++NumGVNDeadStores;
}
- MembersLeft.insert(Member);
}
- CC->Members.swap(MembersLeft);
}
-
return AnythingReplaced;
}
+
+// This function provides global ranking of operations so that we can place them
+// in a canonical order. Note that rank alone is not necessarily enough for a
+// complete ordering, as constants all have the same rank. However, generally,
+// we will simplify an operation with all constants so that it doesn't matter
+// what order they appear in.
+unsigned int NewGVN::getRank(const Value *V) const {
+ // Prefer constants to undef to anything else
+ // Undef is a constant, have to check it first.
+ // Prefer smaller constants to constantexprs
+ if (isa<ConstantExpr>(V))
+ return 2;
+ if (isa<UndefValue>(V))
+ return 1;
+ if (isa<Constant>(V))
+ return 0;
+ else if (auto *A = dyn_cast<Argument>(V))
+ return 3 + A->getArgNo();
+
+ // Need to shift the instruction DFS by number of arguments + 3 to account for
+ // the constant and argument ranking above.
+ unsigned Result = InstrToDFSNum(V);
+ if (Result > 0)
+ return 4 + NumFuncArgs + Result;
+ // Unreachable or something else, just return a really large number.
+ return ~0;
+}
+
+// This is a function that says whether two commutative operations should
+// have their order swapped when canonicalizing.
+bool NewGVN::shouldSwapOperands(const Value *A, const Value *B) const {
+ // Because we only care about a total ordering, and don't rewrite expressions
+ // in this order, we order by rank, which will give a strict weak ordering to
+ // everything but constants, and then we order by pointer address.
+ return std::make_pair(getRank(A), A) > std::make_pair(getRank(B), B);
+}
+
+namespace {
+class NewGVNLegacyPass : public FunctionPass {
+public:
+ static char ID; // Pass identification, replacement for typeid.
+ NewGVNLegacyPass() : FunctionPass(ID) {
+ initializeNewGVNLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+ bool runOnFunction(Function &F) override;
+
+private:
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<MemorySSAWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+};
+} // namespace
+
+bool NewGVNLegacyPass::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+ return NewGVN(F, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+ &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
+ &getAnalysis<AAResultsWrapperPass>().getAAResults(),
+ &getAnalysis<MemorySSAWrapperPass>().getMSSA(),
+ F.getParent()->getDataLayout())
+ .runGVN();
+}
+
+INITIALIZE_PASS_BEGIN(NewGVNLegacyPass, "newgvn", "Global Value Numbering",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_END(NewGVNLegacyPass, "newgvn", "Global Value Numbering", false,
+ false)
+
+char NewGVNLegacyPass::ID = 0;
+
+// createGVNPass - The public interface to this file.
+FunctionPass *llvm::createNewGVNPass() { return new NewGVNLegacyPass(); }
+
+PreservedAnalyses NewGVNPass::run(Function &F, AnalysisManager<Function> &AM) {
+ // Apparently the order in which we get these results matter for
+ // the old GVN (see Chandler's comment in GVN.cpp). I'll keep
+ // the same order here, just in case.
+ auto &AC = AM.getResult<AssumptionAnalysis>(F);
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ auto &AA = AM.getResult<AAManager>(F);
+ auto &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
+ bool Changed =
+ NewGVN(F, &DT, &AC, &TLI, &AA, &MSSA, F.getParent()->getDataLayout())
+ .runGVN();
+ if (!Changed)
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
index 1a7ddc9..1bfecea 100644
--- a/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -66,7 +66,7 @@ static bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
// Add attribute "readnone" so that backend can use a native sqrt instruction
// for this call. Insert a FP compare instruction and a conditional branch
// at the end of CurrBB.
- Call->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone);
+ Call->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
CurrBB.getTerminator()->eraseFromParent();
Builder.SetInsertPoint(&CurrBB);
Value *FCmp = Builder.CreateFCmpOEQ(Call, Call);
@@ -98,14 +98,14 @@ static bool runPartiallyInlineLibCalls(Function &F, TargetLibraryInfo *TLI,
// Skip if function either has local linkage or is not a known library
// function.
- LibFunc::Func LibFunc;
+ LibFunc LF;
if (CalledFunc->hasLocalLinkage() || !CalledFunc->hasName() ||
- !TLI->getLibFunc(CalledFunc->getName(), LibFunc))
+ !TLI->getLibFunc(CalledFunc->getName(), LF))
continue;
- switch (LibFunc) {
- case LibFunc::sqrtf:
- case LibFunc::sqrt:
+ switch (LF) {
+ case LibFunc_sqrtf:
+ case LibFunc_sqrt:
if (TTI->haveFastSqrt(Call->getType()) &&
optimizeSQRT(Call, CalledFunc, *CurrBB, BB))
break;
diff --git a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
index 65c814d..e235e5eb 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -35,6 +35,7 @@
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
@@ -106,11 +107,12 @@ XorOpnd::XorOpnd(Value *V) {
I->getOpcode() == Instruction::And)) {
Value *V0 = I->getOperand(0);
Value *V1 = I->getOperand(1);
- if (isa<ConstantInt>(V0))
+ const APInt *C;
+ if (match(V0, PatternMatch::m_APInt(C)))
std::swap(V0, V1);
- if (ConstantInt *C = dyn_cast<ConstantInt>(V1)) {
- ConstPart = C->getValue();
+ if (match(V1, PatternMatch::m_APInt(C))) {
+ ConstPart = *C;
SymbolicPart = V0;
isOr = (I->getOpcode() == Instruction::Or);
return;
@@ -119,7 +121,7 @@ XorOpnd::XorOpnd(Value *V) {
// view the operand as "V | 0"
SymbolicPart = V;
- ConstPart = APInt::getNullValue(V->getType()->getIntegerBitWidth());
+ ConstPart = APInt::getNullValue(V->getType()->getScalarSizeInBits());
isOr = true;
}
@@ -955,8 +957,8 @@ static BinaryOperator *ConvertShiftToMul(Instruction *Shl) {
/// Scan backwards and forwards among values with the same rank as element i
/// to see if X exists. If X does not exist, return i. This is useful when
/// scanning for 'x' when we see '-x' because they both get the same rank.
-static unsigned FindInOperandList(SmallVectorImpl<ValueEntry> &Ops, unsigned i,
- Value *X) {
+static unsigned FindInOperandList(const SmallVectorImpl<ValueEntry> &Ops,
+ unsigned i, Value *X) {
unsigned XRank = Ops[i].Rank;
unsigned e = Ops.size();
for (unsigned j = i+1; j != e && Ops[j].Rank == XRank; ++j) {
@@ -982,7 +984,7 @@ static unsigned FindInOperandList(SmallVectorImpl<ValueEntry> &Ops, unsigned i,
/// Emit a tree of add instructions, summing Ops together
/// and returning the result. Insert the tree before I.
static Value *EmitAddTreeOfValues(Instruction *I,
- SmallVectorImpl<WeakVH> &Ops){
+ SmallVectorImpl<WeakTrackingVH> &Ops) {
if (Ops.size() == 1) return Ops.back();
Value *V1 = Ops.back();
@@ -1069,8 +1071,7 @@ Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) {
///
/// Ops is the top-level list of add operands we're trying to factor.
static void FindSingleUseMultiplyFactors(Value *V,
- SmallVectorImpl<Value*> &Factors,
- const SmallVectorImpl<ValueEntry> &Ops) {
+ SmallVectorImpl<Value*> &Factors) {
BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul);
if (!BO) {
Factors.push_back(V);
@@ -1078,8 +1079,8 @@ static void FindSingleUseMultiplyFactors(Value *V,
}
// Otherwise, add the LHS and RHS to the list of factors.
- FindSingleUseMultiplyFactors(BO->getOperand(1), Factors, Ops);
- FindSingleUseMultiplyFactors(BO->getOperand(0), Factors, Ops);
+ FindSingleUseMultiplyFactors(BO->getOperand(1), Factors);
+ FindSingleUseMultiplyFactors(BO->getOperand(0), Factors);
}
/// Optimize a series of operands to an 'and', 'or', or 'xor' instruction.
@@ -1135,20 +1136,19 @@ static Value *OptimizeAndOrXor(unsigned Opcode,
/// instruction. There are two special cases: 1) if the constant operand is 0,
/// it will return NULL. 2) if the constant is ~0, the symbolic operand will
/// be returned.
-static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd,
+static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd,
const APInt &ConstOpnd) {
- if (ConstOpnd != 0) {
- if (!ConstOpnd.isAllOnesValue()) {
- LLVMContext &Ctx = Opnd->getType()->getContext();
- Instruction *I;
- I = BinaryOperator::CreateAnd(Opnd, ConstantInt::get(Ctx, ConstOpnd),
- "and.ra", InsertBefore);
- I->setDebugLoc(InsertBefore->getDebugLoc());
- return I;
- }
+ if (ConstOpnd.isNullValue())
+ return nullptr;
+
+ if (ConstOpnd.isAllOnesValue())
return Opnd;
- }
- return nullptr;
+
+ Instruction *I = BinaryOperator::CreateAnd(
+ Opnd, ConstantInt::get(Opnd->getType(), ConstOpnd), "and.ra",
+ InsertBefore);
+ I->setDebugLoc(InsertBefore->getDebugLoc());
+ return I;
}
// Helper function of OptimizeXor(). It tries to simplify "Opnd1 ^ ConstOpnd"
@@ -1164,24 +1164,24 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
// = ((x | c1) ^ c1) ^ (c1 ^ c2)
// = (x & ~c1) ^ (c1 ^ c2)
// It is useful only when c1 == c2.
- if (Opnd1->isOrExpr() && Opnd1->getConstPart() != 0) {
- if (!Opnd1->getValue()->hasOneUse())
- return false;
+ if (!Opnd1->isOrExpr() || Opnd1->getConstPart().isNullValue())
+ return false;
- const APInt &C1 = Opnd1->getConstPart();
- if (C1 != ConstOpnd)
- return false;
+ if (!Opnd1->getValue()->hasOneUse())
+ return false;
- Value *X = Opnd1->getSymbolicPart();
- Res = createAndInstr(I, X, ~C1);
- // ConstOpnd was C2, now C1 ^ C2.
- ConstOpnd ^= C1;
+ const APInt &C1 = Opnd1->getConstPart();
+ if (C1 != ConstOpnd)
+ return false;
- if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
- RedoInsts.insert(T);
- return true;
- }
- return false;
+ Value *X = Opnd1->getSymbolicPart();
+ Res = createAndInstr(I, X, ~C1);
+ // ConstOpnd was C2, now C1 ^ C2.
+ ConstOpnd ^= C1;
+
+ if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
+ RedoInsts.insert(T);
+ return true;
}
@@ -1222,8 +1222,8 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
APInt C3((~C1) ^ C2);
// Do not increase code size!
- if (C3 != 0 && !C3.isAllOnesValue()) {
- int NewInstNum = ConstOpnd != 0 ? 1 : 2;
+ if (!C3.isNullValue() && !C3.isAllOnesValue()) {
+ int NewInstNum = ConstOpnd.getBoolValue() ? 1 : 2;
if (NewInstNum > DeadInstNum)
return false;
}
@@ -1239,8 +1239,8 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
APInt C3 = C1 ^ C2;
// Do not increase code size
- if (C3 != 0 && !C3.isAllOnesValue()) {
- int NewInstNum = ConstOpnd != 0 ? 1 : 2;
+ if (!C3.isNullValue() && !C3.isAllOnesValue()) {
+ int NewInstNum = ConstOpnd.getBoolValue() ? 1 : 2;
if (NewInstNum > DeadInstNum)
return false;
}
@@ -1280,17 +1280,20 @@ Value *ReassociatePass::OptimizeXor(Instruction *I,
SmallVector<XorOpnd, 8> Opnds;
SmallVector<XorOpnd*, 8> OpndPtrs;
Type *Ty = Ops[0].Op->getType();
- APInt ConstOpnd(Ty->getIntegerBitWidth(), 0);
+ APInt ConstOpnd(Ty->getScalarSizeInBits(), 0);
// Step 1: Convert ValueEntry to XorOpnd
for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
Value *V = Ops[i].Op;
- if (!isa<ConstantInt>(V)) {
+ const APInt *C;
+ // TODO: Support non-splat vectors.
+ if (match(V, PatternMatch::m_APInt(C))) {
+ ConstOpnd ^= *C;
+ } else {
XorOpnd O(V);
O.setSymbolicRank(getRank(O.getSymbolicPart()));
Opnds.push_back(O);
- } else
- ConstOpnd ^= cast<ConstantInt>(V)->getValue();
+ }
}
// NOTE: From this point on, do *NOT* add/delete element to/from "Opnds".
@@ -1328,7 +1331,8 @@ Value *ReassociatePass::OptimizeXor(Instruction *I,
Value *CV;
// Step 3.1: Try simplifying "CurrOpnd ^ ConstOpnd"
- if (ConstOpnd != 0 && CombineXorOpnd(I, CurrOpnd, ConstOpnd, CV)) {
+ if (!ConstOpnd.isNullValue() &&
+ CombineXorOpnd(I, CurrOpnd, ConstOpnd, CV)) {
Changed = true;
if (CV)
*CurrOpnd = XorOpnd(CV);
@@ -1370,17 +1374,17 @@ Value *ReassociatePass::OptimizeXor(Instruction *I,
ValueEntry VE(getRank(O.getValue()), O.getValue());
Ops.push_back(VE);
}
- if (ConstOpnd != 0) {
- Value *C = ConstantInt::get(Ty->getContext(), ConstOpnd);
+ if (!ConstOpnd.isNullValue()) {
+ Value *C = ConstantInt::get(Ty, ConstOpnd);
ValueEntry VE(getRank(C), C);
Ops.push_back(VE);
}
- int Sz = Ops.size();
+ unsigned Sz = Ops.size();
if (Sz == 1)
return Ops.back().Op;
- else if (Sz == 0) {
- assert(ConstOpnd == 0);
- return ConstantInt::get(Ty->getContext(), ConstOpnd);
+ if (Sz == 0) {
+ assert(ConstOpnd.isNullValue());
+ return ConstantInt::get(Ty, ConstOpnd);
}
}
@@ -1499,7 +1503,7 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
// Compute all of the factors of this added value.
SmallVector<Value*, 8> Factors;
- FindSingleUseMultiplyFactors(BOp, Factors, Ops);
+ FindSingleUseMultiplyFactors(BOp, Factors);
assert(Factors.size() > 1 && "Bad linearize!");
// Add one to FactorOccurrences for each unique factor in this op.
@@ -1560,7 +1564,7 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
? BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal)
: BinaryOperator::CreateFAdd(MaxOccVal, MaxOccVal);
- SmallVector<WeakVH, 4> NewMulOps;
+ SmallVector<WeakTrackingVH, 4> NewMulOps;
for (unsigned i = 0; i != Ops.size(); ++i) {
// Only try to remove factors from expressions we're allowed to.
BinaryOperator *BOp =
@@ -1583,7 +1587,7 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
}
// No need for extra uses anymore.
- delete DummyInst;
+ DummyInst->deleteValue();
unsigned NumAddedValues = NewMulOps.size();
Value *V = EmitAddTreeOfValues(I, NewMulOps);
@@ -1628,8 +1632,8 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
/// ((((x*y)*x)*y)*x) -> [(x, 3), (y, 2)]
///
/// \returns Whether any factors have a power greater than one.
-bool ReassociatePass::collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
- SmallVectorImpl<Factor> &Factors) {
+static bool collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
+ SmallVectorImpl<Factor> &Factors) {
// FIXME: Have Ops be (ValueEntry, Multiplicity) pairs, simplifying this.
// Compute the sum of powers of simplifiable factors.
unsigned FactorPowerSum = 0;
@@ -1890,6 +1894,8 @@ void ReassociatePass::EraseInst(Instruction *I) {
Op = Op->user_back();
RedoInsts.insert(Op);
}
+
+ MadeChange = true;
}
// Canonicalize expressions of the following form:
@@ -1923,7 +1929,7 @@ Instruction *ReassociatePass::canonicalizeNegConstExpr(Instruction *I) {
// User must be a binary operator with one or more uses.
Instruction *User = I->user_back();
- if (!isa<BinaryOperator>(User) || !User->hasNUsesOrMore(1))
+ if (!isa<BinaryOperator>(User) || User->use_empty())
return nullptr;
unsigned UserOpcode = User->getOpcode();
@@ -1935,6 +1941,12 @@ Instruction *ReassociatePass::canonicalizeNegConstExpr(Instruction *I) {
if (!User->isCommutative() && User->getOperand(1) != I)
return nullptr;
+ // Don't canonicalize x + (-Constant * y) -> x - (Constant * y), if the
+ // resulting subtract will be broken up later. This can get us into an
+ // infinite loop during reassociation.
+ if (UserOpcode == Instruction::FAdd && ShouldBreakUpSubtract(User))
+ return nullptr;
+
// Change the sign of the constant.
APFloat Val = CF->getValueAPF();
Val.changeSign();
@@ -2000,11 +2012,6 @@ void ReassociatePass::OptimizeInst(Instruction *I) {
if (I->isCommutative())
canonicalizeOperands(I);
- // TODO: We should optimize vector Xor instructions, but they are
- // currently unsupported.
- if (I->getType()->isVectorTy() && I->getOpcode() == Instruction::Xor)
- return;
-
// Don't optimize floating point instructions that don't have unsafe algebra.
if (I->getType()->isFPOrFPVectorTy() && !I->hasUnsafeAlgebra())
return;
@@ -2147,7 +2154,7 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
if (I->getOpcode() == Instruction::Mul &&
cast<Instruction>(I->user_back())->getOpcode() == Instruction::Add &&
isa<ConstantInt>(Ops.back().Op) &&
- cast<ConstantInt>(Ops.back().Op)->isAllOnesValue()) {
+ cast<ConstantInt>(Ops.back().Op)->isMinusOne()) {
ValueEntry Tmp = Ops.pop_back_val();
Ops.insert(Ops.begin(), Tmp);
} else if (I->getOpcode() == Instruction::FMul &&
@@ -2236,8 +2243,8 @@ PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
ValueRankMap.clear();
if (MadeChange) {
- // FIXME: This should also 'preserve the CFG'.
- auto PA = PreservedAnalyses();
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
PA.preserve<GlobalsAA>();
return PA;
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
index 615029d..9629568 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -16,7 +16,6 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/Transforms/Scalar.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
@@ -25,6 +24,7 @@
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
#include <list>
using namespace llvm;
diff --git a/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index 1de7420..f19d453 100644
--- a/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -7,20 +7,19 @@
//
//===----------------------------------------------------------------------===//
//
-// Rewrite an existing set of gc.statepoints such that they make potential
-// relocations performed by the garbage collector explicit in the IR.
+// Rewrite call/invoke instructions so as to make potential relocations
+// performed by the garbage collector explicit in the IR.
//
//===----------------------------------------------------------------------===//
-#include "llvm/Pass.h"
-#include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/ADT/SetOperations.h"
-#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetOperations.h"
#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/MapVector.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CallSite.h"
#include "llvm/IR/Dominators.h"
@@ -28,15 +27,16 @@
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
+#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
#include "llvm/IR/Statepoint.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/Verifier.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
@@ -89,10 +89,10 @@ struct RewriteStatepointsForGC : public ModulePass {
Changed |= runOnFunction(F);
if (Changed) {
- // stripNonValidAttributes asserts that shouldRewriteStatepointsIn
+ // stripNonValidAttributesAndMetadata asserts that shouldRewriteStatepointsIn
// returns true for at least one function in the module. Since at least
// one function changed, we know that the precondition is satisfied.
- stripNonValidAttributes(M);
+ stripNonValidAttributesAndMetadata(M);
}
return Changed;
@@ -105,20 +105,24 @@ struct RewriteStatepointsForGC : public ModulePass {
AU.addRequired<TargetTransformInfoWrapperPass>();
}
- /// The IR fed into RewriteStatepointsForGC may have had attributes implying
- /// dereferenceability that are no longer valid/correct after
- /// RewriteStatepointsForGC has run. This is because semantically, after
+ /// The IR fed into RewriteStatepointsForGC may have had attributes and
+ /// metadata implying dereferenceability that are no longer valid/correct after
+ /// RewriteStatepointsForGC has run. This is because semantically, after
/// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire
- /// heap. stripNonValidAttributes (conservatively) restores correctness
- /// by erasing all attributes in the module that externally imply
- /// dereferenceability.
- /// Similar reasoning also applies to the noalias attributes. gc.statepoint
- /// can touch the entire heap including noalias objects.
- void stripNonValidAttributes(Module &M);
-
- // Helpers for stripNonValidAttributes
- void stripNonValidAttributesFromBody(Function &F);
+ /// heap. stripNonValidAttributesAndMetadata (conservatively) restores
+ /// correctness by erasing all attributes in the module that externally imply
+ /// dereferenceability. Similar reasoning also applies to the noalias
+ /// attributes and metadata. gc.statepoint can touch the entire heap including
+ /// noalias objects.
+ void stripNonValidAttributesAndMetadata(Module &M);
+
+ // Helpers for stripNonValidAttributesAndMetadata
+ void stripNonValidAttributesAndMetadataFromBody(Function &F);
void stripNonValidAttributesFromPrototype(Function &F);
+ // Certain metadata on instructions are invalid after running RS4GC.
+ // Optimizations that run after RS4GC can incorrectly use this metadata to
+ // optimize functions. We drop such metadata on the instruction.
+ void stripInvalidMetadataFromInstruction(Instruction &I);
};
} // namespace
@@ -365,6 +369,11 @@ findBaseDefiningValueOfVector(Value *I) {
// for particular sufflevector patterns.
return BaseDefiningValueResult(I, false);
+ // The behavior of getelementptr instructions is the same for vector and
+ // non-vector data types.
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
+ return findBaseDefiningValue(GEP->getPointerOperand());
+
// A PHI or Select is a base defining value. The outer findBasePointer
// algorithm is responsible for constructing a base value for this BDV.
assert((isa<SelectInst>(I) || isa<PHINode>(I)) &&
@@ -634,7 +643,7 @@ static BDVState meetBDVStateImpl(const BDVState &LHS, const BDVState &RHS) {
// Values of type BDVState form a lattice, and this function implements the meet
// operation.
-static BDVState meetBDVState(BDVState LHS, BDVState RHS) {
+static BDVState meetBDVState(const BDVState &LHS, const BDVState &RHS) {
BDVState Result = meetBDVStateImpl(LHS, RHS);
assert(Result == meetBDVStateImpl(RHS, LHS) &&
"Math is wrong: meet does not commute!");
@@ -1123,39 +1132,23 @@ normalizeForInvokeSafepoint(BasicBlock *BB, BasicBlock *InvokeParent,
// Create new attribute set containing only attributes which can be transferred
// from original call to the safepoint.
-static AttributeSet legalizeCallAttributes(AttributeSet AS) {
- AttributeSet Ret;
-
- for (unsigned Slot = 0; Slot < AS.getNumSlots(); Slot++) {
- unsigned Index = AS.getSlotIndex(Slot);
-
- if (Index == AttributeSet::ReturnIndex ||
- Index == AttributeSet::FunctionIndex) {
-
- for (Attribute Attr : make_range(AS.begin(Slot), AS.end(Slot))) {
-
- // Do not allow certain attributes - just skip them
- // Safepoint can not be read only or read none.
- if (Attr.hasAttribute(Attribute::ReadNone) ||
- Attr.hasAttribute(Attribute::ReadOnly))
- continue;
-
- // These attributes control the generation of the gc.statepoint call /
- // invoke itself; and once the gc.statepoint is in place, they're of no
- // use.
- if (isStatepointDirectiveAttr(Attr))
- continue;
-
- Ret = Ret.addAttributes(
- AS.getContext(), Index,
- AttributeSet::get(AS.getContext(), Index, AttrBuilder(Attr)));
- }
- }
-
- // Just skip parameter attributes for now
- }
-
- return Ret;
+static AttributeList legalizeCallAttributes(AttributeList AL) {
+ if (AL.isEmpty())
+ return AL;
+
+ // Remove the readonly, readnone, and statepoint function attributes.
+ AttrBuilder FnAttrs = AL.getFnAttributes();
+ FnAttrs.removeAttribute(Attribute::ReadNone);
+ FnAttrs.removeAttribute(Attribute::ReadOnly);
+ for (Attribute A : AL.getFnAttributes()) {
+ if (isStatepointDirectiveAttr(A))
+ FnAttrs.remove(A);
+ }
+
+ // Just skip parameter and return attributes for now
+ LLVMContext &Ctx = AL.getContext();
+ return AttributeList::get(Ctx, AttributeList::FunctionIndex,
+ AttributeSet::get(Ctx, FnAttrs));
}
/// Helper function to place all gc relocates necessary for the given
@@ -1299,12 +1292,11 @@ static StringRef getDeoptLowering(CallSite CS) {
const char *DeoptLowering = "deopt-lowering";
if (CS.hasFnAttr(DeoptLowering)) {
// FIXME: CallSite has a *really* confusing interface around attributes
- // with values.
- const AttributeSet &CSAS = CS.getAttributes();
- if (CSAS.hasAttribute(AttributeSet::FunctionIndex,
- DeoptLowering))
- return CSAS.getAttribute(AttributeSet::FunctionIndex,
- DeoptLowering).getValueAsString();
+ // with values.
+ const AttributeList &CSAS = CS.getAttributes();
+ if (CSAS.hasAttribute(AttributeList::FunctionIndex, DeoptLowering))
+ return CSAS.getAttribute(AttributeList::FunctionIndex, DeoptLowering)
+ .getValueAsString();
Function *F = CS.getCalledFunction();
assert(F && F->hasFnAttribute(DeoptLowering));
return F->getFnAttribute(DeoptLowering).getValueAsString();
@@ -1388,7 +1380,6 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */
// Create the statepoint given all the arguments
Instruction *Token = nullptr;
- AttributeSet ReturnAttrs;
if (CS.isCall()) {
CallInst *ToReplace = cast<CallInst>(CS.getInstruction());
CallInst *Call = Builder.CreateGCStatepointCall(
@@ -1399,12 +1390,10 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */
Call->setCallingConv(ToReplace->getCallingConv());
// Currently we will fail on parameter attributes and on certain
- // function attributes.
- AttributeSet NewAttrs = legalizeCallAttributes(ToReplace->getAttributes());
- // In case if we can handle this set of attributes - set up function attrs
- // directly on statepoint and return attrs later for gc_result intrinsic.
- Call->setAttributes(NewAttrs.getFnAttributes());
- ReturnAttrs = NewAttrs.getRetAttributes();
+ // function attributes. In case if we can handle this set of attributes -
+ // set up function attrs directly on statepoint and return attrs later for
+ // gc_result intrinsic.
+ Call->setAttributes(legalizeCallAttributes(ToReplace->getAttributes()));
Token = Call;
@@ -1427,12 +1416,10 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */
Invoke->setCallingConv(ToReplace->getCallingConv());
// Currently we will fail on parameter attributes and on certain
- // function attributes.
- AttributeSet NewAttrs = legalizeCallAttributes(ToReplace->getAttributes());
- // In case if we can handle this set of attributes - set up function attrs
- // directly on statepoint and return attrs later for gc_result intrinsic.
- Invoke->setAttributes(NewAttrs.getFnAttributes());
- ReturnAttrs = NewAttrs.getRetAttributes();
+ // function attributes. In case if we can handle this set of attributes -
+ // set up function attrs directly on statepoint and return attrs later for
+ // gc_result intrinsic.
+ Invoke->setAttributes(legalizeCallAttributes(ToReplace->getAttributes()));
Token = Invoke;
@@ -1478,7 +1465,9 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */
StringRef Name =
CS.getInstruction()->hasName() ? CS.getInstruction()->getName() : "";
CallInst *GCResult = Builder.CreateGCResult(Token, CS.getType(), Name);
- GCResult->setAttributes(CS.getAttributes().getRetAttributes());
+ GCResult->setAttributes(
+ AttributeList::get(GCResult->getContext(), AttributeList::ReturnIndex,
+ CS.getAttributes().getRetAttributes()));
// We cannot RAUW or delete CS.getInstruction() because it could be in the
// live set of some other safepoint, in which case that safepoint's
@@ -1615,8 +1604,10 @@ static void relocationViaAlloca(
// Emit alloca for "LiveValue" and record it in "allocaMap" and
// "PromotableAllocas"
+ const DataLayout &DL = F.getParent()->getDataLayout();
auto emitAllocaFor = [&](Value *LiveValue) {
- AllocaInst *Alloca = new AllocaInst(LiveValue->getType(), "",
+ AllocaInst *Alloca = new AllocaInst(LiveValue->getType(),
+ DL.getAllocaAddrSpace(), "",
F.getEntryBlock().getFirstNonPHI());
AllocaMap[LiveValue] = Alloca;
PromotableAllocas.push_back(Alloca);
@@ -1873,7 +1864,7 @@ chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain,
"non noop cast is found during rematerialization");
Type *SrcTy = CI->getOperand(0)->getType();
- Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy);
+ Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy, CI);
} else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
// Cost of the address calculation
@@ -1963,7 +1954,7 @@ static void rematerializeLiveValues(CallSite CS,
// to identify the newly generated AlternateRootPhi (.base version of phi)
// and RootOfChain (the original phi node itself) are the same, so that we
// can rematerialize the gep and casts. This is a workaround for the
- // deficieny in the findBasePointer algorithm.
+ // deficiency in the findBasePointer algorithm.
if (!AreEquivalentPhiNodes(*OrigRootPhi, *AlternateRootPhi))
continue;
// Now that the phi nodes are proved to be the same, assert that
@@ -2003,7 +1994,7 @@ static void rematerializeLiveValues(CallSite CS,
Instruction *LastClonedValue = nullptr;
Instruction *LastValue = nullptr;
for (Instruction *Instr: ChainToBase) {
- // Only GEP's and casts are suported as we need to be careful to not
+ // Only GEP's and casts are supported as we need to be careful to not
// introduce any new uses of pointers not in the liveset.
// Note that it's fine to introduce new uses of pointers which were
// otherwise not used after this statepoint.
@@ -2107,9 +2098,9 @@ static bool insertParsePoints(Function &F, DominatorTree &DT,
// live in the IR. We'll remove all of these when done.
SmallVector<CallInst *, 64> Holders;
- // Insert a dummy call with all of the arguments to the vm_state we'll need
- // for the actual safepoint insertion. This ensures reference arguments in
- // the deopt argument list are considered live through the safepoint (and
+ // Insert a dummy call with all of the deopt operands we'll need for the
+ // actual safepoint insertion as arguments. This ensures reference operands
+ // in the deopt argument list are considered live through the safepoint (and
// thus makes sure they get relocated.)
for (CallSite CS : ToUpdate) {
SmallVector<Value *, 64> DeoptValues;
@@ -2299,12 +2290,11 @@ static void RemoveNonValidAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH,
if (AH.getDereferenceableOrNullBytes(Index))
R.addAttribute(Attribute::get(Ctx, Attribute::DereferenceableOrNull,
AH.getDereferenceableOrNullBytes(Index)));
- if (AH.doesNotAlias(Index))
+ if (AH.getAttributes().hasAttribute(Index, Attribute::NoAlias))
R.addAttribute(Attribute::NoAlias);
if (!R.empty())
- AH.setAttributes(AH.getAttributes().removeAttributes(
- Ctx, Index, AttributeSet::get(Ctx, Index, R)));
+ AH.setAttributes(AH.getAttributes().removeAttributes(Ctx, Index, R));
}
void
@@ -2313,19 +2303,51 @@ RewriteStatepointsForGC::stripNonValidAttributesFromPrototype(Function &F) {
for (Argument &A : F.args())
if (isa<PointerType>(A.getType()))
- RemoveNonValidAttrAtIndex(Ctx, F, A.getArgNo() + 1);
+ RemoveNonValidAttrAtIndex(Ctx, F,
+ A.getArgNo() + AttributeList::FirstArgIndex);
if (isa<PointerType>(F.getReturnType()))
- RemoveNonValidAttrAtIndex(Ctx, F, AttributeSet::ReturnIndex);
+ RemoveNonValidAttrAtIndex(Ctx, F, AttributeList::ReturnIndex);
+}
+
+void RewriteStatepointsForGC::stripInvalidMetadataFromInstruction(Instruction &I) {
+
+ if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
+ return;
+ // These are the attributes that are still valid on loads and stores after
+ // RS4GC.
+ // The metadata implying dereferenceability and noalias are (conservatively)
+ // dropped. This is because semantically, after RewriteStatepointsForGC runs,
+ // all calls to gc.statepoint "free" the entire heap. Also, gc.statepoint can
+ // touch the entire heap including noalias objects. Note: The reasoning is
+ // same as stripping the dereferenceability and noalias attributes that are
+ // analogous to the metadata counterparts.
+ // We also drop the invariant.load metadata on the load because that metadata
+ // implies the address operand to the load points to memory that is never
+ // changed once it became dereferenceable. This is no longer true after RS4GC.
+ // Similar reasoning applies to invariant.group metadata, which applies to
+ // loads within a group.
+ unsigned ValidMetadataAfterRS4GC[] = {LLVMContext::MD_tbaa,
+ LLVMContext::MD_range,
+ LLVMContext::MD_alias_scope,
+ LLVMContext::MD_nontemporal,
+ LLVMContext::MD_nonnull,
+ LLVMContext::MD_align,
+ LLVMContext::MD_type};
+
+ // Drops all metadata on the instruction other than ValidMetadataAfterRS4GC.
+ I.dropUnknownNonDebugMetadata(ValidMetadataAfterRS4GC);
+
}
-void RewriteStatepointsForGC::stripNonValidAttributesFromBody(Function &F) {
+void RewriteStatepointsForGC::stripNonValidAttributesAndMetadataFromBody(Function &F) {
if (F.empty())
return;
LLVMContext &Ctx = F.getContext();
MDBuilder Builder(Ctx);
+
for (Instruction &I : instructions(F)) {
if (const MDNode *MD = I.getMetadata(LLVMContext::MD_tbaa)) {
assert(MD->getNumOperands() < 5 && "unrecognized metadata shape!");
@@ -2346,12 +2368,14 @@ void RewriteStatepointsForGC::stripNonValidAttributesFromBody(Function &F) {
I.setMetadata(LLVMContext::MD_tbaa, MutableTBAA);
}
+ stripInvalidMetadataFromInstruction(I);
+
if (CallSite CS = CallSite(&I)) {
for (int i = 0, e = CS.arg_size(); i != e; i++)
if (isa<PointerType>(CS.getArgument(i)->getType()))
- RemoveNonValidAttrAtIndex(Ctx, CS, i + 1);
+ RemoveNonValidAttrAtIndex(Ctx, CS, i + AttributeList::FirstArgIndex);
if (isa<PointerType>(CS.getType()))
- RemoveNonValidAttrAtIndex(Ctx, CS, AttributeSet::ReturnIndex);
+ RemoveNonValidAttrAtIndex(Ctx, CS, AttributeList::ReturnIndex);
}
}
}
@@ -2370,7 +2394,7 @@ static bool shouldRewriteStatepointsIn(Function &F) {
return false;
}
-void RewriteStatepointsForGC::stripNonValidAttributes(Module &M) {
+void RewriteStatepointsForGC::stripNonValidAttributesAndMetadata(Module &M) {
#ifndef NDEBUG
assert(any_of(M, shouldRewriteStatepointsIn) && "precondition!");
#endif
@@ -2379,7 +2403,7 @@ void RewriteStatepointsForGC::stripNonValidAttributes(Module &M) {
stripNonValidAttributesFromPrototype(F);
for (Function &F : M)
- stripNonValidAttributesFromBody(F);
+ stripNonValidAttributesAndMetadataFromBody(F);
}
bool RewriteStatepointsForGC::runOnFunction(Function &F) {
diff --git a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
index ede381c..4822cf7 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -140,6 +140,14 @@ public:
return nullptr;
}
+ /// getBlockAddress - If this is a constant with a BlockAddress value, return
+ /// it, otherwise return null.
+ BlockAddress *getBlockAddress() const {
+ if (isConstant())
+ return dyn_cast<BlockAddress>(getConstant());
+ return nullptr;
+ }
+
void markForcedConstant(Constant *V) {
assert(isUnknown() && "Can't force a defined value!");
Val.setInt(forcedconstant);
@@ -306,20 +314,14 @@ public:
return MRVFunctionsTracked;
}
- void markOverdefined(Value *V) {
- assert(!V->getType()->isStructTy() &&
- "structs should use markAnythingOverdefined");
- markOverdefined(ValueState[V], V);
- }
-
- /// markAnythingOverdefined - Mark the specified value overdefined. This
+ /// markOverdefined - Mark the specified value overdefined. This
/// works with both scalars and structs.
- void markAnythingOverdefined(Value *V) {
+ void markOverdefined(Value *V) {
if (auto *STy = dyn_cast<StructType>(V->getType()))
for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
markOverdefined(getStructValueState(V, i), V);
else
- markOverdefined(V);
+ markOverdefined(ValueState[V], V);
}
// isStructLatticeConstant - Return true if all the lattice values
@@ -513,12 +515,8 @@ private:
void visitCmpInst(CmpInst &I);
void visitExtractValueInst(ExtractValueInst &EVI);
void visitInsertValueInst(InsertValueInst &IVI);
- void visitLandingPadInst(LandingPadInst &I) { markAnythingOverdefined(&I); }
- void visitFuncletPadInst(FuncletPadInst &FPI) {
- markAnythingOverdefined(&FPI);
- }
void visitCatchSwitchInst(CatchSwitchInst &CPI) {
- markAnythingOverdefined(&CPI);
+ markOverdefined(&CPI);
visitTerminatorInst(CPI);
}
@@ -537,17 +535,11 @@ private:
void visitResumeInst (TerminatorInst &I) { /*returns void*/ }
void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ }
void visitFenceInst (FenceInst &I) { /*returns void*/ }
- void visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) {
- markAnythingOverdefined(&I);
- }
- void visitAtomicRMWInst (AtomicRMWInst &I) { markOverdefined(&I); }
- void visitAllocaInst (Instruction &I) { markOverdefined(&I); }
- void visitVAArgInst (Instruction &I) { markAnythingOverdefined(&I); }
-
void visitInstruction(Instruction &I) {
- // If a new instruction is added to LLVM that we don't handle.
+ // All the instructions we don't do any special handling for just
+ // go to overdefined.
DEBUG(dbgs() << "SCCP: Don't know how to handle: " << I << '\n');
- markAnythingOverdefined(&I); // Just in case
+ markOverdefined(&I);
}
};
@@ -602,14 +594,36 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
return;
}
- Succs[SI->findCaseValue(CI).getSuccessorIndex()] = true;
+ Succs[SI->findCaseValue(CI)->getSuccessorIndex()] = true;
return;
}
- // TODO: This could be improved if the operand is a [cast of a] BlockAddress.
- if (isa<IndirectBrInst>(&TI)) {
- // Just mark all destinations executable!
- Succs.assign(TI.getNumSuccessors(), true);
+ // In case of indirect branch and its address is a blockaddress, we mark
+ // the target as executable.
+ if (auto *IBR = dyn_cast<IndirectBrInst>(&TI)) {
+ // Casts are folded by visitCastInst.
+ LatticeVal IBRValue = getValueState(IBR->getAddress());
+ BlockAddress *Addr = IBRValue.getBlockAddress();
+ if (!Addr) { // Overdefined or unknown condition?
+ // All destinations are executable!
+ if (!IBRValue.isUnknown())
+ Succs.assign(TI.getNumSuccessors(), true);
+ return;
+ }
+
+ BasicBlock* T = Addr->getBasicBlock();
+ assert(Addr->getFunction() == T->getParent() &&
+ "Block address of a different function ?");
+ for (unsigned i = 0; i < IBR->getNumSuccessors(); ++i) {
+ // This is the target.
+ if (IBR->getDestination(i) == T) {
+ Succs[i] = true;
+ return;
+ }
+ }
+
+ // If we didn't find our destination in the IBR successor list, then we
+ // have undefined behavior. Its ok to assume no successor is executable.
return;
}
@@ -659,13 +673,21 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
if (!CI)
return !SCValue.isUnknown();
- return SI->findCaseValue(CI).getCaseSuccessor() == To;
+ return SI->findCaseValue(CI)->getCaseSuccessor() == To;
}
- // Just mark all destinations executable!
- // TODO: This could be improved if the operand is a [cast of a] BlockAddress.
- if (isa<IndirectBrInst>(TI))
- return true;
+ // In case of indirect branch and its address is a blockaddress, we mark
+ // the target as executable.
+ if (auto *IBR = dyn_cast<IndirectBrInst>(TI)) {
+ LatticeVal IBRValue = getValueState(IBR->getAddress());
+ BlockAddress *Addr = IBRValue.getBlockAddress();
+
+ if (!Addr)
+ return !IBRValue.isUnknown();
+
+ // At this point, the indirectbr is branching on a blockaddress.
+ return Addr->getBasicBlock() == To;
+ }
DEBUG(dbgs() << "Unknown terminator instruction: " << *TI << '\n');
llvm_unreachable("SCCP: Don't know how to handle this terminator!");
@@ -693,7 +715,7 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
// If this PN returns a struct, just mark the result overdefined.
// TODO: We could do a lot better than this if code actually uses this.
if (PN.getType()->isStructTy())
- return markAnythingOverdefined(&PN);
+ return markOverdefined(&PN);
if (getValueState(&PN).isOverdefined())
return; // Quick exit
@@ -803,7 +825,7 @@ void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
// If this returns a struct, mark all elements over defined, we don't track
// structs in structs.
if (EVI.getType()->isStructTy())
- return markAnythingOverdefined(&EVI);
+ return markOverdefined(&EVI);
// If this is extracting from more than one level of struct, we don't know.
if (EVI.getNumIndices() != 1)
@@ -828,7 +850,7 @@ void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) {
// If this has more than one index, we can't handle it, drive all results to
// undef.
if (IVI.getNumIndices() != 1)
- return markAnythingOverdefined(&IVI);
+ return markOverdefined(&IVI);
Value *Aggr = IVI.getAggregateOperand();
unsigned Idx = *IVI.idx_begin();
@@ -857,7 +879,7 @@ void SCCPSolver::visitSelectInst(SelectInst &I) {
// If this select returns a struct, just mark the result overdefined.
// TODO: We could do a lot better than this if code actually uses this.
if (I.getType()->isStructTy())
- return markAnythingOverdefined(&I);
+ return markOverdefined(&I);
LatticeVal CondValue = getValueState(I.getCondition());
if (CondValue.isUnknown())
@@ -910,9 +932,16 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) {
// Otherwise, one of our operands is overdefined. Try to produce something
// better than overdefined with some tricks.
-
- // If this is an AND or OR with 0 or -1, it doesn't matter that the other
- // operand is overdefined.
+ // If this is 0 / Y, it doesn't matter that the second operand is
+ // overdefined, and we can replace it with zero.
+ if (I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv)
+ if (V1State.isConstant() && V1State.getConstant()->isNullValue())
+ return markConstant(IV, &I, V1State.getConstant());
+
+ // If this is:
+ // -> AND/MUL with 0
+ // -> OR with -1
+ // it doesn't matter that the other operand is overdefined.
if (I.getOpcode() == Instruction::And || I.getOpcode() == Instruction::Mul ||
I.getOpcode() == Instruction::Or) {
LatticeVal *NonOverdefVal = nullptr;
@@ -934,7 +963,7 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) {
} else {
// X or -1 = -1
if (ConstantInt *CI = NonOverdefVal->getConstantInt())
- if (CI->isAllOnesValue())
+ if (CI->isMinusOne())
return markConstant(IV, &I, NonOverdefVal->getConstant());
}
}
@@ -1021,7 +1050,7 @@ void SCCPSolver::visitStoreInst(StoreInst &SI) {
void SCCPSolver::visitLoadInst(LoadInst &I) {
// If this load is of a struct, just mark the result overdefined.
if (I.getType()->isStructTy())
- return markAnythingOverdefined(&I);
+ return markOverdefined(&I);
LatticeVal PtrVal = getValueState(I.getOperand(0));
if (PtrVal.isUnknown()) return; // The pointer is not resolved yet!
@@ -1078,7 +1107,7 @@ CallOverdefined:
// Otherwise, if we have a single return value case, and if the function is
// a declaration, maybe we can constant fold it.
if (F && F->isDeclaration() && !I->getType()->isStructTy() &&
- canConstantFoldCallTo(F)) {
+ canConstantFoldCallTo(CS, F)) {
SmallVector<Constant*, 8> Operands;
for (CallSite::arg_iterator AI = CS.arg_begin(), E = CS.arg_end();
@@ -1098,7 +1127,7 @@ CallOverdefined:
// If we can constant fold this, mark the result of the call as a
// constant.
- if (Constant *C = ConstantFoldCall(F, Operands, TLI)) {
+ if (Constant *C = ConstantFoldCall(CS, F, Operands, TLI)) {
// call -> undef.
if (isa<UndefValue>(C))
return;
@@ -1107,7 +1136,7 @@ CallOverdefined:
}
// Otherwise, we don't know anything about this call, mark it overdefined.
- return markAnythingOverdefined(I);
+ return markOverdefined(I);
}
// If this is a local function that doesn't have its address taken, mark its
@@ -1483,6 +1512,31 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
return true;
}
+ if (auto *IBR = dyn_cast<IndirectBrInst>(TI)) {
+ // Indirect branch with no successor ?. Its ok to assume it branches
+ // to no target.
+ if (IBR->getNumSuccessors() < 1)
+ continue;
+
+ if (!getValueState(IBR->getAddress()).isUnknown())
+ continue;
+
+ // If the input to SCCP is actually branch on undef, fix the undef to
+ // the first successor of the indirect branch.
+ if (isa<UndefValue>(IBR->getAddress())) {
+ IBR->setAddress(BlockAddress::get(IBR->getSuccessor(0)));
+ markEdgeExecutable(&BB, IBR->getSuccessor(0));
+ return true;
+ }
+
+ // Otherwise, it is a branch on a symbolic value which is currently
+ // considered to be undef. Handle this by forcing the input value to the
+ // branch to the first successor.
+ markForcedConstant(IBR->getAddress(),
+ BlockAddress::get(IBR->getSuccessor(0)));
+ return true;
+ }
+
if (auto *SI = dyn_cast<SwitchInst>(TI)) {
if (!SI->getNumCases() || !getValueState(SI->getCondition()).isUnknown())
continue;
@@ -1490,12 +1544,12 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
// If the input to SCCP is actually switch on undef, fix the undef to
// the first constant.
if (isa<UndefValue>(SI->getCondition())) {
- SI->setCondition(SI->case_begin().getCaseValue());
- markEdgeExecutable(&BB, SI->case_begin().getCaseSuccessor());
+ SI->setCondition(SI->case_begin()->getCaseValue());
+ markEdgeExecutable(&BB, SI->case_begin()->getCaseSuccessor());
return true;
}
- markForcedConstant(SI->getCondition(), SI->case_begin().getCaseValue());
+ markForcedConstant(SI->getCondition(), SI->case_begin()->getCaseValue());
return true;
}
}
@@ -1545,7 +1599,7 @@ static bool runSCCP(Function &F, const DataLayout &DL,
// Mark all arguments to the function as being overdefined.
for (Argument &AI : F.args())
- Solver.markAnythingOverdefined(&AI);
+ Solver.markOverdefined(&AI);
// Solve for constants.
bool ResolvedUndefs = true;
@@ -1715,8 +1769,9 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
// arguments and return value aggressively, and can assume it is not called
// unless we see evidence to the contrary.
if (F.hasLocalLinkage()) {
- if (AddressIsTaken(&F))
+ if (F.hasAddressTaken()) {
AddressTakenFunctions.insert(&F);
+ }
else {
Solver.AddArgumentTrackedFunction(&F);
continue;
@@ -1728,14 +1783,15 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
// Assume nothing about the incoming arguments.
for (Argument &AI : F.args())
- Solver.markAnythingOverdefined(&AI);
+ Solver.markOverdefined(&AI);
}
// Loop over global variables. We inform the solver about any internal global
// variables that do not have their 'addresses taken'. If they don't have
// their addresses taken, we can propagate constants through them.
for (GlobalVariable &G : M.globals())
- if (!G.isConstant() && G.hasLocalLinkage() && !AddressIsTaken(&G))
+ if (!G.isConstant() && G.hasLocalLinkage() &&
+ G.hasDefinitiveInitializer() && !AddressIsTaken(&G))
Solver.TrackValueOfGlobalVariable(&G);
// Solve for constants.
@@ -1760,15 +1816,11 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
if (F.isDeclaration())
continue;
- if (Solver.isBlockExecutable(&F.front())) {
+ if (Solver.isBlockExecutable(&F.front()))
for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E;
- ++AI) {
- if (AI->use_empty())
- continue;
- if (tryToReplaceWithConstant(Solver, &*AI))
+ ++AI)
+ if (!AI->use_empty() && tryToReplaceWithConstant(Solver, &*AI))
++IPNumArgsElimed;
- }
- }
for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
if (!Solver.isBlockExecutable(&*BB)) {
@@ -1817,32 +1869,9 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
if (!I) continue;
bool Folded = ConstantFoldTerminator(I->getParent());
- if (!Folded) {
- // The constant folder may not have been able to fold the terminator
- // if this is a branch or switch on undef. Fold it manually as a
- // branch to the first successor.
-#ifndef NDEBUG
- if (auto *BI = dyn_cast<BranchInst>(I)) {
- assert(BI->isConditional() && isa<UndefValue>(BI->getCondition()) &&
- "Branch should be foldable!");
- } else if (auto *SI = dyn_cast<SwitchInst>(I)) {
- assert(isa<UndefValue>(SI->getCondition()) && "Switch should fold");
- } else {
- llvm_unreachable("Didn't fold away reference to block!");
- }
-#endif
-
- // Make this an uncond branch to the first successor.
- TerminatorInst *TI = I->getParent()->getTerminator();
- BranchInst::Create(TI->getSuccessor(0), TI);
-
- // Remove entries in successor phi nodes to remove edges.
- for (unsigned i = 1, e = TI->getNumSuccessors(); i != e; ++i)
- TI->getSuccessor(i)->removePredecessor(TI->getParent());
-
- // Remove the old terminator.
- TI->eraseFromParent();
- }
+ assert(Folded &&
+ "Expect TermInst on constantint or blockaddress to be folded");
+ (void) Folded;
}
// Finally, delete the basic block.
diff --git a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
index bfcb155..b9cee5b 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -25,6 +25,7 @@
#include "llvm/Transforms/Scalar/SROA.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
@@ -325,7 +326,7 @@ private:
/// partition.
uint64_t BeginOffset, EndOffset;
- /// \brief The start end end iterators of this partition.
+ /// \brief The start and end iterators of this partition.
iterator SI, SJ;
/// \brief A collection of split slice tails overlapping the partition.
@@ -1251,7 +1252,7 @@ static bool isSafeSelectToSpeculate(SelectInst &SI) {
if (!LI || !LI->isSimple())
return false;
- // Both operands to the select need to be dereferencable, either
+ // Both operands to the select need to be dereferenceable, either
// absolutely (e.g. allocas) or at this point because we can see other
// accesses to it.
if (!isSafeToLoadUnconditionally(TValue, LI->getAlignment(), DL, LI))
@@ -1636,8 +1637,17 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
return cast<PointerType>(NewTy)->getPointerAddressSpace() ==
cast<PointerType>(OldTy)->getPointerAddressSpace();
}
- if (NewTy->isIntegerTy() || OldTy->isIntegerTy())
- return true;
+
+ // We can convert integers to integral pointers, but not to non-integral
+ // pointers.
+ if (OldTy->isIntegerTy())
+ return !DL.isNonIntegralPointerType(NewTy);
+
+ // We can convert integral pointers to integers, but non-integral pointers
+ // need to remain pointers.
+ if (!DL.isNonIntegralPointerType(OldTy))
+ return NewTy->isIntegerTy();
+
return false;
}
@@ -1663,8 +1673,7 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
// See if we need inttoptr for this type pair. A cast involving both scalars
// and vectors requires and additional bitcast.
- if (OldTy->getScalarType()->isIntegerTy() &&
- NewTy->getScalarType()->isPointerTy()) {
+ if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
// Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
if (OldTy->isVectorTy() && !NewTy->isVectorTy())
return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
@@ -1680,8 +1689,7 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
// See if we need ptrtoint for this type pair. A cast involving both scalars
// and vectors requires and additional bitcast.
- if (OldTy->getScalarType()->isPointerTy() &&
- NewTy->getScalarType()->isIntegerTy()) {
+ if (OldTy->isPtrOrPtrVectorTy() && NewTy->isIntOrIntVectorTy()) {
// Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
if (OldTy->isVectorTy() && !NewTy->isVectorTy())
return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
@@ -1825,6 +1833,7 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
// Rank the remaining candidate vector types. This is easy because we know
// they're all integer vectors. We sort by ascending number of elements.
auto RankVectorTypes = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
+ (void)DL;
assert(DL.getTypeSizeInBits(RHSTy) == DL.getTypeSizeInBits(LHSTy) &&
"Cannot have vector types of different sizes!");
assert(RHSTy->getElementType()->isIntegerTy() &&
@@ -2185,8 +2194,8 @@ class llvm::sroa::AllocaSliceRewriter
Instruction *OldPtr;
// Track post-rewrite users which are PHI nodes and Selects.
- SmallPtrSetImpl<PHINode *> &PHIUsers;
- SmallPtrSetImpl<SelectInst *> &SelectUsers;
+ SmallSetVector<PHINode *, 8> &PHIUsers;
+ SmallSetVector<SelectInst *, 8> &SelectUsers;
// Utility IR builder, whose name prefix is setup for each visited use, and
// the insertion point is set to point to the user.
@@ -2198,8 +2207,8 @@ public:
uint64_t NewAllocaBeginOffset,
uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
VectorType *PromotableVecTy,
- SmallPtrSetImpl<PHINode *> &PHIUsers,
- SmallPtrSetImpl<SelectInst *> &SelectUsers)
+ SmallSetVector<PHINode *, 8> &PHIUsers,
+ SmallSetVector<SelectInst *, 8> &SelectUsers)
: DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
NewAllocaBeginOffset(NewAllocaBeginOffset),
NewAllocaEndOffset(NewAllocaEndOffset),
@@ -2294,7 +2303,8 @@ private:
#endif
return getAdjustedPtr(IRB, DL, &NewAI,
- APInt(DL.getPointerSizeInBits(), Offset), PointerTy,
+ APInt(DL.getPointerTypeSizeInBits(PointerTy), Offset),
+ PointerTy,
#ifndef NDEBUG
Twine(OldName) + "."
#else
@@ -2369,6 +2379,8 @@ private:
Value *OldOp = LI.getOperand(0);
assert(OldOp == OldPtr);
+ unsigned AS = LI.getPointerAddressSpace();
+
Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
: LI.getType();
const bool IsLoadPastEnd = DL.getTypeStoreSize(TargetTy) > SliceSize;
@@ -2386,7 +2398,22 @@ private:
LoadInst *NewLI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
LI.isVolatile(), LI.getName());
if (LI.isVolatile())
- NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope());
+ NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
+
+ // Any !nonnull metadata or !range metadata on the old load is also valid
+ // on the new load. This is even true in some cases even when the loads
+ // are different types, for example by mapping !nonnull metadata to
+ // !range metadata by modeling the null pointer constant converted to the
+ // integer type.
+ // FIXME: Add support for range metadata here. Currently the utilities
+ // for this don't propagate range metadata in trivial cases from one
+ // integer load to another, don't handle non-addrspace-0 null pointers
+ // correctly, and don't have any support for mapping ranges as the
+ // integer type becomes winder or narrower.
+ if (MDNode *N = LI.getMetadata(LLVMContext::MD_nonnull))
+ copyNonnullMetadata(LI, N, *NewLI);
+
+ // Try to preserve nonnull metadata
V = NewLI;
// If this is an integer load past the end of the slice (which means the
@@ -2401,12 +2428,12 @@ private:
"endian_shift");
}
} else {
- Type *LTy = TargetTy->getPointerTo();
+ Type *LTy = TargetTy->getPointerTo(AS);
LoadInst *NewLI = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy),
getSliceAlign(TargetTy),
LI.isVolatile(), LI.getName());
if (LI.isVolatile())
- NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope());
+ NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
V = NewLI;
IsPtrAdjusted = true;
@@ -2429,12 +2456,12 @@ private:
// the computed value, and then replace the placeholder with LI, leaving
// LI only used for this computation.
Value *Placeholder =
- new LoadInst(UndefValue::get(LI.getType()->getPointerTo()));
+ new LoadInst(UndefValue::get(LI.getType()->getPointerTo(AS)));
V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
"insert");
LI.replaceAllUsesWith(V);
Placeholder->replaceAllUsesWith(&LI);
- delete Placeholder;
+ Placeholder->deleteValue();
} else {
LI.replaceAllUsesWith(V);
}
@@ -2542,13 +2569,14 @@ private:
NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
SI.isVolatile());
} else {
- Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo());
+ unsigned AS = SI.getPointerAddressSpace();
+ Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo(AS));
NewSI = IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(V->getType()),
SI.isVolatile());
}
NewSI->copyMetadata(SI, LLVMContext::MD_mem_parallel_loop_access);
if (SI.isVolatile())
- NewSI->setAtomic(SI.getOrdering(), SI.getSynchScope());
+ NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
Pass.DeadInsts.insert(&SI);
deleteIfTriviallyDead(OldOp);
@@ -3561,10 +3589,11 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
int Idx = 0, Size = Offsets.Splits.size();
for (;;) {
auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
- auto *PartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace());
+ auto AS = LI->getPointerAddressSpace();
+ auto *PartPtrTy = PartTy->getPointerTo(AS);
LoadInst *PLoad = IRB.CreateAlignedLoad(
getAdjustedPtr(IRB, DL, BasePtr,
- APInt(DL.getPointerSizeInBits(), PartOffset),
+ APInt(DL.getPointerSizeInBits(AS), PartOffset),
PartPtrTy, BasePtr->getName() + "."),
getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
LI->getName());
@@ -3616,10 +3645,12 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
auto *PartPtrTy =
PLoad->getType()->getPointerTo(SI->getPointerAddressSpace());
+ auto AS = SI->getPointerAddressSpace();
StoreInst *PStore = IRB.CreateAlignedStore(
- PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr,
- APInt(DL.getPointerSizeInBits(), PartOffset),
- PartPtrTy, StoreBasePtr->getName() + "."),
+ PLoad,
+ getAdjustedPtr(IRB, DL, StoreBasePtr,
+ APInt(DL.getPointerSizeInBits(AS), PartOffset),
+ PartPtrTy, StoreBasePtr->getName() + "."),
getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false);
PStore->copyMetadata(*LI, LLVMContext::MD_mem_parallel_loop_access);
DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
@@ -3688,7 +3719,8 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
int Idx = 0, Size = Offsets.Splits.size();
for (;;) {
auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
- auto *PartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace());
+ auto *LoadPartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace());
+ auto *StorePartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace());
// Either lookup a split load or create one.
LoadInst *PLoad;
@@ -3696,20 +3728,23 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
PLoad = (*SplitLoads)[Idx];
} else {
IRB.SetInsertPoint(LI);
+ auto AS = LI->getPointerAddressSpace();
PLoad = IRB.CreateAlignedLoad(
getAdjustedPtr(IRB, DL, LoadBasePtr,
- APInt(DL.getPointerSizeInBits(), PartOffset),
- PartPtrTy, LoadBasePtr->getName() + "."),
+ APInt(DL.getPointerSizeInBits(AS), PartOffset),
+ LoadPartPtrTy, LoadBasePtr->getName() + "."),
getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
LI->getName());
}
// And store this partition.
IRB.SetInsertPoint(SI);
+ auto AS = SI->getPointerAddressSpace();
StoreInst *PStore = IRB.CreateAlignedStore(
- PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr,
- APInt(DL.getPointerSizeInBits(), PartOffset),
- PartPtrTy, StoreBasePtr->getName() + "."),
+ PLoad,
+ getAdjustedPtr(IRB, DL, StoreBasePtr,
+ APInt(DL.getPointerSizeInBits(AS), PartOffset),
+ StorePartPtrTy, StoreBasePtr->getName() + "."),
getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false);
// Now build a new slice for the alloca.
@@ -3857,7 +3892,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
if (Alignment <= DL.getABITypeAlignment(SliceTy))
Alignment = 0;
NewAI = new AllocaInst(
- SliceTy, nullptr, Alignment,
+ SliceTy, AI.getType()->getAddressSpace(), nullptr, Alignment,
AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI);
++NumNewAllocas;
}
@@ -3871,8 +3906,8 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
// fact scheduled for promotion.
unsigned PPWOldSize = PostPromotionWorklist.size();
unsigned NumUses = 0;
- SmallPtrSet<PHINode *, 8> PHIUsers;
- SmallPtrSet<SelectInst *, 8> SelectUsers;
+ SmallSetVector<PHINode *, 8> PHIUsers;
+ SmallSetVector<SelectInst *, 8> SelectUsers;
AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(),
P.endOffset(), IsIntegerPromotable, VecTy,
@@ -3888,24 +3923,20 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
}
NumAllocaPartitionUses += NumUses;
- MaxUsesPerAllocaPartition =
- std::max<unsigned>(NumUses, MaxUsesPerAllocaPartition);
+ MaxUsesPerAllocaPartition.updateMax(NumUses);
// Now that we've processed all the slices in the new partition, check if any
// PHIs or Selects would block promotion.
- for (SmallPtrSetImpl<PHINode *>::iterator I = PHIUsers.begin(),
- E = PHIUsers.end();
- I != E; ++I)
- if (!isSafePHIToSpeculate(**I)) {
+ for (PHINode *PHI : PHIUsers)
+ if (!isSafePHIToSpeculate(*PHI)) {
Promotable = false;
PHIUsers.clear();
SelectUsers.clear();
break;
}
- for (SmallPtrSetImpl<SelectInst *>::iterator I = SelectUsers.begin(),
- E = SelectUsers.end();
- I != E; ++I)
- if (!isSafeSelectToSpeculate(**I)) {
+
+ for (SelectInst *Sel : SelectUsers)
+ if (!isSafeSelectToSpeculate(*Sel)) {
Promotable = false;
PHIUsers.clear();
SelectUsers.clear();
@@ -4009,8 +4040,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
}
NumAllocaPartitions += NumPartitions;
- MaxPartitionsPerAlloca =
- std::max<unsigned>(NumPartitions, MaxPartitionsPerAlloca);
+ MaxPartitionsPerAlloca.updateMax(NumPartitions);
// Migrate debug information from the old alloca to the new alloca(s)
// and the individual partitions.
@@ -4184,7 +4214,7 @@ bool SROA::promoteAllocas(Function &F) {
NumPromoted += PromotableAllocas.size();
DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
- PromoteMemToReg(PromotableAllocas, *DT, nullptr, AC);
+ PromoteMemToReg(PromotableAllocas, *DT, AC);
PromotableAllocas.clear();
return true;
}
@@ -4234,9 +4264,8 @@ PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
if (!Changed)
return PreservedAnalyses::all();
- // FIXME: Even when promoting allocas we should preserve some abstract set of
- // CFG-specific analyses.
PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
PA.preserve<GlobalsAA>();
return PA;
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
index afe7483..ce6f93e 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -20,11 +20,12 @@
#include "llvm/Analysis/Passes.h"
#include "llvm/Analysis/ScopedNoAliasAA.h"
#include "llvm/Analysis/TypeBasedAliasAnalysis.h"
-#include "llvm/Transforms/Scalar/GVN.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/Verifier.h"
#include "llvm/InitializePasses.h"
-#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
using namespace llvm;
@@ -43,13 +44,15 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializeDSELegacyPassPass(Registry);
initializeGuardWideningLegacyPassPass(Registry);
initializeGVNLegacyPassPass(Registry);
- initializeNewGVNPass(Registry);
+ initializeNewGVNLegacyPassPass(Registry);
initializeEarlyCSELegacyPassPass(Registry);
initializeEarlyCSEMemSSALegacyPassPass(Registry);
initializeGVNHoistLegacyPassPass(Registry);
+ initializeGVNSinkLegacyPassPass(Registry);
initializeFlattenCFGPassPass(Registry);
initializeInductiveRangeCheckEliminationPass(Registry);
initializeIndVarSimplifyLegacyPassPass(Registry);
+ initializeInferAddressSpacesPass(Registry);
initializeJumpThreadingPass(Registry);
initializeLegacyLICMPassPass(Registry);
initializeLegacyLoopSinkPassPass(Registry);
@@ -58,6 +61,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializeLoopAccessLegacyAnalysisPass(Registry);
initializeLoopInstSimplifyLegacyPassPass(Registry);
initializeLoopInterchangePass(Registry);
+ initializeLoopPredicationLegacyPassPass(Registry);
initializeLoopRotateLegacyPassPass(Registry);
initializeLoopStrengthReducePass(Registry);
initializeLoopRerollPass(Registry);
@@ -79,13 +83,14 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
initializeIPSCCPLegacyPassPass(Registry);
initializeSROALegacyPassPass(Registry);
initializeCFGSimplifyPassPass(Registry);
+ initializeLateCFGSimplifyPassPass(Registry);
initializeStructurizeCFGPass(Registry);
+ initializeSimpleLoopUnswitchLegacyPassPass(Registry);
initializeSinkingLegacyPassPass(Registry);
initializeTailCallElimPass(Registry);
initializeSeparateConstOffsetFromGEPPass(Registry);
initializeSpeculativeExecutionLegacyPassPass(Registry);
initializeStraightLineStrengthReducePass(Registry);
- initializeLoadCombinePass(Registry);
initializePlaceBackedgeSafepointsImplPass(Registry);
initializePlaceSafepointsPass(Registry);
initializeFloat2IntLegacyPassPass(Registry);
@@ -115,6 +120,10 @@ void LLVMAddCFGSimplificationPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createCFGSimplificationPass());
}
+void LLVMAddLateCFGSimplificationPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLateCFGSimplificationPass());
+}
+
void LLVMAddDeadStoreEliminationPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createDeadStoreEliminationPass());
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp
index 39969e2..d11855f 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -14,12 +14,12 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/Transforms/Scalar.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
using namespace llvm;
@@ -520,12 +520,25 @@ bool Scalarizer::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
unsigned NumElems = VT->getNumElements();
unsigned NumIndices = GEPI.getNumIndices();
- Scatterer Base = scatter(&GEPI, GEPI.getOperand(0));
+ // The base pointer might be scalar even if it's a vector GEP. In those cases,
+ // splat the pointer into a vector value, and scatter that vector.
+ Value *Op0 = GEPI.getOperand(0);
+ if (!Op0->getType()->isVectorTy())
+ Op0 = Builder.CreateVectorSplat(NumElems, Op0);
+ Scatterer Base = scatter(&GEPI, Op0);
SmallVector<Scatterer, 8> Ops;
Ops.resize(NumIndices);
- for (unsigned I = 0; I < NumIndices; ++I)
- Ops[I] = scatter(&GEPI, GEPI.getOperand(I + 1));
+ for (unsigned I = 0; I < NumIndices; ++I) {
+ Value *Op = GEPI.getOperand(I + 1);
+
+ // The indices might be scalars even if it's a vector GEP. In those cases,
+ // splat the scalar into a vector value, and scatter that vector.
+ if (!Op->getType()->isVectorTy())
+ Op = Builder.CreateVectorSplat(NumElems, Op);
+
+ Ops[I] = scatter(&GEPI, Op);
+ }
ValueVector Res;
Res.resize(NumElems);
diff --git a/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 4d59453..84675f4 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -156,27 +156,27 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
-#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetSubtargetInfo.h"
-#include "llvm/IR/IRBuilder.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;
using namespace llvm::PatternMatch;
@@ -1138,7 +1138,7 @@ bool SeparateConstOffsetFromGEP::reuniteExts(Instruction *I) {
// Add I to DominatingExprs if it's an add/sub that can't sign overflow.
if (match(I, m_NSWAdd(m_Value(LHS), m_Value(RHS))) ||
match(I, m_NSWSub(m_Value(LHS), m_Value(RHS)))) {
- if (isKnownNotFullPoison(I)) {
+ if (programUndefinedIfFullPoison(I)) {
const SCEV *Key =
SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
DominatingExprs[Key].push_back(I);
diff --git a/contrib/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
new file mode 100644
index 0000000..aaab585
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -0,0 +1,808 @@
+//===- SimpleLoopUnswitch.cpp - Hoist loop-invariant control flow ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GenericDomTree.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <utility>
+
+#define DEBUG_TYPE "simple-loop-unswitch"
+
+using namespace llvm;
+
+STATISTIC(NumBranches, "Number of branches unswitched");
+STATISTIC(NumSwitches, "Number of switches unswitched");
+STATISTIC(NumTrivial, "Number of unswitches that are trivial");
+
+static void replaceLoopUsesWithConstant(Loop &L, Value &LIC,
+ Constant &Replacement) {
+ assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?");
+
+ // Replace uses of LIC in the loop with the given constant.
+ for (auto UI = LIC.use_begin(), UE = LIC.use_end(); UI != UE;) {
+ // Grab the use and walk past it so we can clobber it in the use list.
+ Use *U = &*UI++;
+ Instruction *UserI = dyn_cast<Instruction>(U->getUser());
+ if (!UserI || !L.contains(UserI))
+ continue;
+
+ // Replace this use within the loop body.
+ *U = &Replacement;
+ }
+}
+
+/// Update the dominator tree after removing one exiting predecessor of a loop
+/// exit block.
+static void updateLoopExitIDom(BasicBlock *LoopExitBB, Loop &L,
+ DominatorTree &DT) {
+ assert(pred_begin(LoopExitBB) != pred_end(LoopExitBB) &&
+ "Cannot have empty predecessors of the loop exit block if we split "
+ "off a block to unswitch!");
+
+ BasicBlock *IDom = *pred_begin(LoopExitBB);
+ // Walk all of the other predecessors finding the nearest common dominator
+ // until all predecessors are covered or we reach the loop header. The loop
+ // header necessarily dominates all loop exit blocks in loop simplified form
+ // so we can early-exit the moment we hit that block.
+ for (auto PI = std::next(pred_begin(LoopExitBB)), PE = pred_end(LoopExitBB);
+ PI != PE && IDom != L.getHeader(); ++PI)
+ IDom = DT.findNearestCommonDominator(IDom, *PI);
+
+ DT.changeImmediateDominator(LoopExitBB, IDom);
+}
+
+/// Update the dominator tree after unswitching a particular former exit block.
+///
+/// This handles the full update of the dominator tree after hoisting a block
+/// that previously was an exit block (or split off of an exit block) up to be
+/// reached from the new immediate dominator of the preheader.
+///
+/// The common case is simple -- we just move the unswitched block to have an
+/// immediate dominator of the old preheader. But in complex cases, there may
+/// be other blocks reachable from the unswitched block that are immediately
+/// dominated by some node between the unswitched one and the old preheader.
+/// All of these also need to be hoisted in the dominator tree. We also want to
+/// minimize queries to the dominator tree because each step of this
+/// invalidates any DFS numbers that would make queries fast.
+static void updateDTAfterUnswitch(BasicBlock *UnswitchedBB, BasicBlock *OldPH,
+ DominatorTree &DT) {
+ DomTreeNode *OldPHNode = DT[OldPH];
+ DomTreeNode *UnswitchedNode = DT[UnswitchedBB];
+ // If the dominator tree has already been updated for this unswitched node,
+ // we're done. This makes it easier to use this routine if there are multiple
+ // paths to the same unswitched destination.
+ if (UnswitchedNode->getIDom() == OldPHNode)
+ return;
+
+ // First collect the domtree nodes that we are hoisting over. These are the
+ // set of nodes which may have children that need to be hoisted as well.
+ SmallPtrSet<DomTreeNode *, 4> DomChain;
+ for (auto *IDom = UnswitchedNode->getIDom(); IDom != OldPHNode;
+ IDom = IDom->getIDom())
+ DomChain.insert(IDom);
+
+ // The unswitched block ends up immediately dominated by the old preheader --
+ // regardless of whether it is the loop exit block or split off of the loop
+ // exit block.
+ DT.changeImmediateDominator(UnswitchedNode, OldPHNode);
+
+ // For everything that moves up the dominator tree, we need to examine the
+ // dominator frontier to see if it additionally should move up the dominator
+ // tree. This lambda appends the dominator frontier for a node on the
+ // worklist.
+ //
+ // Note that we don't currently use the IDFCalculator here for two reasons:
+ // 1) It computes dominator tree levels for the entire function on each run
+ // of 'compute'. While this isn't terrible, given that we expect to update
+ // relatively small subtrees of the domtree, it isn't necessarily the right
+ // tradeoff.
+ // 2) The interface doesn't fit this usage well. It doesn't operate in
+ // append-only, and builds several sets that we don't need.
+ //
+ // FIXME: Neither of these issues are a big deal and could be addressed with
+ // some amount of refactoring of IDFCalculator. That would allow us to share
+ // the core logic here (which is solving the same core problem).
+ SmallSetVector<BasicBlock *, 4> Worklist;
+ SmallVector<DomTreeNode *, 4> DomNodes;
+ SmallPtrSet<BasicBlock *, 4> DomSet;
+ auto AppendDomFrontier = [&](DomTreeNode *Node) {
+ assert(DomNodes.empty() && "Must start with no dominator nodes.");
+ assert(DomSet.empty() && "Must start with an empty dominator set.");
+
+ // First flatten this subtree into sequence of nodes by doing a pre-order
+ // walk.
+ DomNodes.push_back(Node);
+ // We intentionally re-evaluate the size as each node can add new children.
+ // Because this is a tree walk, this cannot add any duplicates.
+ for (int i = 0; i < (int)DomNodes.size(); ++i)
+ DomNodes.insert(DomNodes.end(), DomNodes[i]->begin(), DomNodes[i]->end());
+
+ // Now create a set of the basic blocks so we can quickly test for
+ // dominated successors. We could in theory use the DFS numbers of the
+ // dominator tree for this, but we want this to remain predictably fast
+ // even while we mutate the dominator tree in ways that would invalidate
+ // the DFS numbering.
+ for (DomTreeNode *InnerN : DomNodes)
+ DomSet.insert(InnerN->getBlock());
+
+ // Now re-walk the nodes, appending every successor of every node that isn't
+ // in the set. Note that we don't append the node itself, even though if it
+ // is a successor it does not strictly dominate itself and thus it would be
+ // part of the dominance frontier. The reason we don't append it is that
+ // the node passed in came *from* the worklist and so it has already been
+ // processed.
+ for (DomTreeNode *InnerN : DomNodes)
+ for (BasicBlock *SuccBB : successors(InnerN->getBlock()))
+ if (!DomSet.count(SuccBB))
+ Worklist.insert(SuccBB);
+
+ DomNodes.clear();
+ DomSet.clear();
+ };
+
+ // Append the initial dom frontier nodes.
+ AppendDomFrontier(UnswitchedNode);
+
+ // Walk the worklist. We grow the list in the loop and so must recompute size.
+ for (int i = 0; i < (int)Worklist.size(); ++i) {
+ auto *BB = Worklist[i];
+
+ DomTreeNode *Node = DT[BB];
+ assert(!DomChain.count(Node) &&
+ "Cannot be dominated by a block you can reach!");
+
+ // If this block had an immediate dominator somewhere in the chain
+ // we hoisted over, then its position in the domtree needs to move as it is
+ // reachable from a node hoisted over this chain.
+ if (!DomChain.count(Node->getIDom()))
+ continue;
+
+ DT.changeImmediateDominator(Node, OldPHNode);
+
+ // Now add this node's dominator frontier to the worklist as well.
+ AppendDomFrontier(Node);
+ }
+}
+
+/// Check that all the LCSSA PHI nodes in the loop exit block have trivial
+/// incoming values along this edge.
+static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB,
+ BasicBlock &ExitBB) {
+ for (Instruction &I : ExitBB) {
+ auto *PN = dyn_cast<PHINode>(&I);
+ if (!PN)
+ // No more PHIs to check.
+ return true;
+
+ // If the incoming value for this edge isn't loop invariant the unswitch
+ // won't be trivial.
+ if (!L.isLoopInvariant(PN->getIncomingValueForBlock(&ExitingBB)))
+ return false;
+ }
+ llvm_unreachable("Basic blocks should never be empty!");
+}
+
+/// Rewrite the PHI nodes in an unswitched loop exit basic block.
+///
+/// Requires that the loop exit and unswitched basic block are the same, and
+/// that the exiting block was a unique predecessor of that block. Rewrites the
+/// PHI nodes in that block such that what were LCSSA PHI nodes become trivial
+/// PHI nodes from the old preheader that now contains the unswitched
+/// terminator.
+static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB,
+ BasicBlock &OldExitingBB,
+ BasicBlock &OldPH) {
+ for (Instruction &I : UnswitchedBB) {
+ auto *PN = dyn_cast<PHINode>(&I);
+ if (!PN)
+ // No more PHIs to check.
+ break;
+
+ // When the loop exit is directly unswitched we just need to update the
+ // incoming basic block. We loop to handle weird cases with repeated
+ // incoming blocks, but expect to typically only have one operand here.
+ for (auto i : seq<int>(0, PN->getNumOperands())) {
+ assert(PN->getIncomingBlock(i) == &OldExitingBB &&
+ "Found incoming block different from unique predecessor!");
+ PN->setIncomingBlock(i, &OldPH);
+ }
+ }
+}
+
+/// Rewrite the PHI nodes in the loop exit basic block and the split off
+/// unswitched block.
+///
+/// Because the exit block remains an exit from the loop, this rewrites the
+/// LCSSA PHI nodes in it to remove the unswitched edge and introduces PHI
+/// nodes into the unswitched basic block to select between the value in the
+/// old preheader and the loop exit.
+static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB,
+ BasicBlock &UnswitchedBB,
+ BasicBlock &OldExitingBB,
+ BasicBlock &OldPH) {
+ assert(&ExitBB != &UnswitchedBB &&
+ "Must have different loop exit and unswitched blocks!");
+ Instruction *InsertPt = &*UnswitchedBB.begin();
+ for (Instruction &I : ExitBB) {
+ auto *PN = dyn_cast<PHINode>(&I);
+ if (!PN)
+ // No more PHIs to check.
+ break;
+
+ auto *NewPN = PHINode::Create(PN->getType(), /*NumReservedValues*/ 2,
+ PN->getName() + ".split", InsertPt);
+
+ // Walk backwards over the old PHI node's inputs to minimize the cost of
+ // removing each one. We have to do this weird loop manually so that we
+ // create the same number of new incoming edges in the new PHI as we expect
+ // each case-based edge to be included in the unswitched switch in some
+ // cases.
+ // FIXME: This is really, really gross. It would be much cleaner if LLVM
+ // allowed us to create a single entry for a predecessor block without
+ // having separate entries for each "edge" even though these edges are
+ // required to produce identical results.
+ for (int i = PN->getNumIncomingValues() - 1; i >= 0; --i) {
+ if (PN->getIncomingBlock(i) != &OldExitingBB)
+ continue;
+
+ Value *Incoming = PN->removeIncomingValue(i);
+ NewPN->addIncoming(Incoming, &OldPH);
+ }
+
+ // Now replace the old PHI with the new one and wire the old one in as an
+ // input to the new one.
+ PN->replaceAllUsesWith(NewPN);
+ NewPN->addIncoming(PN, &ExitBB);
+ }
+}
+
+/// Unswitch a trivial branch if the condition is loop invariant.
+///
+/// This routine should only be called when loop code leading to the branch has
+/// been validated as trivial (no side effects). This routine checks if the
+/// condition is invariant and one of the successors is a loop exit. This
+/// allows us to unswitch without duplicating the loop, making it trivial.
+///
+/// If this routine fails to unswitch the branch it returns false.
+///
+/// If the branch can be unswitched, this routine splits the preheader and
+/// hoists the branch above that split. Preserves loop simplified form
+/// (splitting the exit block as necessary). It simplifies the branch within
+/// the loop to an unconditional branch but doesn't remove it entirely. Further
+/// cleanup can be done with some simplify-cfg like pass.
+static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
+ LoopInfo &LI) {
+ assert(BI.isConditional() && "Can only unswitch a conditional branch!");
+ DEBUG(dbgs() << " Trying to unswitch branch: " << BI << "\n");
+
+ Value *LoopCond = BI.getCondition();
+
+ // Need a trivial loop condition to unswitch.
+ if (!L.isLoopInvariant(LoopCond))
+ return false;
+
+ // FIXME: We should compute this once at the start and update it!
+ SmallVector<BasicBlock *, 16> ExitBlocks;
+ L.getExitBlocks(ExitBlocks);
+ SmallPtrSet<BasicBlock *, 16> ExitBlockSet(ExitBlocks.begin(),
+ ExitBlocks.end());
+
+ // Check to see if a successor of the branch is guaranteed to
+ // exit through a unique exit block without having any
+ // side-effects. If so, determine the value of Cond that causes
+ // it to do this.
+ ConstantInt *CondVal = ConstantInt::getTrue(BI.getContext());
+ ConstantInt *Replacement = ConstantInt::getFalse(BI.getContext());
+ int LoopExitSuccIdx = 0;
+ auto *LoopExitBB = BI.getSuccessor(0);
+ if (!ExitBlockSet.count(LoopExitBB)) {
+ std::swap(CondVal, Replacement);
+ LoopExitSuccIdx = 1;
+ LoopExitBB = BI.getSuccessor(1);
+ if (!ExitBlockSet.count(LoopExitBB))
+ return false;
+ }
+ auto *ContinueBB = BI.getSuccessor(1 - LoopExitSuccIdx);
+ assert(L.contains(ContinueBB) &&
+ "Cannot have both successors exit and still be in the loop!");
+
+ auto *ParentBB = BI.getParent();
+ if (!areLoopExitPHIsLoopInvariant(L, *ParentBB, *LoopExitBB))
+ return false;
+
+ DEBUG(dbgs() << " unswitching trivial branch when: " << CondVal
+ << " == " << LoopCond << "\n");
+
+ // Split the preheader, so that we know that there is a safe place to insert
+ // the conditional branch. We will change the preheader to have a conditional
+ // branch on LoopCond.
+ BasicBlock *OldPH = L.getLoopPreheader();
+ BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI);
+
+ // Now that we have a place to insert the conditional branch, create a place
+ // to branch to: this is the exit block out of the loop that we are
+ // unswitching. We need to split this if there are other loop predecessors.
+ // Because the loop is in simplified form, *any* other predecessor is enough.
+ BasicBlock *UnswitchedBB;
+ if (BasicBlock *PredBB = LoopExitBB->getUniquePredecessor()) {
+ (void)PredBB;
+ assert(PredBB == BI.getParent() &&
+ "A branch's parent isn't a predecessor!");
+ UnswitchedBB = LoopExitBB;
+ } else {
+ UnswitchedBB = SplitBlock(LoopExitBB, &LoopExitBB->front(), &DT, &LI);
+ }
+
+ // Now splice the branch to gate reaching the new preheader and re-point its
+ // successors.
+ OldPH->getInstList().splice(std::prev(OldPH->end()),
+ BI.getParent()->getInstList(), BI);
+ OldPH->getTerminator()->eraseFromParent();
+ BI.setSuccessor(LoopExitSuccIdx, UnswitchedBB);
+ BI.setSuccessor(1 - LoopExitSuccIdx, NewPH);
+
+ // Create a new unconditional branch that will continue the loop as a new
+ // terminator.
+ BranchInst::Create(ContinueBB, ParentBB);
+
+ // Rewrite the relevant PHI nodes.
+ if (UnswitchedBB == LoopExitBB)
+ rewritePHINodesForUnswitchedExitBlock(*UnswitchedBB, *ParentBB, *OldPH);
+ else
+ rewritePHINodesForExitAndUnswitchedBlocks(*LoopExitBB, *UnswitchedBB,
+ *ParentBB, *OldPH);
+
+ // Now we need to update the dominator tree.
+ updateDTAfterUnswitch(UnswitchedBB, OldPH, DT);
+ // But if we split something off of the loop exit block then we also removed
+ // one of the predecessors for the loop exit block and may need to update its
+ // idom.
+ if (UnswitchedBB != LoopExitBB)
+ updateLoopExitIDom(LoopExitBB, L, DT);
+
+ // Since this is an i1 condition we can also trivially replace uses of it
+ // within the loop with a constant.
+ replaceLoopUsesWithConstant(L, *LoopCond, *Replacement);
+
+ ++NumTrivial;
+ ++NumBranches;
+ return true;
+}
+
+/// Unswitch a trivial switch if the condition is loop invariant.
+///
+/// This routine should only be called when loop code leading to the switch has
+/// been validated as trivial (no side effects). This routine checks if the
+/// condition is invariant and that at least one of the successors is a loop
+/// exit. This allows us to unswitch without duplicating the loop, making it
+/// trivial.
+///
+/// If this routine fails to unswitch the switch it returns false.
+///
+/// If the switch can be unswitched, this routine splits the preheader and
+/// copies the switch above that split. If the default case is one of the
+/// exiting cases, it copies the non-exiting cases and points them at the new
+/// preheader. If the default case is not exiting, it copies the exiting cases
+/// and points the default at the preheader. It preserves loop simplified form
+/// (splitting the exit blocks as necessary). It simplifies the switch within
+/// the loop by removing now-dead cases. If the default case is one of those
+/// unswitched, it replaces its destination with a new basic block containing
+/// only unreachable. Such basic blocks, while technically loop exits, are not
+/// considered for unswitching so this is a stable transform and the same
+/// switch will not be revisited. If after unswitching there is only a single
+/// in-loop successor, the switch is further simplified to an unconditional
+/// branch. Still more cleanup can be done with some simplify-cfg like pass.
+static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
+ LoopInfo &LI) {
+ DEBUG(dbgs() << " Trying to unswitch switch: " << SI << "\n");
+ Value *LoopCond = SI.getCondition();
+
+ // If this isn't switching on an invariant condition, we can't unswitch it.
+ if (!L.isLoopInvariant(LoopCond))
+ return false;
+
+ auto *ParentBB = SI.getParent();
+
+ // FIXME: We should compute this once at the start and update it!
+ SmallVector<BasicBlock *, 16> ExitBlocks;
+ L.getExitBlocks(ExitBlocks);
+ SmallPtrSet<BasicBlock *, 16> ExitBlockSet(ExitBlocks.begin(),
+ ExitBlocks.end());
+
+ SmallVector<int, 4> ExitCaseIndices;
+ for (auto Case : SI.cases()) {
+ auto *SuccBB = Case.getCaseSuccessor();
+ if (ExitBlockSet.count(SuccBB) &&
+ areLoopExitPHIsLoopInvariant(L, *ParentBB, *SuccBB))
+ ExitCaseIndices.push_back(Case.getCaseIndex());
+ }
+ BasicBlock *DefaultExitBB = nullptr;
+ if (ExitBlockSet.count(SI.getDefaultDest()) &&
+ areLoopExitPHIsLoopInvariant(L, *ParentBB, *SI.getDefaultDest()) &&
+ !isa<UnreachableInst>(SI.getDefaultDest()->getTerminator()))
+ DefaultExitBB = SI.getDefaultDest();
+ else if (ExitCaseIndices.empty())
+ return false;
+
+ DEBUG(dbgs() << " unswitching trivial cases...\n");
+
+ SmallVector<std::pair<ConstantInt *, BasicBlock *>, 4> ExitCases;
+ ExitCases.reserve(ExitCaseIndices.size());
+ // We walk the case indices backwards so that we remove the last case first
+ // and don't disrupt the earlier indices.
+ for (unsigned Index : reverse(ExitCaseIndices)) {
+ auto CaseI = SI.case_begin() + Index;
+ // Save the value of this case.
+ ExitCases.push_back({CaseI->getCaseValue(), CaseI->getCaseSuccessor()});
+ // Delete the unswitched cases.
+ SI.removeCase(CaseI);
+ }
+
+ // Check if after this all of the remaining cases point at the same
+ // successor.
+ BasicBlock *CommonSuccBB = nullptr;
+ if (SI.getNumCases() > 0 &&
+ std::all_of(std::next(SI.case_begin()), SI.case_end(),
+ [&SI](const SwitchInst::CaseHandle &Case) {
+ return Case.getCaseSuccessor() ==
+ SI.case_begin()->getCaseSuccessor();
+ }))
+ CommonSuccBB = SI.case_begin()->getCaseSuccessor();
+
+ if (DefaultExitBB) {
+ // We can't remove the default edge so replace it with an edge to either
+ // the single common remaining successor (if we have one) or an unreachable
+ // block.
+ if (CommonSuccBB) {
+ SI.setDefaultDest(CommonSuccBB);
+ } else {
+ BasicBlock *UnreachableBB = BasicBlock::Create(
+ ParentBB->getContext(),
+ Twine(ParentBB->getName()) + ".unreachable_default",
+ ParentBB->getParent());
+ new UnreachableInst(ParentBB->getContext(), UnreachableBB);
+ SI.setDefaultDest(UnreachableBB);
+ DT.addNewBlock(UnreachableBB, ParentBB);
+ }
+ } else {
+ // If we're not unswitching the default, we need it to match any cases to
+ // have a common successor or if we have no cases it is the common
+ // successor.
+ if (SI.getNumCases() == 0)
+ CommonSuccBB = SI.getDefaultDest();
+ else if (SI.getDefaultDest() != CommonSuccBB)
+ CommonSuccBB = nullptr;
+ }
+
+ // Split the preheader, so that we know that there is a safe place to insert
+ // the switch.
+ BasicBlock *OldPH = L.getLoopPreheader();
+ BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI);
+ OldPH->getTerminator()->eraseFromParent();
+
+ // Now add the unswitched switch.
+ auto *NewSI = SwitchInst::Create(LoopCond, NewPH, ExitCases.size(), OldPH);
+
+ // Rewrite the IR for the unswitched basic blocks. This requires two steps.
+ // First, we split any exit blocks with remaining in-loop predecessors. Then
+ // we update the PHIs in one of two ways depending on if there was a split.
+ // We walk in reverse so that we split in the same order as the cases
+ // appeared. This is purely for convenience of reading the resulting IR, but
+ // it doesn't cost anything really.
+ SmallPtrSet<BasicBlock *, 2> UnswitchedExitBBs;
+ SmallDenseMap<BasicBlock *, BasicBlock *, 2> SplitExitBBMap;
+ // Handle the default exit if necessary.
+ // FIXME: It'd be great if we could merge this with the loop below but LLVM's
+ // ranges aren't quite powerful enough yet.
+ if (DefaultExitBB) {
+ if (pred_empty(DefaultExitBB)) {
+ UnswitchedExitBBs.insert(DefaultExitBB);
+ rewritePHINodesForUnswitchedExitBlock(*DefaultExitBB, *ParentBB, *OldPH);
+ } else {
+ auto *SplitBB =
+ SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI);
+ rewritePHINodesForExitAndUnswitchedBlocks(*DefaultExitBB, *SplitBB,
+ *ParentBB, *OldPH);
+ updateLoopExitIDom(DefaultExitBB, L, DT);
+ DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB;
+ }
+ }
+ // Note that we must use a reference in the for loop so that we update the
+ // container.
+ for (auto &CasePair : reverse(ExitCases)) {
+ // Grab a reference to the exit block in the pair so that we can update it.
+ BasicBlock *ExitBB = CasePair.second;
+
+ // If this case is the last edge into the exit block, we can simply reuse it
+ // as it will no longer be a loop exit. No mapping necessary.
+ if (pred_empty(ExitBB)) {
+ // Only rewrite once.
+ if (UnswitchedExitBBs.insert(ExitBB).second)
+ rewritePHINodesForUnswitchedExitBlock(*ExitBB, *ParentBB, *OldPH);
+ continue;
+ }
+
+ // Otherwise we need to split the exit block so that we retain an exit
+ // block from the loop and a target for the unswitched condition.
+ BasicBlock *&SplitExitBB = SplitExitBBMap[ExitBB];
+ if (!SplitExitBB) {
+ // If this is the first time we see this, do the split and remember it.
+ SplitExitBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI);
+ rewritePHINodesForExitAndUnswitchedBlocks(*ExitBB, *SplitExitBB,
+ *ParentBB, *OldPH);
+ updateLoopExitIDom(ExitBB, L, DT);
+ }
+ // Update the case pair to point to the split block.
+ CasePair.second = SplitExitBB;
+ }
+
+ // Now add the unswitched cases. We do this in reverse order as we built them
+ // in reverse order.
+ for (auto CasePair : reverse(ExitCases)) {
+ ConstantInt *CaseVal = CasePair.first;
+ BasicBlock *UnswitchedBB = CasePair.second;
+
+ NewSI->addCase(CaseVal, UnswitchedBB);
+ updateDTAfterUnswitch(UnswitchedBB, OldPH, DT);
+ }
+
+ // If the default was unswitched, re-point it and add explicit cases for
+ // entering the loop.
+ if (DefaultExitBB) {
+ NewSI->setDefaultDest(DefaultExitBB);
+ updateDTAfterUnswitch(DefaultExitBB, OldPH, DT);
+
+ // We removed all the exit cases, so we just copy the cases to the
+ // unswitched switch.
+ for (auto Case : SI.cases())
+ NewSI->addCase(Case.getCaseValue(), NewPH);
+ }
+
+ // If we ended up with a common successor for every path through the switch
+ // after unswitching, rewrite it to an unconditional branch to make it easy
+ // to recognize. Otherwise we potentially have to recognize the default case
+ // pointing at unreachable and other complexity.
+ if (CommonSuccBB) {
+ BasicBlock *BB = SI.getParent();
+ SI.eraseFromParent();
+ BranchInst::Create(CommonSuccBB, BB);
+ }
+
+ DT.verifyDomTree();
+ ++NumTrivial;
+ ++NumSwitches;
+ return true;
+}
+
+/// This routine scans the loop to find a branch or switch which occurs before
+/// any side effects occur. These can potentially be unswitched without
+/// duplicating the loop. If a branch or switch is successfully unswitched the
+/// scanning continues to see if subsequent branches or switches have become
+/// trivial. Once all trivial candidates have been unswitched, this routine
+/// returns.
+///
+/// The return value indicates whether anything was unswitched (and therefore
+/// changed).
+static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT,
+ LoopInfo &LI) {
+ bool Changed = false;
+
+ // If loop header has only one reachable successor we should keep looking for
+ // trivial condition candidates in the successor as well. An alternative is
+ // to constant fold conditions and merge successors into loop header (then we
+ // only need to check header's terminator). The reason for not doing this in
+ // LoopUnswitch pass is that it could potentially break LoopPassManager's
+ // invariants. Folding dead branches could either eliminate the current loop
+ // or make other loops unreachable. LCSSA form might also not be preserved
+ // after deleting branches. The following code keeps traversing loop header's
+ // successors until it finds the trivial condition candidate (condition that
+ // is not a constant). Since unswitching generates branches with constant
+ // conditions, this scenario could be very common in practice.
+ BasicBlock *CurrentBB = L.getHeader();
+ SmallPtrSet<BasicBlock *, 8> Visited;
+ Visited.insert(CurrentBB);
+ do {
+ // Check if there are any side-effecting instructions (e.g. stores, calls,
+ // volatile loads) in the part of the loop that the code *would* execute
+ // without unswitching.
+ if (llvm::any_of(*CurrentBB,
+ [](Instruction &I) { return I.mayHaveSideEffects(); }))
+ return Changed;
+
+ TerminatorInst *CurrentTerm = CurrentBB->getTerminator();
+
+ if (auto *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
+ // Don't bother trying to unswitch past a switch with a constant
+ // condition. This should be removed prior to running this pass by
+ // simplify-cfg.
+ if (isa<Constant>(SI->getCondition()))
+ return Changed;
+
+ if (!unswitchTrivialSwitch(L, *SI, DT, LI))
+ // Coludn't unswitch this one so we're done.
+ return Changed;
+
+ // Mark that we managed to unswitch something.
+ Changed = true;
+
+ // If unswitching turned the terminator into an unconditional branch then
+ // we can continue. The unswitching logic specifically works to fold any
+ // cases it can into an unconditional branch to make it easier to
+ // recognize here.
+ auto *BI = dyn_cast<BranchInst>(CurrentBB->getTerminator());
+ if (!BI || BI->isConditional())
+ return Changed;
+
+ CurrentBB = BI->getSuccessor(0);
+ continue;
+ }
+
+ auto *BI = dyn_cast<BranchInst>(CurrentTerm);
+ if (!BI)
+ // We do not understand other terminator instructions.
+ return Changed;
+
+ // Don't bother trying to unswitch past an unconditional branch or a branch
+ // with a constant value. These should be removed by simplify-cfg prior to
+ // running this pass.
+ if (!BI->isConditional() || isa<Constant>(BI->getCondition()))
+ return Changed;
+
+ // Found a trivial condition candidate: non-foldable conditional branch. If
+ // we fail to unswitch this, we can't do anything else that is trivial.
+ if (!unswitchTrivialBranch(L, *BI, DT, LI))
+ return Changed;
+
+ // Mark that we managed to unswitch something.
+ Changed = true;
+
+ // We unswitched the branch. This should always leave us with an
+ // unconditional branch that we can follow now.
+ BI = cast<BranchInst>(CurrentBB->getTerminator());
+ assert(!BI->isConditional() &&
+ "Cannot form a conditional branch by unswitching1");
+ CurrentBB = BI->getSuccessor(0);
+
+ // When continuing, if we exit the loop or reach a previous visited block,
+ // then we can not reach any trivial condition candidates (unfoldable
+ // branch instructions or switch instructions) and no unswitch can happen.
+ } while (L.contains(CurrentBB) && Visited.insert(CurrentBB).second);
+
+ return Changed;
+}
+
+/// Unswitch control flow predicated on loop invariant conditions.
+///
+/// This first hoists all branches or switches which are trivial (IE, do not
+/// require duplicating any part of the loop) out of the loop body. It then
+/// looks at other loop invariant control flows and tries to unswitch those as
+/// well by cloning the loop if the result is small enough.
+static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI,
+ AssumptionCache &AC) {
+ assert(L.isLCSSAForm(DT) &&
+ "Loops must be in LCSSA form before unswitching.");
+ bool Changed = false;
+
+ // Must be in loop simplified form: we need a preheader and dedicated exits.
+ if (!L.isLoopSimplifyForm())
+ return false;
+
+ // Try trivial unswitch first before loop over other basic blocks in the loop.
+ Changed |= unswitchAllTrivialConditions(L, DT, LI);
+
+ // FIXME: Add support for non-trivial unswitching by cloning the loop.
+
+ return Changed;
+}
+
+PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &U) {
+ Function &F = *L.getHeader()->getParent();
+ (void)F;
+
+ DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L << "\n");
+
+ if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC))
+ return PreservedAnalyses::all();
+
+#ifndef NDEBUG
+ // Historically this pass has had issues with the dominator tree so verify it
+ // in asserts builds.
+ AR.DT.verifyDomTree();
+#endif
+ return getLoopPassPreservedAnalyses();
+}
+
+namespace {
+
+class SimpleLoopUnswitchLegacyPass : public LoopPass {
+public:
+ static char ID; // Pass ID, replacement for typeid
+
+ explicit SimpleLoopUnswitchLegacyPass() : LoopPass(ID) {
+ initializeSimpleLoopUnswitchLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ getLoopAnalysisUsage(AU);
+ }
+};
+
+} // end anonymous namespace
+
+bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
+ if (skipLoop(L))
+ return false;
+
+ Function &F = *L->getHeader()->getParent();
+
+ DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << *L << "\n");
+
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+
+ bool Changed = unswitchLoop(*L, DT, LI, AC);
+
+#ifndef NDEBUG
+ // Historically this pass has had issues with the dominator tree so verify it
+ // in asserts builds.
+ DT.verifyDomTree();
+#endif
+ return Changed;
+}
+
+char SimpleLoopUnswitchLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(SimpleLoopUnswitchLegacyPass, "simple-loop-unswitch",
+ "Simple unswitch loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(SimpleLoopUnswitchLegacyPass, "simple-loop-unswitch",
+ "Simple unswitch loops", false, false)
+
+Pass *llvm::createSimpleLoopUnswitchLegacyPass() {
+ return new SimpleLoopUnswitchLegacyPass();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index f2723bd..8754c71 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -130,7 +130,8 @@ static bool mergeEmptyReturnBlocks(Function &F) {
/// iterating until no more changes are made.
static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
AssumptionCache *AC,
- unsigned BonusInstThreshold) {
+ unsigned BonusInstThreshold,
+ bool LateSimplifyCFG) {
bool Changed = false;
bool LocalChange = true;
@@ -145,7 +146,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
// Loop over all of the basic blocks and remove them if they are unneeded.
for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) {
- if (SimplifyCFG(&*BBIt++, TTI, BonusInstThreshold, AC, &LoopHeaders)) {
+ if (SimplifyCFG(&*BBIt++, TTI, BonusInstThreshold, AC, &LoopHeaders, LateSimplifyCFG)) {
LocalChange = true;
++NumSimpl;
}
@@ -156,10 +157,12 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
}
static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
- AssumptionCache *AC, int BonusInstThreshold) {
+ AssumptionCache *AC, int BonusInstThreshold,
+ bool LateSimplifyCFG) {
bool EverChanged = removeUnreachableBlocks(F);
EverChanged |= mergeEmptyReturnBlocks(F);
- EverChanged |= iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold);
+ EverChanged |= iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold,
+ LateSimplifyCFG);
// If neither pass changed anything, we're done.
if (!EverChanged) return false;
@@ -173,7 +176,8 @@ static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
return true;
do {
- EverChanged = iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold);
+ EverChanged = iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold,
+ LateSimplifyCFG);
EverChanged |= removeUnreachableBlocks(F);
} while (EverChanged);
@@ -181,17 +185,19 @@ static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
}
SimplifyCFGPass::SimplifyCFGPass()
- : BonusInstThreshold(UserBonusInstThreshold) {}
+ : BonusInstThreshold(UserBonusInstThreshold),
+ LateSimplifyCFG(true) {}
-SimplifyCFGPass::SimplifyCFGPass(int BonusInstThreshold)
- : BonusInstThreshold(BonusInstThreshold) {}
+SimplifyCFGPass::SimplifyCFGPass(int BonusInstThreshold, bool LateSimplifyCFG)
+ : BonusInstThreshold(BonusInstThreshold),
+ LateSimplifyCFG(LateSimplifyCFG) {}
PreservedAnalyses SimplifyCFGPass::run(Function &F,
FunctionAnalysisManager &AM) {
auto &TTI = AM.getResult<TargetIRAnalysis>(F);
auto &AC = AM.getResult<AssumptionAnalysis>(F);
- if (!simplifyFunctionCFG(F, TTI, &AC, BonusInstThreshold))
+ if (!simplifyFunctionCFG(F, TTI, &AC, BonusInstThreshold, LateSimplifyCFG))
return PreservedAnalyses::all();
PreservedAnalyses PA;
PA.preserve<GlobalsAA>();
@@ -199,16 +205,17 @@ PreservedAnalyses SimplifyCFGPass::run(Function &F,
}
namespace {
-struct CFGSimplifyPass : public FunctionPass {
- static char ID; // Pass identification, replacement for typeid
+struct BaseCFGSimplifyPass : public FunctionPass {
unsigned BonusInstThreshold;
std::function<bool(const Function &)> PredicateFtor;
+ bool LateSimplifyCFG;
- CFGSimplifyPass(int T = -1,
- std::function<bool(const Function &)> Ftor = nullptr)
- : FunctionPass(ID), PredicateFtor(std::move(Ftor)) {
+ BaseCFGSimplifyPass(int T, bool LateSimplifyCFG,
+ std::function<bool(const Function &)> Ftor,
+ char &ID)
+ : FunctionPass(ID), PredicateFtor(std::move(Ftor)),
+ LateSimplifyCFG(LateSimplifyCFG) {
BonusInstThreshold = (T == -1) ? UserBonusInstThreshold : unsigned(T);
- initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
}
bool runOnFunction(Function &F) override {
if (skipFunction(F) || (PredicateFtor && !PredicateFtor(F)))
@@ -218,7 +225,7 @@ struct CFGSimplifyPass : public FunctionPass {
&getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
const TargetTransformInfo &TTI =
getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- return simplifyFunctionCFG(F, TTI, AC, BonusInstThreshold);
+ return simplifyFunctionCFG(F, TTI, AC, BonusInstThreshold, LateSimplifyCFG);
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -227,6 +234,26 @@ struct CFGSimplifyPass : public FunctionPass {
AU.addPreserved<GlobalsAAWrapperPass>();
}
};
+
+struct CFGSimplifyPass : public BaseCFGSimplifyPass {
+ static char ID; // Pass identification, replacement for typeid
+
+ CFGSimplifyPass(int T = -1,
+ std::function<bool(const Function &)> Ftor = nullptr)
+ : BaseCFGSimplifyPass(T, false, Ftor, ID) {
+ initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
+ }
+};
+
+struct LateCFGSimplifyPass : public BaseCFGSimplifyPass {
+ static char ID; // Pass identification, replacement for typeid
+
+ LateCFGSimplifyPass(int T = -1,
+ std::function<bool(const Function &)> Ftor = nullptr)
+ : BaseCFGSimplifyPass(T, true, Ftor, ID) {
+ initializeLateCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
+ }
+};
}
char CFGSimplifyPass::ID = 0;
@@ -237,9 +264,24 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
false)
+char LateCFGSimplifyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LateCFGSimplifyPass, "latesimplifycfg",
+ "Simplify the CFG more aggressively", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(LateCFGSimplifyPass, "latesimplifycfg",
+ "Simplify the CFG more aggressively", false, false)
+
// Public interface to the CFGSimplification pass
FunctionPass *
llvm::createCFGSimplificationPass(int Threshold,
- std::function<bool(const Function &)> Ftor) {
+ std::function<bool(const Function &)> Ftor) {
return new CFGSimplifyPass(Threshold, std::move(Ftor));
}
+
+// Public interface to the LateCFGSimplification pass
+FunctionPass *
+llvm::createLateCFGSimplificationPass(int Threshold,
+ std::function<bool(const Function &)> Ftor) {
+ return new LateCFGSimplifyPass(Threshold, std::move(Ftor));
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
index c3f14a0..5210f16 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
@@ -114,7 +114,7 @@ static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo,
if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) {
// We cannot sink a load across a critical edge - there may be stores in
// other code paths.
- if (!isSafeToSpeculativelyExecute(Inst))
+ if (isa<LoadInst>(Inst))
return false;
// We don't want to sink across a critical edge if we don't dominate the
@@ -164,13 +164,14 @@ static bool SinkInstruction(Instruction *Inst,
// Instructions can only be sunk if all their uses are in blocks
// dominated by one of the successors.
- // Look at all the postdominators and see if we can sink it in one.
+ // Look at all the dominated blocks and see if we can sink it in one.
DomTreeNode *DTN = DT.getNode(Inst->getParent());
for (DomTreeNode::iterator I = DTN->begin(), E = DTN->end();
I != E && SuccToSinkTo == nullptr; ++I) {
BasicBlock *Candidate = (*I)->getBlock();
- if ((*I)->getIDom()->getBlock() == Inst->getParent() &&
- IsAcceptableTarget(Inst, Candidate, DT, LI))
+ // A node always immediate-dominates its children on the dominator
+ // tree.
+ if (IsAcceptableTarget(Inst, Candidate, DT, LI))
SuccToSinkTo = Candidate;
}
@@ -262,9 +263,8 @@ PreservedAnalyses SinkingPass::run(Function &F, FunctionAnalysisManager &AM) {
if (!iterativelySinkInstructions(F, DT, LI, AA))
return PreservedAnalyses::all();
- auto PA = PreservedAnalyses();
- PA.preserve<DominatorTreeAnalysis>();
- PA.preserve<LoopAnalysis>();
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
return PA;
}
diff --git a/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index 2be3f5c..8b8d659 100644
--- a/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -693,7 +693,7 @@ bool StraightLineStrengthReduce::runOnFunction(Function &F) {
UnlinkedInst->setOperand(I, nullptr);
RecursivelyDeleteTriviallyDeadInstructions(Op);
}
- delete UnlinkedInst;
+ UnlinkedInst->deleteValue();
}
bool Ret = !UnlinkedInstructions.empty();
UnlinkedInstructions.clear();
diff --git a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index 49ce026..0cccb41 100644
--- a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -7,7 +7,6 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/Transforms/Scalar.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SCCIterator.h"
@@ -20,6 +19,7 @@
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
using namespace llvm;
@@ -329,7 +329,7 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) {
Loops[Exit] = N->getEntry();
} else {
- // Test for sucessors as back edge
+ // Test for successors as back edge
BasicBlock *BB = N->getNodeAs<BasicBlock>();
BranchInst *Term = cast<BranchInst>(BB->getTerminator());
diff --git a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index a6b9fee..90c5c24 100644
--- a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -51,13 +51,12 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Scalar/TailRecursionElimination.h"
-#include "llvm/Transforms/Scalar.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/InlineCost.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/Loads.h"
@@ -69,6 +68,7 @@
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
@@ -76,6 +76,7 @@
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;
@@ -90,16 +91,10 @@ STATISTIC(NumAccumAdded, "Number of accumulators introduced");
/// If it contains any dynamic allocas, returns false.
static bool canTRE(Function &F) {
// Because of PR962, we don't TRE dynamic allocas.
- for (auto &BB : F) {
- for (auto &I : BB) {
- if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
- if (!AI->isStaticAlloca())
- return false;
- }
- }
- }
-
- return true;
+ return llvm::all_of(instructions(F), [](Instruction &I) {
+ auto *AI = dyn_cast<AllocaInst>(&I);
+ return !AI || AI->isStaticAlloca();
+ });
}
namespace {
@@ -321,7 +316,7 @@ static bool markTails(Function &F, bool &AllCallsAreTailCalls) {
/// instruction from after the call to before the call, assuming that all
/// instructions between the call and this instruction are movable.
///
-static bool canMoveAboveCall(Instruction *I, CallInst *CI) {
+static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) {
// FIXME: We can move load/store/call/free instructions above the call if the
// call does not mod/ref the memory location being processed.
if (I->mayHaveSideEffects()) // This also handles volatile loads.
@@ -332,10 +327,10 @@ static bool canMoveAboveCall(Instruction *I, CallInst *CI) {
if (CI->mayHaveSideEffects()) {
// Non-volatile loads may be moved above a call with side effects if it
// does not write to memory and the load provably won't trap.
- // FIXME: Writes to memory only matter if they may alias the pointer
+ // Writes to memory only matter if they may alias the pointer
// being loaded from.
const DataLayout &DL = L->getModule()->getDataLayout();
- if (CI->mayWriteToMemory() ||
+ if ((AA->getModRefInfo(CI, MemoryLocation::get(L)) & MRI_Mod) ||
!isSafeToLoadUnconditionally(L->getPointerOperand(),
L->getAlignment(), DL, L))
return false;
@@ -496,7 +491,7 @@ static bool eliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
BasicBlock *&OldEntry,
bool &TailCallsAreMarkedTail,
SmallVectorImpl<PHINode *> &ArgumentPHIs,
- bool CannotTailCallElimCallsMarkedTail) {
+ AliasAnalysis *AA) {
// If we are introducing accumulator recursion to eliminate operations after
// the call instruction that are both associative and commutative, the initial
// value for the accumulator is placed in this variable. If this value is set
@@ -516,7 +511,8 @@ static bool eliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
// Check that this is the case now.
BasicBlock::iterator BBI(CI);
for (++BBI; &*BBI != Ret; ++BBI) {
- if (canMoveAboveCall(&*BBI, CI)) continue;
+ if (canMoveAboveCall(&*BBI, CI, AA))
+ continue;
// If we can't move the instruction above the call, it might be because it
// is an associative and commutative operation that could be transformed
@@ -675,12 +671,17 @@ static bool foldReturnAndProcessPred(BasicBlock *BB, ReturnInst *Ret,
bool &TailCallsAreMarkedTail,
SmallVectorImpl<PHINode *> &ArgumentPHIs,
bool CannotTailCallElimCallsMarkedTail,
- const TargetTransformInfo *TTI) {
+ const TargetTransformInfo *TTI,
+ AliasAnalysis *AA) {
bool Change = false;
+ // Make sure this block is a trivial return block.
+ assert(BB->getFirstNonPHIOrDbg() == Ret &&
+ "Trying to fold non-trivial return block");
+
// If the return block contains nothing but the return and PHI's,
// there might be an opportunity to duplicate the return in its
- // predecessors and perform TRC there. Look for predecessors that end
+ // predecessors and perform TRE there. Look for predecessors that end
// in unconditional branch and recursive call(s).
SmallVector<BranchInst*, 8> UncondBranchPreds;
for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
@@ -707,8 +708,7 @@ static bool foldReturnAndProcessPred(BasicBlock *BB, ReturnInst *Ret,
BB->eraseFromParent();
eliminateRecursiveTailCall(CI, RI, OldEntry, TailCallsAreMarkedTail,
- ArgumentPHIs,
- CannotTailCallElimCallsMarkedTail);
+ ArgumentPHIs, AA);
++NumRetDuped;
Change = true;
}
@@ -721,17 +721,18 @@ static bool processReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
bool &TailCallsAreMarkedTail,
SmallVectorImpl<PHINode *> &ArgumentPHIs,
bool CannotTailCallElimCallsMarkedTail,
- const TargetTransformInfo *TTI) {
+ const TargetTransformInfo *TTI,
+ AliasAnalysis *AA) {
CallInst *CI = findTRECandidate(Ret, CannotTailCallElimCallsMarkedTail, TTI);
if (!CI)
return false;
return eliminateRecursiveTailCall(CI, Ret, OldEntry, TailCallsAreMarkedTail,
- ArgumentPHIs,
- CannotTailCallElimCallsMarkedTail);
+ ArgumentPHIs, AA);
}
-static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI) {
+static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI,
+ AliasAnalysis *AA) {
if (F.getFnAttribute("disable-tail-calls").getValueAsString() == "true")
return false;
@@ -766,11 +767,11 @@ static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI)
if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) {
bool Change =
processReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
- ArgumentPHIs, !CanTRETailMarkedCall, TTI);
+ ArgumentPHIs, !CanTRETailMarkedCall, TTI, AA);
if (!Change && BB->getFirstNonPHIOrDbg() == Ret)
- Change =
- foldReturnAndProcessPred(BB, Ret, OldEntry, TailCallsAreMarkedTail,
- ArgumentPHIs, !CanTRETailMarkedCall, TTI);
+ Change = foldReturnAndProcessPred(BB, Ret, OldEntry,
+ TailCallsAreMarkedTail, ArgumentPHIs,
+ !CanTRETailMarkedCall, TTI, AA);
MadeChange |= Change;
}
}
@@ -800,6 +801,7 @@ struct TailCallElim : public FunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
}
@@ -808,7 +810,8 @@ struct TailCallElim : public FunctionPass {
return false;
return eliminateTailRecursion(
- F, &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F));
+ F, &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F),
+ &getAnalysis<AAResultsWrapperPass>().getAAResults());
}
};
}
@@ -829,8 +832,9 @@ PreservedAnalyses TailCallElimPass::run(Function &F,
FunctionAnalysisManager &AM) {
TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
+ AliasAnalysis &AA = AM.getResult<AAManager>(F);
- bool Changed = eliminateTailRecursion(F, &TTI);
+ bool Changed = eliminateTailRecursion(F, &TTI, &AA);
if (!Changed)
return PreservedAnalyses::all();
OpenPOWER on IntegriCloud