summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Analysis/InlineCost.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Analysis/InlineCost.cpp')
-rw-r--r--contrib/llvm/lib/Analysis/InlineCost.cpp467
1 files changed, 278 insertions, 189 deletions
diff --git a/contrib/llvm/lib/Analysis/InlineCost.cpp b/contrib/llvm/lib/Analysis/InlineCost.cpp
index 4109049..3569366 100644
--- a/contrib/llvm/lib/Analysis/InlineCost.cpp
+++ b/contrib/llvm/lib/Analysis/InlineCost.cpp
@@ -18,6 +18,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/InstructionSimplify.h"
@@ -48,11 +49,16 @@ static cl::opt<int> HintThreshold(
"inlinehint-threshold", cl::Hidden, cl::init(325),
cl::desc("Threshold for inlining functions with inline hint"));
+static cl::opt<int>
+ ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
+ cl::init(45),
+ cl::desc("Threshold for inlining cold callsites"));
+
// We introduce this threshold to help performance of instrumentation based
// PGO before we actually hook up inliner with analysis passes such as BPI and
// BFI.
static cl::opt<int> ColdThreshold(
- "inlinecold-threshold", cl::Hidden, cl::init(225),
+ "inlinecold-threshold", cl::Hidden, cl::init(45),
cl::desc("Threshold for inlining functions with cold attribute"));
static cl::opt<int>
@@ -60,6 +66,12 @@ static cl::opt<int>
cl::ZeroOrMore,
cl::desc("Threshold for hot callsites "));
+static cl::opt<int> ColdCallSiteRelFreq(
+ "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore,
+ cl::desc("Maxmimum block frequency, expressed as a percentage of caller's "
+ "entry frequency, for a callsite to be cold in the absence of "
+ "profile information."));
+
namespace {
class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
@@ -72,12 +84,18 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
/// Getter for the cache of @llvm.assume intrinsics.
std::function<AssumptionCache &(Function &)> &GetAssumptionCache;
+ /// Getter for BlockFrequencyInfo
+ Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI;
+
/// Profile summary information.
ProfileSummaryInfo *PSI;
/// The called function.
Function &F;
+ // Cache the DataLayout since we use it a lot.
+ const DataLayout &DL;
+
/// The candidate callsite being analyzed. Please do not use this to do
/// analysis in the caller function; we want the inline cost query to be
/// easily cacheable. Instead, use the cover function paramHasAttr.
@@ -133,9 +151,11 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
void disableSROA(Value *V);
void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
int InstructionCost);
- bool isGEPOffsetConstant(GetElementPtrInst &GEP);
+ bool isGEPFree(GetElementPtrInst &GEP);
bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
bool simplifyCallSite(Function *F, CallSite CS);
+ template <typename Callable>
+ bool simplifyInstruction(Instruction &I, Callable Evaluate);
ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
/// Return true if the given argument to the function being considered for
@@ -158,6 +178,9 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
/// Return true if size growth is allowed when inlining the callee at CS.
bool allowSizeGrowth(CallSite CS);
+ /// Return true if \p CS is a cold callsite.
+ bool isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI);
+
// Custom analysis routines.
bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues);
@@ -202,9 +225,11 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
public:
CallAnalyzer(const TargetTransformInfo &TTI,
std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
+ Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
ProfileSummaryInfo *PSI, Function &Callee, CallSite CSArg,
const InlineParams &Params)
- : TTI(TTI), GetAssumptionCache(GetAssumptionCache), PSI(PSI), F(Callee),
+ : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
+ PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()),
CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold),
Cost(0), IsCallerRecursive(false), IsRecursiveCall(false),
ExposesReturnsTwice(false), HasDynamicAlloca(false),
@@ -286,23 +311,11 @@ void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
SROACostSavings += InstructionCost;
}
-/// \brief Check whether a GEP's indices are all constant.
-///
-/// Respects any simplified values known during the analysis of this callsite.
-bool CallAnalyzer::isGEPOffsetConstant(GetElementPtrInst &GEP) {
- for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
- if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
- return false;
-
- return true;
-}
-
/// \brief Accumulate a constant GEP offset into an APInt if possible.
///
/// Returns false if unable to compute the offset for any reason. Respects any
/// simplified values known during the analysis of this callsite.
bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
- const DataLayout &DL = F.getParent()->getDataLayout();
unsigned IntPtrWidth = DL.getPointerSizeInBits();
assert(IntPtrWidth == Offset.getBitWidth());
@@ -331,13 +344,27 @@ bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
return true;
}
+/// \brief Use TTI to check whether a GEP is free.
+///
+/// Respects any simplified values known during the analysis of this callsite.
+bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
+ SmallVector<Value *, 4> Indices;
+ for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
+ if (Constant *SimpleOp = SimplifiedValues.lookup(*I))
+ Indices.push_back(SimpleOp);
+ else
+ Indices.push_back(*I);
+ return TargetTransformInfo::TCC_Free ==
+ TTI.getGEPCost(GEP.getSourceElementType(), GEP.getPointerOperand(),
+ Indices);
+}
+
bool CallAnalyzer::visitAlloca(AllocaInst &I) {
// Check whether inlining will turn a dynamic alloca into a static
// alloca and handle that case.
if (I.isArrayAllocation()) {
Constant *Size = SimplifiedValues.lookup(I.getArraySize());
if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
- const DataLayout &DL = F.getParent()->getDataLayout();
Type *Ty = I.getAllocatedType();
AllocatedSize = SaturatingMultiplyAdd(
AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize);
@@ -347,7 +374,6 @@ bool CallAnalyzer::visitAlloca(AllocaInst &I) {
// Accumulate the allocated size.
if (I.isStaticAlloca()) {
- const DataLayout &DL = F.getParent()->getDataLayout();
Type *Ty = I.getAllocatedType();
AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize);
}
@@ -396,7 +422,7 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
// Non-constant GEPs aren't folded, and disable SROA.
if (SROACandidate)
disableSROA(CostIt);
- return false;
+ return isGEPFree(I);
}
// Add the result as a new mapping to Base + Offset.
@@ -411,7 +437,15 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
}
}
- if (isGEPOffsetConstant(I)) {
+ // Lambda to check whether a GEP's indices are all constant.
+ auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
+ for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
+ if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
+ return false;
+ return true;
+ };
+
+ if (IsGEPOffsetConstant(I)) {
if (SROACandidate)
SROAArgValues[&I] = SROAArg;
@@ -422,19 +456,36 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
// Variable GEPs will require math and will disable SROA.
if (SROACandidate)
disableSROA(CostIt);
- return false;
+ return isGEPFree(I);
+}
+
+/// Simplify \p I if its operands are constants and update SimplifiedValues.
+/// \p Evaluate is a callable specific to instruction type that evaluates the
+/// instruction when all the operands are constants.
+template <typename Callable>
+bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) {
+ SmallVector<Constant *, 2> COps;
+ for (Value *Op : I.operands()) {
+ Constant *COp = dyn_cast<Constant>(Op);
+ if (!COp)
+ COp = SimplifiedValues.lookup(Op);
+ if (!COp)
+ return false;
+ COps.push_back(COp);
+ }
+ auto *C = Evaluate(COps);
+ if (!C)
+ return false;
+ SimplifiedValues[&I] = C;
+ return true;
}
bool CallAnalyzer::visitBitCast(BitCastInst &I) {
// Propagate constants through bitcasts.
- Constant *COp = dyn_cast<Constant>(I.getOperand(0));
- if (!COp)
- COp = SimplifiedValues.lookup(I.getOperand(0));
- if (COp)
- if (Constant *C = ConstantExpr::getBitCast(COp, I.getType())) {
- SimplifiedValues[&I] = C;
- return true;
- }
+ if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
+ return ConstantExpr::getBitCast(COps[0], I.getType());
+ }))
+ return true;
// Track base/offsets through casts
std::pair<Value *, APInt> BaseAndOffset =
@@ -455,19 +506,14 @@ bool CallAnalyzer::visitBitCast(BitCastInst &I) {
bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
// Propagate constants through ptrtoint.
- Constant *COp = dyn_cast<Constant>(I.getOperand(0));
- if (!COp)
- COp = SimplifiedValues.lookup(I.getOperand(0));
- if (COp)
- if (Constant *C = ConstantExpr::getPtrToInt(COp, I.getType())) {
- SimplifiedValues[&I] = C;
- return true;
- }
+ if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
+ return ConstantExpr::getPtrToInt(COps[0], I.getType());
+ }))
+ return true;
// Track base/offset pairs when converted to a plain integer provided the
// integer is large enough to represent the pointer.
unsigned IntegerSize = I.getType()->getScalarSizeInBits();
- const DataLayout &DL = F.getParent()->getDataLayout();
if (IntegerSize >= DL.getPointerSizeInBits()) {
std::pair<Value *, APInt> BaseAndOffset =
ConstantOffsetPtrs.lookup(I.getOperand(0));
@@ -492,20 +538,15 @@ bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
// Propagate constants through ptrtoint.
- Constant *COp = dyn_cast<Constant>(I.getOperand(0));
- if (!COp)
- COp = SimplifiedValues.lookup(I.getOperand(0));
- if (COp)
- if (Constant *C = ConstantExpr::getIntToPtr(COp, I.getType())) {
- SimplifiedValues[&I] = C;
- return true;
- }
+ if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
+ return ConstantExpr::getIntToPtr(COps[0], I.getType());
+ }))
+ return true;
// Track base/offset pairs when round-tripped through a pointer without
// modifications provided the integer is not too large.
Value *Op = I.getOperand(0);
unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
- const DataLayout &DL = F.getParent()->getDataLayout();
if (IntegerSize <= DL.getPointerSizeInBits()) {
std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
if (BaseAndOffset.first)
@@ -523,14 +564,10 @@ bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
bool CallAnalyzer::visitCastInst(CastInst &I) {
// Propagate constants through ptrtoint.
- Constant *COp = dyn_cast<Constant>(I.getOperand(0));
- if (!COp)
- COp = SimplifiedValues.lookup(I.getOperand(0));
- if (COp)
- if (Constant *C = ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) {
- SimplifiedValues[&I] = C;
- return true;
- }
+ if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
+ return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType());
+ }))
+ return true;
// Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
disableSROA(I.getOperand(0));
@@ -540,16 +577,10 @@ bool CallAnalyzer::visitCastInst(CastInst &I) {
bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
Value *Operand = I.getOperand(0);
- Constant *COp = dyn_cast<Constant>(Operand);
- if (!COp)
- COp = SimplifiedValues.lookup(Operand);
- if (COp) {
- const DataLayout &DL = F.getParent()->getDataLayout();
- if (Constant *C = ConstantFoldInstOperands(&I, COp, DL)) {
- SimplifiedValues[&I] = C;
- return true;
- }
- }
+ if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
+ return ConstantFoldInstOperands(&I, COps[0], DL);
+ }))
+ return true;
// Disable any SROA on the argument to arbitrary unary operators.
disableSROA(Operand);
@@ -558,8 +589,7 @@ bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
}
bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
- unsigned ArgNo = A->getArgNo();
- return CandidateCS.paramHasAttr(ArgNo + 1, Attr);
+ return CandidateCS.paramHasAttr(A->getArgNo(), Attr);
}
bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
@@ -610,6 +640,26 @@ bool CallAnalyzer::allowSizeGrowth(CallSite CS) {
return true;
}
+bool CallAnalyzer::isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI) {
+ // If global profile summary is available, then callsite's coldness is
+ // determined based on that.
+ if (PSI->hasProfileSummary())
+ return PSI->isColdCallSite(CS, CallerBFI);
+ if (!CallerBFI)
+ return false;
+
+ // In the absence of global profile summary, determine if the callsite is cold
+ // relative to caller's entry. We could potentially cache the computation of
+ // scaled entry frequency, but the added complexity is not worth it unless
+ // this scaling shows up high in the profiles.
+ const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
+ auto CallSiteBB = CS.getInstruction()->getParent();
+ auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
+ auto CallerEntryFreq =
+ CallerBFI->getBlockFreq(&(CS.getCaller()->getEntryBlock()));
+ return CallSiteFreq < CallerEntryFreq * ColdProb;
+}
+
void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
// If no size growth is allowed for this inlining, set Threshold to 0.
if (!allowSizeGrowth(CS)) {
@@ -642,17 +692,34 @@ void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
if (Callee.hasFnAttribute(Attribute::InlineHint))
Threshold = MaxIfValid(Threshold, Params.HintThreshold);
if (PSI) {
- uint64_t TotalWeight;
- if (CS.getInstruction()->extractProfTotalWeight(TotalWeight) &&
- PSI->isHotCount(TotalWeight)) {
- Threshold = MaxIfValid(Threshold, Params.HotCallSiteThreshold);
- } else if (PSI->isFunctionEntryHot(&Callee)) {
- // If callsite hotness can not be determined, we may still know
- // that the callee is hot and treat it as a weaker hint for threshold
- // increase.
- Threshold = MaxIfValid(Threshold, Params.HintThreshold);
- } else if (PSI->isFunctionEntryCold(&Callee)) {
- Threshold = MinIfValid(Threshold, Params.ColdThreshold);
+ BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr;
+ // FIXME: After switching to the new passmanager, simplify the logic below
+ // by checking only the callsite hotness/coldness. The check for CallerBFI
+ // exists only because we do not have BFI available with the old PM.
+ //
+ // Use callee's hotness information only if we have no way of determining
+ // callsite's hotness information. Callsite hotness can be determined if
+ // sample profile is used (which adds hotness metadata to calls) or if
+ // caller's BlockFrequencyInfo is available.
+ if (CallerBFI || PSI->hasSampleProfile()) {
+ if (PSI->isHotCallSite(CS, CallerBFI)) {
+ DEBUG(dbgs() << "Hot callsite.\n");
+ Threshold = Params.HotCallSiteThreshold.getValue();
+ } else if (isColdCallSite(CS, CallerBFI)) {
+ DEBUG(dbgs() << "Cold callsite.\n");
+ Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
+ }
+ } else {
+ if (PSI->isFunctionEntryHot(&Callee)) {
+ DEBUG(dbgs() << "Hot callee.\n");
+ // If callsite hotness can not be determined, we may still know
+ // that the callee is hot and treat it as a weaker hint for threshold
+ // increase.
+ Threshold = MaxIfValid(Threshold, Params.HintThreshold);
+ } else if (PSI->isFunctionEntryCold(&Callee)) {
+ DEBUG(dbgs() << "Cold callee.\n");
+ Threshold = MinIfValid(Threshold, Params.ColdThreshold);
+ }
}
}
}
@@ -665,20 +732,10 @@ void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
bool CallAnalyzer::visitCmpInst(CmpInst &I) {
Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
// First try to handle simplified comparisons.
- if (!isa<Constant>(LHS))
- if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
- LHS = SimpleLHS;
- if (!isa<Constant>(RHS))
- if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
- RHS = SimpleRHS;
- if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
- if (Constant *CRHS = dyn_cast<Constant>(RHS))
- if (Constant *C =
- ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) {
- SimplifiedValues[&I] = C;
- return true;
- }
- }
+ if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
+ return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]);
+ }))
+ return true;
if (I.getOpcode() == Instruction::FCmp)
return false;
@@ -756,24 +813,18 @@ bool CallAnalyzer::visitSub(BinaryOperator &I) {
bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
- const DataLayout &DL = F.getParent()->getDataLayout();
- if (!isa<Constant>(LHS))
- if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
- LHS = SimpleLHS;
- if (!isa<Constant>(RHS))
- if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
- RHS = SimpleRHS;
- Value *SimpleV = nullptr;
- if (auto FI = dyn_cast<FPMathOperator>(&I))
- SimpleV =
- SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL);
- else
- SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL);
+ auto Evaluate = [&](SmallVectorImpl<Constant *> &COps) {
+ Value *SimpleV = nullptr;
+ if (auto FI = dyn_cast<FPMathOperator>(&I))
+ SimpleV = SimplifyFPBinOp(I.getOpcode(), COps[0], COps[1],
+ FI->getFastMathFlags(), DL);
+ else
+ SimpleV = SimplifyBinOp(I.getOpcode(), COps[0], COps[1], DL);
+ return dyn_cast_or_null<Constant>(SimpleV);
+ };
- if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) {
- SimplifiedValues[&I] = C;
+ if (simplifyInstruction(I, Evaluate))
return true;
- }
// Disable any SROA on arguments to arbitrary, unsimplified binary operators.
disableSROA(LHS);
@@ -814,13 +865,10 @@ bool CallAnalyzer::visitStore(StoreInst &I) {
bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
// Constant folding for extract value is trivial.
- Constant *C = dyn_cast<Constant>(I.getAggregateOperand());
- if (!C)
- C = SimplifiedValues.lookup(I.getAggregateOperand());
- if (C) {
- SimplifiedValues[&I] = ConstantExpr::getExtractValue(C, I.getIndices());
+ if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
+ return ConstantExpr::getExtractValue(COps[0], I.getIndices());
+ }))
return true;
- }
// SROA can look through these but give them a cost.
return false;
@@ -828,17 +876,12 @@ bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
// Constant folding for insert value is trivial.
- Constant *AggC = dyn_cast<Constant>(I.getAggregateOperand());
- if (!AggC)
- AggC = SimplifiedValues.lookup(I.getAggregateOperand());
- Constant *InsertedC = dyn_cast<Constant>(I.getInsertedValueOperand());
- if (!InsertedC)
- InsertedC = SimplifiedValues.lookup(I.getInsertedValueOperand());
- if (AggC && InsertedC) {
- SimplifiedValues[&I] =
- ConstantExpr::getInsertValue(AggC, InsertedC, I.getIndices());
+ if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
+ return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0],
+ /*InsertedValueOperand*/ COps[1],
+ I.getIndices());
+ }))
return true;
- }
// SROA can look through these but give them a cost.
return false;
@@ -855,7 +898,7 @@ bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
// because we have to continually rebuild the argument list even when no
// simplifications can be performed. Until that is fixed with remapping
// inside of instsimplify, directly constant fold calls here.
- if (!canConstantFoldCallTo(F))
+ if (!canConstantFoldCallTo(CS, F))
return false;
// Try to re-map the arguments to constants.
@@ -871,7 +914,7 @@ bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
ConstantArgs.push_back(C);
}
- if (Constant *C = ConstantFoldCall(F, ConstantArgs)) {
+ if (Constant *C = ConstantFoldCall(CS, F, ConstantArgs)) {
SimplifiedValues[CS.getInstruction()] = C;
return true;
}
@@ -959,7 +1002,8 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
// out. Pretend to inline the function, with a custom threshold.
auto IndirectCallParams = Params;
IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold;
- CallAnalyzer CA(TTI, GetAssumptionCache, PSI, *F, CS, IndirectCallParams);
+ CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, *F, CS,
+ IndirectCallParams);
if (CA.analyzeCall(CS)) {
// We were able to inline the indirect call! Subtract the cost from the
// threshold to get the bonus we want to apply, but don't go below zero.
@@ -995,22 +1039,74 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
if (isa<ConstantInt>(V))
return true;
- // Otherwise, we need to accumulate a cost proportional to the number of
- // distinct successor blocks. This fan-out in the CFG cannot be represented
- // for free even if we can represent the core switch as a jumptable that
- // takes a single instruction.
+ // Assume the most general case where the switch is lowered into
+ // either a jump table, bit test, or a balanced binary tree consisting of
+ // case clusters without merging adjacent clusters with the same
+ // destination. We do not consider the switches that are lowered with a mix
+ // of jump table/bit test/binary search tree. The cost of the switch is
+ // proportional to the size of the tree or the size of jump table range.
//
// NB: We convert large switches which are just used to initialize large phi
// nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
// inlining those. It will prevent inlining in cases where the optimization
// does not (yet) fire.
- SmallPtrSet<BasicBlock *, 8> SuccessorBlocks;
- SuccessorBlocks.insert(SI.getDefaultDest());
- for (auto I = SI.case_begin(), E = SI.case_end(); I != E; ++I)
- SuccessorBlocks.insert(I.getCaseSuccessor());
- // Add cost corresponding to the number of distinct destinations. The first
- // we model as free because of fallthrough.
- Cost += (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost;
+
+ // Maximum valid cost increased in this function.
+ int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1;
+
+ // Exit early for a large switch, assuming one case needs at least one
+ // instruction.
+ // FIXME: This is not true for a bit test, but ignore such case for now to
+ // save compile-time.
+ int64_t CostLowerBound =
+ std::min((int64_t)CostUpperBound,
+ (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost);
+
+ if (CostLowerBound > Threshold) {
+ Cost = CostLowerBound;
+ return false;
+ }
+
+ unsigned JumpTableSize = 0;
+ unsigned NumCaseCluster =
+ TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize);
+
+ // If suitable for a jump table, consider the cost for the table size and
+ // branch to destination.
+ if (JumpTableSize) {
+ int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
+ 4 * InlineConstants::InstrCost;
+
+ Cost = std::min((int64_t)CostUpperBound, JTCost + Cost);
+ return false;
+ }
+
+ // Considering forming a binary search, we should find the number of nodes
+ // which is same as the number of comparisons when lowered. For a given
+ // number of clusters, n, we can define a recursive function, f(n), to find
+ // the number of nodes in the tree. The recursion is :
+ // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
+ // and f(n) = n, when n <= 3.
+ // This will lead a binary tree where the leaf should be either f(2) or f(3)
+ // when n > 3. So, the number of comparisons from leaves should be n, while
+ // the number of non-leaf should be :
+ // 2^(log2(n) - 1) - 1
+ // = 2^log2(n) * 2^-1 - 1
+ // = n / 2 - 1.
+ // Considering comparisons from leaf and non-leaf nodes, we can estimate the
+ // number of comparisons in a simple closed form :
+ // n + n / 2 - 1 = n * 3 / 2 - 1
+ if (NumCaseCluster <= 3) {
+ // Suppose a comparison includes one compare and one conditional branch.
+ Cost += NumCaseCluster * 2 * InlineConstants::InstrCost;
+ return false;
+ }
+
+ int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1;
+ int64_t SwitchCost =
+ ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
+
+ Cost = std::min((int64_t)CostUpperBound, SwitchCost + Cost);
return false;
}
@@ -1098,19 +1194,10 @@ bool CallAnalyzer::analyzeBlock(BasicBlock *BB,
// is expensive or the function has the "use-soft-float" attribute, this may
// eventually become a library call. Treat the cost as such.
if (I->getType()->isFloatingPointTy()) {
- bool hasSoftFloatAttr = false;
-
// If the function has the "use-soft-float" attribute, mark it as
// expensive.
- if (F.hasFnAttribute("use-soft-float")) {
- Attribute Attr = F.getFnAttribute("use-soft-float");
- StringRef Val = Attr.getValueAsString();
- if (Val == "true")
- hasSoftFloatAttr = true;
- }
-
if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive ||
- hasSoftFloatAttr)
+ (F.getFnAttribute("use-soft-float").getValueAsString() == "true"))
Cost += InlineConstants::CallPenalty;
}
@@ -1155,7 +1242,6 @@ ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
if (!V->getType()->isPointerTy())
return nullptr;
- const DataLayout &DL = F.getParent()->getDataLayout();
unsigned IntPtrWidth = DL.getPointerSizeInBits();
APInt Offset = APInt::getNullValue(IntPtrWidth);
@@ -1212,7 +1298,6 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
FiftyPercentVectorBonus = 3 * Threshold / 2;
TenPercentVectorBonus = 3 * Threshold / 4;
- const DataLayout &DL = F.getParent()->getDataLayout();
// Track whether the post-inlining function would have more than one basic
// block. A single basic block is often intended for inlining. Balloon the
@@ -1225,36 +1310,10 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
// the rest of the function body.
Threshold += (SingleBBBonus + FiftyPercentVectorBonus);
- // Give out bonuses per argument, as the instructions setting them up will
- // be gone after inlining.
- for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
- if (CS.isByValArgument(I)) {
- // We approximate the number of loads and stores needed by dividing the
- // size of the byval type by the target's pointer size.
- PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
- unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
- unsigned PointerSize = DL.getPointerSizeInBits();
- // Ceiling division.
- unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
+ // Give out bonuses for the callsite, as the instructions setting them up
+ // will be gone after inlining.
+ Cost -= getCallsiteCost(CS, DL);
- // If it generates more than 8 stores it is likely to be expanded as an
- // inline memcpy so we take that as an upper bound. Otherwise we assume
- // one load and one store per word copied.
- // FIXME: The maxStoresPerMemcpy setting from the target should be used
- // here instead of a magic number of 8, but it's not available via
- // DataLayout.
- NumStores = std::min(NumStores, 8U);
-
- Cost -= 2 * NumStores * InlineConstants::InstrCost;
- } else {
- // For non-byval arguments subtract off one instruction per call
- // argument.
- Cost -= InlineConstants::InstrCost;
- }
- }
- // The call instruction also disappears after inlining.
- Cost -= InlineConstants::InstrCost + InlineConstants::CallPenalty;
-
// If there is only one call of the function, and it has internal linkage,
// the cost of inlining it drops dramatically.
bool OnlyOneCallAndLocalLinkage =
@@ -1371,7 +1430,7 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
Value *Cond = SI->getCondition();
if (ConstantInt *SimpleCond =
dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
- BBWorklist.insert(SI->findCaseValue(SimpleCond).getCaseSuccessor());
+ BBWorklist.insert(SI->findCaseValue(SimpleCond)->getCaseSuccessor());
continue;
}
}
@@ -1430,13 +1489,6 @@ LLVM_DUMP_METHOD void CallAnalyzer::dump() {
}
#endif
-/// \brief Test that two functions either have or have not the given attribute
-/// at the same time.
-template <typename AttrKind>
-static bool attributeMatches(Function *F1, Function *F2, AttrKind Attr) {
- return F1->getFnAttribute(Attr) == F2->getFnAttribute(Attr);
-}
-
/// \brief Test that there are no attribute conflicts between Caller and Callee
/// that prevent inlining.
static bool functionsHaveCompatibleAttributes(Function *Caller,
@@ -1446,18 +1498,52 @@ static bool functionsHaveCompatibleAttributes(Function *Caller,
AttributeFuncs::areInlineCompatible(*Caller, *Callee);
}
+int llvm::getCallsiteCost(CallSite CS, const DataLayout &DL) {
+ int Cost = 0;
+ for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
+ if (CS.isByValArgument(I)) {
+ // We approximate the number of loads and stores needed by dividing the
+ // size of the byval type by the target's pointer size.
+ PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
+ unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
+ unsigned PointerSize = DL.getPointerSizeInBits();
+ // Ceiling division.
+ unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
+
+ // If it generates more than 8 stores it is likely to be expanded as an
+ // inline memcpy so we take that as an upper bound. Otherwise we assume
+ // one load and one store per word copied.
+ // FIXME: The maxStoresPerMemcpy setting from the target should be used
+ // here instead of a magic number of 8, but it's not available via
+ // DataLayout.
+ NumStores = std::min(NumStores, 8U);
+
+ Cost += 2 * NumStores * InlineConstants::InstrCost;
+ } else {
+ // For non-byval arguments subtract off one instruction per call
+ // argument.
+ Cost += InlineConstants::InstrCost;
+ }
+ }
+ // The call instruction also disappears after inlining.
+ Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty;
+ return Cost;
+}
+
InlineCost llvm::getInlineCost(
CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
+ Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
ProfileSummaryInfo *PSI) {
return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI,
- GetAssumptionCache, PSI);
+ GetAssumptionCache, GetBFI, PSI);
}
InlineCost llvm::getInlineCost(
CallSite CS, Function *Callee, const InlineParams &Params,
TargetTransformInfo &CalleeTTI,
std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
+ Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
ProfileSummaryInfo *PSI) {
// Cannot inline indirect calls.
@@ -1492,7 +1578,8 @@ InlineCost llvm::getInlineCost(
DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
<< "...\n");
- CallAnalyzer CA(CalleeTTI, GetAssumptionCache, PSI, *Callee, CS, Params);
+ CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, *Callee, CS,
+ Params);
bool ShouldInline = CA.analyzeCall(CS);
DEBUG(CA.dump());
@@ -1565,7 +1652,9 @@ InlineParams llvm::getInlineParams(int Threshold) {
// Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
Params.HotCallSiteThreshold = HotCallSiteThreshold;
- // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
+ // Set the ColdCallSiteThreshold knob from the -inline-cold-callsite-threshold.
+ Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
+
// Set the OptMinSizeThreshold and OptSizeThreshold params only if the
// -inlinehint-threshold commandline option is not explicitly given. If that
// option is present, then its value applies even for callees with size and
OpenPOWER on IntegriCloud