diff options
Diffstat (limited to 'contrib/llvm/lib/Analysis/InlineCost.cpp')
-rw-r--r-- | contrib/llvm/lib/Analysis/InlineCost.cpp | 467 |
1 files changed, 278 insertions, 189 deletions
diff --git a/contrib/llvm/lib/Analysis/InlineCost.cpp b/contrib/llvm/lib/Analysis/InlineCost.cpp index 4109049..3569366 100644 --- a/contrib/llvm/lib/Analysis/InlineCost.cpp +++ b/contrib/llvm/lib/Analysis/InlineCost.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" @@ -48,11 +49,16 @@ static cl::opt<int> HintThreshold( "inlinehint-threshold", cl::Hidden, cl::init(325), cl::desc("Threshold for inlining functions with inline hint")); +static cl::opt<int> + ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden, + cl::init(45), + cl::desc("Threshold for inlining cold callsites")); + // We introduce this threshold to help performance of instrumentation based // PGO before we actually hook up inliner with analysis passes such as BPI and // BFI. static cl::opt<int> ColdThreshold( - "inlinecold-threshold", cl::Hidden, cl::init(225), + "inlinecold-threshold", cl::Hidden, cl::init(45), cl::desc("Threshold for inlining functions with cold attribute")); static cl::opt<int> @@ -60,6 +66,12 @@ static cl::opt<int> cl::ZeroOrMore, cl::desc("Threshold for hot callsites ")); +static cl::opt<int> ColdCallSiteRelFreq( + "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore, + cl::desc("Maxmimum block frequency, expressed as a percentage of caller's " + "entry frequency, for a callsite to be cold in the absence of " + "profile information.")); + namespace { class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { @@ -72,12 +84,18 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { /// Getter for the cache of @llvm.assume intrinsics. std::function<AssumptionCache &(Function &)> &GetAssumptionCache; + /// Getter for BlockFrequencyInfo + Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI; + /// Profile summary information. ProfileSummaryInfo *PSI; /// The called function. Function &F; + // Cache the DataLayout since we use it a lot. + const DataLayout &DL; + /// The candidate callsite being analyzed. Please do not use this to do /// analysis in the caller function; we want the inline cost query to be /// easily cacheable. Instead, use the cover function paramHasAttr. @@ -133,9 +151,11 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { void disableSROA(Value *V); void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, int InstructionCost); - bool isGEPOffsetConstant(GetElementPtrInst &GEP); + bool isGEPFree(GetElementPtrInst &GEP); bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); bool simplifyCallSite(Function *F, CallSite CS); + template <typename Callable> + bool simplifyInstruction(Instruction &I, Callable Evaluate); ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V); /// Return true if the given argument to the function being considered for @@ -158,6 +178,9 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { /// Return true if size growth is allowed when inlining the callee at CS. bool allowSizeGrowth(CallSite CS); + /// Return true if \p CS is a cold callsite. + bool isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI); + // Custom analysis routines. bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues); @@ -202,9 +225,11 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { public: CallAnalyzer(const TargetTransformInfo &TTI, std::function<AssumptionCache &(Function &)> &GetAssumptionCache, + Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI, ProfileSummaryInfo *PSI, Function &Callee, CallSite CSArg, const InlineParams &Params) - : TTI(TTI), GetAssumptionCache(GetAssumptionCache), PSI(PSI), F(Callee), + : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI), + PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold), Cost(0), IsCallerRecursive(false), IsRecursiveCall(false), ExposesReturnsTwice(false), HasDynamicAlloca(false), @@ -286,23 +311,11 @@ void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, SROACostSavings += InstructionCost; } -/// \brief Check whether a GEP's indices are all constant. -/// -/// Respects any simplified values known during the analysis of this callsite. -bool CallAnalyzer::isGEPOffsetConstant(GetElementPtrInst &GEP) { - for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) - if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I)) - return false; - - return true; -} - /// \brief Accumulate a constant GEP offset into an APInt if possible. /// /// Returns false if unable to compute the offset for any reason. Respects any /// simplified values known during the analysis of this callsite. bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) { - const DataLayout &DL = F.getParent()->getDataLayout(); unsigned IntPtrWidth = DL.getPointerSizeInBits(); assert(IntPtrWidth == Offset.getBitWidth()); @@ -331,13 +344,27 @@ bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) { return true; } +/// \brief Use TTI to check whether a GEP is free. +/// +/// Respects any simplified values known during the analysis of this callsite. +bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) { + SmallVector<Value *, 4> Indices; + for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) + if (Constant *SimpleOp = SimplifiedValues.lookup(*I)) + Indices.push_back(SimpleOp); + else + Indices.push_back(*I); + return TargetTransformInfo::TCC_Free == + TTI.getGEPCost(GEP.getSourceElementType(), GEP.getPointerOperand(), + Indices); +} + bool CallAnalyzer::visitAlloca(AllocaInst &I) { // Check whether inlining will turn a dynamic alloca into a static // alloca and handle that case. if (I.isArrayAllocation()) { Constant *Size = SimplifiedValues.lookup(I.getArraySize()); if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) { - const DataLayout &DL = F.getParent()->getDataLayout(); Type *Ty = I.getAllocatedType(); AllocatedSize = SaturatingMultiplyAdd( AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize); @@ -347,7 +374,6 @@ bool CallAnalyzer::visitAlloca(AllocaInst &I) { // Accumulate the allocated size. if (I.isStaticAlloca()) { - const DataLayout &DL = F.getParent()->getDataLayout(); Type *Ty = I.getAllocatedType(); AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize); } @@ -396,7 +422,7 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { // Non-constant GEPs aren't folded, and disable SROA. if (SROACandidate) disableSROA(CostIt); - return false; + return isGEPFree(I); } // Add the result as a new mapping to Base + Offset. @@ -411,7 +437,15 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { } } - if (isGEPOffsetConstant(I)) { + // Lambda to check whether a GEP's indices are all constant. + auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) { + for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) + if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I)) + return false; + return true; + }; + + if (IsGEPOffsetConstant(I)) { if (SROACandidate) SROAArgValues[&I] = SROAArg; @@ -422,19 +456,36 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { // Variable GEPs will require math and will disable SROA. if (SROACandidate) disableSROA(CostIt); - return false; + return isGEPFree(I); +} + +/// Simplify \p I if its operands are constants and update SimplifiedValues. +/// \p Evaluate is a callable specific to instruction type that evaluates the +/// instruction when all the operands are constants. +template <typename Callable> +bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) { + SmallVector<Constant *, 2> COps; + for (Value *Op : I.operands()) { + Constant *COp = dyn_cast<Constant>(Op); + if (!COp) + COp = SimplifiedValues.lookup(Op); + if (!COp) + return false; + COps.push_back(COp); + } + auto *C = Evaluate(COps); + if (!C) + return false; + SimplifiedValues[&I] = C; + return true; } bool CallAnalyzer::visitBitCast(BitCastInst &I) { // Propagate constants through bitcasts. - Constant *COp = dyn_cast<Constant>(I.getOperand(0)); - if (!COp) - COp = SimplifiedValues.lookup(I.getOperand(0)); - if (COp) - if (Constant *C = ConstantExpr::getBitCast(COp, I.getType())) { - SimplifiedValues[&I] = C; - return true; - } + if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { + return ConstantExpr::getBitCast(COps[0], I.getType()); + })) + return true; // Track base/offsets through casts std::pair<Value *, APInt> BaseAndOffset = @@ -455,19 +506,14 @@ bool CallAnalyzer::visitBitCast(BitCastInst &I) { bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) { // Propagate constants through ptrtoint. - Constant *COp = dyn_cast<Constant>(I.getOperand(0)); - if (!COp) - COp = SimplifiedValues.lookup(I.getOperand(0)); - if (COp) - if (Constant *C = ConstantExpr::getPtrToInt(COp, I.getType())) { - SimplifiedValues[&I] = C; - return true; - } + if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { + return ConstantExpr::getPtrToInt(COps[0], I.getType()); + })) + return true; // Track base/offset pairs when converted to a plain integer provided the // integer is large enough to represent the pointer. unsigned IntegerSize = I.getType()->getScalarSizeInBits(); - const DataLayout &DL = F.getParent()->getDataLayout(); if (IntegerSize >= DL.getPointerSizeInBits()) { std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(I.getOperand(0)); @@ -492,20 +538,15 @@ bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) { bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { // Propagate constants through ptrtoint. - Constant *COp = dyn_cast<Constant>(I.getOperand(0)); - if (!COp) - COp = SimplifiedValues.lookup(I.getOperand(0)); - if (COp) - if (Constant *C = ConstantExpr::getIntToPtr(COp, I.getType())) { - SimplifiedValues[&I] = C; - return true; - } + if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { + return ConstantExpr::getIntToPtr(COps[0], I.getType()); + })) + return true; // Track base/offset pairs when round-tripped through a pointer without // modifications provided the integer is not too large. Value *Op = I.getOperand(0); unsigned IntegerSize = Op->getType()->getScalarSizeInBits(); - const DataLayout &DL = F.getParent()->getDataLayout(); if (IntegerSize <= DL.getPointerSizeInBits()) { std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op); if (BaseAndOffset.first) @@ -523,14 +564,10 @@ bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { bool CallAnalyzer::visitCastInst(CastInst &I) { // Propagate constants through ptrtoint. - Constant *COp = dyn_cast<Constant>(I.getOperand(0)); - if (!COp) - COp = SimplifiedValues.lookup(I.getOperand(0)); - if (COp) - if (Constant *C = ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) { - SimplifiedValues[&I] = C; - return true; - } + if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { + return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType()); + })) + return true; // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere. disableSROA(I.getOperand(0)); @@ -540,16 +577,10 @@ bool CallAnalyzer::visitCastInst(CastInst &I) { bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) { Value *Operand = I.getOperand(0); - Constant *COp = dyn_cast<Constant>(Operand); - if (!COp) - COp = SimplifiedValues.lookup(Operand); - if (COp) { - const DataLayout &DL = F.getParent()->getDataLayout(); - if (Constant *C = ConstantFoldInstOperands(&I, COp, DL)) { - SimplifiedValues[&I] = C; - return true; - } - } + if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { + return ConstantFoldInstOperands(&I, COps[0], DL); + })) + return true; // Disable any SROA on the argument to arbitrary unary operators. disableSROA(Operand); @@ -558,8 +589,7 @@ bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) { } bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) { - unsigned ArgNo = A->getArgNo(); - return CandidateCS.paramHasAttr(ArgNo + 1, Attr); + return CandidateCS.paramHasAttr(A->getArgNo(), Attr); } bool CallAnalyzer::isKnownNonNullInCallee(Value *V) { @@ -610,6 +640,26 @@ bool CallAnalyzer::allowSizeGrowth(CallSite CS) { return true; } +bool CallAnalyzer::isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI) { + // If global profile summary is available, then callsite's coldness is + // determined based on that. + if (PSI->hasProfileSummary()) + return PSI->isColdCallSite(CS, CallerBFI); + if (!CallerBFI) + return false; + + // In the absence of global profile summary, determine if the callsite is cold + // relative to caller's entry. We could potentially cache the computation of + // scaled entry frequency, but the added complexity is not worth it unless + // this scaling shows up high in the profiles. + const BranchProbability ColdProb(ColdCallSiteRelFreq, 100); + auto CallSiteBB = CS.getInstruction()->getParent(); + auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB); + auto CallerEntryFreq = + CallerBFI->getBlockFreq(&(CS.getCaller()->getEntryBlock())); + return CallSiteFreq < CallerEntryFreq * ColdProb; +} + void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) { // If no size growth is allowed for this inlining, set Threshold to 0. if (!allowSizeGrowth(CS)) { @@ -642,17 +692,34 @@ void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) { if (Callee.hasFnAttribute(Attribute::InlineHint)) Threshold = MaxIfValid(Threshold, Params.HintThreshold); if (PSI) { - uint64_t TotalWeight; - if (CS.getInstruction()->extractProfTotalWeight(TotalWeight) && - PSI->isHotCount(TotalWeight)) { - Threshold = MaxIfValid(Threshold, Params.HotCallSiteThreshold); - } else if (PSI->isFunctionEntryHot(&Callee)) { - // If callsite hotness can not be determined, we may still know - // that the callee is hot and treat it as a weaker hint for threshold - // increase. - Threshold = MaxIfValid(Threshold, Params.HintThreshold); - } else if (PSI->isFunctionEntryCold(&Callee)) { - Threshold = MinIfValid(Threshold, Params.ColdThreshold); + BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr; + // FIXME: After switching to the new passmanager, simplify the logic below + // by checking only the callsite hotness/coldness. The check for CallerBFI + // exists only because we do not have BFI available with the old PM. + // + // Use callee's hotness information only if we have no way of determining + // callsite's hotness information. Callsite hotness can be determined if + // sample profile is used (which adds hotness metadata to calls) or if + // caller's BlockFrequencyInfo is available. + if (CallerBFI || PSI->hasSampleProfile()) { + if (PSI->isHotCallSite(CS, CallerBFI)) { + DEBUG(dbgs() << "Hot callsite.\n"); + Threshold = Params.HotCallSiteThreshold.getValue(); + } else if (isColdCallSite(CS, CallerBFI)) { + DEBUG(dbgs() << "Cold callsite.\n"); + Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold); + } + } else { + if (PSI->isFunctionEntryHot(&Callee)) { + DEBUG(dbgs() << "Hot callee.\n"); + // If callsite hotness can not be determined, we may still know + // that the callee is hot and treat it as a weaker hint for threshold + // increase. + Threshold = MaxIfValid(Threshold, Params.HintThreshold); + } else if (PSI->isFunctionEntryCold(&Callee)) { + DEBUG(dbgs() << "Cold callee.\n"); + Threshold = MinIfValid(Threshold, Params.ColdThreshold); + } } } } @@ -665,20 +732,10 @@ void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) { bool CallAnalyzer::visitCmpInst(CmpInst &I) { Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); // First try to handle simplified comparisons. - if (!isa<Constant>(LHS)) - if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS)) - LHS = SimpleLHS; - if (!isa<Constant>(RHS)) - if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS)) - RHS = SimpleRHS; - if (Constant *CLHS = dyn_cast<Constant>(LHS)) { - if (Constant *CRHS = dyn_cast<Constant>(RHS)) - if (Constant *C = - ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) { - SimplifiedValues[&I] = C; - return true; - } - } + if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { + return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]); + })) + return true; if (I.getOpcode() == Instruction::FCmp) return false; @@ -756,24 +813,18 @@ bool CallAnalyzer::visitSub(BinaryOperator &I) { bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) { Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); - const DataLayout &DL = F.getParent()->getDataLayout(); - if (!isa<Constant>(LHS)) - if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS)) - LHS = SimpleLHS; - if (!isa<Constant>(RHS)) - if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS)) - RHS = SimpleRHS; - Value *SimpleV = nullptr; - if (auto FI = dyn_cast<FPMathOperator>(&I)) - SimpleV = - SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL); - else - SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL); + auto Evaluate = [&](SmallVectorImpl<Constant *> &COps) { + Value *SimpleV = nullptr; + if (auto FI = dyn_cast<FPMathOperator>(&I)) + SimpleV = SimplifyFPBinOp(I.getOpcode(), COps[0], COps[1], + FI->getFastMathFlags(), DL); + else + SimpleV = SimplifyBinOp(I.getOpcode(), COps[0], COps[1], DL); + return dyn_cast_or_null<Constant>(SimpleV); + }; - if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) { - SimplifiedValues[&I] = C; + if (simplifyInstruction(I, Evaluate)) return true; - } // Disable any SROA on arguments to arbitrary, unsimplified binary operators. disableSROA(LHS); @@ -814,13 +865,10 @@ bool CallAnalyzer::visitStore(StoreInst &I) { bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) { // Constant folding for extract value is trivial. - Constant *C = dyn_cast<Constant>(I.getAggregateOperand()); - if (!C) - C = SimplifiedValues.lookup(I.getAggregateOperand()); - if (C) { - SimplifiedValues[&I] = ConstantExpr::getExtractValue(C, I.getIndices()); + if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { + return ConstantExpr::getExtractValue(COps[0], I.getIndices()); + })) return true; - } // SROA can look through these but give them a cost. return false; @@ -828,17 +876,12 @@ bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) { bool CallAnalyzer::visitInsertValue(InsertValueInst &I) { // Constant folding for insert value is trivial. - Constant *AggC = dyn_cast<Constant>(I.getAggregateOperand()); - if (!AggC) - AggC = SimplifiedValues.lookup(I.getAggregateOperand()); - Constant *InsertedC = dyn_cast<Constant>(I.getInsertedValueOperand()); - if (!InsertedC) - InsertedC = SimplifiedValues.lookup(I.getInsertedValueOperand()); - if (AggC && InsertedC) { - SimplifiedValues[&I] = - ConstantExpr::getInsertValue(AggC, InsertedC, I.getIndices()); + if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { + return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0], + /*InsertedValueOperand*/ COps[1], + I.getIndices()); + })) return true; - } // SROA can look through these but give them a cost. return false; @@ -855,7 +898,7 @@ bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) { // because we have to continually rebuild the argument list even when no // simplifications can be performed. Until that is fixed with remapping // inside of instsimplify, directly constant fold calls here. - if (!canConstantFoldCallTo(F)) + if (!canConstantFoldCallTo(CS, F)) return false; // Try to re-map the arguments to constants. @@ -871,7 +914,7 @@ bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) { ConstantArgs.push_back(C); } - if (Constant *C = ConstantFoldCall(F, ConstantArgs)) { + if (Constant *C = ConstantFoldCall(CS, F, ConstantArgs)) { SimplifiedValues[CS.getInstruction()] = C; return true; } @@ -959,7 +1002,8 @@ bool CallAnalyzer::visitCallSite(CallSite CS) { // out. Pretend to inline the function, with a custom threshold. auto IndirectCallParams = Params; IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold; - CallAnalyzer CA(TTI, GetAssumptionCache, PSI, *F, CS, IndirectCallParams); + CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, *F, CS, + IndirectCallParams); if (CA.analyzeCall(CS)) { // We were able to inline the indirect call! Subtract the cost from the // threshold to get the bonus we want to apply, but don't go below zero. @@ -995,22 +1039,74 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { if (isa<ConstantInt>(V)) return true; - // Otherwise, we need to accumulate a cost proportional to the number of - // distinct successor blocks. This fan-out in the CFG cannot be represented - // for free even if we can represent the core switch as a jumptable that - // takes a single instruction. + // Assume the most general case where the switch is lowered into + // either a jump table, bit test, or a balanced binary tree consisting of + // case clusters without merging adjacent clusters with the same + // destination. We do not consider the switches that are lowered with a mix + // of jump table/bit test/binary search tree. The cost of the switch is + // proportional to the size of the tree or the size of jump table range. // // NB: We convert large switches which are just used to initialize large phi // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent // inlining those. It will prevent inlining in cases where the optimization // does not (yet) fire. - SmallPtrSet<BasicBlock *, 8> SuccessorBlocks; - SuccessorBlocks.insert(SI.getDefaultDest()); - for (auto I = SI.case_begin(), E = SI.case_end(); I != E; ++I) - SuccessorBlocks.insert(I.getCaseSuccessor()); - // Add cost corresponding to the number of distinct destinations. The first - // we model as free because of fallthrough. - Cost += (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost; + + // Maximum valid cost increased in this function. + int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1; + + // Exit early for a large switch, assuming one case needs at least one + // instruction. + // FIXME: This is not true for a bit test, but ignore such case for now to + // save compile-time. + int64_t CostLowerBound = + std::min((int64_t)CostUpperBound, + (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost); + + if (CostLowerBound > Threshold) { + Cost = CostLowerBound; + return false; + } + + unsigned JumpTableSize = 0; + unsigned NumCaseCluster = + TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize); + + // If suitable for a jump table, consider the cost for the table size and + // branch to destination. + if (JumpTableSize) { + int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost + + 4 * InlineConstants::InstrCost; + + Cost = std::min((int64_t)CostUpperBound, JTCost + Cost); + return false; + } + + // Considering forming a binary search, we should find the number of nodes + // which is same as the number of comparisons when lowered. For a given + // number of clusters, n, we can define a recursive function, f(n), to find + // the number of nodes in the tree. The recursion is : + // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3, + // and f(n) = n, when n <= 3. + // This will lead a binary tree where the leaf should be either f(2) or f(3) + // when n > 3. So, the number of comparisons from leaves should be n, while + // the number of non-leaf should be : + // 2^(log2(n) - 1) - 1 + // = 2^log2(n) * 2^-1 - 1 + // = n / 2 - 1. + // Considering comparisons from leaf and non-leaf nodes, we can estimate the + // number of comparisons in a simple closed form : + // n + n / 2 - 1 = n * 3 / 2 - 1 + if (NumCaseCluster <= 3) { + // Suppose a comparison includes one compare and one conditional branch. + Cost += NumCaseCluster * 2 * InlineConstants::InstrCost; + return false; + } + + int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1; + int64_t SwitchCost = + ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost; + + Cost = std::min((int64_t)CostUpperBound, SwitchCost + Cost); return false; } @@ -1098,19 +1194,10 @@ bool CallAnalyzer::analyzeBlock(BasicBlock *BB, // is expensive or the function has the "use-soft-float" attribute, this may // eventually become a library call. Treat the cost as such. if (I->getType()->isFloatingPointTy()) { - bool hasSoftFloatAttr = false; - // If the function has the "use-soft-float" attribute, mark it as // expensive. - if (F.hasFnAttribute("use-soft-float")) { - Attribute Attr = F.getFnAttribute("use-soft-float"); - StringRef Val = Attr.getValueAsString(); - if (Val == "true") - hasSoftFloatAttr = true; - } - if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive || - hasSoftFloatAttr) + (F.getFnAttribute("use-soft-float").getValueAsString() == "true")) Cost += InlineConstants::CallPenalty; } @@ -1155,7 +1242,6 @@ ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) { if (!V->getType()->isPointerTy()) return nullptr; - const DataLayout &DL = F.getParent()->getDataLayout(); unsigned IntPtrWidth = DL.getPointerSizeInBits(); APInt Offset = APInt::getNullValue(IntPtrWidth); @@ -1212,7 +1298,6 @@ bool CallAnalyzer::analyzeCall(CallSite CS) { FiftyPercentVectorBonus = 3 * Threshold / 2; TenPercentVectorBonus = 3 * Threshold / 4; - const DataLayout &DL = F.getParent()->getDataLayout(); // Track whether the post-inlining function would have more than one basic // block. A single basic block is often intended for inlining. Balloon the @@ -1225,36 +1310,10 @@ bool CallAnalyzer::analyzeCall(CallSite CS) { // the rest of the function body. Threshold += (SingleBBBonus + FiftyPercentVectorBonus); - // Give out bonuses per argument, as the instructions setting them up will - // be gone after inlining. - for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) { - if (CS.isByValArgument(I)) { - // We approximate the number of loads and stores needed by dividing the - // size of the byval type by the target's pointer size. - PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType()); - unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType()); - unsigned PointerSize = DL.getPointerSizeInBits(); - // Ceiling division. - unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize; + // Give out bonuses for the callsite, as the instructions setting them up + // will be gone after inlining. + Cost -= getCallsiteCost(CS, DL); - // If it generates more than 8 stores it is likely to be expanded as an - // inline memcpy so we take that as an upper bound. Otherwise we assume - // one load and one store per word copied. - // FIXME: The maxStoresPerMemcpy setting from the target should be used - // here instead of a magic number of 8, but it's not available via - // DataLayout. - NumStores = std::min(NumStores, 8U); - - Cost -= 2 * NumStores * InlineConstants::InstrCost; - } else { - // For non-byval arguments subtract off one instruction per call - // argument. - Cost -= InlineConstants::InstrCost; - } - } - // The call instruction also disappears after inlining. - Cost -= InlineConstants::InstrCost + InlineConstants::CallPenalty; - // If there is only one call of the function, and it has internal linkage, // the cost of inlining it drops dramatically. bool OnlyOneCallAndLocalLinkage = @@ -1371,7 +1430,7 @@ bool CallAnalyzer::analyzeCall(CallSite CS) { Value *Cond = SI->getCondition(); if (ConstantInt *SimpleCond = dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { - BBWorklist.insert(SI->findCaseValue(SimpleCond).getCaseSuccessor()); + BBWorklist.insert(SI->findCaseValue(SimpleCond)->getCaseSuccessor()); continue; } } @@ -1430,13 +1489,6 @@ LLVM_DUMP_METHOD void CallAnalyzer::dump() { } #endif -/// \brief Test that two functions either have or have not the given attribute -/// at the same time. -template <typename AttrKind> -static bool attributeMatches(Function *F1, Function *F2, AttrKind Attr) { - return F1->getFnAttribute(Attr) == F2->getFnAttribute(Attr); -} - /// \brief Test that there are no attribute conflicts between Caller and Callee /// that prevent inlining. static bool functionsHaveCompatibleAttributes(Function *Caller, @@ -1446,18 +1498,52 @@ static bool functionsHaveCompatibleAttributes(Function *Caller, AttributeFuncs::areInlineCompatible(*Caller, *Callee); } +int llvm::getCallsiteCost(CallSite CS, const DataLayout &DL) { + int Cost = 0; + for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) { + if (CS.isByValArgument(I)) { + // We approximate the number of loads and stores needed by dividing the + // size of the byval type by the target's pointer size. + PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType()); + unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType()); + unsigned PointerSize = DL.getPointerSizeInBits(); + // Ceiling division. + unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize; + + // If it generates more than 8 stores it is likely to be expanded as an + // inline memcpy so we take that as an upper bound. Otherwise we assume + // one load and one store per word copied. + // FIXME: The maxStoresPerMemcpy setting from the target should be used + // here instead of a magic number of 8, but it's not available via + // DataLayout. + NumStores = std::min(NumStores, 8U); + + Cost += 2 * NumStores * InlineConstants::InstrCost; + } else { + // For non-byval arguments subtract off one instruction per call + // argument. + Cost += InlineConstants::InstrCost; + } + } + // The call instruction also disappears after inlining. + Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty; + return Cost; +} + InlineCost llvm::getInlineCost( CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI, std::function<AssumptionCache &(Function &)> &GetAssumptionCache, + Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI, ProfileSummaryInfo *PSI) { return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI, - GetAssumptionCache, PSI); + GetAssumptionCache, GetBFI, PSI); } InlineCost llvm::getInlineCost( CallSite CS, Function *Callee, const InlineParams &Params, TargetTransformInfo &CalleeTTI, std::function<AssumptionCache &(Function &)> &GetAssumptionCache, + Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI, ProfileSummaryInfo *PSI) { // Cannot inline indirect calls. @@ -1492,7 +1578,8 @@ InlineCost llvm::getInlineCost( DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() << "...\n"); - CallAnalyzer CA(CalleeTTI, GetAssumptionCache, PSI, *Callee, CS, Params); + CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, *Callee, CS, + Params); bool ShouldInline = CA.analyzeCall(CS); DEBUG(CA.dump()); @@ -1565,7 +1652,9 @@ InlineParams llvm::getInlineParams(int Threshold) { // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold. Params.HotCallSiteThreshold = HotCallSiteThreshold; - // Set the OptMinSizeThreshold and OptSizeThreshold params only if the + // Set the ColdCallSiteThreshold knob from the -inline-cold-callsite-threshold. + Params.ColdCallSiteThreshold = ColdCallSiteThreshold; + // Set the OptMinSizeThreshold and OptSizeThreshold params only if the // -inlinehint-threshold commandline option is not explicitly given. If that // option is present, then its value applies even for callees with size and |