diff options
Diffstat (limited to 'contrib/llvm/lib/Transforms/IPO')
26 files changed, 4199 insertions, 1817 deletions
diff --git a/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp index 65b7bad..72bae20 100644 --- a/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -29,8 +29,9 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/IPO/ArgumentPromotion.h" #include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" @@ -38,6 +39,7 @@ #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" +#include "llvm/Analysis/LazyCallGraph.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/CFG.h" @@ -51,323 +53,404 @@ #include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/IPO.h" #include <set> using namespace llvm; #define DEBUG_TYPE "argpromotion" -STATISTIC(NumArgumentsPromoted , "Number of pointer arguments promoted"); +STATISTIC(NumArgumentsPromoted, "Number of pointer arguments promoted"); STATISTIC(NumAggregatesPromoted, "Number of aggregate arguments promoted"); -STATISTIC(NumByValArgsPromoted , "Number of byval arguments promoted"); -STATISTIC(NumArgumentsDead , "Number of dead pointer args eliminated"); +STATISTIC(NumByValArgsPromoted, "Number of byval arguments promoted"); +STATISTIC(NumArgumentsDead, "Number of dead pointer args eliminated"); -namespace { - /// ArgPromotion - The 'by reference' to 'by value' argument promotion pass. - /// - struct ArgPromotion : public CallGraphSCCPass { - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<TargetLibraryInfoWrapperPass>(); - getAAResultsAnalysisUsage(AU); - CallGraphSCCPass::getAnalysisUsage(AU); - } +/// A vector used to hold the indices of a single GEP instruction +typedef std::vector<uint64_t> IndicesVector; - bool runOnSCC(CallGraphSCC &SCC) override; - static char ID; // Pass identification, replacement for typeid - explicit ArgPromotion(unsigned maxElements = 3) - : CallGraphSCCPass(ID), maxElements(maxElements) { - initializeArgPromotionPass(*PassRegistry::getPassRegistry()); - } +/// DoPromotion - This method actually performs the promotion of the specified +/// arguments, and returns the new function. At this point, we know that it's +/// safe to do so. +static Function * +doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote, + SmallPtrSetImpl<Argument *> &ByValArgsToTransform, + Optional<function_ref<void(CallSite OldCS, CallSite NewCS)>> + ReplaceCallSite) { - private: + // Start by computing a new prototype for the function, which is the same as + // the old function, but has modified arguments. + FunctionType *FTy = F->getFunctionType(); + std::vector<Type *> Params; - using llvm::Pass::doInitialization; - bool doInitialization(CallGraph &CG) override; - /// The maximum number of elements to expand, or 0 for unlimited. - unsigned maxElements; - }; -} + typedef std::set<std::pair<Type *, IndicesVector>> ScalarizeTable; -/// A vector used to hold the indices of a single GEP instruction -typedef std::vector<uint64_t> IndicesVector; + // ScalarizedElements - If we are promoting a pointer that has elements + // accessed out of it, keep track of which elements are accessed so that we + // can add one argument for each. + // + // Arguments that are directly loaded will have a zero element value here, to + // handle cases where there are both a direct load and GEP accesses. + // + std::map<Argument *, ScalarizeTable> ScalarizedElements; -static CallGraphNode * -PromoteArguments(CallGraphNode *CGN, CallGraph &CG, - function_ref<AAResults &(Function &F)> AARGetter, - unsigned MaxElements); -static bool isDenselyPacked(Type *type, const DataLayout &DL); -static bool canPaddingBeAccessed(Argument *Arg); -static bool isSafeToPromoteArgument(Argument *Arg, bool isByVal, AAResults &AAR, - unsigned MaxElements); -static CallGraphNode * -DoPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote, - SmallPtrSetImpl<Argument *> &ByValArgsToTransform, CallGraph &CG); + // OriginalLoads - Keep track of a representative load instruction from the + // original function so that we can tell the alias analysis implementation + // what the new GEP/Load instructions we are inserting look like. + // We need to keep the original loads for each argument and the elements + // of the argument that are accessed. + std::map<std::pair<Argument *, IndicesVector>, LoadInst *> OriginalLoads; -char ArgPromotion::ID = 0; -INITIALIZE_PASS_BEGIN(ArgPromotion, "argpromotion", - "Promote 'by reference' arguments to scalars", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_PASS_END(ArgPromotion, "argpromotion", - "Promote 'by reference' arguments to scalars", false, false) + // Attribute - Keep track of the parameter attributes for the arguments + // that we are *not* promoting. For the ones that we do promote, the parameter + // attributes are lost + SmallVector<AttributeSet, 8> ArgAttrVec; + AttributeList PAL = F->getAttributes(); -Pass *llvm::createArgumentPromotionPass(unsigned maxElements) { - return new ArgPromotion(maxElements); -} + // First, determine the new argument list + unsigned ArgNo = 0; + for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; + ++I, ++ArgNo) { + if (ByValArgsToTransform.count(&*I)) { + // Simple byval argument? Just add all the struct element types. + Type *AgTy = cast<PointerType>(I->getType())->getElementType(); + StructType *STy = cast<StructType>(AgTy); + Params.insert(Params.end(), STy->element_begin(), STy->element_end()); + ArgAttrVec.insert(ArgAttrVec.end(), STy->getNumElements(), + AttributeSet()); + ++NumByValArgsPromoted; + } else if (!ArgsToPromote.count(&*I)) { + // Unchanged argument + Params.push_back(I->getType()); + ArgAttrVec.push_back(PAL.getParamAttributes(ArgNo)); + } else if (I->use_empty()) { + // Dead argument (which are always marked as promotable) + ++NumArgumentsDead; -static bool runImpl(CallGraphSCC &SCC, CallGraph &CG, - function_ref<AAResults &(Function &F)> AARGetter, - unsigned MaxElements) { - bool Changed = false, LocalChange; + // There may be remaining metadata uses of the argument for things like + // llvm.dbg.value. Replace them with undef. + I->replaceAllUsesWith(UndefValue::get(I->getType())); + } else { + // Okay, this is being promoted. This means that the only uses are loads + // or GEPs which are only used by loads - do { // Iterate until we stop promoting from this SCC. - LocalChange = false; - // Attempt to promote arguments from all functions in this SCC. - for (CallGraphNode *OldNode : SCC) { - if (CallGraphNode *NewNode = - PromoteArguments(OldNode, CG, AARGetter, MaxElements)) { - LocalChange = true; - SCC.ReplaceNode(OldNode, NewNode); + // In this table, we will track which indices are loaded from the argument + // (where direct loads are tracked as no indices). + ScalarizeTable &ArgIndices = ScalarizedElements[&*I]; + for (User *U : I->users()) { + Instruction *UI = cast<Instruction>(U); + Type *SrcTy; + if (LoadInst *L = dyn_cast<LoadInst>(UI)) + SrcTy = L->getType(); + else + SrcTy = cast<GetElementPtrInst>(UI)->getSourceElementType(); + IndicesVector Indices; + Indices.reserve(UI->getNumOperands() - 1); + // Since loads will only have a single operand, and GEPs only a single + // non-index operand, this will record direct loads without any indices, + // and gep+loads with the GEP indices. + for (User::op_iterator II = UI->op_begin() + 1, IE = UI->op_end(); + II != IE; ++II) + Indices.push_back(cast<ConstantInt>(*II)->getSExtValue()); + // GEPs with a single 0 index can be merged with direct loads + if (Indices.size() == 1 && Indices.front() == 0) + Indices.clear(); + ArgIndices.insert(std::make_pair(SrcTy, Indices)); + LoadInst *OrigLoad; + if (LoadInst *L = dyn_cast<LoadInst>(UI)) + OrigLoad = L; + else + // Take any load, we will use it only to update Alias Analysis + OrigLoad = cast<LoadInst>(UI->user_back()); + OriginalLoads[std::make_pair(&*I, Indices)] = OrigLoad; } - } - Changed |= LocalChange; // Remember that we changed something. - } while (LocalChange); - - return Changed; -} -bool ArgPromotion::runOnSCC(CallGraphSCC &SCC) { - if (skipSCC(SCC)) - return false; + // Add a parameter to the function for each element passed in. + for (const auto &ArgIndex : ArgIndices) { + // not allowed to dereference ->begin() if size() is 0 + Params.push_back(GetElementPtrInst::getIndexedType( + cast<PointerType>(I->getType()->getScalarType())->getElementType(), + ArgIndex.second)); + ArgAttrVec.push_back(AttributeSet()); + assert(Params.back()); + } - // Get the callgraph information that we need to update to reflect our - // changes. - CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); + if (ArgIndices.size() == 1 && ArgIndices.begin()->second.empty()) + ++NumArgumentsPromoted; + else + ++NumAggregatesPromoted; + } + } - // We compute dedicated AA results for each function in the SCC as needed. We - // use a lambda referencing external objects so that they live long enough to - // be queried, but we re-use them each time. - Optional<BasicAAResult> BAR; - Optional<AAResults> AAR; - auto AARGetter = [&](Function &F) -> AAResults & { - BAR.emplace(createLegacyPMBasicAAResult(*this, F)); - AAR.emplace(createLegacyPMAAResults(*this, F, *BAR)); - return *AAR; - }; - - return runImpl(SCC, CG, AARGetter, maxElements); -} + Type *RetTy = FTy->getReturnType(); -/// \brief Checks if a type could have padding bytes. -static bool isDenselyPacked(Type *type, const DataLayout &DL) { + // Construct the new function type using the new arguments. + FunctionType *NFTy = FunctionType::get(RetTy, Params, FTy->isVarArg()); - // There is no size information, so be conservative. - if (!type->isSized()) - return false; + // Create the new function body and insert it into the module. + Function *NF = Function::Create(NFTy, F->getLinkage(), F->getName()); + NF->copyAttributesFrom(F); - // If the alloc size is not equal to the storage size, then there are padding - // bytes. For x86_fp80 on x86-64, size: 80 alloc size: 128. - if (DL.getTypeSizeInBits(type) != DL.getTypeAllocSizeInBits(type)) - return false; + // Patch the pointer to LLVM function in debug info descriptor. + NF->setSubprogram(F->getSubprogram()); + F->setSubprogram(nullptr); - if (!isa<CompositeType>(type)) - return true; + DEBUG(dbgs() << "ARG PROMOTION: Promoting to:" << *NF << "\n" + << "From: " << *F); - // For homogenous sequential types, check for padding within members. - if (SequentialType *seqTy = dyn_cast<SequentialType>(type)) - return isDenselyPacked(seqTy->getElementType(), DL); + // Recompute the parameter attributes list based on the new arguments for + // the function. + NF->setAttributes(AttributeList::get(F->getContext(), PAL.getFnAttributes(), + PAL.getRetAttributes(), ArgAttrVec)); + ArgAttrVec.clear(); - // Check for padding within and between elements of a struct. - StructType *StructTy = cast<StructType>(type); - const StructLayout *Layout = DL.getStructLayout(StructTy); - uint64_t StartPos = 0; - for (unsigned i = 0, E = StructTy->getNumElements(); i < E; ++i) { - Type *ElTy = StructTy->getElementType(i); - if (!isDenselyPacked(ElTy, DL)) - return false; - if (StartPos != Layout->getElementOffsetInBits(i)) - return false; - StartPos += DL.getTypeAllocSizeInBits(ElTy); - } + F->getParent()->getFunctionList().insert(F->getIterator(), NF); + NF->takeName(F); - return true; -} + // Loop over all of the callers of the function, transforming the call sites + // to pass in the loaded pointers. + // + SmallVector<Value *, 16> Args; + while (!F->use_empty()) { + CallSite CS(F->user_back()); + assert(CS.getCalledFunction() == F); + Instruction *Call = CS.getInstruction(); + const AttributeList &CallPAL = CS.getAttributes(); -/// \brief Checks if the padding bytes of an argument could be accessed. -static bool canPaddingBeAccessed(Argument *arg) { + // Loop over the operands, inserting GEP and loads in the caller as + // appropriate. + CallSite::arg_iterator AI = CS.arg_begin(); + ArgNo = 0; + for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; + ++I, ++AI, ++ArgNo) + if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) { + Args.push_back(*AI); // Unmodified argument + ArgAttrVec.push_back(CallPAL.getParamAttributes(ArgNo)); + } else if (ByValArgsToTransform.count(&*I)) { + // Emit a GEP and load for each element of the struct. + Type *AgTy = cast<PointerType>(I->getType())->getElementType(); + StructType *STy = cast<StructType>(AgTy); + Value *Idxs[2] = { + ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), nullptr}; + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i); + Value *Idx = GetElementPtrInst::Create( + STy, *AI, Idxs, (*AI)->getName() + "." + Twine(i), Call); + // TODO: Tell AA about the new values? + Args.push_back(new LoadInst(Idx, Idx->getName() + ".val", Call)); + ArgAttrVec.push_back(AttributeSet()); + } + } else if (!I->use_empty()) { + // Non-dead argument: insert GEPs and loads as appropriate. + ScalarizeTable &ArgIndices = ScalarizedElements[&*I]; + // Store the Value* version of the indices in here, but declare it now + // for reuse. + std::vector<Value *> Ops; + for (const auto &ArgIndex : ArgIndices) { + Value *V = *AI; + LoadInst *OrigLoad = + OriginalLoads[std::make_pair(&*I, ArgIndex.second)]; + if (!ArgIndex.second.empty()) { + Ops.reserve(ArgIndex.second.size()); + Type *ElTy = V->getType(); + for (auto II : ArgIndex.second) { + // Use i32 to index structs, and i64 for others (pointers/arrays). + // This satisfies GEP constraints. + Type *IdxTy = + (ElTy->isStructTy() ? Type::getInt32Ty(F->getContext()) + : Type::getInt64Ty(F->getContext())); + Ops.push_back(ConstantInt::get(IdxTy, II)); + // Keep track of the type we're currently indexing. + if (auto *ElPTy = dyn_cast<PointerType>(ElTy)) + ElTy = ElPTy->getElementType(); + else + ElTy = cast<CompositeType>(ElTy)->getTypeAtIndex(II); + } + // And create a GEP to extract those indices. + V = GetElementPtrInst::Create(ArgIndex.first, V, Ops, + V->getName() + ".idx", Call); + Ops.clear(); + } + // Since we're replacing a load make sure we take the alignment + // of the previous load. + LoadInst *newLoad = new LoadInst(V, V->getName() + ".val", Call); + newLoad->setAlignment(OrigLoad->getAlignment()); + // Transfer the AA info too. + AAMDNodes AAInfo; + OrigLoad->getAAMetadata(AAInfo); + newLoad->setAAMetadata(AAInfo); - assert(arg->hasByValAttr()); + Args.push_back(newLoad); + ArgAttrVec.push_back(AttributeSet()); + } + } - // Track all the pointers to the argument to make sure they are not captured. - SmallPtrSet<Value *, 16> PtrValues; - PtrValues.insert(arg); + // Push any varargs arguments on the list. + for (; AI != CS.arg_end(); ++AI, ++ArgNo) { + Args.push_back(*AI); + ArgAttrVec.push_back(CallPAL.getParamAttributes(ArgNo)); + } - // Track all of the stores. - SmallVector<StoreInst *, 16> Stores; + SmallVector<OperandBundleDef, 1> OpBundles; + CS.getOperandBundlesAsDefs(OpBundles); - // Scan through the uses recursively to make sure the pointer is always used - // sanely. - SmallVector<Value *, 16> WorkList; - WorkList.insert(WorkList.end(), arg->user_begin(), arg->user_end()); - while (!WorkList.empty()) { - Value *V = WorkList.back(); - WorkList.pop_back(); - if (isa<GetElementPtrInst>(V) || isa<PHINode>(V)) { - if (PtrValues.insert(V).second) - WorkList.insert(WorkList.end(), V->user_begin(), V->user_end()); - } else if (StoreInst *Store = dyn_cast<StoreInst>(V)) { - Stores.push_back(Store); - } else if (!isa<LoadInst>(V)) { - return true; + CallSite NewCS; + if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) { + NewCS = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(), + Args, OpBundles, "", Call); + } else { + auto *NewCall = CallInst::Create(NF, Args, OpBundles, "", Call); + NewCall->setTailCallKind(cast<CallInst>(Call)->getTailCallKind()); + NewCS = NewCall; } - } + NewCS.setCallingConv(CS.getCallingConv()); + NewCS.setAttributes( + AttributeList::get(F->getContext(), CallPAL.getFnAttributes(), + CallPAL.getRetAttributes(), ArgAttrVec)); + NewCS->setDebugLoc(Call->getDebugLoc()); + uint64_t W; + if (Call->extractProfTotalWeight(W)) + NewCS->setProfWeight(W); + Args.clear(); + ArgAttrVec.clear(); -// Check to make sure the pointers aren't captured - for (StoreInst *Store : Stores) - if (PtrValues.count(Store->getValueOperand())) - return true; + // Update the callgraph to know that the callsite has been transformed. + if (ReplaceCallSite) + (*ReplaceCallSite)(CS, NewCS); - return false; -} + if (!Call->use_empty()) { + Call->replaceAllUsesWith(NewCS.getInstruction()); + NewCS->takeName(Call); + } -/// PromoteArguments - This method checks the specified function to see if there -/// are any promotable arguments and if it is safe to promote the function (for -/// example, all callers are direct). If safe to promote some arguments, it -/// calls the DoPromotion method. -/// -static CallGraphNode * -PromoteArguments(CallGraphNode *CGN, CallGraph &CG, - function_ref<AAResults &(Function &F)> AARGetter, - unsigned MaxElements) { - Function *F = CGN->getFunction(); + // Finally, remove the old call from the program, reducing the use-count of + // F. + Call->eraseFromParent(); + } - // Make sure that it is local to this module. - if (!F || !F->hasLocalLinkage()) return nullptr; + const DataLayout &DL = F->getParent()->getDataLayout(); - // Don't promote arguments for variadic functions. Adding, removing, or - // changing non-pack parameters can change the classification of pack - // parameters. Frontends encode that classification at the call site in the - // IR, while in the callee the classification is determined dynamically based - // on the number of registers consumed so far. - if (F->isVarArg()) return nullptr; + // Since we have now created the new function, splice the body of the old + // function right into the new function, leaving the old rotting hulk of the + // function empty. + NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList()); - // First check: see if there are any pointer arguments! If not, quick exit. - SmallVector<Argument*, 16> PointerArgs; - for (Argument &I : F->args()) - if (I.getType()->isPointerTy()) - PointerArgs.push_back(&I); - if (PointerArgs.empty()) return nullptr; + // Loop over the argument list, transferring uses of the old arguments over to + // the new arguments, also transferring over the names as well. + // + for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(), + I2 = NF->arg_begin(); + I != E; ++I) { + if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) { + // If this is an unmodified argument, move the name and users over to the + // new version. + I->replaceAllUsesWith(&*I2); + I2->takeName(&*I); + ++I2; + continue; + } - // Second check: make sure that all callers are direct callers. We can't - // transform functions that have indirect callers. Also see if the function - // is self-recursive. - bool isSelfRecursive = false; - for (Use &U : F->uses()) { - CallSite CS(U.getUser()); - // Must be a direct call. - if (CS.getInstruction() == nullptr || !CS.isCallee(&U)) return nullptr; - - if (CS.getInstruction()->getParent()->getParent() == F) - isSelfRecursive = true; - } - - const DataLayout &DL = F->getParent()->getDataLayout(); + if (ByValArgsToTransform.count(&*I)) { + // In the callee, we create an alloca, and store each of the new incoming + // arguments into the alloca. + Instruction *InsertPt = &NF->begin()->front(); - AAResults &AAR = AARGetter(*F); + // Just add all the struct element types. + Type *AgTy = cast<PointerType>(I->getType())->getElementType(); + Value *TheAlloca = new AllocaInst(AgTy, DL.getAllocaAddrSpace(), nullptr, + I->getParamAlignment(), "", InsertPt); + StructType *STy = cast<StructType>(AgTy); + Value *Idxs[2] = {ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), + nullptr}; - // Check to see which arguments are promotable. If an argument is promotable, - // add it to ArgsToPromote. - SmallPtrSet<Argument*, 8> ArgsToPromote; - SmallPtrSet<Argument*, 8> ByValArgsToTransform; - for (Argument *PtrArg : PointerArgs) { - Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType(); + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i); + Value *Idx = GetElementPtrInst::Create( + AgTy, TheAlloca, Idxs, TheAlloca->getName() + "." + Twine(i), + InsertPt); + I2->setName(I->getName() + "." + Twine(i)); + new StoreInst(&*I2++, Idx, InsertPt); + } - // Replace sret attribute with noalias. This reduces register pressure by - // avoiding a register copy. - if (PtrArg->hasStructRetAttr()) { - unsigned ArgNo = PtrArg->getArgNo(); - F->setAttributes( - F->getAttributes() - .removeAttribute(F->getContext(), ArgNo + 1, Attribute::StructRet) - .addAttribute(F->getContext(), ArgNo + 1, Attribute::NoAlias)); - for (Use &U : F->uses()) { - CallSite CS(U.getUser()); - CS.setAttributes( - CS.getAttributes() - .removeAttribute(F->getContext(), ArgNo + 1, - Attribute::StructRet) - .addAttribute(F->getContext(), ArgNo + 1, Attribute::NoAlias)); + // Anything that used the arg should now use the alloca. + I->replaceAllUsesWith(TheAlloca); + TheAlloca->takeName(&*I); + + // If the alloca is used in a call, we must clear the tail flag since + // the callee now uses an alloca from the caller. + for (User *U : TheAlloca->users()) { + CallInst *Call = dyn_cast<CallInst>(U); + if (!Call) + continue; + Call->setTailCall(false); } + continue; } - // If this is a byval argument, and if the aggregate type is small, just - // pass the elements, which is always safe, if the passed value is densely - // packed or if we can prove the padding bytes are never accessed. This does - // not apply to inalloca. - bool isSafeToPromote = - PtrArg->hasByValAttr() && - (isDenselyPacked(AgTy, DL) || !canPaddingBeAccessed(PtrArg)); - if (isSafeToPromote) { - if (StructType *STy = dyn_cast<StructType>(AgTy)) { - if (MaxElements > 0 && STy->getNumElements() > MaxElements) { - DEBUG(dbgs() << "argpromotion disable promoting argument '" - << PtrArg->getName() << "' because it would require adding more" - << " than " << MaxElements << " arguments to the function.\n"); - continue; - } - - // If all the elements are single-value types, we can promote it. - bool AllSimple = true; - for (const auto *EltTy : STy->elements()) { - if (!EltTy->isSingleValueType()) { - AllSimple = false; - break; - } + if (I->use_empty()) + continue; + + // Otherwise, if we promoted this argument, then all users are load + // instructions (or GEPs with only load users), and all loads should be + // using the new argument that we added. + ScalarizeTable &ArgIndices = ScalarizedElements[&*I]; + + while (!I->use_empty()) { + if (LoadInst *LI = dyn_cast<LoadInst>(I->user_back())) { + assert(ArgIndices.begin()->second.empty() && + "Load element should sort to front!"); + I2->setName(I->getName() + ".val"); + LI->replaceAllUsesWith(&*I2); + LI->eraseFromParent(); + DEBUG(dbgs() << "*** Promoted load of argument '" << I->getName() + << "' in function '" << F->getName() << "'\n"); + } else { + GetElementPtrInst *GEP = cast<GetElementPtrInst>(I->user_back()); + IndicesVector Operands; + Operands.reserve(GEP->getNumIndices()); + for (User::op_iterator II = GEP->idx_begin(), IE = GEP->idx_end(); + II != IE; ++II) + Operands.push_back(cast<ConstantInt>(*II)->getSExtValue()); + + // GEPs with a single 0 index can be merged with direct loads + if (Operands.size() == 1 && Operands.front() == 0) + Operands.clear(); + + Function::arg_iterator TheArg = I2; + for (ScalarizeTable::iterator It = ArgIndices.begin(); + It->second != Operands; ++It, ++TheArg) { + assert(It != ArgIndices.end() && "GEP not handled??"); } - // Safe to transform, don't even bother trying to "promote" it. - // Passing the elements as a scalar will allow sroa to hack on - // the new alloca we introduce. - if (AllSimple) { - ByValArgsToTransform.insert(PtrArg); - continue; + std::string NewName = I->getName(); + for (unsigned i = 0, e = Operands.size(); i != e; ++i) { + NewName += "." + utostr(Operands[i]); } - } - } + NewName += ".val"; + TheArg->setName(NewName); - // If the argument is a recursive type and we're in a recursive - // function, we could end up infinitely peeling the function argument. - if (isSelfRecursive) { - if (StructType *STy = dyn_cast<StructType>(AgTy)) { - bool RecursiveType = false; - for (const auto *EltTy : STy->elements()) { - if (EltTy == PtrArg->getType()) { - RecursiveType = true; - break; - } + DEBUG(dbgs() << "*** Promoted agg argument '" << TheArg->getName() + << "' of function '" << NF->getName() << "'\n"); + + // All of the uses must be load instructions. Replace them all with + // the argument specified by ArgNo. + while (!GEP->use_empty()) { + LoadInst *L = cast<LoadInst>(GEP->user_back()); + L->replaceAllUsesWith(&*TheArg); + L->eraseFromParent(); } - if (RecursiveType) - continue; + GEP->eraseFromParent(); } } - - // Otherwise, see if we can promote the pointer to its value. - if (isSafeToPromoteArgument(PtrArg, PtrArg->hasByValOrInAllocaAttr(), AAR, - MaxElements)) - ArgsToPromote.insert(PtrArg); - } - // No promotable pointer arguments. - if (ArgsToPromote.empty() && ByValArgsToTransform.empty()) - return nullptr; + // Increment I2 past all of the arguments added for this promoted pointer. + std::advance(I2, ArgIndices.size()); + } - return DoPromotion(F, ArgsToPromote, ByValArgsToTransform, CG); + return NF; } /// AllCallersPassInValidPointerForArgument - Return true if we can prove that /// all callees pass in a valid pointer for the specified function argument. -static bool AllCallersPassInValidPointerForArgument(Argument *Arg) { +static bool allCallersPassInValidPointerForArgument(Argument *Arg) { Function *Callee = Arg->getParent(); const DataLayout &DL = Callee->getParent()->getDataLayout(); @@ -390,26 +473,25 @@ static bool AllCallersPassInValidPointerForArgument(Argument *Arg) { /// elements in Prefix is the same as the corresponding elements in Longer. /// /// This means it also returns true when Prefix and Longer are equal! -static bool IsPrefix(const IndicesVector &Prefix, const IndicesVector &Longer) { +static bool isPrefix(const IndicesVector &Prefix, const IndicesVector &Longer) { if (Prefix.size() > Longer.size()) return false; return std::equal(Prefix.begin(), Prefix.end(), Longer.begin()); } - /// Checks if Indices, or a prefix of Indices, is in Set. -static bool PrefixIn(const IndicesVector &Indices, +static bool prefixIn(const IndicesVector &Indices, std::set<IndicesVector> &Set) { - std::set<IndicesVector>::iterator Low; - Low = Set.upper_bound(Indices); - if (Low != Set.begin()) - Low--; - // Low is now the last element smaller than or equal to Indices. This means - // it points to a prefix of Indices (possibly Indices itself), if such - // prefix exists. - // - // This load is safe if any prefix of its operands is safe to load. - return Low != Set.end() && IsPrefix(*Low, Indices); + std::set<IndicesVector>::iterator Low; + Low = Set.upper_bound(Indices); + if (Low != Set.begin()) + Low--; + // Low is now the last element smaller than or equal to Indices. This means + // it points to a prefix of Indices (possibly Indices itself), if such + // prefix exists. + // + // This load is safe if any prefix of its operands is safe to load. + return Low != Set.end() && isPrefix(*Low, Indices); } /// Mark the given indices (ToMark) as safe in the given set of indices @@ -417,7 +499,7 @@ static bool PrefixIn(const IndicesVector &Indices, /// is already a prefix of Indices in Safe, Indices are implicitely marked safe /// already. Furthermore, any indices that Indices is itself a prefix of, are /// removed from Safe (since they are implicitely safe because of Indices now). -static void MarkIndicesSafe(const IndicesVector &ToMark, +static void markIndicesSafe(const IndicesVector &ToMark, std::set<IndicesVector> &Safe) { std::set<IndicesVector>::iterator Low; Low = Safe.upper_bound(ToMark); @@ -428,7 +510,7 @@ static void MarkIndicesSafe(const IndicesVector &ToMark, // means it points to a prefix of Indices (possibly Indices itself), if // such prefix exists. if (Low != Safe.end()) { - if (IsPrefix(*Low, ToMark)) + if (isPrefix(*Low, ToMark)) // If there is already a prefix of these indices (or exactly these // indices) marked a safe, don't bother adding these indices return; @@ -441,7 +523,7 @@ static void MarkIndicesSafe(const IndicesVector &ToMark, ++Low; // If there we're a prefix of longer index list(s), remove those std::set<IndicesVector>::iterator End = Safe.end(); - while (Low != End && IsPrefix(ToMark, *Low)) { + while (Low != End && isPrefix(ToMark, *Low)) { std::set<IndicesVector>::iterator Remove = Low; ++Low; Safe.erase(Remove); @@ -486,7 +568,7 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca, GEPIndicesSet ToPromote; // If the pointer is always valid, any load with first index 0 is valid. - if (isByValOrInAlloca || AllCallersPassInValidPointerForArgument(Arg)) + if (isByValOrInAlloca || allCallersPassInValidPointerForArgument(Arg)) SafeToUnconditionallyLoad.insert(IndicesVector(1, 0)); // First, iterate the entry block and mark loads of (geps of) arguments as @@ -512,25 +594,26 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca, return false; // Indices checked out, mark them as safe - MarkIndicesSafe(Indices, SafeToUnconditionallyLoad); + markIndicesSafe(Indices, SafeToUnconditionallyLoad); Indices.clear(); } } else if (V == Arg) { // Direct loads are equivalent to a GEP with a single 0 index. - MarkIndicesSafe(IndicesVector(1, 0), SafeToUnconditionallyLoad); + markIndicesSafe(IndicesVector(1, 0), SafeToUnconditionallyLoad); } } // Now, iterate all uses of the argument to see if there are any uses that are // not (GEP+)loads, or any (GEP+)loads that are not safe to promote. - SmallVector<LoadInst*, 16> Loads; + SmallVector<LoadInst *, 16> Loads; IndicesVector Operands; for (Use &U : Arg->uses()) { User *UR = U.getUser(); Operands.clear(); if (LoadInst *LI = dyn_cast<LoadInst>(UR)) { // Don't hack volatile/atomic loads - if (!LI->isSimple()) return false; + if (!LI->isSimple()) + return false; Loads.push_back(LI); // Direct loads are equivalent to a GEP with a zero index and then a load. Operands.push_back(0); @@ -547,30 +630,31 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca, } // Ensure that all of the indices are constants. - for (User::op_iterator i = GEP->idx_begin(), e = GEP->idx_end(); - i != e; ++i) + for (User::op_iterator i = GEP->idx_begin(), e = GEP->idx_end(); i != e; + ++i) if (ConstantInt *C = dyn_cast<ConstantInt>(*i)) Operands.push_back(C->getSExtValue()); else - return false; // Not a constant operand GEP! + return false; // Not a constant operand GEP! // Ensure that the only users of the GEP are load instructions. for (User *GEPU : GEP->users()) if (LoadInst *LI = dyn_cast<LoadInst>(GEPU)) { // Don't hack volatile/atomic loads - if (!LI->isSimple()) return false; + if (!LI->isSimple()) + return false; Loads.push_back(LI); } else { // Other uses than load? return false; } } else { - return false; // Not a load or a GEP. + return false; // Not a load or a GEP. } // Now, see if it is safe to promote this load / loads of this GEP. Loading // is safe if Operands, or a prefix of Operands, is marked as safe. - if (!PrefixIn(Operands, SafeToUnconditionallyLoad)) + if (!prefixIn(Operands, SafeToUnconditionallyLoad)) return false; // See if we are already promoting a load with these indices. If not, check @@ -579,8 +663,10 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca, if (ToPromote.find(Operands) == ToPromote.end()) { if (MaxElements > 0 && ToPromote.size() == MaxElements) { DEBUG(dbgs() << "argpromotion not promoting argument '" - << Arg->getName() << "' because it would require adding more " - << "than " << MaxElements << " arguments to the function.\n"); + << Arg->getName() + << "' because it would require adding more " + << "than " << MaxElements + << " arguments to the function.\n"); // We limit aggregate promotion to only promoting up to a fixed number // of elements of the aggregate. return false; @@ -589,7 +675,8 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca, } } - if (Loads.empty()) return true; // No users, this is a dead argument. + if (Loads.empty()) + return true; // No users, this is a dead argument. // Okay, now we know that the argument is only used by load instructions and // it is safe to unconditionally perform all of them. Use alias analysis to @@ -598,7 +685,7 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca, // Because there could be several/many load instructions, remember which // blocks we know to be transparent to the load. - df_iterator_default_set<BasicBlock*, 16> TranspBlocks; + df_iterator_default_set<BasicBlock *, 16> TranspBlocks; for (LoadInst *Load : Loads) { // Check to see if the load is invalidated from the start of the block to @@ -607,7 +694,7 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca, MemoryLocation Loc = MemoryLocation::get(Load); if (AAR.canInstructionRangeModRef(BB->front(), *Load, Loc, MRI_Mod)) - return false; // Pointer is invalidated! + return false; // Pointer is invalidated! // Now check every path from the entry block to the load for transparency. // To do this, we perform a depth first search on the inverse CFG from the @@ -625,416 +712,347 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca, return true; } -/// DoPromotion - This method actually performs the promotion of the specified -/// arguments, and returns the new function. At this point, we know that it's -/// safe to do so. -static CallGraphNode * -DoPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote, - SmallPtrSetImpl<Argument *> &ByValArgsToTransform, CallGraph &CG) { +/// \brief Checks if a type could have padding bytes. +static bool isDenselyPacked(Type *type, const DataLayout &DL) { - // Start by computing a new prototype for the function, which is the same as - // the old function, but has modified arguments. - FunctionType *FTy = F->getFunctionType(); - std::vector<Type*> Params; + // There is no size information, so be conservative. + if (!type->isSized()) + return false; - typedef std::set<std::pair<Type *, IndicesVector>> ScalarizeTable; + // If the alloc size is not equal to the storage size, then there are padding + // bytes. For x86_fp80 on x86-64, size: 80 alloc size: 128. + if (DL.getTypeSizeInBits(type) != DL.getTypeAllocSizeInBits(type)) + return false; - // ScalarizedElements - If we are promoting a pointer that has elements - // accessed out of it, keep track of which elements are accessed so that we - // can add one argument for each. - // - // Arguments that are directly loaded will have a zero element value here, to - // handle cases where there are both a direct load and GEP accesses. - // - std::map<Argument*, ScalarizeTable> ScalarizedElements; + if (!isa<CompositeType>(type)) + return true; - // OriginalLoads - Keep track of a representative load instruction from the - // original function so that we can tell the alias analysis implementation - // what the new GEP/Load instructions we are inserting look like. - // We need to keep the original loads for each argument and the elements - // of the argument that are accessed. - std::map<std::pair<Argument*, IndicesVector>, LoadInst*> OriginalLoads; + // For homogenous sequential types, check for padding within members. + if (SequentialType *seqTy = dyn_cast<SequentialType>(type)) + return isDenselyPacked(seqTy->getElementType(), DL); - // Attribute - Keep track of the parameter attributes for the arguments - // that we are *not* promoting. For the ones that we do promote, the parameter - // attributes are lost - SmallVector<AttributeSet, 8> AttributesVec; - const AttributeSet &PAL = F->getAttributes(); + // Check for padding within and between elements of a struct. + StructType *StructTy = cast<StructType>(type); + const StructLayout *Layout = DL.getStructLayout(StructTy); + uint64_t StartPos = 0; + for (unsigned i = 0, E = StructTy->getNumElements(); i < E; ++i) { + Type *ElTy = StructTy->getElementType(i); + if (!isDenselyPacked(ElTy, DL)) + return false; + if (StartPos != Layout->getElementOffsetInBits(i)) + return false; + StartPos += DL.getTypeAllocSizeInBits(ElTy); + } - // Add any return attributes. - if (PAL.hasAttributes(AttributeSet::ReturnIndex)) - AttributesVec.push_back(AttributeSet::get(F->getContext(), - PAL.getRetAttributes())); + return true; +} - // First, determine the new argument list - unsigned ArgIndex = 1; - for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; - ++I, ++ArgIndex) { - if (ByValArgsToTransform.count(&*I)) { - // Simple byval argument? Just add all the struct element types. - Type *AgTy = cast<PointerType>(I->getType())->getElementType(); - StructType *STy = cast<StructType>(AgTy); - Params.insert(Params.end(), STy->element_begin(), STy->element_end()); - ++NumByValArgsPromoted; - } else if (!ArgsToPromote.count(&*I)) { - // Unchanged argument - Params.push_back(I->getType()); - AttributeSet attrs = PAL.getParamAttributes(ArgIndex); - if (attrs.hasAttributes(ArgIndex)) { - AttrBuilder B(attrs, ArgIndex); - AttributesVec. - push_back(AttributeSet::get(F->getContext(), Params.size(), B)); - } - } else if (I->use_empty()) { - // Dead argument (which are always marked as promotable) - ++NumArgumentsDead; - } else { - // Okay, this is being promoted. This means that the only uses are loads - // or GEPs which are only used by loads +/// \brief Checks if the padding bytes of an argument could be accessed. +static bool canPaddingBeAccessed(Argument *arg) { - // In this table, we will track which indices are loaded from the argument - // (where direct loads are tracked as no indices). - ScalarizeTable &ArgIndices = ScalarizedElements[&*I]; - for (User *U : I->users()) { - Instruction *UI = cast<Instruction>(U); - Type *SrcTy; - if (LoadInst *L = dyn_cast<LoadInst>(UI)) - SrcTy = L->getType(); - else - SrcTy = cast<GetElementPtrInst>(UI)->getSourceElementType(); - IndicesVector Indices; - Indices.reserve(UI->getNumOperands() - 1); - // Since loads will only have a single operand, and GEPs only a single - // non-index operand, this will record direct loads without any indices, - // and gep+loads with the GEP indices. - for (User::op_iterator II = UI->op_begin() + 1, IE = UI->op_end(); - II != IE; ++II) - Indices.push_back(cast<ConstantInt>(*II)->getSExtValue()); - // GEPs with a single 0 index can be merged with direct loads - if (Indices.size() == 1 && Indices.front() == 0) - Indices.clear(); - ArgIndices.insert(std::make_pair(SrcTy, Indices)); - LoadInst *OrigLoad; - if (LoadInst *L = dyn_cast<LoadInst>(UI)) - OrigLoad = L; - else - // Take any load, we will use it only to update Alias Analysis - OrigLoad = cast<LoadInst>(UI->user_back()); - OriginalLoads[std::make_pair(&*I, Indices)] = OrigLoad; - } + assert(arg->hasByValAttr()); - // Add a parameter to the function for each element passed in. - for (const auto &ArgIndex : ArgIndices) { - // not allowed to dereference ->begin() if size() is 0 - Params.push_back(GetElementPtrInst::getIndexedType( - cast<PointerType>(I->getType()->getScalarType())->getElementType(), - ArgIndex.second)); - assert(Params.back()); - } + // Track all the pointers to the argument to make sure they are not captured. + SmallPtrSet<Value *, 16> PtrValues; + PtrValues.insert(arg); - if (ArgIndices.size() == 1 && ArgIndices.begin()->second.empty()) - ++NumArgumentsPromoted; - else - ++NumAggregatesPromoted; + // Track all of the stores. + SmallVector<StoreInst *, 16> Stores; + + // Scan through the uses recursively to make sure the pointer is always used + // sanely. + SmallVector<Value *, 16> WorkList; + WorkList.insert(WorkList.end(), arg->user_begin(), arg->user_end()); + while (!WorkList.empty()) { + Value *V = WorkList.back(); + WorkList.pop_back(); + if (isa<GetElementPtrInst>(V) || isa<PHINode>(V)) { + if (PtrValues.insert(V).second) + WorkList.insert(WorkList.end(), V->user_begin(), V->user_end()); + } else if (StoreInst *Store = dyn_cast<StoreInst>(V)) { + Stores.push_back(Store); + } else if (!isa<LoadInst>(V)) { + return true; } } - // Add any function attributes. - if (PAL.hasAttributes(AttributeSet::FunctionIndex)) - AttributesVec.push_back(AttributeSet::get(FTy->getContext(), - PAL.getFnAttributes())); + // Check to make sure the pointers aren't captured + for (StoreInst *Store : Stores) + if (PtrValues.count(Store->getValueOperand())) + return true; - Type *RetTy = FTy->getReturnType(); + return false; +} - // Construct the new function type using the new arguments. - FunctionType *NFTy = FunctionType::get(RetTy, Params, FTy->isVarArg()); +/// PromoteArguments - This method checks the specified function to see if there +/// are any promotable arguments and if it is safe to promote the function (for +/// example, all callers are direct). If safe to promote some arguments, it +/// calls the DoPromotion method. +/// +static Function * +promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter, + unsigned MaxElements, + Optional<function_ref<void(CallSite OldCS, CallSite NewCS)>> + ReplaceCallSite) { + // Make sure that it is local to this module. + if (!F->hasLocalLinkage()) + return nullptr; - // Create the new function body and insert it into the module. - Function *NF = Function::Create(NFTy, F->getLinkage(), F->getName()); - NF->copyAttributesFrom(F); + // Don't promote arguments for variadic functions. Adding, removing, or + // changing non-pack parameters can change the classification of pack + // parameters. Frontends encode that classification at the call site in the + // IR, while in the callee the classification is determined dynamically based + // on the number of registers consumed so far. + if (F->isVarArg()) + return nullptr; - // Patch the pointer to LLVM function in debug info descriptor. - NF->setSubprogram(F->getSubprogram()); - F->setSubprogram(nullptr); + // First check: see if there are any pointer arguments! If not, quick exit. + SmallVector<Argument *, 16> PointerArgs; + for (Argument &I : F->args()) + if (I.getType()->isPointerTy()) + PointerArgs.push_back(&I); + if (PointerArgs.empty()) + return nullptr; - DEBUG(dbgs() << "ARG PROMOTION: Promoting to:" << *NF << "\n" - << "From: " << *F); - - // Recompute the parameter attributes list based on the new arguments for - // the function. - NF->setAttributes(AttributeSet::get(F->getContext(), AttributesVec)); - AttributesVec.clear(); + // Second check: make sure that all callers are direct callers. We can't + // transform functions that have indirect callers. Also see if the function + // is self-recursive. + bool isSelfRecursive = false; + for (Use &U : F->uses()) { + CallSite CS(U.getUser()); + // Must be a direct call. + if (CS.getInstruction() == nullptr || !CS.isCallee(&U)) + return nullptr; - F->getParent()->getFunctionList().insert(F->getIterator(), NF); - NF->takeName(F); + if (CS.getInstruction()->getParent()->getParent() == F) + isSelfRecursive = true; + } - // Get a new callgraph node for NF. - CallGraphNode *NF_CGN = CG.getOrInsertFunction(NF); + const DataLayout &DL = F->getParent()->getDataLayout(); - // Loop over all of the callers of the function, transforming the call sites - // to pass in the loaded pointers. - // - SmallVector<Value*, 16> Args; - while (!F->use_empty()) { - CallSite CS(F->user_back()); - assert(CS.getCalledFunction() == F); - Instruction *Call = CS.getInstruction(); - const AttributeSet &CallPAL = CS.getAttributes(); + AAResults &AAR = AARGetter(*F); - // Add any return attributes. - if (CallPAL.hasAttributes(AttributeSet::ReturnIndex)) - AttributesVec.push_back(AttributeSet::get(F->getContext(), - CallPAL.getRetAttributes())); + // Check to see which arguments are promotable. If an argument is promotable, + // add it to ArgsToPromote. + SmallPtrSet<Argument *, 8> ArgsToPromote; + SmallPtrSet<Argument *, 8> ByValArgsToTransform; + for (Argument *PtrArg : PointerArgs) { + Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType(); - // Loop over the operands, inserting GEP and loads in the caller as - // appropriate. - CallSite::arg_iterator AI = CS.arg_begin(); - ArgIndex = 1; - for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); - I != E; ++I, ++AI, ++ArgIndex) - if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) { - Args.push_back(*AI); // Unmodified argument + // Replace sret attribute with noalias. This reduces register pressure by + // avoiding a register copy. + if (PtrArg->hasStructRetAttr()) { + unsigned ArgNo = PtrArg->getArgNo(); + F->removeParamAttr(ArgNo, Attribute::StructRet); + F->addParamAttr(ArgNo, Attribute::NoAlias); + for (Use &U : F->uses()) { + CallSite CS(U.getUser()); + CS.removeParamAttr(ArgNo, Attribute::StructRet); + CS.addParamAttr(ArgNo, Attribute::NoAlias); + } + } - if (CallPAL.hasAttributes(ArgIndex)) { - AttrBuilder B(CallPAL, ArgIndex); - AttributesVec. - push_back(AttributeSet::get(F->getContext(), Args.size(), B)); - } - } else if (ByValArgsToTransform.count(&*I)) { - // Emit a GEP and load for each element of the struct. - Type *AgTy = cast<PointerType>(I->getType())->getElementType(); - StructType *STy = cast<StructType>(AgTy); - Value *Idxs[2] = { - ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), nullptr }; - for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { - Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i); - Value *Idx = GetElementPtrInst::Create( - STy, *AI, Idxs, (*AI)->getName() + "." + Twine(i), Call); - // TODO: Tell AA about the new values? - Args.push_back(new LoadInst(Idx, Idx->getName()+".val", Call)); + // If this is a byval argument, and if the aggregate type is small, just + // pass the elements, which is always safe, if the passed value is densely + // packed or if we can prove the padding bytes are never accessed. This does + // not apply to inalloca. + bool isSafeToPromote = + PtrArg->hasByValAttr() && + (isDenselyPacked(AgTy, DL) || !canPaddingBeAccessed(PtrArg)); + if (isSafeToPromote) { + if (StructType *STy = dyn_cast<StructType>(AgTy)) { + if (MaxElements > 0 && STy->getNumElements() > MaxElements) { + DEBUG(dbgs() << "argpromotion disable promoting argument '" + << PtrArg->getName() + << "' because it would require adding more" + << " than " << MaxElements + << " arguments to the function.\n"); + continue; } - } else if (!I->use_empty()) { - // Non-dead argument: insert GEPs and loads as appropriate. - ScalarizeTable &ArgIndices = ScalarizedElements[&*I]; - // Store the Value* version of the indices in here, but declare it now - // for reuse. - std::vector<Value*> Ops; - for (const auto &ArgIndex : ArgIndices) { - Value *V = *AI; - LoadInst *OrigLoad = - OriginalLoads[std::make_pair(&*I, ArgIndex.second)]; - if (!ArgIndex.second.empty()) { - Ops.reserve(ArgIndex.second.size()); - Type *ElTy = V->getType(); - for (unsigned long II : ArgIndex.second) { - // Use i32 to index structs, and i64 for others (pointers/arrays). - // This satisfies GEP constraints. - Type *IdxTy = (ElTy->isStructTy() ? - Type::getInt32Ty(F->getContext()) : - Type::getInt64Ty(F->getContext())); - Ops.push_back(ConstantInt::get(IdxTy, II)); - // Keep track of the type we're currently indexing. - if (auto *ElPTy = dyn_cast<PointerType>(ElTy)) - ElTy = ElPTy->getElementType(); - else - ElTy = cast<CompositeType>(ElTy)->getTypeAtIndex(II); - } - // And create a GEP to extract those indices. - V = GetElementPtrInst::Create(ArgIndex.first, V, Ops, - V->getName() + ".idx", Call); - Ops.clear(); + + // If all the elements are single-value types, we can promote it. + bool AllSimple = true; + for (const auto *EltTy : STy->elements()) { + if (!EltTy->isSingleValueType()) { + AllSimple = false; + break; } - // Since we're replacing a load make sure we take the alignment - // of the previous load. - LoadInst *newLoad = new LoadInst(V, V->getName()+".val", Call); - newLoad->setAlignment(OrigLoad->getAlignment()); - // Transfer the AA info too. - AAMDNodes AAInfo; - OrigLoad->getAAMetadata(AAInfo); - newLoad->setAAMetadata(AAInfo); + } - Args.push_back(newLoad); + // Safe to transform, don't even bother trying to "promote" it. + // Passing the elements as a scalar will allow sroa to hack on + // the new alloca we introduce. + if (AllSimple) { + ByValArgsToTransform.insert(PtrArg); + continue; } } + } - // Push any varargs arguments on the list. - for (; AI != CS.arg_end(); ++AI, ++ArgIndex) { - Args.push_back(*AI); - if (CallPAL.hasAttributes(ArgIndex)) { - AttrBuilder B(CallPAL, ArgIndex); - AttributesVec. - push_back(AttributeSet::get(F->getContext(), Args.size(), B)); + // If the argument is a recursive type and we're in a recursive + // function, we could end up infinitely peeling the function argument. + if (isSelfRecursive) { + if (StructType *STy = dyn_cast<StructType>(AgTy)) { + bool RecursiveType = false; + for (const auto *EltTy : STy->elements()) { + if (EltTy == PtrArg->getType()) { + RecursiveType = true; + break; + } + } + if (RecursiveType) + continue; } } - // Add any function attributes. - if (CallPAL.hasAttributes(AttributeSet::FunctionIndex)) - AttributesVec.push_back(AttributeSet::get(Call->getContext(), - CallPAL.getFnAttributes())); + // Otherwise, see if we can promote the pointer to its value. + if (isSafeToPromoteArgument(PtrArg, PtrArg->hasByValOrInAllocaAttr(), AAR, + MaxElements)) + ArgsToPromote.insert(PtrArg); + } - SmallVector<OperandBundleDef, 1> OpBundles; - CS.getOperandBundlesAsDefs(OpBundles); + // No promotable pointer arguments. + if (ArgsToPromote.empty() && ByValArgsToTransform.empty()) + return nullptr; - Instruction *New; - if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) { - New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(), - Args, OpBundles, "", Call); - cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv()); - cast<InvokeInst>(New)->setAttributes(AttributeSet::get(II->getContext(), - AttributesVec)); - } else { - New = CallInst::Create(NF, Args, OpBundles, "", Call); - cast<CallInst>(New)->setCallingConv(CS.getCallingConv()); - cast<CallInst>(New)->setAttributes(AttributeSet::get(New->getContext(), - AttributesVec)); - cast<CallInst>(New)->setTailCallKind( - cast<CallInst>(Call)->getTailCallKind()); - } - New->setDebugLoc(Call->getDebugLoc()); - Args.clear(); - AttributesVec.clear(); + return doPromotion(F, ArgsToPromote, ByValArgsToTransform, ReplaceCallSite); +} - // Update the callgraph to know that the callsite has been transformed. - CallGraphNode *CalleeNode = CG[Call->getParent()->getParent()]; - CalleeNode->replaceCallEdge(CS, CallSite(New), NF_CGN); +PreservedAnalyses ArgumentPromotionPass::run(LazyCallGraph::SCC &C, + CGSCCAnalysisManager &AM, + LazyCallGraph &CG, + CGSCCUpdateResult &UR) { + bool Changed = false, LocalChange; - if (!Call->use_empty()) { - Call->replaceAllUsesWith(New); - New->takeName(Call); + // Iterate until we stop promoting from this SCC. + do { + LocalChange = false; + + for (LazyCallGraph::Node &N : C) { + Function &OldF = N.getFunction(); + + FunctionAnalysisManager &FAM = + AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager(); + // FIXME: This lambda must only be used with this function. We should + // skip the lambda and just get the AA results directly. + auto AARGetter = [&](Function &F) -> AAResults & { + assert(&F == &OldF && "Called with an unexpected function!"); + return FAM.getResult<AAManager>(F); + }; + + Function *NewF = promoteArguments(&OldF, AARGetter, 3u, None); + if (!NewF) + continue; + LocalChange = true; + + // Directly substitute the functions in the call graph. Note that this + // requires the old function to be completely dead and completely + // replaced by the new function. It does no call graph updates, it merely + // swaps out the particular function mapped to a particular node in the + // graph. + C.getOuterRefSCC().replaceNodeFunction(N, *NewF); + OldF.eraseFromParent(); } - // Finally, remove the old call from the program, reducing the use-count of - // F. - Call->eraseFromParent(); - } + Changed |= LocalChange; + } while (LocalChange); - // Since we have now created the new function, splice the body of the old - // function right into the new function, leaving the old rotting hulk of the - // function empty. - NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList()); + if (!Changed) + return PreservedAnalyses::all(); - // Loop over the argument list, transferring uses of the old arguments over to - // the new arguments, also transferring over the names as well. - // - for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(), - I2 = NF->arg_begin(); I != E; ++I) { - if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) { - // If this is an unmodified argument, move the name and users over to the - // new version. - I->replaceAllUsesWith(&*I2); - I2->takeName(&*I); - ++I2; - continue; - } - - if (ByValArgsToTransform.count(&*I)) { - // In the callee, we create an alloca, and store each of the new incoming - // arguments into the alloca. - Instruction *InsertPt = &NF->begin()->front(); + return PreservedAnalyses::none(); +} - // Just add all the struct element types. - Type *AgTy = cast<PointerType>(I->getType())->getElementType(); - Value *TheAlloca = new AllocaInst(AgTy, nullptr, "", InsertPt); - StructType *STy = cast<StructType>(AgTy); - Value *Idxs[2] = { - ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), nullptr }; +namespace { +/// ArgPromotion - The 'by reference' to 'by value' argument promotion pass. +/// +struct ArgPromotion : public CallGraphSCCPass { + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + getAAResultsAnalysisUsage(AU); + CallGraphSCCPass::getAnalysisUsage(AU); + } - for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { - Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i); - Value *Idx = GetElementPtrInst::Create( - AgTy, TheAlloca, Idxs, TheAlloca->getName() + "." + Twine(i), - InsertPt); - I2->setName(I->getName()+"."+Twine(i)); - new StoreInst(&*I2++, Idx, InsertPt); - } + bool runOnSCC(CallGraphSCC &SCC) override; + static char ID; // Pass identification, replacement for typeid + explicit ArgPromotion(unsigned MaxElements = 3) + : CallGraphSCCPass(ID), MaxElements(MaxElements) { + initializeArgPromotionPass(*PassRegistry::getPassRegistry()); + } - // Anything that used the arg should now use the alloca. - I->replaceAllUsesWith(TheAlloca); - TheAlloca->takeName(&*I); +private: + using llvm::Pass::doInitialization; + bool doInitialization(CallGraph &CG) override; + /// The maximum number of elements to expand, or 0 for unlimited. + unsigned MaxElements; +}; +} - // If the alloca is used in a call, we must clear the tail flag since - // the callee now uses an alloca from the caller. - for (User *U : TheAlloca->users()) { - CallInst *Call = dyn_cast<CallInst>(U); - if (!Call) - continue; - Call->setTailCall(false); - } - continue; - } +char ArgPromotion::ID = 0; +INITIALIZE_PASS_BEGIN(ArgPromotion, "argpromotion", + "Promote 'by reference' arguments to scalars", false, + false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(ArgPromotion, "argpromotion", + "Promote 'by reference' arguments to scalars", false, false) - if (I->use_empty()) - continue; +Pass *llvm::createArgumentPromotionPass(unsigned MaxElements) { + return new ArgPromotion(MaxElements); +} - // Otherwise, if we promoted this argument, then all users are load - // instructions (or GEPs with only load users), and all loads should be - // using the new argument that we added. - ScalarizeTable &ArgIndices = ScalarizedElements[&*I]; +bool ArgPromotion::runOnSCC(CallGraphSCC &SCC) { + if (skipSCC(SCC)) + return false; - while (!I->use_empty()) { - if (LoadInst *LI = dyn_cast<LoadInst>(I->user_back())) { - assert(ArgIndices.begin()->second.empty() && - "Load element should sort to front!"); - I2->setName(I->getName()+".val"); - LI->replaceAllUsesWith(&*I2); - LI->eraseFromParent(); - DEBUG(dbgs() << "*** Promoted load of argument '" << I->getName() - << "' in function '" << F->getName() << "'\n"); - } else { - GetElementPtrInst *GEP = cast<GetElementPtrInst>(I->user_back()); - IndicesVector Operands; - Operands.reserve(GEP->getNumIndices()); - for (User::op_iterator II = GEP->idx_begin(), IE = GEP->idx_end(); - II != IE; ++II) - Operands.push_back(cast<ConstantInt>(*II)->getSExtValue()); + // Get the callgraph information that we need to update to reflect our + // changes. + CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); - // GEPs with a single 0 index can be merged with direct loads - if (Operands.size() == 1 && Operands.front() == 0) - Operands.clear(); + LegacyAARGetter AARGetter(*this); - Function::arg_iterator TheArg = I2; - for (ScalarizeTable::iterator It = ArgIndices.begin(); - It->second != Operands; ++It, ++TheArg) { - assert(It != ArgIndices.end() && "GEP not handled??"); - } + bool Changed = false, LocalChange; - std::string NewName = I->getName(); - for (unsigned i = 0, e = Operands.size(); i != e; ++i) { - NewName += "." + utostr(Operands[i]); - } - NewName += ".val"; - TheArg->setName(NewName); + // Iterate until we stop promoting from this SCC. + do { + LocalChange = false; + // Attempt to promote arguments from all functions in this SCC. + for (CallGraphNode *OldNode : SCC) { + Function *OldF = OldNode->getFunction(); + if (!OldF) + continue; + + auto ReplaceCallSite = [&](CallSite OldCS, CallSite NewCS) { + Function *Caller = OldCS.getInstruction()->getParent()->getParent(); + CallGraphNode *NewCalleeNode = + CG.getOrInsertFunction(NewCS.getCalledFunction()); + CallGraphNode *CallerNode = CG[Caller]; + CallerNode->replaceCallEdge(OldCS, NewCS, NewCalleeNode); + }; + + if (Function *NewF = promoteArguments(OldF, AARGetter, MaxElements, + {ReplaceCallSite})) { + LocalChange = true; - DEBUG(dbgs() << "*** Promoted agg argument '" << TheArg->getName() - << "' of function '" << NF->getName() << "'\n"); + // Update the call graph for the newly promoted function. + CallGraphNode *NewNode = CG.getOrInsertFunction(NewF); + NewNode->stealCalledFunctionsFrom(OldNode); + if (OldNode->getNumReferences() == 0) + delete CG.removeFunctionFromModule(OldNode); + else + OldF->setLinkage(Function::ExternalLinkage); - // All of the uses must be load instructions. Replace them all with - // the argument specified by ArgNo. - while (!GEP->use_empty()) { - LoadInst *L = cast<LoadInst>(GEP->user_back()); - L->replaceAllUsesWith(&*TheArg); - L->eraseFromParent(); - } - GEP->eraseFromParent(); + // And updat ethe SCC we're iterating as well. + SCC.ReplaceNode(OldNode, NewNode); } } + // Remember that we changed something. + Changed |= LocalChange; + } while (LocalChange); - // Increment I2 past all of the arguments added for this promoted pointer. - std::advance(I2, ArgIndices.size()); - } - - NF_CGN->stealCalledFunctionsFrom(CG[F]); - - // Now that the old function is dead, delete it. If there is a dangling - // reference to the CallgraphNode, just leave the dead function around for - // someone else to nuke. - CallGraphNode *CGN = CG[F]; - if (CGN->getNumReferences() == 0) - delete CG.removeFunctionFromModule(CGN); - else - F->setLinkage(Function::ExternalLinkage); - - return NF_CGN; + return Changed; } bool ArgPromotion::doInitialization(CallGraph &CG) { diff --git a/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp b/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp index d75ed20..62b5a9c 100644 --- a/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp +++ b/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp @@ -60,6 +60,23 @@ static bool IsBetterCanonical(const GlobalVariable &A, return A.hasGlobalUnnamedAddr(); } +static bool hasMetadataOtherThanDebugLoc(const GlobalVariable *GV) { + SmallVector<std::pair<unsigned, MDNode *>, 4> MDs; + GV->getAllMetadata(MDs); + for (const auto &V : MDs) + if (V.first != LLVMContext::MD_dbg) + return true; + return false; +} + +static void copyDebugLocMetadata(const GlobalVariable *From, + GlobalVariable *To) { + SmallVector<DIGlobalVariableExpression *, 1> MDs; + From->getDebugInfo(MDs); + for (auto MD : MDs) + To->addDebugInfo(MD); +} + static unsigned getAlignment(GlobalVariable *GV) { unsigned Align = GV->getAlignment(); if (Align) @@ -113,6 +130,10 @@ static bool mergeConstants(Module &M) { if (GV->isWeakForLinker()) continue; + // Don't touch globals with metadata other then !dbg. + if (hasMetadataOtherThanDebugLoc(GV)) + continue; + Constant *Init = GV->getInitializer(); // Check to see if the initializer is already known. @@ -155,6 +176,9 @@ static bool mergeConstants(Module &M) { if (!Slot->hasGlobalUnnamedAddr() && !GV->hasGlobalUnnamedAddr()) continue; + if (hasMetadataOtherThanDebugLoc(GV)) + continue; + if (!GV->hasGlobalUnnamedAddr()) Slot->setUnnamedAddr(GlobalValue::UnnamedAddr::None); @@ -178,6 +202,8 @@ static bool mergeConstants(Module &M) { getAlignment(Replacements[i].second))); } + copyDebugLocMetadata(Replacements[i].first, Replacements[i].second); + // Eliminate any uses of the dead global. Replacements[i].first->replaceAllUsesWith(Replacements[i].second); diff --git a/contrib/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp b/contrib/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp index ba2e60d..d94aa5d 100644 --- a/contrib/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp +++ b/contrib/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp @@ -95,11 +95,25 @@ void CrossDSOCFI::buildCFICheck(Module &M) { } } + NamedMDNode *CfiFunctionsMD = M.getNamedMetadata("cfi.functions"); + if (CfiFunctionsMD) { + for (auto Func : CfiFunctionsMD->operands()) { + assert(Func->getNumOperands() >= 2); + for (unsigned I = 2; I < Func->getNumOperands(); ++I) + if (ConstantInt *TypeId = + extractNumericTypeId(cast<MDNode>(Func->getOperand(I).get()))) + TypeIds.insert(TypeId->getZExtValue()); + } + } + LLVMContext &Ctx = M.getContext(); Constant *C = M.getOrInsertFunction( "__cfi_check", Type::getVoidTy(Ctx), Type::getInt64Ty(Ctx), - Type::getInt8PtrTy(Ctx), Type::getInt8PtrTy(Ctx), nullptr); + Type::getInt8PtrTy(Ctx), Type::getInt8PtrTy(Ctx)); Function *F = dyn_cast<Function>(C); + // Take over the existing function. The frontend emits a weak stub so that the + // linker knows about the symbol; this pass replaces the function body. + F->deleteBody(); F->setAlignment(4096); auto args = F->arg_begin(); Value &CallSiteTypeId = *(args++); @@ -117,7 +131,7 @@ void CrossDSOCFI::buildCFICheck(Module &M) { IRBuilder<> IRBFail(TrapBB); Constant *CFICheckFailFn = M.getOrInsertFunction( "__cfi_check_fail", Type::getVoidTy(Ctx), Type::getInt8PtrTy(Ctx), - Type::getInt8PtrTy(Ctx), nullptr); + Type::getInt8PtrTy(Ctx)); IRBFail.CreateCall(CFICheckFailFn, {&CFICheckFailData, &Addr}); IRBFail.CreateBr(ExitBB); diff --git a/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp index 1a5ed46..8e26849 100644 --- a/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -166,41 +166,40 @@ bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) { Args.assign(CS.arg_begin(), CS.arg_begin() + NumArgs); // Drop any attributes that were on the vararg arguments. - AttributeSet PAL = CS.getAttributes(); - if (!PAL.isEmpty() && PAL.getSlotIndex(PAL.getNumSlots() - 1) > NumArgs) { - SmallVector<AttributeSet, 8> AttributesVec; - for (unsigned i = 0; PAL.getSlotIndex(i) <= NumArgs; ++i) - AttributesVec.push_back(PAL.getSlotAttributes(i)); - if (PAL.hasAttributes(AttributeSet::FunctionIndex)) - AttributesVec.push_back(AttributeSet::get(Fn.getContext(), - PAL.getFnAttributes())); - PAL = AttributeSet::get(Fn.getContext(), AttributesVec); + AttributeList PAL = CS.getAttributes(); + if (!PAL.isEmpty()) { + SmallVector<AttributeSet, 8> ArgAttrs; + for (unsigned ArgNo = 0; ArgNo < NumArgs; ++ArgNo) + ArgAttrs.push_back(PAL.getParamAttributes(ArgNo)); + PAL = AttributeList::get(Fn.getContext(), PAL.getFnAttributes(), + PAL.getRetAttributes(), ArgAttrs); } SmallVector<OperandBundleDef, 1> OpBundles; CS.getOperandBundlesAsDefs(OpBundles); - Instruction *New; + CallSite NewCS; if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) { - New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(), - Args, OpBundles, "", Call); - cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv()); - cast<InvokeInst>(New)->setAttributes(PAL); + NewCS = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(), + Args, OpBundles, "", Call); } else { - New = CallInst::Create(NF, Args, OpBundles, "", Call); - cast<CallInst>(New)->setCallingConv(CS.getCallingConv()); - cast<CallInst>(New)->setAttributes(PAL); - cast<CallInst>(New)->setTailCallKind( - cast<CallInst>(Call)->getTailCallKind()); + NewCS = CallInst::Create(NF, Args, OpBundles, "", Call); + cast<CallInst>(NewCS.getInstruction()) + ->setTailCallKind(cast<CallInst>(Call)->getTailCallKind()); } - New->setDebugLoc(Call->getDebugLoc()); + NewCS.setCallingConv(CS.getCallingConv()); + NewCS.setAttributes(PAL); + NewCS->setDebugLoc(Call->getDebugLoc()); + uint64_t W; + if (Call->extractProfTotalWeight(W)) + NewCS->setProfWeight(W); Args.clear(); if (!Call->use_empty()) - Call->replaceAllUsesWith(New); + Call->replaceAllUsesWith(NewCS.getInstruction()); - New->takeName(Call); + NewCS->takeName(Call); // Finally, remove the old call from the program, reducing the use-count of // F. @@ -681,8 +680,8 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { bool HasLiveReturnedArg = false; // Set up to build a new list of parameter attributes. - SmallVector<AttributeSet, 8> AttributesVec; - const AttributeSet &PAL = F->getAttributes(); + SmallVector<AttributeSet, 8> ArgAttrVec; + const AttributeList &PAL = F->getAttributes(); // Remember which arguments are still alive. SmallVector<bool, 10> ArgAlive(FTy->getNumParams(), false); @@ -696,16 +695,8 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { if (LiveValues.erase(Arg)) { Params.push_back(I->getType()); ArgAlive[i] = true; - - // Get the original parameter attributes (skipping the first one, that is - // for the return value. - if (PAL.hasAttributes(i + 1)) { - AttrBuilder B(PAL, i + 1); - if (B.contains(Attribute::Returned)) - HasLiveReturnedArg = true; - AttributesVec. - push_back(AttributeSet::get(F->getContext(), Params.size(), B)); - } + ArgAttrVec.push_back(PAL.getParamAttributes(i)); + HasLiveReturnedArg |= PAL.hasParamAttribute(i, Attribute::Returned); } else { ++NumArgumentsEliminated; DEBUG(dbgs() << "DeadArgumentEliminationPass - Removing argument " << i @@ -779,30 +770,24 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { assert(NRetTy && "No new return type found?"); // The existing function return attributes. - AttributeSet RAttrs = PAL.getRetAttributes(); + AttrBuilder RAttrs(PAL.getRetAttributes()); // Remove any incompatible attributes, but only if we removed all return // values. Otherwise, ensure that we don't have any conflicting attributes // here. Currently, this should not be possible, but special handling might be // required when new return value attributes are added. if (NRetTy->isVoidTy()) - RAttrs = RAttrs.removeAttributes(NRetTy->getContext(), - AttributeSet::ReturnIndex, - AttributeFuncs::typeIncompatible(NRetTy)); + RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy)); else - assert(!AttrBuilder(RAttrs, AttributeSet::ReturnIndex). - overlaps(AttributeFuncs::typeIncompatible(NRetTy)) && + assert(!RAttrs.overlaps(AttributeFuncs::typeIncompatible(NRetTy)) && "Return attributes no longer compatible?"); - if (RAttrs.hasAttributes(AttributeSet::ReturnIndex)) - AttributesVec.push_back(AttributeSet::get(NRetTy->getContext(), RAttrs)); - - if (PAL.hasAttributes(AttributeSet::FunctionIndex)) - AttributesVec.push_back(AttributeSet::get(F->getContext(), - PAL.getFnAttributes())); + AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs); // Reconstruct the AttributesList based on the vector we constructed. - AttributeSet NewPAL = AttributeSet::get(F->getContext(), AttributesVec); + assert(ArgAttrVec.size() == Params.size()); + AttributeList NewPAL = AttributeList::get( + F->getContext(), PAL.getFnAttributes(), RetAttrs, ArgAttrVec); // Create the new function type based on the recomputed parameters. FunctionType *NFTy = FunctionType::get(NRetTy, Params, FTy->isVarArg()); @@ -829,18 +814,14 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { CallSite CS(F->user_back()); Instruction *Call = CS.getInstruction(); - AttributesVec.clear(); - const AttributeSet &CallPAL = CS.getAttributes(); - - // The call return attributes. - AttributeSet RAttrs = CallPAL.getRetAttributes(); + ArgAttrVec.clear(); + const AttributeList &CallPAL = CS.getAttributes(); - // Adjust in case the function was changed to return void. - RAttrs = RAttrs.removeAttributes(NRetTy->getContext(), - AttributeSet::ReturnIndex, - AttributeFuncs::typeIncompatible(NF->getReturnType())); - if (RAttrs.hasAttributes(AttributeSet::ReturnIndex)) - AttributesVec.push_back(AttributeSet::get(NF->getContext(), RAttrs)); + // Adjust the call return attributes in case the function was changed to + // return void. + AttrBuilder RAttrs(CallPAL.getRetAttributes()); + RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy)); + AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs); // Declare these outside of the loops, so we can reuse them for the second // loop, which loops the varargs. @@ -852,57 +833,55 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { if (ArgAlive[i]) { Args.push_back(*I); // Get original parameter attributes, but skip return attributes. - if (CallPAL.hasAttributes(i + 1)) { - AttrBuilder B(CallPAL, i + 1); + AttributeSet Attrs = CallPAL.getParamAttributes(i); + if (NRetTy != RetTy && Attrs.hasAttribute(Attribute::Returned)) { // If the return type has changed, then get rid of 'returned' on the // call site. The alternative is to make all 'returned' attributes on // call sites keep the return value alive just like 'returned' - // attributes on function declaration but it's less clearly a win - // and this is not an expected case anyway - if (NRetTy != RetTy && B.contains(Attribute::Returned)) - B.removeAttribute(Attribute::Returned); - AttributesVec. - push_back(AttributeSet::get(F->getContext(), Args.size(), B)); + // attributes on function declaration but it's less clearly a win and + // this is not an expected case anyway + ArgAttrVec.push_back(AttributeSet::get( + F->getContext(), + AttrBuilder(Attrs).removeAttribute(Attribute::Returned))); + } else { + // Otherwise, use the original attributes. + ArgAttrVec.push_back(Attrs); } } // Push any varargs arguments on the list. Don't forget their attributes. for (CallSite::arg_iterator E = CS.arg_end(); I != E; ++I, ++i) { Args.push_back(*I); - if (CallPAL.hasAttributes(i + 1)) { - AttrBuilder B(CallPAL, i + 1); - AttributesVec. - push_back(AttributeSet::get(F->getContext(), Args.size(), B)); - } + ArgAttrVec.push_back(CallPAL.getParamAttributes(i)); } - if (CallPAL.hasAttributes(AttributeSet::FunctionIndex)) - AttributesVec.push_back(AttributeSet::get(Call->getContext(), - CallPAL.getFnAttributes())); - // Reconstruct the AttributesList based on the vector we constructed. - AttributeSet NewCallPAL = AttributeSet::get(F->getContext(), AttributesVec); + assert(ArgAttrVec.size() == Args.size()); + AttributeList NewCallPAL = AttributeList::get( + F->getContext(), CallPAL.getFnAttributes(), RetAttrs, ArgAttrVec); SmallVector<OperandBundleDef, 1> OpBundles; CS.getOperandBundlesAsDefs(OpBundles); - Instruction *New; + CallSite NewCS; if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) { - New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(), - Args, OpBundles, "", Call->getParent()); - cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv()); - cast<InvokeInst>(New)->setAttributes(NewCallPAL); + NewCS = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(), + Args, OpBundles, "", Call->getParent()); } else { - New = CallInst::Create(NF, Args, OpBundles, "", Call); - cast<CallInst>(New)->setCallingConv(CS.getCallingConv()); - cast<CallInst>(New)->setAttributes(NewCallPAL); - cast<CallInst>(New)->setTailCallKind( - cast<CallInst>(Call)->getTailCallKind()); + NewCS = CallInst::Create(NF, Args, OpBundles, "", Call); + cast<CallInst>(NewCS.getInstruction()) + ->setTailCallKind(cast<CallInst>(Call)->getTailCallKind()); } - New->setDebugLoc(Call->getDebugLoc()); - + NewCS.setCallingConv(CS.getCallingConv()); + NewCS.setAttributes(NewCallPAL); + NewCS->setDebugLoc(Call->getDebugLoc()); + uint64_t W; + if (Call->extractProfTotalWeight(W)) + NewCS->setProfWeight(W); Args.clear(); + ArgAttrVec.clear(); + Instruction *New = NewCS.getInstruction(); if (!Call->use_empty()) { if (New->getType() == Call->getType()) { // Return type not changed? Just replace users then. diff --git a/contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp b/contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp index 98c4b17..ecff88c 100644 --- a/contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp +++ b/contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp @@ -17,9 +17,9 @@ #include "llvm/ADT/Statistic.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Module.h" +#include "llvm/Pass.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Utils/GlobalStatus.h" -#include "llvm/Pass.h" using namespace llvm; #define DEBUG_TYPE "elim-avail-extern" diff --git a/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp b/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp index 479fd18..d1147f7 100644 --- a/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp +++ b/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp @@ -11,13 +11,13 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/IPO.h" #include "llvm/ADT/SetVector.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" +#include "llvm/Transforms/IPO.h" #include <algorithm> using namespace llvm; @@ -53,18 +53,18 @@ static void makeVisible(GlobalValue &GV, bool Delete) { } namespace { - /// @brief A pass to extract specific functions and their dependencies. + /// @brief A pass to extract specific global values and their dependencies. class GVExtractorPass : public ModulePass { SetVector<GlobalValue *> Named; bool deleteStuff; public: static char ID; // Pass identification, replacement for typeid - /// FunctionExtractorPass - If deleteFn is true, this pass deletes as the - /// specified function. Otherwise, it deletes as much of the module as - /// possible, except for the function specified. - /// - explicit GVExtractorPass(std::vector<GlobalValue*>& GVs, bool deleteS = true) + /// If deleteS is true, this pass deletes the specified global values. + /// Otherwise, it deletes as much of the module as possible, except for the + /// global values specified. + explicit GVExtractorPass(std::vector<GlobalValue*> &GVs, + bool deleteS = true) : ModulePass(ID), Named(GVs.begin(), GVs.end()), deleteStuff(deleteS) {} bool runOnModule(Module &M) override { diff --git a/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index 402a665..813a4b6 100644 --- a/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -14,7 +14,6 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/IPO/FunctionAttrs.h" -#include "llvm/Transforms/IPO.h" #include "llvm/ADT/SCCIterator.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallSet.h" @@ -34,7 +33,7 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Transforms/IPO.h" using namespace llvm; #define DEBUG_TYPE "functionattrs" @@ -49,31 +48,35 @@ STATISTIC(NumNoAlias, "Number of function returns marked noalias"); STATISTIC(NumNonNullReturn, "Number of function returns marked nonnull"); STATISTIC(NumNoRecurse, "Number of functions marked as norecurse"); -namespace { -typedef SmallSetVector<Function *, 8> SCCNodeSet; -} +// FIXME: This is disabled by default to avoid exposing security vulnerabilities +// in C/C++ code compiled by clang: +// http://lists.llvm.org/pipermail/cfe-dev/2017-January/052066.html +static cl::opt<bool> EnableNonnullArgPropagation( + "enable-nonnull-arg-prop", cl::Hidden, + cl::desc("Try to propagate nonnull argument attributes from callsites to " + "caller functions.")); namespace { -/// The three kinds of memory access relevant to 'readonly' and -/// 'readnone' attributes. -enum MemoryAccessKind { - MAK_ReadNone = 0, - MAK_ReadOnly = 1, - MAK_MayWrite = 2 -}; +typedef SmallSetVector<Function *, 8> SCCNodeSet; } -static MemoryAccessKind checkFunctionMemoryAccess(Function &F, AAResults &AAR, +/// Returns the memory access attribute for function F using AAR for AA results, +/// where SCCNodes is the current SCC. +/// +/// If ThisBody is true, this function may examine the function body and will +/// return a result pertaining to this copy of the function. If it is false, the +/// result will be based only on AA results for the function declaration; it +/// will be assumed that some other (perhaps less optimized) version of the +/// function may be selected at link time. +static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody, + AAResults &AAR, const SCCNodeSet &SCCNodes) { FunctionModRefBehavior MRB = AAR.getModRefBehavior(&F); if (MRB == FMRB_DoesNotAccessMemory) // Already perfect! return MAK_ReadNone; - // Non-exact function definitions may not be selected at link time, and an - // alternative version that writes to memory may be selected. See the comment - // on GlobalValue::isDefinitionExact for more details. - if (!F.hasExactDefinition()) { + if (!ThisBody) { if (AliasAnalysis::onlyReadsMemory(MRB)) return MAK_ReadOnly; @@ -172,9 +175,14 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, AAResults &AAR, return ReadsMemory ? MAK_ReadOnly : MAK_ReadNone; } +MemoryAccessKind llvm::computeFunctionBodyMemoryAccess(Function &F, + AAResults &AAR) { + return checkFunctionMemoryAccess(F, /*ThisBody=*/true, AAR, {}); +} + /// Deduce readonly/readnone attributes for the SCC. template <typename AARGetterT> -static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT AARGetter) { +static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter) { // Check if any of the functions in the SCC read or write memory. If they // write memory then they can't be marked readnone or readonly. bool ReadsMemory = false; @@ -182,7 +190,11 @@ static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT AARGetter) { // Call the callable parameter to look up AA results for this function. AAResults &AAR = AARGetter(*F); - switch (checkFunctionMemoryAccess(*F, AAR, SCCNodes)) { + // Non-exact function definitions may not be selected at link time, and an + // alternative version that writes to memory may be selected. See the + // comment on GlobalValue::isDefinitionExact for more details. + switch (checkFunctionMemoryAccess(*F, F->hasExactDefinition(), + AAR, SCCNodes)) { case MAK_MayWrite: return false; case MAK_ReadOnly: @@ -209,15 +221,11 @@ static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT AARGetter) { MadeChange = true; // Clear out any existing attributes. - AttrBuilder B; - B.addAttribute(Attribute::ReadOnly).addAttribute(Attribute::ReadNone); - F->removeAttributes( - AttributeSet::FunctionIndex, - AttributeSet::get(F->getContext(), AttributeSet::FunctionIndex, B)); + F->removeFnAttr(Attribute::ReadOnly); + F->removeFnAttr(Attribute::ReadNone); // Add in the new attribute. - F->addAttribute(AttributeSet::FunctionIndex, - ReadsMemory ? Attribute::ReadOnly : Attribute::ReadNone); + F->addFnAttr(ReadsMemory ? Attribute::ReadOnly : Attribute::ReadNone); if (ReadsMemory) ++NumReadOnly; @@ -482,9 +490,6 @@ determinePointerReadAttrs(Argument *A, static bool addArgumentReturnedAttrs(const SCCNodeSet &SCCNodes) { bool Changed = false; - AttrBuilder B; - B.addAttribute(Attribute::Returned); - // Check each function in turn, determining if an argument is always returned. for (Function *F : SCCNodes) { // We can infer and propagate function attributes only when we know that the @@ -522,7 +527,7 @@ static bool addArgumentReturnedAttrs(const SCCNodeSet &SCCNodes) { if (Value *RetArg = FindRetArg()) { auto *A = cast<Argument>(RetArg); - A->addAttr(AttributeSet::get(F->getContext(), A->getArgNo() + 1, B)); + A->addAttr(Attribute::Returned); ++NumReturned; Changed = true; } @@ -531,15 +536,55 @@ static bool addArgumentReturnedAttrs(const SCCNodeSet &SCCNodes) { return Changed; } +/// If a callsite has arguments that are also arguments to the parent function, +/// try to propagate attributes from the callsite's arguments to the parent's +/// arguments. This may be important because inlining can cause information loss +/// when attribute knowledge disappears with the inlined call. +static bool addArgumentAttrsFromCallsites(Function &F) { + if (!EnableNonnullArgPropagation) + return false; + + bool Changed = false; + + // For an argument attribute to transfer from a callsite to the parent, the + // call must be guaranteed to execute every time the parent is called. + // Conservatively, just check for calls in the entry block that are guaranteed + // to execute. + // TODO: This could be enhanced by testing if the callsite post-dominates the + // entry block or by doing simple forward walks or backward walks to the + // callsite. + BasicBlock &Entry = F.getEntryBlock(); + for (Instruction &I : Entry) { + if (auto CS = CallSite(&I)) { + if (auto *CalledFunc = CS.getCalledFunction()) { + for (auto &CSArg : CalledFunc->args()) { + if (!CSArg.hasNonNullAttr()) + continue; + + // If the non-null callsite argument operand is an argument to 'F' + // (the caller) and the call is guaranteed to execute, then the value + // must be non-null throughout 'F'. + auto *FArg = dyn_cast<Argument>(CS.getArgOperand(CSArg.getArgNo())); + if (FArg && !FArg->hasNonNullAttr()) { + FArg->addAttr(Attribute::NonNull); + Changed = true; + } + } + } + } + if (!isGuaranteedToTransferExecutionToSuccessor(&I)) + break; + } + + return Changed; +} + /// Deduce nocapture attributes for the SCC. static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) { bool Changed = false; ArgumentGraph AG; - AttrBuilder B; - B.addAttribute(Attribute::NoCapture); - // Check each function in turn, determining which pointer arguments are not // captured. for (Function *F : SCCNodes) { @@ -549,6 +594,8 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) { if (!F->hasExactDefinition()) continue; + Changed |= addArgumentAttrsFromCallsites(*F); + // Functions that are readonly (or readnone) and nounwind and don't return // a value can't capture arguments. Don't analyze them. if (F->onlyReadsMemory() && F->doesNotThrow() && @@ -556,7 +603,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) { for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); A != E; ++A) { if (A->getType()->isPointerTy() && !A->hasNoCaptureAttr()) { - A->addAttr(AttributeSet::get(F->getContext(), A->getArgNo() + 1, B)); + A->addAttr(Attribute::NoCapture); ++NumNoCapture; Changed = true; } @@ -575,8 +622,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) { if (!Tracker.Captured) { if (Tracker.Uses.empty()) { // If it's trivially not captured, mark it nocapture now. - A->addAttr( - AttributeSet::get(F->getContext(), A->getArgNo() + 1, B)); + A->addAttr(Attribute::NoCapture); ++NumNoCapture; Changed = true; } else { @@ -602,9 +648,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) { Self.insert(&*A); Attribute::AttrKind R = determinePointerReadAttrs(&*A, Self); if (R != Attribute::None) { - AttrBuilder B; - B.addAttribute(R); - A->addAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, B)); + A->addAttr(R); Changed = true; R == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg; } @@ -629,7 +673,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) { if (ArgumentSCC[0]->Uses.size() == 1 && ArgumentSCC[0]->Uses[0] == ArgumentSCC[0]) { Argument *A = ArgumentSCC[0]->Definition; - A->addAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, B)); + A->addAttr(Attribute::NoCapture); ++NumNoCapture; Changed = true; } @@ -671,7 +715,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) { for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) { Argument *A = ArgumentSCC[i]->Definition; - A->addAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, B)); + A->addAttr(Attribute::NoCapture); ++NumNoCapture; Changed = true; } @@ -702,14 +746,12 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) { } if (ReadAttr != Attribute::None) { - AttrBuilder B, R; - B.addAttribute(ReadAttr); - R.addAttribute(Attribute::ReadOnly).addAttribute(Attribute::ReadNone); for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) { Argument *A = ArgumentSCC[i]->Definition; // Clear out existing readonly/readnone attributes - A->removeAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, R)); - A->addAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, B)); + A->removeAttr(Attribute::ReadOnly); + A->removeAttr(Attribute::ReadNone); + A->addAttr(ReadAttr); ReadAttr == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg; Changed = true; } @@ -769,7 +811,7 @@ static bool isFunctionMallocLike(Function *F, const SCCNodeSet &SCCNodes) { case Instruction::Call: case Instruction::Invoke: { CallSite CS(RVI); - if (CS.paramHasAttr(0, Attribute::NoAlias)) + if (CS.hasRetAttr(Attribute::NoAlias)) break; if (CS.getCalledFunction() && SCCNodes.count(CS.getCalledFunction())) break; @@ -792,7 +834,7 @@ static bool addNoAliasAttrs(const SCCNodeSet &SCCNodes) { // pointers. for (Function *F : SCCNodes) { // Already noalias. - if (F->doesNotAlias(0)) + if (F->returnDoesNotAlias()) continue; // We can infer and propagate function attributes only when we know that the @@ -812,10 +854,11 @@ static bool addNoAliasAttrs(const SCCNodeSet &SCCNodes) { bool MadeChange = false; for (Function *F : SCCNodes) { - if (F->doesNotAlias(0) || !F->getReturnType()->isPointerTy()) + if (F->returnDoesNotAlias() || + !F->getReturnType()->isPointerTy()) continue; - F->setDoesNotAlias(0); + F->setReturnDoesNotAlias(); ++NumNoAlias; MadeChange = true; } @@ -905,7 +948,7 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) { // pointers. for (Function *F : SCCNodes) { // Already nonnull. - if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex, + if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex, Attribute::NonNull)) continue; @@ -926,7 +969,7 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) { // Mark the function eagerly since we may discover a function // which prevents us from speculating about the entire SCC DEBUG(dbgs() << "Eagerly marking " << F->getName() << " as nonnull\n"); - F->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull); + F->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull); ++NumNonNullReturn; MadeChange = true; } @@ -939,13 +982,13 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) { if (SCCReturnsNonNull) { for (Function *F : SCCNodes) { - if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex, + if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex, Attribute::NonNull) || !F->getReturnType()->isPointerTy()) continue; DEBUG(dbgs() << "SCC marking " << F->getName() << " as nonnull\n"); - F->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull); + F->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull); ++NumNonNullReturn; MadeChange = true; } @@ -1144,6 +1187,10 @@ static bool runImpl(CallGraphSCC &SCC, AARGetterT AARGetter) { SCCNodes.insert(F); } + // Skip it if the SCC only contains optnone functions. + if (SCCNodes.empty()) + return Changed; + Changed |= addArgumentReturnedAttrs(SCCNodes); Changed |= addReadAttrs(SCCNodes, AARGetter); Changed |= addArgumentAttrs(SCCNodes); @@ -1163,19 +1210,7 @@ static bool runImpl(CallGraphSCC &SCC, AARGetterT AARGetter) { bool PostOrderFunctionAttrsLegacyPass::runOnSCC(CallGraphSCC &SCC) { if (skipSCC(SCC)) return false; - - // We compute dedicated AA results for each function in the SCC as needed. We - // use a lambda referencing external objects so that they live long enough to - // be queried, but we re-use them each time. - Optional<BasicAAResult> BAR; - Optional<AAResults> AAR; - auto AARGetter = [&](Function &F) -> AAResults & { - BAR.emplace(createLegacyPMBasicAAResult(*this, F)); - AAR.emplace(createLegacyPMAAResults(*this, F, *BAR)); - return *AAR; - }; - - return runImpl(SCC, AARGetter); + return runImpl(SCC, LegacyAARGetter(*this)); } namespace { @@ -1275,16 +1310,9 @@ PreservedAnalyses ReversePostOrderFunctionAttrsPass::run(Module &M, ModuleAnalysisManager &AM) { auto &CG = AM.getResult<CallGraphAnalysis>(M); - bool Changed = deduceFunctionAttributeInRPO(M, CG); - - // CallGraphAnalysis holds AssertingVH and must be invalidated eagerly so - // that other passes don't delete stuff from under it. - // FIXME: We need to invalidate this to avoid PR28400. Is there a better - // solution? - AM.invalidate<CallGraphAnalysis>(M); - - if (!Changed) + if (!deduceFunctionAttributeInRPO(M, CG)) return PreservedAnalyses::all(); + PreservedAnalyses PA; PA.preserve<CallGraphAnalysis>(); return PA; diff --git a/contrib/llvm/lib/Transforms/IPO/FunctionImport.cpp b/contrib/llvm/lib/Transforms/IPO/FunctionImport.cpp index 6b32f6c..233a36d 100644 --- a/contrib/llvm/lib/Transforms/IPO/FunctionImport.cpp +++ b/contrib/llvm/lib/Transforms/IPO/FunctionImport.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringSet.h" #include "llvm/ADT/Triple.h" +#include "llvm/Bitcode/BitcodeReader.h" #include "llvm/IR/AutoUpgrade.h" #include "llvm/IR/DiagnosticPrinter.h" #include "llvm/IR/IntrinsicInst.h" @@ -25,7 +26,6 @@ #include "llvm/IRReader/IRReader.h" #include "llvm/Linker/Linker.h" #include "llvm/Object/IRObjectFile.h" -#include "llvm/Object/ModuleSummaryIndexObjectFile.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/SourceMgr.h" @@ -64,6 +64,12 @@ static cl::opt<float> ImportHotMultiplier( "import-hot-multiplier", cl::init(3.0), cl::Hidden, cl::value_desc("x"), cl::desc("Multiply the `import-instr-limit` threshold for hot callsites")); +static cl::opt<float> ImportCriticalMultiplier( + "import-critical-multiplier", cl::init(100.0), cl::Hidden, + cl::value_desc("x"), + cl::desc( + "Multiply the `import-instr-limit` threshold for critical callsites")); + // FIXME: This multiplier was not really tuned up. static cl::opt<float> ImportColdMultiplier( "import-cold-multiplier", cl::init(0), cl::Hidden, cl::value_desc("N"), @@ -75,12 +81,6 @@ static cl::opt<bool> PrintImports("print-imports", cl::init(false), cl::Hidden, static cl::opt<bool> ComputeDead("compute-dead", cl::init(true), cl::Hidden, cl::desc("Compute dead symbols")); -// Temporary allows the function import pass to disable always linking -// referenced discardable symbols. -static cl::opt<bool> - DontForceImportReferencedDiscardableSymbols("disable-force-link-odr", - cl::init(false), cl::Hidden); - static cl::opt<bool> EnableImportMetadata( "enable-import-metadata", cl::init( #if !defined(NDEBUG) @@ -123,8 +123,8 @@ namespace { /// - [insert you fancy metric here] static const GlobalValueSummary * selectCallee(const ModuleSummaryIndex &Index, - const GlobalValueSummaryList &CalleeSummaryList, - unsigned Threshold) { + ArrayRef<std::unique_ptr<GlobalValueSummary>> CalleeSummaryList, + unsigned Threshold, StringRef CallerModulePath) { auto It = llvm::find_if( CalleeSummaryList, [&](const std::unique_ptr<GlobalValueSummary> &SummaryPtr) { @@ -145,6 +145,21 @@ selectCallee(const ModuleSummaryIndex &Index, auto *Summary = cast<FunctionSummary>(GVSummary); + // If this is a local function, make sure we import the copy + // in the caller's module. The only time a local function can + // share an entry in the index is if there is a local with the same name + // in another module that had the same source file name (in a different + // directory), where each was compiled in their own directory so there + // was not distinguishing path. + // However, do the import from another module if there is only one + // entry in the list - in that case this must be a reference due + // to indirect call profile data, since a function pointer can point to + // a local in another module. + if (GlobalValue::isLocalLinkage(Summary->linkage()) && + CalleeSummaryList.size() > 1 && + Summary->modulePath() != CallerModulePath) + return false; + if (Summary->instCount() > Threshold) return false; @@ -159,17 +174,6 @@ selectCallee(const ModuleSummaryIndex &Index, return cast<GlobalValueSummary>(It->get()); } -/// Return the summary for the function \p GUID that fits the \p Threshold, or -/// null if there's no match. -static const GlobalValueSummary *selectCallee(GlobalValue::GUID GUID, - unsigned Threshold, - const ModuleSummaryIndex &Index) { - auto CalleeSummaryList = Index.findGlobalValueSummaryList(GUID); - if (CalleeSummaryList == Index.end()) - return nullptr; // This function does not have a summary - return selectCallee(Index, CalleeSummaryList->second, Threshold); -} - using EdgeInfo = std::tuple<const FunctionSummary *, unsigned /* Threshold */, GlobalValue::GUID>; @@ -183,10 +187,23 @@ static void computeImportForFunction( FunctionImporter::ImportMapTy &ImportList, StringMap<FunctionImporter::ExportSetTy> *ExportLists = nullptr) { for (auto &Edge : Summary.calls()) { - auto GUID = Edge.first.getGUID(); - DEBUG(dbgs() << " edge -> " << GUID << " Threshold:" << Threshold << "\n"); + ValueInfo VI = Edge.first; + DEBUG(dbgs() << " edge -> " << VI.getGUID() << " Threshold:" << Threshold + << "\n"); + + if (VI.getSummaryList().empty()) { + // For SamplePGO, the indirect call targets for local functions will + // have its original name annotated in profile. We try to find the + // corresponding PGOFuncName as the GUID. + auto GUID = Index.getGUIDFromOriginalID(VI.getGUID()); + if (GUID == 0) + continue; + VI = Index.getValueInfo(GUID); + if (!VI) + continue; + } - if (DefinedGVSummaries.count(GUID)) { + if (DefinedGVSummaries.count(VI.getGUID())) { DEBUG(dbgs() << "ignored! Target already in destination module.\n"); continue; } @@ -196,13 +213,16 @@ static void computeImportForFunction( return ImportHotMultiplier; if (Hotness == CalleeInfo::HotnessType::Cold) return ImportColdMultiplier; + if (Hotness == CalleeInfo::HotnessType::Critical) + return ImportCriticalMultiplier; return 1.0; }; const auto NewThreshold = Threshold * GetBonusMultiplier(Edge.second.Hotness); - auto *CalleeSummary = selectCallee(GUID, NewThreshold, Index); + auto *CalleeSummary = selectCallee(Index, VI.getSummaryList(), NewThreshold, + Summary.modulePath()); if (!CalleeSummary) { DEBUG(dbgs() << "ignored! No qualifying callee with summary found.\n"); continue; @@ -234,7 +254,7 @@ static void computeImportForFunction( const auto AdjThreshold = GetAdjustedThreshold(Threshold, IsHotCallsite); auto ExportModulePath = ResolvedCalleeSummary->modulePath(); - auto &ProcessedThreshold = ImportList[ExportModulePath][GUID]; + auto &ProcessedThreshold = ImportList[ExportModulePath][VI.getGUID()]; /// Since the traversal of the call graph is DFS, we can revisit a function /// a second time with a higher threshold. In this case, it is added back to /// the worklist with the new threshold. @@ -250,7 +270,7 @@ static void computeImportForFunction( // Make exports in the source module. if (ExportLists) { auto &ExportList = (*ExportLists)[ExportModulePath]; - ExportList.insert(GUID); + ExportList.insert(VI.getGUID()); if (!PreviouslyImported) { // This is the first time this function was exported from its source // module, so mark all functions and globals it references as exported @@ -270,7 +290,7 @@ static void computeImportForFunction( } // Insert the newly imported function to the worklist. - Worklist.emplace_back(ResolvedCalleeSummary, AdjThreshold, GUID); + Worklist.emplace_back(ResolvedCalleeSummary, AdjThreshold, VI.getGUID()); } } @@ -280,8 +300,7 @@ static void computeImportForFunction( static void ComputeImportForModule( const GVSummaryMapTy &DefinedGVSummaries, const ModuleSummaryIndex &Index, FunctionImporter::ImportMapTy &ImportList, - StringMap<FunctionImporter::ExportSetTy> *ExportLists = nullptr, - const DenseSet<GlobalValue::GUID> *DeadSymbols = nullptr) { + StringMap<FunctionImporter::ExportSetTy> *ExportLists = nullptr) { // Worklist contains the list of function imported in this module, for which // we will analyse the callees and may import further down the callgraph. SmallVector<EdgeInfo, 128> Worklist; @@ -289,7 +308,7 @@ static void ComputeImportForModule( // Populate the worklist with the import for the functions in the current // module for (auto &GVSummary : DefinedGVSummaries) { - if (DeadSymbols && DeadSymbols->count(GVSummary.first)) { + if (!Index.isGlobalValueLive(GVSummary.second)) { DEBUG(dbgs() << "Ignores Dead GUID: " << GVSummary.first << "\n"); continue; } @@ -332,15 +351,14 @@ void llvm::ComputeCrossModuleImport( const ModuleSummaryIndex &Index, const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries, StringMap<FunctionImporter::ImportMapTy> &ImportLists, - StringMap<FunctionImporter::ExportSetTy> &ExportLists, - const DenseSet<GlobalValue::GUID> *DeadSymbols) { + StringMap<FunctionImporter::ExportSetTy> &ExportLists) { // For each module that has function defined, compute the import/export lists. for (auto &DefinedGVSummaries : ModuleToDefinedGVSummaries) { auto &ImportList = ImportLists[DefinedGVSummaries.first()]; DEBUG(dbgs() << "Computing import for Module '" << DefinedGVSummaries.first() << "'\n"); ComputeImportForModule(DefinedGVSummaries.second, Index, ImportList, - &ExportLists, DeadSymbols); + &ExportLists); } // When computing imports we added all GUIDs referenced by anything @@ -402,84 +420,71 @@ void llvm::ComputeCrossModuleImportForModule( #endif } -DenseSet<GlobalValue::GUID> llvm::computeDeadSymbols( - const ModuleSummaryIndex &Index, +void llvm::computeDeadSymbols( + ModuleSummaryIndex &Index, const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols) { + assert(!Index.withGlobalValueDeadStripping()); if (!ComputeDead) - return DenseSet<GlobalValue::GUID>(); + return; if (GUIDPreservedSymbols.empty()) // Don't do anything when nothing is live, this is friendly with tests. - return DenseSet<GlobalValue::GUID>(); - DenseSet<GlobalValue::GUID> LiveSymbols = GUIDPreservedSymbols; - SmallVector<GlobalValue::GUID, 128> Worklist; - Worklist.reserve(LiveSymbols.size() * 2); - for (auto GUID : LiveSymbols) { - DEBUG(dbgs() << "Live root: " << GUID << "\n"); - Worklist.push_back(GUID); - } - // Add values flagged in the index as live roots to the worklist. - for (const auto &Entry : Index) { - bool IsLiveRoot = llvm::any_of( - Entry.second, - [&](const std::unique_ptr<llvm::GlobalValueSummary> &Summary) { - return Summary->liveRoot(); - }); - if (!IsLiveRoot) + return; + unsigned LiveSymbols = 0; + SmallVector<ValueInfo, 128> Worklist; + Worklist.reserve(GUIDPreservedSymbols.size() * 2); + for (auto GUID : GUIDPreservedSymbols) { + ValueInfo VI = Index.getValueInfo(GUID); + if (!VI) continue; - DEBUG(dbgs() << "Live root (summary): " << Entry.first << "\n"); - Worklist.push_back(Entry.first); + for (auto &S : VI.getSummaryList()) + S->setLive(true); } - while (!Worklist.empty()) { - auto GUID = Worklist.pop_back_val(); - auto It = Index.findGlobalValueSummaryList(GUID); - if (It == Index.end()) { - DEBUG(dbgs() << "Not in index: " << GUID << "\n"); - continue; - } - - // FIXME: we should only make the prevailing copy live here - for (auto &Summary : It->second) { - for (auto Ref : Summary->refs()) { - auto RefGUID = Ref.getGUID(); - if (LiveSymbols.insert(RefGUID).second) { - DEBUG(dbgs() << "Marking live (ref): " << RefGUID << "\n"); - Worklist.push_back(RefGUID); - } - } - if (auto *FS = dyn_cast<FunctionSummary>(Summary.get())) { - for (auto Call : FS->calls()) { - auto CallGUID = Call.first.getGUID(); - if (LiveSymbols.insert(CallGUID).second) { - DEBUG(dbgs() << "Marking live (call): " << CallGUID << "\n"); - Worklist.push_back(CallGUID); - } - } + // Add values flagged in the index as live roots to the worklist. + for (const auto &Entry : Index) + for (auto &S : Entry.second.SummaryList) + if (S->isLive()) { + DEBUG(dbgs() << "Live root: " << Entry.first << "\n"); + Worklist.push_back(ValueInfo(&Entry)); + ++LiveSymbols; + break; } + + // Make value live and add it to the worklist if it was not live before. + // FIXME: we should only make the prevailing copy live here + auto visit = [&](ValueInfo VI) { + for (auto &S : VI.getSummaryList()) + if (S->isLive()) + return; + for (auto &S : VI.getSummaryList()) + S->setLive(true); + ++LiveSymbols; + Worklist.push_back(VI); + }; + + while (!Worklist.empty()) { + auto VI = Worklist.pop_back_val(); + for (auto &Summary : VI.getSummaryList()) { + for (auto Ref : Summary->refs()) + visit(Ref); + if (auto *FS = dyn_cast<FunctionSummary>(Summary.get())) + for (auto Call : FS->calls()) + visit(Call.first); if (auto *AS = dyn_cast<AliasSummary>(Summary.get())) { auto AliaseeGUID = AS->getAliasee().getOriginalName(); - if (LiveSymbols.insert(AliaseeGUID).second) { - DEBUG(dbgs() << "Marking live (alias): " << AliaseeGUID << "\n"); - Worklist.push_back(AliaseeGUID); - } + ValueInfo AliaseeVI = Index.getValueInfo(AliaseeGUID); + if (AliaseeVI) + visit(AliaseeVI); } } } - DenseSet<GlobalValue::GUID> DeadSymbols; - DeadSymbols.reserve( - std::min(Index.size(), Index.size() - LiveSymbols.size())); - for (auto &Entry : Index) { - auto GUID = Entry.first; - if (!LiveSymbols.count(GUID)) { - DEBUG(dbgs() << "Marking dead: " << GUID << "\n"); - DeadSymbols.insert(GUID); - } - } - DEBUG(dbgs() << LiveSymbols.size() << " symbols Live, and " - << DeadSymbols.size() << " symbols Dead \n"); - NumDeadSymbols += DeadSymbols.size(); - NumLiveSymbols += LiveSymbols.size(); - return DeadSymbols; + Index.setWithGlobalValueDeadStripping(); + + unsigned DeadSymbols = Index.size() - LiveSymbols; + DEBUG(dbgs() << LiveSymbols << " symbols Live, and " << DeadSymbols + << " symbols Dead \n"); + NumDeadSymbols += DeadSymbols; + NumLiveSymbols += LiveSymbols; } /// Compute the set of summaries needed for a ThinLTO backend compilation of @@ -522,9 +527,24 @@ llvm::EmitImportsFiles(StringRef ModulePath, StringRef OutputFilename, /// Fixup WeakForLinker linkages in \p TheModule based on summary analysis. void llvm::thinLTOResolveWeakForLinkerModule( Module &TheModule, const GVSummaryMapTy &DefinedGlobals) { + auto ConvertToDeclaration = [](GlobalValue &GV) { + DEBUG(dbgs() << "Converting to a declaration: `" << GV.getName() << "\n"); + if (Function *F = dyn_cast<Function>(&GV)) { + F->deleteBody(); + F->clearMetadata(); + } else if (GlobalVariable *V = dyn_cast<GlobalVariable>(&GV)) { + V->setInitializer(nullptr); + V->setLinkage(GlobalValue::ExternalLinkage); + V->clearMetadata(); + } else + // For now we don't resolve or drop aliases. Once we do we'll + // need to add support here for creating either a function or + // variable declaration, and return the new GlobalValue* for + // the caller to use. + llvm_unreachable("Expected function or variable"); + }; + auto updateLinkage = [&](GlobalValue &GV) { - if (!GlobalValue::isWeakForLinker(GV.getLinkage())) - return; // See if the global summary analysis computed a new resolved linkage. const auto &GS = DefinedGlobals.find(GV.getGUID()); if (GS == DefinedGlobals.end()) @@ -532,18 +552,40 @@ void llvm::thinLTOResolveWeakForLinkerModule( auto NewLinkage = GS->second->linkage(); if (NewLinkage == GV.getLinkage()) return; - DEBUG(dbgs() << "ODR fixing up linkage for `" << GV.getName() << "` from " - << GV.getLinkage() << " to " << NewLinkage << "\n"); - GV.setLinkage(NewLinkage); - // Remove functions converted to available_externally from comdats, + + // Switch the linkage to weakany if asked for, e.g. we do this for + // linker redefined symbols (via --wrap or --defsym). + // We record that the visibility should be changed here in `addThinLTO` + // as we need access to the resolution vectors for each input file in + // order to find which symbols have been redefined. + // We may consider reorganizing this code and moving the linkage recording + // somewhere else, e.g. in thinLTOResolveWeakForLinkerInIndex. + if (NewLinkage == GlobalValue::WeakAnyLinkage) { + GV.setLinkage(NewLinkage); + return; + } + + if (!GlobalValue::isWeakForLinker(GV.getLinkage())) + return; + // Check for a non-prevailing def that has interposable linkage + // (e.g. non-odr weak or linkonce). In that case we can't simply + // convert to available_externally, since it would lose the + // interposable property and possibly get inlined. Simply drop + // the definition in that case. + if (GlobalValue::isAvailableExternallyLinkage(NewLinkage) && + GlobalValue::isInterposableLinkage(GV.getLinkage())) + ConvertToDeclaration(GV); + else { + DEBUG(dbgs() << "ODR fixing up linkage for `" << GV.getName() << "` from " + << GV.getLinkage() << " to " << NewLinkage << "\n"); + GV.setLinkage(NewLinkage); + } + // Remove declarations from comdats, including available_externally // as this is a declaration for the linker, and will be dropped eventually. // It is illegal for comdats to contain declarations. auto *GO = dyn_cast_or_null<GlobalObject>(&GV); - if (GO && GO->isDeclarationForLinker() && GO->hasComdat()) { - assert(GO->hasAvailableExternallyLinkage() && - "Expected comdat on definition (possibly available external)"); + if (GO && GO->isDeclarationForLinker() && GO->hasComdat()) GO->setComdat(nullptr); - } }; // Process functions and global now @@ -562,7 +604,7 @@ void llvm::thinLTOInternalizeModule(Module &TheModule, // the current module. StringSet<> AsmUndefinedRefs; ModuleSymbolTable::CollectAsmSymbols( - Triple(TheModule.getTargetTriple()), TheModule.getModuleInlineAsm(), + TheModule, [&AsmUndefinedRefs](StringRef Name, object::BasicSymbolRef::Flags Flags) { if (Flags & object::BasicSymbolRef::SF_Undefined) AsmUndefinedRefs.insert(Name); @@ -576,8 +618,7 @@ void llvm::thinLTOInternalizeModule(Module &TheModule, return true; // Lookup the linkage recorded in the summaries during global analysis. - const auto &GS = DefinedGlobals.find(GV.getGUID()); - GlobalValue::LinkageTypes Linkage; + auto GS = DefinedGlobals.find(GV.getGUID()); if (GS == DefinedGlobals.end()) { // Must have been promoted (possibly conservatively). Find original // name so that we can access the correct summary and see if it can @@ -589,7 +630,7 @@ void llvm::thinLTOInternalizeModule(Module &TheModule, std::string OrigId = GlobalValue::getGlobalIdentifier( OrigName, GlobalValue::InternalLinkage, TheModule.getSourceFileName()); - const auto &GS = DefinedGlobals.find(GlobalValue::getGUID(OrigId)); + GS = DefinedGlobals.find(GlobalValue::getGUID(OrigId)); if (GS == DefinedGlobals.end()) { // Also check the original non-promoted non-globalized name. In some // cases a preempted weak value is linked in as a local copy because @@ -597,15 +638,11 @@ void llvm::thinLTOInternalizeModule(Module &TheModule, // In that case, since it was originally not a local value, it was // recorded in the index using the original name. // FIXME: This may not be needed once PR27866 is fixed. - const auto &GS = DefinedGlobals.find(GlobalValue::getGUID(OrigName)); + GS = DefinedGlobals.find(GlobalValue::getGUID(OrigName)); assert(GS != DefinedGlobals.end()); - Linkage = GS->second->linkage(); - } else { - Linkage = GS->second->linkage(); } - } else - Linkage = GS->second->linkage(); - return !GlobalValue::isLocalLinkage(Linkage); + } + return !GlobalValue::isLocalLinkage(GS->second->linkage()); }; // FIXME: See if we can just internalize directly here via linkage changes @@ -617,14 +654,12 @@ void llvm::thinLTOInternalizeModule(Module &TheModule, // index. // Expected<bool> FunctionImporter::importFunctions( - Module &DestModule, const FunctionImporter::ImportMapTy &ImportList, - bool ForceImportReferencedDiscardableSymbols) { + Module &DestModule, const FunctionImporter::ImportMapTy &ImportList) { DEBUG(dbgs() << "Starting import for Module " << DestModule.getModuleIdentifier() << "\n"); unsigned ImportedCount = 0; - // Linker that will be used for importing function - Linker TheLinker(DestModule); + IRMover Mover(DestModule); // Do the actual import of functions now, one Module at a time std::set<StringRef> ModuleNameOrderedList; for (auto &FunctionsToImportPerModule : ImportList) { @@ -648,7 +683,7 @@ Expected<bool> FunctionImporter::importFunctions( auto &ImportGUIDs = FunctionsToImportPerModule->second; // Find the globals to import - DenseSet<const GlobalValue *> GlobalsToImport; + SetVector<GlobalValue *> GlobalsToImport; for (Function &F : *SrcModule) { if (!F.hasName()) continue; @@ -687,6 +722,13 @@ Expected<bool> FunctionImporter::importFunctions( } } for (GlobalAlias &GA : SrcModule->aliases()) { + // FIXME: This should eventually be controlled entirely by the summary. + if (FunctionImportGlobalProcessing::doImportAsDefinition( + &GA, &GlobalsToImport)) { + GlobalsToImport.insert(&GA); + continue; + } + if (!GA.hasName()) continue; auto GUID = GA.getGUID(); @@ -731,12 +773,9 @@ Expected<bool> FunctionImporter::importFunctions( << " from " << SrcModule->getSourceFileName() << "\n"; } - // Instruct the linker that the client will take care of linkonce resolution - unsigned Flags = Linker::Flags::None; - if (!ForceImportReferencedDiscardableSymbols) - Flags |= Linker::Flags::DontForceLinkLinkonceODR; - - if (TheLinker.linkInModule(std::move(SrcModule), Flags, &GlobalsToImport)) + if (Mover.move(std::move(SrcModule), GlobalsToImport.getArrayRef(), + [](GlobalValue &, IRMover::ValueAdder) {}, + /*IsPerformingImport=*/true)) report_fatal_error("Function Import: link error"); ImportedCount += GlobalsToImport.size(); @@ -778,7 +817,7 @@ static bool doImportingForModule(Module &M) { // is only enabled when testing importing via the 'opt' tool, which does // not do the ThinLink that would normally determine what values to promote. for (auto &I : *Index) { - for (auto &S : I.second) { + for (auto &S : I.second.SummaryList) { if (GlobalValue::isLocalLinkage(S->linkage())) S->setLinkage(GlobalValue::ExternalLinkage); } @@ -796,8 +835,7 @@ static bool doImportingForModule(Module &M) { return loadFile(Identifier, M.getContext()); }; FunctionImporter Importer(*Index, ModuleLoader); - Expected<bool> Result = Importer.importFunctions( - M, ImportList, !DontForceImportReferencedDiscardableSymbols); + Expected<bool> Result = Importer.importFunctions(M, ImportList); // FIXME: Probably need to propagate Errors through the pass manager. if (!Result) { diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp index 7a04de3..c91e8b4 100644 --- a/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp +++ b/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp @@ -25,7 +25,7 @@ #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Utils/CtorUtils.h" #include "llvm/Transforms/Utils/GlobalStatus.h" -#include <unordered_map> + using namespace llvm; #define DEBUG_TYPE "globaldce" @@ -50,7 +50,14 @@ namespace { if (skipModule(M)) return false; + // We need a minimally functional dummy module analysis manager. It needs + // to at least know about the possibility of proxying a function analysis + // manager. + FunctionAnalysisManager DummyFAM; ModuleAnalysisManager DummyMAM; + DummyMAM.registerPass( + [&] { return FunctionAnalysisManagerModuleProxy(DummyFAM); }); + auto PA = Impl.run(M, DummyMAM); return !PA.areAllPreserved(); } @@ -78,9 +85,67 @@ static bool isEmptyFunction(Function *F) { return RI.getReturnValue() == nullptr; } -PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &) { +/// Compute the set of GlobalValue that depends from V. +/// The recursion stops as soon as a GlobalValue is met. +void GlobalDCEPass::ComputeDependencies(Value *V, + SmallPtrSetImpl<GlobalValue *> &Deps) { + if (auto *I = dyn_cast<Instruction>(V)) { + Function *Parent = I->getParent()->getParent(); + Deps.insert(Parent); + } else if (auto *GV = dyn_cast<GlobalValue>(V)) { + Deps.insert(GV); + } else if (auto *CE = dyn_cast<Constant>(V)) { + // Avoid walking the whole tree of a big ConstantExprs multiple times. + auto Where = ConstantDependenciesCache.find(CE); + if (Where != ConstantDependenciesCache.end()) { + auto const &K = Where->second; + Deps.insert(K.begin(), K.end()); + } else { + SmallPtrSetImpl<GlobalValue *> &LocalDeps = ConstantDependenciesCache[CE]; + for (User *CEUser : CE->users()) + ComputeDependencies(CEUser, LocalDeps); + Deps.insert(LocalDeps.begin(), LocalDeps.end()); + } + } +} + +void GlobalDCEPass::UpdateGVDependencies(GlobalValue &GV) { + SmallPtrSet<GlobalValue *, 8> Deps; + for (User *User : GV.users()) + ComputeDependencies(User, Deps); + Deps.erase(&GV); // Remove self-reference. + for (GlobalValue *GVU : Deps) { + GVDependencies.insert(std::make_pair(GVU, &GV)); + } +} + +/// Mark Global value as Live +void GlobalDCEPass::MarkLive(GlobalValue &GV, + SmallVectorImpl<GlobalValue *> *Updates) { + auto const Ret = AliveGlobals.insert(&GV); + if (!Ret.second) + return; + + if (Updates) + Updates->push_back(&GV); + if (Comdat *C = GV.getComdat()) { + for (auto &&CM : make_range(ComdatMembers.equal_range(C))) + MarkLive(*CM.second, Updates); // Recursion depth is only two because only + // globals in the same comdat are visited. + } +} + +PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) { bool Changed = false; + // The algorithm first computes the set L of global variables that are + // trivially live. Then it walks the initialization of these variables to + // compute the globals used to initialize them, which effectively builds a + // directed graph where nodes are global variables, and an edge from A to B + // means B is used to initialize A. Finally, it propagates the liveness + // information through the graph starting from the nodes in L. Nodes note + // marked as alive are discarded. + // Remove empty functions from the global ctors list. Changed |= optimizeGlobalCtorsList(M, isEmptyFunction); @@ -103,21 +168,39 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &) { // initializer. if (!GO.isDeclaration() && !GO.hasAvailableExternallyLinkage()) if (!GO.isDiscardableIfUnused()) - GlobalIsNeeded(&GO); + MarkLive(GO); + + UpdateGVDependencies(GO); } + // Compute direct dependencies of aliases. for (GlobalAlias &GA : M.aliases()) { Changed |= RemoveUnusedGlobalValue(GA); // Externally visible aliases are needed. if (!GA.isDiscardableIfUnused()) - GlobalIsNeeded(&GA); + MarkLive(GA); + + UpdateGVDependencies(GA); } + // Compute direct dependencies of ifuncs. for (GlobalIFunc &GIF : M.ifuncs()) { Changed |= RemoveUnusedGlobalValue(GIF); // Externally visible ifuncs are needed. if (!GIF.isDiscardableIfUnused()) - GlobalIsNeeded(&GIF); + MarkLive(GIF); + + UpdateGVDependencies(GIF); + } + + // Propagate liveness from collected Global Values through the computed + // dependencies. + SmallVector<GlobalValue *, 8> NewLiveGVs{AliveGlobals.begin(), + AliveGlobals.end()}; + while (!NewLiveGVs.empty()) { + GlobalValue *LGV = NewLiveGVs.pop_back_val(); + for (auto &&GVD : make_range(GVDependencies.equal_range(LGV))) + MarkLive(*GVD.second, &NewLiveGVs); } // Now that all globals which are needed are in the AliveGlobals set, we loop @@ -154,7 +237,7 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &) { GA.setAliasee(nullptr); } - // The third pass drops targets of ifuncs which are dead... + // The fourth pass drops targets of ifuncs which are dead... std::vector<GlobalIFunc*> DeadIFuncs; for (GlobalIFunc &GIF : M.ifuncs()) if (!AliveGlobals.count(&GIF)) { @@ -188,7 +271,8 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &) { // Make sure that all memory is released AliveGlobals.clear(); - SeenConstants.clear(); + ConstantDependenciesCache.clear(); + GVDependencies.clear(); ComdatMembers.clear(); if (Changed) @@ -196,60 +280,6 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &) { return PreservedAnalyses::all(); } -/// GlobalIsNeeded - the specific global value as needed, and -/// recursively mark anything that it uses as also needed. -void GlobalDCEPass::GlobalIsNeeded(GlobalValue *G) { - // If the global is already in the set, no need to reprocess it. - if (!AliveGlobals.insert(G).second) - return; - - if (Comdat *C = G->getComdat()) { - for (auto &&CM : make_range(ComdatMembers.equal_range(C))) - GlobalIsNeeded(CM.second); - } - - if (GlobalVariable *GV = dyn_cast<GlobalVariable>(G)) { - // If this is a global variable, we must make sure to add any global values - // referenced by the initializer to the alive set. - if (GV->hasInitializer()) - MarkUsedGlobalsAsNeeded(GV->getInitializer()); - } else if (GlobalIndirectSymbol *GIS = dyn_cast<GlobalIndirectSymbol>(G)) { - // The target of a global alias or ifunc is needed. - MarkUsedGlobalsAsNeeded(GIS->getIndirectSymbol()); - } else { - // Otherwise this must be a function object. We have to scan the body of - // the function looking for constants and global values which are used as - // operands. Any operands of these types must be processed to ensure that - // any globals used will be marked as needed. - Function *F = cast<Function>(G); - - for (Use &U : F->operands()) - MarkUsedGlobalsAsNeeded(cast<Constant>(U.get())); - - for (BasicBlock &BB : *F) - for (Instruction &I : BB) - for (Use &U : I.operands()) - if (GlobalValue *GV = dyn_cast<GlobalValue>(U)) - GlobalIsNeeded(GV); - else if (Constant *C = dyn_cast<Constant>(U)) - MarkUsedGlobalsAsNeeded(C); - } -} - -void GlobalDCEPass::MarkUsedGlobalsAsNeeded(Constant *C) { - if (GlobalValue *GV = dyn_cast<GlobalValue>(C)) - return GlobalIsNeeded(GV); - - // Loop over all of the operands of the constant, adding any globals they - // use to the list of needed globals. - for (Use &U : C->operands()) { - // If we've already processed this constant there's no need to do it again. - Constant *Op = dyn_cast<Constant>(U); - if (Op && SeenConstants.insert(Op).second) - MarkUsedGlobalsAsNeeded(Op); - } -} - // RemoveUnusedGlobalValue - Loop over all of the uses of the specified // GlobalValue, looking for the constant pointer ref that may be pointing to it. // If found, check to see if the constant pointer ref is safe to destroy, and if diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp index 5b0d5e3..93eab68 100644 --- a/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -239,7 +239,7 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init, // we delete a constant array, we may also be holding pointer to one of its // elements (or an element of one of its elements if we're dealing with an // array of arrays) in the worklist. - SmallVector<WeakVH, 8> WorkList(V->user_begin(), V->user_end()); + SmallVector<WeakTrackingVH, 8> WorkList(V->user_begin(), V->user_end()); while (!WorkList.empty()) { Value *UV = WorkList.pop_back_val(); if (!UV) @@ -837,7 +837,7 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy, if (StoreInst *SI = dyn_cast<StoreInst>(GV->user_back())) { // The global is initialized when the store to it occurs. new StoreInst(ConstantInt::getTrue(GV->getContext()), InitBool, false, 0, - SI->getOrdering(), SI->getSynchScope(), SI); + SI->getOrdering(), SI->getSyncScopeID(), SI); SI->eraseFromParent(); continue; } @@ -854,7 +854,7 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy, // Replace the cmp X, 0 with a use of the bool value. // Sink the load to where the compare was, if atomic rules allow us to. Value *LV = new LoadInst(InitBool, InitBool->getName()+".val", false, 0, - LI->getOrdering(), LI->getSynchScope(), + LI->getOrdering(), LI->getSyncScopeID(), LI->isUnordered() ? (Instruction*)ICI : LI); InitBoolUsed = true; switch (ICI->getPredicate()) { @@ -1605,7 +1605,7 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) { assert(LI->getOperand(0) == GV && "Not a copy!"); // Insert a new load, to preserve the saved value. StoreVal = new LoadInst(NewGV, LI->getName()+".b", false, 0, - LI->getOrdering(), LI->getSynchScope(), LI); + LI->getOrdering(), LI->getSyncScopeID(), LI); } else { assert((isa<CastInst>(StoredVal) || isa<SelectInst>(StoredVal)) && "This is not a form that we understand!"); @@ -1614,12 +1614,12 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) { } } new StoreInst(StoreVal, NewGV, false, 0, - SI->getOrdering(), SI->getSynchScope(), SI); + SI->getOrdering(), SI->getSyncScopeID(), SI); } else { // Change the load into a load of bool then a select. LoadInst *LI = cast<LoadInst>(UI); LoadInst *NLI = new LoadInst(NewGV, LI->getName()+".b", false, 0, - LI->getOrdering(), LI->getSynchScope(), LI); + LI->getOrdering(), LI->getSyncScopeID(), LI); Value *NSI; if (IsOneZero) NSI = new ZExtInst(NLI, LI->getType(), "", LI); @@ -1792,7 +1792,9 @@ static void makeAllConstantUsesInstructions(Constant *C) { NewU->insertBefore(UI); UI->replaceUsesOfWith(U, NewU); } - U->dropAllReferences(); + // We've replaced all the uses, so destroy the constant. (destroyConstant + // will update value handles and metadata.) + U->destroyConstant(); } } @@ -1819,12 +1821,14 @@ static bool processInternalGlobal( GS.AccessingFunction->doesNotRecurse() && isPointerValueDeadOnEntryToFunction(GS.AccessingFunction, GV, LookupDomTree)) { + const DataLayout &DL = GV->getParent()->getDataLayout(); + DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV << "\n"); Instruction &FirstI = const_cast<Instruction&>(*GS.AccessingFunction ->getEntryBlock().begin()); Type *ElemTy = GV->getValueType(); // FIXME: Pass Global's alignment when globals have alignment - AllocaInst *Alloca = new AllocaInst(ElemTy, nullptr, + AllocaInst *Alloca = new AllocaInst(ElemTy, DL.getAllocaAddrSpace(), nullptr, GV->getName(), &FirstI); if (!isa<UndefValue>(GV->getInitializer())) new StoreInst(GV->getInitializer(), Alloca, &FirstI); @@ -1977,16 +1981,11 @@ static void ChangeCalleesToFastCall(Function *F) { } } -static AttributeSet StripNest(LLVMContext &C, const AttributeSet &Attrs) { - for (unsigned i = 0, e = Attrs.getNumSlots(); i != e; ++i) { - unsigned Index = Attrs.getSlotIndex(i); - if (!Attrs.getSlotAttributes(i).hasAttribute(Index, Attribute::Nest)) - continue; - - // There can be only one. - return Attrs.removeAttribute(C, Index, Attribute::Nest); - } - +static AttributeList StripNest(LLVMContext &C, AttributeList Attrs) { + // There can be at most one attribute set with a nest attribute. + unsigned NestIndex; + if (Attrs.hasAttrSomewhere(Attribute::Nest, &NestIndex)) + return Attrs.removeAttribute(C, NestIndex, Attribute::Nest); return Attrs; } @@ -2027,6 +2026,24 @@ OptimizeFunctions(Module &M, TargetLibraryInfo *TLI, continue; } + // LLVM's definition of dominance allows instructions that are cyclic + // in unreachable blocks, e.g.: + // %pat = select i1 %condition, @global, i16* %pat + // because any instruction dominates an instruction in a block that's + // not reachable from entry. + // So, remove unreachable blocks from the function, because a) there's + // no point in analyzing them and b) GlobalOpt should otherwise grow + // some more complicated logic to break these cycles. + // Removing unreachable blocks might invalidate the dominator so we + // recalculate it. + if (!F->isDeclaration()) { + if (removeUnreachableBlocks(*F)) { + auto &DT = LookupDomTree(*F); + DT.recalculate(*F); + Changed = true; + } + } + Changed |= processGlobal(*F, TLI, LookupDomTree); if (!F->hasLocalLinkage()) @@ -2387,7 +2404,7 @@ OptimizeGlobalAliases(Module &M, } static Function *FindCXAAtExit(Module &M, TargetLibraryInfo *TLI) { - LibFunc::Func F = LibFunc::cxa_atexit; + LibFunc F = LibFunc_cxa_atexit; if (!TLI->has(F)) return nullptr; @@ -2396,7 +2413,7 @@ static Function *FindCXAAtExit(Module &M, TargetLibraryInfo *TLI) { return nullptr; // Make sure that the function has the correct prototype. - if (!TLI->getLibFunc(*Fn, F) || F != LibFunc::cxa_atexit) + if (!TLI->getLibFunc(*Fn, F) || F != LibFunc_cxa_atexit) return nullptr; return Fn; diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalSplit.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalSplit.cpp index bbbd096..e47d881 100644 --- a/contrib/llvm/lib/Transforms/IPO/GlobalSplit.cpp +++ b/contrib/llvm/lib/Transforms/IPO/GlobalSplit.cpp @@ -14,7 +14,6 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/GlobalSplit.h" #include "llvm/ADT/StringExtras.h" #include "llvm/IR/Constants.h" @@ -23,6 +22,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/Pass.h" +#include "llvm/Transforms/IPO.h" #include <set> @@ -85,7 +85,16 @@ bool splitGlobal(GlobalVariable &GV) { uint64_t ByteOffset = cast<ConstantInt>( cast<ConstantAsMetadata>(Type->getOperand(0))->getValue()) ->getZExtValue(); - if (ByteOffset < SplitBegin || ByteOffset >= SplitEnd) + // Type metadata may be attached one byte after the end of the vtable, for + // classes without virtual methods in Itanium ABI. AFAIK, it is never + // attached to the first byte of a vtable. Subtract one to get the right + // slice. + // This is making an assumption that vtable groups are the only kinds of + // global variables that !type metadata can be attached to, and that they + // are either Itanium ABI vtable groups or contain a single vtable (i.e. + // Microsoft ABI vtables). + uint64_t AttachedTo = (ByteOffset == 0) ? ByteOffset : ByteOffset - 1; + if (AttachedTo < SplitBegin || AttachedTo >= SplitEnd) continue; SplitGV->addMetadata( LLVMContext::MD_type, diff --git a/contrib/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp b/contrib/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp index 916135e..f79b610 100644 --- a/contrib/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp +++ b/contrib/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp @@ -15,7 +15,6 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/IPO.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ValueTracking.h" @@ -24,6 +23,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" +#include "llvm/Transforms/IPO.h" using namespace llvm; #define DEBUG_TYPE "ipconstprop" @@ -136,7 +136,13 @@ static bool PropagateConstantReturn(Function &F) { // For more details, see GlobalValue::mayBeDerefined. if (!F.isDefinitionExact()) return false; - + + // Don't touch naked functions. The may contain asm returning + // value we don't see, so we may end up interprocedurally propagating + // the return value incorrectly. + if (F.hasFnAttribute(Attribute::Naked)) + return false; + // Check to see if this function returns a constant. SmallVector<Value *,4> RetVals; StructType *STy = dyn_cast<StructType>(F.getReturnType()); diff --git a/contrib/llvm/lib/Transforms/IPO/IPO.cpp b/contrib/llvm/lib/Transforms/IPO/IPO.cpp index 89518f3..5bb305c 100644 --- a/contrib/llvm/lib/Transforms/IPO/IPO.cpp +++ b/contrib/llvm/lib/Transforms/IPO/IPO.cpp @@ -13,10 +13,10 @@ // //===----------------------------------------------------------------------===// -#include "llvm-c/Initialization.h" #include "llvm-c/Transforms/IPO.h" -#include "llvm/InitializePasses.h" +#include "llvm-c/Initialization.h" #include "llvm/IR/LegacyPassManager.h" +#include "llvm/InitializePasses.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/AlwaysInliner.h" #include "llvm/Transforms/IPO/FunctionAttrs.h" diff --git a/contrib/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp b/contrib/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp index 2ef299d..15d7515 100644 --- a/contrib/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp +++ b/contrib/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp @@ -8,8 +8,8 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/IPO/InferFunctionAttrs.h" -#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" diff --git a/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp b/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp index 1770445..50e7cc8 100644 --- a/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp +++ b/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp @@ -48,7 +48,7 @@ public: } explicit SimpleInliner(InlineParams Params) - : LegacyInlinerBase(ID), Params(Params) { + : LegacyInlinerBase(ID), Params(std::move(Params)) { initializeSimpleInlinerPass(*PassRegistry::getPassRegistry()); } @@ -61,7 +61,8 @@ public: [&](Function &F) -> AssumptionCache & { return ACT->getAssumptionCache(F); }; - return llvm::getInlineCost(CS, Params, TTI, GetAssumptionCache, PSI); + return llvm::getInlineCost(CS, Params, TTI, GetAssumptionCache, + /*GetBFI=*/None, PSI); } bool runOnSCC(CallGraphSCC &SCC) override; @@ -92,8 +93,12 @@ Pass *llvm::createFunctionInliningPass(int Threshold) { } Pass *llvm::createFunctionInliningPass(unsigned OptLevel, - unsigned SizeOptLevel) { - return new SimpleInliner(llvm::getInlineParams(OptLevel, SizeOptLevel)); + unsigned SizeOptLevel, + bool DisableInlineHotCallSite) { + auto Param = llvm::getInlineParams(OptLevel, SizeOptLevel); + if (DisableInlineHotCallSite) + Param.HotCallSiteThreshold = 0; + return new SimpleInliner(Param); } Pass *llvm::createFunctionInliningPass(InlineParams &Params) { diff --git a/contrib/llvm/lib/Transforms/IPO/Inliner.cpp b/contrib/llvm/lib/Transforms/IPO/Inliner.cpp index 3f4731c..317770d 100644 --- a/contrib/llvm/lib/Transforms/IPO/Inliner.cpp +++ b/contrib/llvm/lib/Transforms/IPO/Inliner.cpp @@ -19,6 +19,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/OptimizationDiagnosticInfo.h" @@ -260,8 +261,8 @@ static bool InlineCallIfPossible( /// Return true if inlining of CS can block the caller from being /// inlined which is proved to be more beneficial. \p IC is the /// estimated inline cost associated with callsite \p CS. -/// \p TotalAltCost will be set to the estimated cost of inlining the caller -/// if \p CS is suppressed for inlining. +/// \p TotalSecondaryCost will be set to the estimated cost of inlining the +/// caller if \p CS is suppressed for inlining. static bool shouldBeDeferred(Function *Caller, CallSite CS, InlineCost IC, int &TotalSecondaryCost, @@ -288,7 +289,7 @@ shouldBeDeferred(Function *Caller, CallSite CS, InlineCost IC, // treating them as truly abstract units etc. TotalSecondaryCost = 0; // The candidate cost to be imposed upon the current function. - int CandidateCost = IC.getCost() - (InlineConstants::CallPenalty + 1); + int CandidateCost = IC.getCost() - 1; // This bool tracks what happens if we do NOT inline C into B. bool callerWillBeRemoved = Caller->hasLocalLinkage(); // This bool tracks what happens if we DO inline C into B. @@ -325,7 +326,7 @@ shouldBeDeferred(Function *Caller, CallSite CS, InlineCost IC, // one is set very low by getInlineCost, in anticipation that Caller will // be removed entirely. We did not account for this above unless there // is only one caller of Caller. - if (callerWillBeRemoved && !Caller->use_empty()) + if (callerWillBeRemoved && !Caller->hasOneUse()) TotalSecondaryCost -= InlineConstants::LastCallToStaticBonus; if (inliningPreventsSomeOuterInline && TotalSecondaryCost < IC.getCost()) @@ -342,6 +343,7 @@ static bool shouldInline(CallSite CS, InlineCost IC = GetInlineCost(CS); Instruction *Call = CS.getInstruction(); Function *Callee = CS.getCalledFunction(); + Function *Caller = CS.getCaller(); if (IC.isAlways()) { DEBUG(dbgs() << " Inlining: cost=always" @@ -355,19 +357,20 @@ static bool shouldInline(CallSite CS, if (IC.isNever()) { DEBUG(dbgs() << " NOT Inlining: cost=never" << ", Call: " << *CS.getInstruction() << "\n"); - ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "NeverInline", Call) - << NV("Callee", Callee) - << " should never be inlined (cost=never)"); + ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call) + << NV("Callee", Callee) << " not inlined into " + << NV("Caller", Caller) + << " because it should never be inlined (cost=never)"); return false; } - Function *Caller = CS.getCaller(); if (!IC) { DEBUG(dbgs() << " NOT Inlining: cost=" << IC.getCost() << ", thres=" << (IC.getCostDelta() + IC.getCost()) << ", Call: " << *CS.getInstruction() << "\n"); - ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call) - << NV("Callee", Callee) << " too costly to inline (cost=" + ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "TooCostly", Call) + << NV("Callee", Callee) << " not inlined into " + << NV("Caller", Caller) << " because too costly to inline (cost=" << NV("Cost", IC.getCost()) << ", threshold=" << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"); return false; @@ -378,8 +381,8 @@ static bool shouldInline(CallSite CS, DEBUG(dbgs() << " NOT Inlining: " << *CS.getInstruction() << " Cost = " << IC.getCost() << ", outer Cost = " << TotalSecondaryCost << '\n'); - ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, - "IncreaseCostInOtherContexts", Call) + ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "IncreaseCostInOtherContexts", + Call) << "Not inlining. Cost of inlining " << NV("Callee", Callee) << " increases the cost of inlining " << NV("Caller", Caller) << " in other contexts"); @@ -499,7 +502,7 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG, std::swap(CallSites[i--], CallSites[--FirstCallInSCC]); InlinedArrayAllocasTy InlinedArrayAllocas; - InlineFunctionInfo InlineInfo(&CG, &GetAssumptionCache); + InlineFunctionInfo InlineInfo(&CG, &GetAssumptionCache, PSI); // Now that we have all of the call sites, loop over them and inline them if // it looks profitable to do so. @@ -516,52 +519,54 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG, Function *Caller = CS.getCaller(); Function *Callee = CS.getCalledFunction(); - // If this call site is dead and it is to a readonly function, we should - // just delete the call instead of trying to inline it, regardless of - // size. This happens because IPSCCP propagates the result out of the - // call and then we're left with the dead call. - if (isInstructionTriviallyDead(CS.getInstruction(), &TLI)) { - DEBUG(dbgs() << " -> Deleting dead call: " << *CS.getInstruction() - << "\n"); - // Update the call graph by deleting the edge from Callee to Caller. - CG[Caller]->removeCallEdgeFor(CS); - CS.getInstruction()->eraseFromParent(); - ++NumCallsDeleted; - } else { - // We can only inline direct calls to non-declarations. - if (!Callee || Callee->isDeclaration()) - continue; + // We can only inline direct calls to non-declarations. + if (!Callee || Callee->isDeclaration()) + continue; + Instruction *Instr = CS.getInstruction(); + + bool IsTriviallyDead = isInstructionTriviallyDead(Instr, &TLI); + + int InlineHistoryID; + if (!IsTriviallyDead) { // If this call site was obtained by inlining another function, verify // that the include path for the function did not include the callee // itself. If so, we'd be recursively inlining the same function, // which would provide the same callsites, which would cause us to // infinitely inline. - int InlineHistoryID = CallSites[CSi].second; + InlineHistoryID = CallSites[CSi].second; if (InlineHistoryID != -1 && InlineHistoryIncludes(Callee, InlineHistoryID, InlineHistory)) continue; + } + // FIXME for new PM: because of the old PM we currently generate ORE and + // in turn BFI on demand. With the new PM, the ORE dependency should + // just become a regular analysis dependency. + OptimizationRemarkEmitter ORE(Caller); + + // If the policy determines that we should inline this function, + // delete the call instead. + if (!shouldInline(CS, GetInlineCost, ORE)) + continue; + + // If this call site is dead and it is to a readonly function, we should + // just delete the call instead of trying to inline it, regardless of + // size. This happens because IPSCCP propagates the result out of the + // call and then we're left with the dead call. + if (IsTriviallyDead) { + DEBUG(dbgs() << " -> Deleting dead call: " << *Instr << "\n"); + // Update the call graph by deleting the edge from Callee to Caller. + CG[Caller]->removeCallEdgeFor(CS); + Instr->eraseFromParent(); + ++NumCallsDeleted; + } else { // Get DebugLoc to report. CS will be invalid after Inliner. - DebugLoc DLoc = CS.getInstruction()->getDebugLoc(); + DebugLoc DLoc = Instr->getDebugLoc(); BasicBlock *Block = CS.getParent(); - // FIXME for new PM: because of the old PM we currently generate ORE and - // in turn BFI on demand. With the new PM, the ORE dependency should - // just become a regular analysis dependency. - OptimizationRemarkEmitter ORE(Caller); - - // If the policy determines that we should inline this function, - // try to do so. - using namespace ore; - if (!shouldInline(CS, GetInlineCost, ORE)) { - ORE.emit( - OptimizationRemarkMissed(DEBUG_TYPE, "NotInlined", DLoc, Block) - << NV("Callee", Callee) << " will not be inlined into " - << NV("Caller", Caller)); - continue; - } // Attempt to inline the function. + using namespace ore; if (!InlineCallIfPossible(CS, InlineInfo, InlinedArrayAllocas, InlineHistoryID, InsertLifetime, AARGetter, ImportedFunctionsStats)) { @@ -638,22 +643,12 @@ bool LegacyInlinerBase::inlineCalls(CallGraphSCC &SCC) { ACT = &getAnalysis<AssumptionCacheTracker>(); PSI = getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); - // We compute dedicated AA results for each function in the SCC as needed. We - // use a lambda referencing external objects so that they live long enough to - // be queried, but we re-use them each time. - Optional<BasicAAResult> BAR; - Optional<AAResults> AAR; - auto AARGetter = [&](Function &F) -> AAResults & { - BAR.emplace(createLegacyPMBasicAAResult(*this, F)); - AAR.emplace(createLegacyPMAAResults(*this, F, *BAR)); - return *AAR; - }; auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & { return ACT->getAssumptionCache(F); }; return inlineCallsImpl(SCC, CG, GetAssumptionCache, PSI, TLI, InsertLifetime, [this](CallSite CS) { return getInlineCost(CS); }, - AARGetter, ImportedFunctionsStats); + LegacyAARGetter(*this), ImportedFunctionsStats); } /// Remove now-dead linkonce functions at the end of @@ -750,9 +745,6 @@ bool LegacyInlinerBase::removeDeadFunctions(CallGraph &CG, PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, CGSCCAnalysisManager &AM, LazyCallGraph &CG, CGSCCUpdateResult &UR) { - FunctionAnalysisManager &FAM = - AM.getResult<FunctionAnalysisManagerCGSCCProxy>(InitialC, CG) - .getManager(); const ModuleAnalysisManager &MAM = AM.getResult<ModuleAnalysisManagerCGSCCProxy>(InitialC, CG).getManager(); bool Changed = false; @@ -761,35 +753,52 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, Module &M = *InitialC.begin()->getFunction().getParent(); ProfileSummaryInfo *PSI = MAM.getCachedResult<ProfileSummaryAnalysis>(M); - std::function<AssumptionCache &(Function &)> GetAssumptionCache = - [&](Function &F) -> AssumptionCache & { - return FAM.getResult<AssumptionAnalysis>(F); - }; - - // Setup the data structure used to plumb customization into the - // `InlineFunction` routine. - InlineFunctionInfo IFI(/*cg=*/nullptr, &GetAssumptionCache); + // We use a single common worklist for calls across the entire SCC. We + // process these in-order and append new calls introduced during inlining to + // the end. + // + // Note that this particular order of processing is actually critical to + // avoid very bad behaviors. Consider *highly connected* call graphs where + // each function contains a small amonut of code and a couple of calls to + // other functions. Because the LLVM inliner is fundamentally a bottom-up + // inliner, it can handle gracefully the fact that these all appear to be + // reasonable inlining candidates as it will flatten things until they become + // too big to inline, and then move on and flatten another batch. + // + // However, when processing call edges *within* an SCC we cannot rely on this + // bottom-up behavior. As a consequence, with heavily connected *SCCs* of + // functions we can end up incrementally inlining N calls into each of + // N functions because each incremental inlining decision looks good and we + // don't have a topological ordering to prevent explosions. + // + // To compensate for this, we don't process transitive edges made immediate + // by inlining until we've done one pass of inlining across the entire SCC. + // Large, highly connected SCCs still lead to some amount of code bloat in + // this model, but it is uniformly spread across all the functions in the SCC + // and eventually they all become too large to inline, rather than + // incrementally maknig a single function grow in a super linear fashion. + SmallVector<std::pair<CallSite, int>, 16> Calls; - auto GetInlineCost = [&](CallSite CS) { - Function &Callee = *CS.getCalledFunction(); - auto &CalleeTTI = FAM.getResult<TargetIRAnalysis>(Callee); - return getInlineCost(CS, Params, CalleeTTI, GetAssumptionCache, PSI); - }; + // Populate the initial list of calls in this SCC. + for (auto &N : InitialC) { + // We want to generally process call sites top-down in order for + // simplifications stemming from replacing the call with the returned value + // after inlining to be visible to subsequent inlining decisions. + // FIXME: Using instructions sequence is a really bad way to do this. + // Instead we should do an actual RPO walk of the function body. + for (Instruction &I : instructions(N.getFunction())) + if (auto CS = CallSite(&I)) + if (Function *Callee = CS.getCalledFunction()) + if (!Callee->isDeclaration()) + Calls.push_back({CS, -1}); + } + if (Calls.empty()) + return PreservedAnalyses::all(); - // We use a worklist of nodes to process so that we can handle if the SCC - // structure changes and some nodes are no longer part of the current SCC. We - // also need to use an updatable pointer for the SCC as a consequence. - SmallVector<LazyCallGraph::Node *, 16> Nodes; - for (auto &N : InitialC) - Nodes.push_back(&N); + // Capture updatable variables for the current SCC and RefSCC. auto *C = &InitialC; auto *RC = &C->getOuterRefSCC(); - // We also use a secondary worklist of call sites within a particular node to - // allow quickly continuing to inline through newly inlined call sites where - // possible. - SmallVector<std::pair<CallSite, int>, 16> Calls; - // When inlining a callee produces new call sites, we want to keep track of // the fact that they were inlined from the callee. This allows us to avoid // infinite inlining in some obscure cases. To represent this, we use an @@ -805,34 +814,58 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, // defer deleting these to make it easier to handle the call graph updates. SmallVector<Function *, 4> DeadFunctions; - do { - auto &N = *Nodes.pop_back_val(); + // Loop forward over all of the calls. Note that we cannot cache the size as + // inlining can introduce new calls that need to be processed. + for (int i = 0; i < (int)Calls.size(); ++i) { + // We expect the calls to typically be batched with sequences of calls that + // have the same caller, so we first set up some shared infrastructure for + // this caller. We also do any pruning we can at this layer on the caller + // alone. + Function &F = *Calls[i].first.getCaller(); + LazyCallGraph::Node &N = *CG.lookup(F); if (CG.lookupSCC(N) != C) continue; - Function &F = N.getFunction(); if (F.hasFnAttribute(Attribute::OptimizeNone)) continue; + DEBUG(dbgs() << "Inlining calls in: " << F.getName() << "\n"); + + // Get a FunctionAnalysisManager via a proxy for this particular node. We + // do this each time we visit a node as the SCC may have changed and as + // we're going to mutate this particular function we want to make sure the + // proxy is in place to forward any invalidation events. We can use the + // manager we get here for looking up results for functions other than this + // node however because those functions aren't going to be mutated by this + // pass. + FunctionAnalysisManager &FAM = + AM.getResult<FunctionAnalysisManagerCGSCCProxy>(*C, CG) + .getManager(); + std::function<AssumptionCache &(Function &)> GetAssumptionCache = + [&](Function &F) -> AssumptionCache & { + return FAM.getResult<AssumptionAnalysis>(F); + }; + auto GetBFI = [&](Function &F) -> BlockFrequencyInfo & { + return FAM.getResult<BlockFrequencyAnalysis>(F); + }; + + auto GetInlineCost = [&](CallSite CS) { + Function &Callee = *CS.getCalledFunction(); + auto &CalleeTTI = FAM.getResult<TargetIRAnalysis>(Callee); + return getInlineCost(CS, Params, CalleeTTI, GetAssumptionCache, {GetBFI}, + PSI); + }; + // Get the remarks emission analysis for the caller. auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); - // We want to generally process call sites top-down in order for - // simplifications stemming from replacing the call with the returned value - // after inlining to be visible to subsequent inlining decisions. So we - // walk the function backwards and then process the back of the vector. - // FIXME: Using reverse is a really bad way to do this. Instead we should - // do an actual PO walk of the function body. - for (Instruction &I : reverse(instructions(F))) - if (auto CS = CallSite(&I)) - if (Function *Callee = CS.getCalledFunction()) - if (!Callee->isDeclaration()) - Calls.push_back({CS, -1}); - + // Now process as many calls as we have within this caller in the sequnece. + // We bail out as soon as the caller has to change so we can update the + // call graph and prepare the context of that new caller. bool DidInline = false; - while (!Calls.empty()) { + for (; i < (int)Calls.size() && Calls[i].first.getCaller() == &F; ++i) { int InlineHistoryID; CallSite CS; - std::tie(CS, InlineHistoryID) = Calls.pop_back_val(); + std::tie(CS, InlineHistoryID) = Calls[i]; Function &Callee = *CS.getCalledFunction(); if (InlineHistoryID != -1 && @@ -843,6 +876,13 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, if (!shouldInline(CS, GetInlineCost, ORE)) continue; + // Setup the data structure used to plumb customization into the + // `InlineFunction` routine. + InlineFunctionInfo IFI( + /*cg=*/nullptr, &GetAssumptionCache, PSI, + &FAM.getResult<BlockFrequencyAnalysis>(*(CS.getCaller())), + &FAM.getResult<BlockFrequencyAnalysis>(Callee)); + if (!InlineFunction(CS, IFI)) continue; DidInline = true; @@ -869,7 +909,13 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, // To check this we also need to nuke any dead constant uses (perhaps // made dead by this operation on other functions). Callee.removeDeadConstantUsers(); - if (Callee.use_empty()) { + if (Callee.use_empty() && !CG.isLibFunction(Callee)) { + Calls.erase( + std::remove_if(Calls.begin() + i + 1, Calls.end(), + [&Callee](const std::pair<CallSite, int> &Call) { + return Call.first.getCaller() == &Callee; + }), + Calls.end()); // Clear the body and queue the function itself for deletion when we // finish inlining and call graph updates. // Note that after this point, it is an error to do anything other @@ -882,6 +928,10 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, } } + // Back the call index up by one to put us in a good position to go around + // the outer loop. + --i; + if (!DidInline) continue; Changed = true; @@ -896,8 +946,8 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, // below. for (Function *InlinedCallee : InlinedCallees) { LazyCallGraph::Node &CalleeN = *CG.lookup(*InlinedCallee); - for (LazyCallGraph::Edge &E : CalleeN) - RC->insertTrivialRefEdge(N, *E.getNode()); + for (LazyCallGraph::Edge &E : *CalleeN) + RC->insertTrivialRefEdge(N, E.getNode()); } InlinedCallees.clear(); @@ -908,8 +958,9 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, // re-use the exact same logic for updating the call graph to reflect the // change.. C = &updateCGAndAnalysisManagerForFunctionPass(CG, *C, N, AM, UR); + DEBUG(dbgs() << "Updated inlining SCC: " << *C << "\n"); RC = &C->getOuterRefSCC(); - } while (!Nodes.empty()); + } // Now that we've finished inlining all of the calls across this SCC, delete // all of the trivially dead functions, updating the call graph and the CGSCC @@ -920,8 +971,13 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, // sets. for (Function *DeadF : DeadFunctions) { // Get the necessary information out of the call graph and nuke the - // function there. + // function there. Also, cclear out any cached analyses. auto &DeadC = *CG.lookupSCC(*CG.lookup(*DeadF)); + FunctionAnalysisManager &FAM = + AM.getResult<FunctionAnalysisManagerCGSCCProxy>(DeadC, CG) + .getManager(); + FAM.clear(*DeadF); + AM.clear(DeadC); auto &DeadRC = DeadC.getOuterRefSCC(); CG.removeDeadFunction(*DeadF); @@ -933,5 +989,13 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, // And delete the actual function from the module. M.getFunctionList().erase(DeadF); } - return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); + + if (!Changed) + return PreservedAnalyses::all(); + + // Even if we change the IR, we update the core CGSCC data structures and so + // can preserve the proxy to the function analysis manager. + PreservedAnalyses PA; + PA.preserve<FunctionAnalysisManagerCGSCCProxy>(); + return PA; } diff --git a/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp b/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp index f898c3b..c74b0a3 100644 --- a/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp +++ b/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp @@ -14,7 +14,6 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/IPO.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/IR/Dominators.h" @@ -22,6 +21,7 @@ #include "llvm/IR/Module.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/CodeExtractor.h" diff --git a/contrib/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/contrib/llvm/lib/Transforms/IPO/LowerTypeTests.cpp index deb7e81..693df5e 100644 --- a/contrib/llvm/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/contrib/llvm/lib/Transforms/IPO/LowerTypeTests.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/Triple.h" +#include "llvm/Analysis/TypeMetadataUtils.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" @@ -42,8 +43,6 @@ using namespace llvm; using namespace lowertypetests; -using SummaryAction = LowerTypeTestsSummaryAction; - #define DEBUG_TYPE "lowertypetests" STATISTIC(ByteArraySizeBits, "Byte array size in bits"); @@ -57,13 +56,13 @@ static cl::opt<bool> AvoidReuse( cl::desc("Try to avoid reuse of byte array addresses using aliases"), cl::Hidden, cl::init(true)); -static cl::opt<SummaryAction> ClSummaryAction( +static cl::opt<PassSummaryAction> ClSummaryAction( "lowertypetests-summary-action", cl::desc("What to do with the summary when running this pass"), - cl::values(clEnumValN(SummaryAction::None, "none", "Do nothing"), - clEnumValN(SummaryAction::Import, "import", + cl::values(clEnumValN(PassSummaryAction::None, "none", "Do nothing"), + clEnumValN(PassSummaryAction::Import, "import", "Import typeid resolutions from summary and globals"), - clEnumValN(SummaryAction::Export, "export", + clEnumValN(PassSummaryAction::Export, "export", "Export typeid resolutions to summary and globals")), cl::Hidden); @@ -208,17 +207,26 @@ struct ByteArrayInfo { class GlobalTypeMember final : TrailingObjects<GlobalTypeMember, MDNode *> { GlobalObject *GO; size_t NTypes; + // For functions: true if this is a definition (either in the merged module or + // in one of the thinlto modules). + bool IsDefinition; + // For functions: true if this function is either defined or used in a thinlto + // module and its jumptable entry needs to be exported to thinlto backends. + bool IsExported; friend TrailingObjects; size_t numTrailingObjects(OverloadToken<MDNode *>) const { return NTypes; } public: static GlobalTypeMember *create(BumpPtrAllocator &Alloc, GlobalObject *GO, + bool IsDefinition, bool IsExported, ArrayRef<MDNode *> Types) { auto *GTM = static_cast<GlobalTypeMember *>(Alloc.Allocate( totalSizeToAlloc<MDNode *>(Types.size()), alignof(GlobalTypeMember))); GTM->GO = GO; GTM->NTypes = Types.size(); + GTM->IsDefinition = IsDefinition; + GTM->IsExported = IsExported; std::uninitialized_copy(Types.begin(), Types.end(), GTM->getTrailingObjects<MDNode *>()); return GTM; @@ -226,6 +234,12 @@ public: GlobalObject *getGlobal() const { return GO; } + bool isDefinition() const { + return IsDefinition; + } + bool isExported() const { + return IsExported; + } ArrayRef<MDNode *> types() const { return makeArrayRef(getTrailingObjects<MDNode *>(), NTypes); } @@ -234,10 +248,9 @@ public: class LowerTypeTestsModule { Module &M; - SummaryAction Action; - ModuleSummaryIndex *Summary; + ModuleSummaryIndex *ExportSummary; + const ModuleSummaryIndex *ImportSummary; - bool LinkerSubsectionsViaSymbols; Triple::ArchType Arch; Triple::OSType OS; Triple::ObjectFormatType ObjectFormat; @@ -253,15 +266,21 @@ class LowerTypeTestsModule { // Indirect function call index assignment counter for WebAssembly uint64_t IndirectIndex = 1; - // Mapping from type identifiers to the call sites that test them. - DenseMap<Metadata *, std::vector<CallInst *>> TypeTestCallSites; + // Mapping from type identifiers to the call sites that test them, as well as + // whether the type identifier needs to be exported to ThinLTO backends as + // part of the regular LTO phase of the ThinLTO pipeline (see exportTypeId). + struct TypeIdUserInfo { + std::vector<CallInst *> CallSites; + bool IsExported = false; + }; + DenseMap<Metadata *, TypeIdUserInfo> TypeIdUsers; /// This structure describes how to lower type tests for a particular type /// identifier. It is either built directly from the global analysis (during /// regular LTO or the regular LTO phase of ThinLTO), or indirectly using type /// identifier summaries and external symbol references (in ThinLTO backends). struct TypeIdLowering { - TypeTestResolution::Kind TheKind; + TypeTestResolution::Kind TheKind = TypeTestResolution::Unsat; /// All except Unsat: the start address within the combined global. Constant *OffsetedGlobal; @@ -274,9 +293,6 @@ class LowerTypeTestsModule { /// covering members of this type identifier as a multiple of 2^AlignLog2. Constant *SizeM1; - /// ByteArray, Inline, AllOnes: range of SizeM1 expressed as a bit width. - unsigned SizeM1BitWidth; - /// ByteArray: the byte array to test the address against. Constant *TheByteArray; @@ -291,6 +307,11 @@ class LowerTypeTestsModule { Function *WeakInitializerFn = nullptr; + void exportTypeId(StringRef TypeId, const TypeIdLowering &TIL); + TypeIdLowering importTypeId(StringRef TypeId); + void importTypeTest(CallInst *CI); + void importFunction(Function *F, bool isDefinition); + BitSetInfo buildBitSet(Metadata *TypeId, const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout); @@ -327,8 +348,8 @@ class LowerTypeTestsModule { void createJumpTable(Function *F, ArrayRef<GlobalTypeMember *> Functions); public: - LowerTypeTestsModule(Module &M, SummaryAction Action, - ModuleSummaryIndex *Summary); + LowerTypeTestsModule(Module &M, ModuleSummaryIndex *ExportSummary, + const ModuleSummaryIndex *ImportSummary); bool lower(); // Lower the module using the action and summary passed as command line @@ -341,15 +362,17 @@ struct LowerTypeTests : public ModulePass { bool UseCommandLine = false; - SummaryAction Action; - ModuleSummaryIndex *Summary; + ModuleSummaryIndex *ExportSummary; + const ModuleSummaryIndex *ImportSummary; LowerTypeTests() : ModulePass(ID), UseCommandLine(true) { initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry()); } - LowerTypeTests(SummaryAction Action, ModuleSummaryIndex *Summary) - : ModulePass(ID), Action(Action), Summary(Summary) { + LowerTypeTests(ModuleSummaryIndex *ExportSummary, + const ModuleSummaryIndex *ImportSummary) + : ModulePass(ID), ExportSummary(ExportSummary), + ImportSummary(ImportSummary) { initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry()); } @@ -358,7 +381,7 @@ struct LowerTypeTests : public ModulePass { return false; if (UseCommandLine) return LowerTypeTestsModule::runForTesting(M); - return LowerTypeTestsModule(M, Action, Summary).lower(); + return LowerTypeTestsModule(M, ExportSummary, ImportSummary).lower(); } }; @@ -368,9 +391,10 @@ INITIALIZE_PASS(LowerTypeTests, "lowertypetests", "Lower type metadata", false, false) char LowerTypeTests::ID = 0; -ModulePass *llvm::createLowerTypeTestsPass(SummaryAction Action, - ModuleSummaryIndex *Summary) { - return new LowerTypeTests(Action, Summary); +ModulePass * +llvm::createLowerTypeTestsPass(ModuleSummaryIndex *ExportSummary, + const ModuleSummaryIndex *ImportSummary) { + return new LowerTypeTests(ExportSummary, ImportSummary); } /// Build a bit set for TypeId using the object layouts in @@ -467,13 +491,9 @@ void LowerTypeTestsModule::allocateByteArrays() { // Create an alias instead of RAUW'ing the gep directly. On x86 this ensures // that the pc-relative displacement is folded into the lea instead of the // test instruction getting another displacement. - if (LinkerSubsectionsViaSymbols) { - BAI->ByteArray->replaceAllUsesWith(GEP); - } else { - GlobalAlias *Alias = GlobalAlias::create( - Int8Ty, 0, GlobalValue::PrivateLinkage, "bits", GEP, &M); - BAI->ByteArray->replaceAllUsesWith(Alias); - } + GlobalAlias *Alias = GlobalAlias::create( + Int8Ty, 0, GlobalValue::PrivateLinkage, "bits", GEP, &M); + BAI->ByteArray->replaceAllUsesWith(Alias); BAI->ByteArray->eraseFromParent(); } @@ -494,10 +514,11 @@ Value *LowerTypeTestsModule::createBitSetTest(IRBuilder<> &B, return createMaskedBitTest(B, TIL.InlineBits, BitOffset); } else { Constant *ByteArray = TIL.TheByteArray; - if (!LinkerSubsectionsViaSymbols && AvoidReuse) { + if (AvoidReuse && !ImportSummary) { // Each use of the byte array uses a different alias. This makes the // backend less likely to reuse previously computed byte array addresses, // improving the security of the CFI mechanism based on this pass. + // This won't work when importing because TheByteArray is external. ByteArray = GlobalAlias::create(Int8Ty, 0, GlobalValue::PrivateLinkage, "bits_use", ByteArray, &M); } @@ -593,15 +614,31 @@ Value *LowerTypeTestsModule::lowerTypeTestCall(Metadata *TypeId, CallInst *CI, IntPtrTy)); Value *BitOffset = B.CreateOr(OffsetSHR, OffsetSHL); - Constant *BitSizeConst = ConstantExpr::getZExt(TIL.SizeM1, IntPtrTy); - Value *OffsetInRange = B.CreateICmpULE(BitOffset, BitSizeConst); + Value *OffsetInRange = B.CreateICmpULE(BitOffset, TIL.SizeM1); // If the bit set is all ones, testing against it is unnecessary. if (TIL.TheKind == TypeTestResolution::AllOnes) return OffsetInRange; - TerminatorInst *Term = SplitBlockAndInsertIfThen(OffsetInRange, CI, false); - IRBuilder<> ThenB(Term); + // See if the intrinsic is used in the following common pattern: + // br(llvm.type.test(...), thenbb, elsebb) + // where nothing happens between the type test and the br. + // If so, create slightly simpler IR. + if (CI->hasOneUse()) + if (auto *Br = dyn_cast<BranchInst>(*CI->user_begin())) + if (CI->getNextNode() == Br) { + BasicBlock *Then = InitialBB->splitBasicBlock(CI->getIterator()); + BasicBlock *Else = Br->getSuccessor(1); + BranchInst *NewBr = BranchInst::Create(Then, Else, OffsetInRange); + NewBr->setMetadata(LLVMContext::MD_prof, + Br->getMetadata(LLVMContext::MD_prof)); + ReplaceInstWithInst(InitialBB->getTerminator(), NewBr); + + IRBuilder<> ThenB(CI); + return createBitSetTest(ThenB, TIL, BitOffset); + } + + IRBuilder<> ThenB(SplitBlockAndInsertIfThen(OffsetInRange, CI, false)); // Now that we know that the offset is in range and aligned, load the // appropriate bit from the bitset. @@ -672,21 +709,174 @@ void LowerTypeTestsModule::buildBitSetsFromGlobalVariables( ConstantInt::get(Int32Ty, I * 2)}; Constant *CombinedGlobalElemPtr = ConstantExpr::getGetElementPtr( NewInit->getType(), CombinedGlobal, CombinedGlobalIdxs); - if (LinkerSubsectionsViaSymbols) { - GV->replaceAllUsesWith(CombinedGlobalElemPtr); - } else { - assert(GV->getType()->getAddressSpace() == 0); - GlobalAlias *GAlias = GlobalAlias::create(NewTy->getElementType(I * 2), 0, - GV->getLinkage(), "", - CombinedGlobalElemPtr, &M); - GAlias->setVisibility(GV->getVisibility()); - GAlias->takeName(GV); - GV->replaceAllUsesWith(GAlias); - } + assert(GV->getType()->getAddressSpace() == 0); + GlobalAlias *GAlias = + GlobalAlias::create(NewTy->getElementType(I * 2), 0, GV->getLinkage(), + "", CombinedGlobalElemPtr, &M); + GAlias->setVisibility(GV->getVisibility()); + GAlias->takeName(GV); + GV->replaceAllUsesWith(GAlias); GV->eraseFromParent(); } } +/// Export the given type identifier so that ThinLTO backends may import it. +/// Type identifiers are exported by adding coarse-grained information about how +/// to test the type identifier to the summary, and creating symbols in the +/// object file (aliases and absolute symbols) containing fine-grained +/// information about the type identifier. +void LowerTypeTestsModule::exportTypeId(StringRef TypeId, + const TypeIdLowering &TIL) { + TypeTestResolution &TTRes = + ExportSummary->getOrInsertTypeIdSummary(TypeId).TTRes; + TTRes.TheKind = TIL.TheKind; + + auto ExportGlobal = [&](StringRef Name, Constant *C) { + GlobalAlias *GA = + GlobalAlias::create(Int8Ty, 0, GlobalValue::ExternalLinkage, + "__typeid_" + TypeId + "_" + Name, C, &M); + GA->setVisibility(GlobalValue::HiddenVisibility); + }; + + if (TIL.TheKind != TypeTestResolution::Unsat) + ExportGlobal("global_addr", TIL.OffsetedGlobal); + + if (TIL.TheKind == TypeTestResolution::ByteArray || + TIL.TheKind == TypeTestResolution::Inline || + TIL.TheKind == TypeTestResolution::AllOnes) { + ExportGlobal("align", ConstantExpr::getIntToPtr(TIL.AlignLog2, Int8PtrTy)); + ExportGlobal("size_m1", ConstantExpr::getIntToPtr(TIL.SizeM1, Int8PtrTy)); + + uint64_t BitSize = cast<ConstantInt>(TIL.SizeM1)->getZExtValue() + 1; + if (TIL.TheKind == TypeTestResolution::Inline) + TTRes.SizeM1BitWidth = (BitSize <= 32) ? 5 : 6; + else + TTRes.SizeM1BitWidth = (BitSize <= 128) ? 7 : 32; + } + + if (TIL.TheKind == TypeTestResolution::ByteArray) { + ExportGlobal("byte_array", TIL.TheByteArray); + ExportGlobal("bit_mask", TIL.BitMask); + } + + if (TIL.TheKind == TypeTestResolution::Inline) + ExportGlobal("inline_bits", + ConstantExpr::getIntToPtr(TIL.InlineBits, Int8PtrTy)); +} + +LowerTypeTestsModule::TypeIdLowering +LowerTypeTestsModule::importTypeId(StringRef TypeId) { + const TypeIdSummary *TidSummary = ImportSummary->getTypeIdSummary(TypeId); + if (!TidSummary) + return {}; // Unsat: no globals match this type id. + const TypeTestResolution &TTRes = TidSummary->TTRes; + + TypeIdLowering TIL; + TIL.TheKind = TTRes.TheKind; + + auto ImportGlobal = [&](StringRef Name, unsigned AbsWidth) { + Constant *C = + M.getOrInsertGlobal(("__typeid_" + TypeId + "_" + Name).str(), Int8Ty); + auto *GV = dyn_cast<GlobalVariable>(C); + // We only need to set metadata if the global is newly created, in which + // case it would not have hidden visibility. + if (!GV || GV->getVisibility() == GlobalValue::HiddenVisibility) + return C; + + GV->setVisibility(GlobalValue::HiddenVisibility); + auto SetAbsRange = [&](uint64_t Min, uint64_t Max) { + auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Min)); + auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Max)); + GV->setMetadata(LLVMContext::MD_absolute_symbol, + MDNode::get(M.getContext(), {MinC, MaxC})); + }; + if (AbsWidth == IntPtrTy->getBitWidth()) + SetAbsRange(~0ull, ~0ull); // Full set. + else if (AbsWidth) + SetAbsRange(0, 1ull << AbsWidth); + return C; + }; + + if (TIL.TheKind != TypeTestResolution::Unsat) + TIL.OffsetedGlobal = ImportGlobal("global_addr", 0); + + if (TIL.TheKind == TypeTestResolution::ByteArray || + TIL.TheKind == TypeTestResolution::Inline || + TIL.TheKind == TypeTestResolution::AllOnes) { + TIL.AlignLog2 = ConstantExpr::getPtrToInt(ImportGlobal("align", 8), Int8Ty); + TIL.SizeM1 = ConstantExpr::getPtrToInt( + ImportGlobal("size_m1", TTRes.SizeM1BitWidth), IntPtrTy); + } + + if (TIL.TheKind == TypeTestResolution::ByteArray) { + TIL.TheByteArray = ImportGlobal("byte_array", 0); + TIL.BitMask = ImportGlobal("bit_mask", 8); + } + + if (TIL.TheKind == TypeTestResolution::Inline) + TIL.InlineBits = ConstantExpr::getPtrToInt( + ImportGlobal("inline_bits", 1 << TTRes.SizeM1BitWidth), + TTRes.SizeM1BitWidth <= 5 ? Int32Ty : Int64Ty); + + return TIL; +} + +void LowerTypeTestsModule::importTypeTest(CallInst *CI) { + auto TypeIdMDVal = dyn_cast<MetadataAsValue>(CI->getArgOperand(1)); + if (!TypeIdMDVal) + report_fatal_error("Second argument of llvm.type.test must be metadata"); + + auto TypeIdStr = dyn_cast<MDString>(TypeIdMDVal->getMetadata()); + if (!TypeIdStr) + report_fatal_error( + "Second argument of llvm.type.test must be a metadata string"); + + TypeIdLowering TIL = importTypeId(TypeIdStr->getString()); + Value *Lowered = lowerTypeTestCall(TypeIdStr, CI, TIL); + CI->replaceAllUsesWith(Lowered); + CI->eraseFromParent(); +} + +// ThinLTO backend: the function F has a jump table entry; update this module +// accordingly. isDefinition describes the type of the jump table entry. +void LowerTypeTestsModule::importFunction(Function *F, bool isDefinition) { + assert(F->getType()->getAddressSpace() == 0); + + // Declaration of a local function - nothing to do. + if (F->isDeclarationForLinker() && isDefinition) + return; + + GlobalValue::VisibilityTypes Visibility = F->getVisibility(); + std::string Name = F->getName(); + Function *FDecl; + + if (F->isDeclarationForLinker() && !isDefinition) { + // Declaration of an external function. + FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage, + Name + ".cfi_jt", &M); + FDecl->setVisibility(GlobalValue::HiddenVisibility); + } else if (isDefinition) { + F->setName(Name + ".cfi"); + F->setLinkage(GlobalValue::ExternalLinkage); + F->setVisibility(GlobalValue::HiddenVisibility); + FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage, + Name, &M); + FDecl->setVisibility(Visibility); + } else { + // Function definition without type metadata, where some other translation + // unit contained a declaration with type metadata. This normally happens + // during mixed CFI + non-CFI compilation. We do nothing with the function + // so that it is treated the same way as a function defined outside of the + // LTO unit. + return; + } + + if (F->isWeakForLinker()) + replaceWeakDeclarationWithJumpTablePtr(F, FDecl); + else + F->replaceAllUsesWith(FDecl); +} + void LowerTypeTestsModule::lowerTypeTestCalls( ArrayRef<Metadata *> TypeIds, Constant *CombinedGlobalAddr, const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout) { @@ -708,16 +898,12 @@ void LowerTypeTestsModule::lowerTypeTestCalls( TIL.OffsetedGlobal = ConstantExpr::getGetElementPtr( Int8Ty, CombinedGlobalAddr, ConstantInt::get(IntPtrTy, BSI.ByteOffset)), TIL.AlignLog2 = ConstantInt::get(Int8Ty, BSI.AlignLog2); + TIL.SizeM1 = ConstantInt::get(IntPtrTy, BSI.BitSize - 1); if (BSI.isAllOnes()) { TIL.TheKind = (BSI.BitSize == 1) ? TypeTestResolution::Single : TypeTestResolution::AllOnes; - TIL.SizeM1BitWidth = (BSI.BitSize <= 128) ? 7 : 32; - TIL.SizeM1 = ConstantInt::get((BSI.BitSize <= 128) ? Int8Ty : Int32Ty, - BSI.BitSize - 1); } else if (BSI.BitSize <= 64) { TIL.TheKind = TypeTestResolution::Inline; - TIL.SizeM1BitWidth = (BSI.BitSize <= 32) ? 5 : 6; - TIL.SizeM1 = ConstantInt::get(Int8Ty, BSI.BitSize - 1); uint64_t InlineBits = 0; for (auto Bit : BSI.Bits) InlineBits |= uint64_t(1) << Bit; @@ -728,17 +914,19 @@ void LowerTypeTestsModule::lowerTypeTestCalls( (BSI.BitSize <= 32) ? Int32Ty : Int64Ty, InlineBits); } else { TIL.TheKind = TypeTestResolution::ByteArray; - TIL.SizeM1BitWidth = (BSI.BitSize <= 128) ? 7 : 32; - TIL.SizeM1 = ConstantInt::get((BSI.BitSize <= 128) ? Int8Ty : Int32Ty, - BSI.BitSize - 1); ++NumByteArraysCreated; ByteArrayInfo *BAI = createByteArray(BSI); TIL.TheByteArray = BAI->ByteArray; TIL.BitMask = BAI->MaskGlobal; } + TypeIdUserInfo &TIUI = TypeIdUsers[TypeId]; + + if (TIUI.IsExported) + exportTypeId(cast<MDString>(TypeId)->getString(), TIL); + // Lower each call to llvm.type.test for this type identifier. - for (CallInst *CI : TypeTestCallSites[TypeId]) { + for (CallInst *CI : TIUI.CallSites) { ++NumTypeTestCallsLowered; Value *Lowered = lowerTypeTestCall(TypeId, CI, TIL); CI->replaceAllUsesWith(Lowered); @@ -757,9 +945,9 @@ void LowerTypeTestsModule::verifyTypeMDNode(GlobalObject *GO, MDNode *Type) { report_fatal_error( "A member of a type identifier may not have an explicit section"); - if (isa<GlobalVariable>(GO) && GO->isDeclarationForLinker()) - report_fatal_error( - "A global var member of a type identifier must be a definition"); + // FIXME: We previously checked that global var member of a type identifier + // must be a definition, but the IR linker may leave type metadata on + // declarations. We should restore this check after fixing PR31759. auto OffsetConstMD = dyn_cast<ConstantAsMetadata>(Type->getOperand(0)); if (!OffsetConstMD) @@ -1012,7 +1200,6 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative( // arithmetic that we normally use for globals. // FIXME: find a better way to represent the jumptable in the IR. - assert(!Functions.empty()); // Build a simple layout based on the regular layout of jump tables. @@ -1036,6 +1223,7 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative( // references to the original functions with references to the aliases. for (unsigned I = 0; I != Functions.size(); ++I) { Function *F = cast<Function>(Functions[I]->getGlobal()); + bool IsDefinition = Functions[I]->isDefinition(); Constant *CombinedGlobalElemPtr = ConstantExpr::getBitCast( ConstantExpr::getInBoundsGetElementPtr( @@ -1043,8 +1231,18 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative( ArrayRef<Constant *>{ConstantInt::get(IntPtrTy, 0), ConstantInt::get(IntPtrTy, I)}), F->getType()); - if (LinkerSubsectionsViaSymbols || F->isDeclarationForLinker()) { - + if (Functions[I]->isExported()) { + if (IsDefinition) { + ExportSummary->cfiFunctionDefs().insert(F->getName()); + } else { + GlobalAlias *JtAlias = GlobalAlias::create( + F->getValueType(), 0, GlobalValue::ExternalLinkage, + F->getName() + ".cfi_jt", CombinedGlobalElemPtr, &M); + JtAlias->setVisibility(GlobalValue::HiddenVisibility); + ExportSummary->cfiFunctionDecls().insert(F->getName()); + } + } + if (!IsDefinition) { if (F->isWeakForLinker()) replaceWeakDeclarationWithJumpTablePtr(F, CombinedGlobalElemPtr); else @@ -1052,9 +1250,8 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative( } else { assert(F->getType()->getAddressSpace() == 0); - GlobalAlias *FAlias = GlobalAlias::create(F->getValueType(), 0, - F->getLinkage(), "", - CombinedGlobalElemPtr, &M); + GlobalAlias *FAlias = GlobalAlias::create( + F->getValueType(), 0, F->getLinkage(), "", CombinedGlobalElemPtr, &M); FAlias->setVisibility(F->getVisibility()); FAlias->takeName(F); if (FAlias->hasName()) @@ -1173,15 +1370,12 @@ void LowerTypeTestsModule::buildBitSetsFromDisjointSet( } /// Lower all type tests in this module. -LowerTypeTestsModule::LowerTypeTestsModule(Module &M, SummaryAction Action, - ModuleSummaryIndex *Summary) - : M(M), Action(Action), Summary(Summary) { - // FIXME: Use these fields. - (void)this->Action; - (void)this->Summary; - +LowerTypeTestsModule::LowerTypeTestsModule( + Module &M, ModuleSummaryIndex *ExportSummary, + const ModuleSummaryIndex *ImportSummary) + : M(M), ExportSummary(ExportSummary), ImportSummary(ImportSummary) { + assert(!(ExportSummary && ImportSummary)); Triple TargetTriple(M.getTargetTriple()); - LinkerSubsectionsViaSymbols = TargetTriple.isMacOSX(); Arch = TargetTriple.getArch(); OS = TargetTriple.getOS(); ObjectFormat = TargetTriple.getObjectFormat(); @@ -1203,7 +1397,11 @@ bool LowerTypeTestsModule::runForTesting(Module &M) { ExitOnErr(errorCodeToError(In.error())); } - bool Changed = LowerTypeTestsModule(M, ClSummaryAction, &Summary).lower(); + bool Changed = + LowerTypeTestsModule( + M, ClSummaryAction == PassSummaryAction::Export ? &Summary : nullptr, + ClSummaryAction == PassSummaryAction::Import ? &Summary : nullptr) + .lower(); if (!ClWriteSummary.empty()) { ExitOnError ExitOnErr("-lowertypetests-write-summary: " + ClWriteSummary + @@ -1222,9 +1420,40 @@ bool LowerTypeTestsModule::runForTesting(Module &M) { bool LowerTypeTestsModule::lower() { Function *TypeTestFunc = M.getFunction(Intrinsic::getName(Intrinsic::type_test)); - if (!TypeTestFunc || TypeTestFunc->use_empty()) + if ((!TypeTestFunc || TypeTestFunc->use_empty()) && !ExportSummary && + !ImportSummary) return false; + if (ImportSummary) { + if (TypeTestFunc) { + for (auto UI = TypeTestFunc->use_begin(), UE = TypeTestFunc->use_end(); + UI != UE;) { + auto *CI = cast<CallInst>((*UI++).getUser()); + importTypeTest(CI); + } + } + + SmallVector<Function *, 8> Defs; + SmallVector<Function *, 8> Decls; + for (auto &F : M) { + // CFI functions are either external, or promoted. A local function may + // have the same name, but it's not the one we are looking for. + if (F.hasLocalLinkage()) + continue; + if (ImportSummary->cfiFunctionDefs().count(F.getName())) + Defs.push_back(&F); + else if (ImportSummary->cfiFunctionDecls().count(F.getName())) + Decls.push_back(&F); + } + + for (auto F : Defs) + importFunction(F, /*isDefinition*/ true); + for (auto F : Decls) + importFunction(F, /*isDefinition*/ false); + + return true; + } + // Equivalence class set containing type identifiers and the globals that // reference them. This is used to partition the set of type identifiers in // the module into disjoint sets. @@ -1247,13 +1476,76 @@ bool LowerTypeTestsModule::lower() { llvm::DenseMap<Metadata *, TIInfo> TypeIdInfo; unsigned I = 0; SmallVector<MDNode *, 2> Types; + + struct ExportedFunctionInfo { + CfiFunctionLinkage Linkage; + MDNode *FuncMD; // {name, linkage, type[, type...]} + }; + DenseMap<StringRef, ExportedFunctionInfo> ExportedFunctions; + if (ExportSummary) { + NamedMDNode *CfiFunctionsMD = M.getNamedMetadata("cfi.functions"); + if (CfiFunctionsMD) { + for (auto FuncMD : CfiFunctionsMD->operands()) { + assert(FuncMD->getNumOperands() >= 2); + StringRef FunctionName = + cast<MDString>(FuncMD->getOperand(0))->getString(); + if (!ExportSummary->isGUIDLive(GlobalValue::getGUID( + GlobalValue::dropLLVMManglingEscape(FunctionName)))) + continue; + CfiFunctionLinkage Linkage = static_cast<CfiFunctionLinkage>( + cast<ConstantAsMetadata>(FuncMD->getOperand(1)) + ->getValue() + ->getUniqueInteger() + .getZExtValue()); + auto P = ExportedFunctions.insert({FunctionName, {Linkage, FuncMD}}); + if (!P.second && P.first->second.Linkage != CFL_Definition) + P.first->second = {Linkage, FuncMD}; + } + + for (const auto &P : ExportedFunctions) { + StringRef FunctionName = P.first; + CfiFunctionLinkage Linkage = P.second.Linkage; + MDNode *FuncMD = P.second.FuncMD; + Function *F = M.getFunction(FunctionName); + if (!F) + F = Function::Create( + FunctionType::get(Type::getVoidTy(M.getContext()), false), + GlobalVariable::ExternalLinkage, FunctionName, &M); + + if (Linkage == CFL_Definition) + F->eraseMetadata(LLVMContext::MD_type); + + if (F->isDeclaration()) { + if (Linkage == CFL_WeakDeclaration) + F->setLinkage(GlobalValue::ExternalWeakLinkage); + + SmallVector<MDNode *, 2> Types; + for (unsigned I = 2; I < FuncMD->getNumOperands(); ++I) + F->addMetadata(LLVMContext::MD_type, + *cast<MDNode>(FuncMD->getOperand(I).get())); + } + } + } + } + for (GlobalObject &GO : M.global_objects()) { + if (isa<GlobalVariable>(GO) && GO.isDeclarationForLinker()) + continue; + Types.clear(); GO.getMetadata(LLVMContext::MD_type, Types); if (Types.empty()) continue; - auto *GTM = GlobalTypeMember::create(Alloc, &GO, Types); + bool IsDefinition = !GO.isDeclarationForLinker(); + bool IsExported = false; + if (isa<Function>(GO) && ExportedFunctions.count(GO.getName())) { + IsDefinition |= ExportedFunctions[GO.getName()].Linkage == CFL_Definition; + IsExported = true; + } + + auto *GTM = + GlobalTypeMember::create(Alloc, &GO, IsDefinition, IsExported, Types); for (MDNode *Type : Types) { verifyTypeMDNode(&GO, Type); auto &Info = TypeIdInfo[cast<MDNode>(Type)->getOperand(1)]; @@ -1262,33 +1554,56 @@ bool LowerTypeTestsModule::lower() { } } - for (const Use &U : TypeTestFunc->uses()) { - auto CI = cast<CallInst>(U.getUser()); + auto AddTypeIdUse = [&](Metadata *TypeId) -> TypeIdUserInfo & { + // Add the call site to the list of call sites for this type identifier. We + // also use TypeIdUsers to keep track of whether we have seen this type + // identifier before. If we have, we don't need to re-add the referenced + // globals to the equivalence class. + auto Ins = TypeIdUsers.insert({TypeId, {}}); + if (Ins.second) { + // Add the type identifier to the equivalence class. + GlobalClassesTy::iterator GCI = GlobalClasses.insert(TypeId); + GlobalClassesTy::member_iterator CurSet = GlobalClasses.findLeader(GCI); + + // Add the referenced globals to the type identifier's equivalence class. + for (GlobalTypeMember *GTM : TypeIdInfo[TypeId].RefGlobals) + CurSet = GlobalClasses.unionSets( + CurSet, GlobalClasses.findLeader(GlobalClasses.insert(GTM))); + } + + return Ins.first->second; + }; - auto BitSetMDVal = dyn_cast<MetadataAsValue>(CI->getArgOperand(1)); - if (!BitSetMDVal) - report_fatal_error("Second argument of llvm.type.test must be metadata"); - auto BitSet = BitSetMDVal->getMetadata(); + if (TypeTestFunc) { + for (const Use &U : TypeTestFunc->uses()) { + auto CI = cast<CallInst>(U.getUser()); - // Add the call site to the list of call sites for this type identifier. We - // also use TypeTestCallSites to keep track of whether we have seen this - // type identifier before. If we have, we don't need to re-add the - // referenced globals to the equivalence class. - std::pair<DenseMap<Metadata *, std::vector<CallInst *>>::iterator, bool> - Ins = TypeTestCallSites.insert( - std::make_pair(BitSet, std::vector<CallInst *>())); - Ins.first->second.push_back(CI); - if (!Ins.second) - continue; + auto TypeIdMDVal = dyn_cast<MetadataAsValue>(CI->getArgOperand(1)); + if (!TypeIdMDVal) + report_fatal_error("Second argument of llvm.type.test must be metadata"); + auto TypeId = TypeIdMDVal->getMetadata(); + AddTypeIdUse(TypeId).CallSites.push_back(CI); + } + } - // Add the type identifier to the equivalence class. - GlobalClassesTy::iterator GCI = GlobalClasses.insert(BitSet); - GlobalClassesTy::member_iterator CurSet = GlobalClasses.findLeader(GCI); + if (ExportSummary) { + DenseMap<GlobalValue::GUID, TinyPtrVector<Metadata *>> MetadataByGUID; + for (auto &P : TypeIdInfo) { + if (auto *TypeId = dyn_cast<MDString>(P.first)) + MetadataByGUID[GlobalValue::getGUID(TypeId->getString())].push_back( + TypeId); + } - // Add the referenced globals to the type identifier's equivalence class. - for (GlobalTypeMember *GTM : TypeIdInfo[BitSet].RefGlobals) - CurSet = GlobalClasses.unionSets( - CurSet, GlobalClasses.findLeader(GlobalClasses.insert(GTM))); + for (auto &P : *ExportSummary) { + for (auto &S : P.second.SummaryList) { + auto *FS = dyn_cast<FunctionSummary>(S.get()); + if (!FS || !ExportSummary->isGlobalValueLive(FS)) + continue; + for (GlobalValue::GUID G : FS->type_tests()) + for (Metadata *MD : MetadataByGUID[G]) + AddTypeIdUse(MD).IsExported = true; + } + } } if (GlobalClasses.empty()) @@ -1349,8 +1664,9 @@ bool LowerTypeTestsModule::lower() { PreservedAnalyses LowerTypeTestsPass::run(Module &M, ModuleAnalysisManager &AM) { - bool Changed = - LowerTypeTestsModule(M, SummaryAction::None, /*Summary=*/nullptr).lower(); + bool Changed = LowerTypeTestsModule(M, /*ExportSummary=*/nullptr, + /*ImportSummary=*/nullptr) + .lower(); if (!Changed) return PreservedAnalyses::all(); return PreservedAnalyses::none(); diff --git a/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp index e0bb0eb..0e478ba 100644 --- a/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp +++ b/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp @@ -96,8 +96,10 @@ #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfo.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/ValueHandle.h" @@ -127,6 +129,26 @@ static cl::opt<unsigned> NumFunctionsForSanityCheck( "'0' disables this check. Works only with '-debug' key."), cl::init(0), cl::Hidden); +// Under option -mergefunc-preserve-debug-info we: +// - Do not create a new function for a thunk. +// - Retain the debug info for a thunk's parameters (and associated +// instructions for the debug info) from the entry block. +// Note: -debug will display the algorithm at work. +// - Create debug-info for the call (to the shared implementation) made by +// a thunk and its return value. +// - Erase the rest of the function, retaining the (minimally sized) entry +// block to create a thunk. +// - Preserve a thunk's call site to point to the thunk even when both occur +// within the same translation unit, to aid debugability. Note that this +// behaviour differs from the underlying -mergefunc implementation which +// modifies the thunk's call site to point to the shared implementation +// when both occur within the same translation unit. +static cl::opt<bool> + MergeFunctionsPDI("mergefunc-preserve-debug-info", cl::Hidden, + cl::init(false), + cl::desc("Preserve debug info in thunk when mergefunc " + "transformations are made.")); + namespace { class FunctionNode { @@ -185,11 +207,13 @@ private: /// A work queue of functions that may have been modified and should be /// analyzed again. - std::vector<WeakVH> Deferred; + std::vector<WeakTrackingVH> Deferred; /// Checks the rules of order relation introduced among functions set. /// Returns true, if sanity check has been passed, and false if failed. - bool doSanityCheck(std::vector<WeakVH> &Worklist); +#ifndef NDEBUG + bool doSanityCheck(std::vector<WeakTrackingVH> &Worklist); +#endif /// Insert a ComparableFunction into the FnTree, or merge it away if it's /// equal to one that's already present. @@ -215,8 +239,21 @@ private: /// Replace G with a thunk or an alias to F. Deletes G. void writeThunkOrAlias(Function *F, Function *G); - /// Replace G with a simple tail call to bitcast(F). Also replace direct uses - /// of G with bitcast(F). Deletes G. + /// Fill PDIUnrelatedWL with instructions from the entry block that are + /// unrelated to parameter related debug info. + void filterInstsUnrelatedToPDI(BasicBlock *GEntryBlock, + std::vector<Instruction *> &PDIUnrelatedWL); + + /// Erase the rest of the CFG (i.e. barring the entry block). + void eraseTail(Function *G); + + /// Erase the instructions in PDIUnrelatedWL as they are unrelated to the + /// parameter debug info, from the entry block. + void eraseInstsUnrelatedToPDI(std::vector<Instruction *> &PDIUnrelatedWL); + + /// Replace G with a simple tail call to bitcast(F). Also (unless + /// MergeFunctionsPDI holds) replace direct uses of G with bitcast(F), + /// delete G. void writeThunk(Function *F, Function *G); /// Replace G with an alias to F. Deletes G. @@ -248,7 +285,8 @@ ModulePass *llvm::createMergeFunctionsPass() { return new MergeFunctions(); } -bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) { +#ifndef NDEBUG +bool MergeFunctions::doSanityCheck(std::vector<WeakTrackingVH> &Worklist) { if (const unsigned Max = NumFunctionsForSanityCheck) { unsigned TripleNumber = 0; bool Valid = true; @@ -256,10 +294,12 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) { dbgs() << "MERGEFUNC-SANITY: Started for first " << Max << " functions.\n"; unsigned i = 0; - for (std::vector<WeakVH>::iterator I = Worklist.begin(), E = Worklist.end(); + for (std::vector<WeakTrackingVH>::iterator I = Worklist.begin(), + E = Worklist.end(); I != E && i < Max; ++I, ++i) { unsigned j = i; - for (std::vector<WeakVH>::iterator J = I; J != E && j < Max; ++J, ++j) { + for (std::vector<WeakTrackingVH>::iterator J = I; J != E && j < Max; + ++J, ++j) { Function *F1 = cast<Function>(*I); Function *F2 = cast<Function>(*J); int Res1 = FunctionComparator(F1, F2, &GlobalNumbers).compare(); @@ -269,8 +309,7 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) { if (Res1 != -Res2) { dbgs() << "MERGEFUNC-SANITY: Non-symmetric; triple: " << TripleNumber << "\n"; - F1->dump(); - F2->dump(); + dbgs() << *F1 << '\n' << *F2 << '\n'; Valid = false; } @@ -278,7 +317,7 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) { continue; unsigned k = j; - for (std::vector<WeakVH>::iterator K = J; K != E && k < Max; + for (std::vector<WeakTrackingVH>::iterator K = J; K != E && k < Max; ++k, ++K, ++TripleNumber) { if (K == J) continue; @@ -305,9 +344,7 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) { << TripleNumber << "\n"; dbgs() << "Res1, Res3, Res4: " << Res1 << ", " << Res3 << ", " << Res4 << "\n"; - F1->dump(); - F2->dump(); - F3->dump(); + dbgs() << *F1 << '\n' << *F2 << '\n' << *F3 << '\n'; Valid = false; } } @@ -319,6 +356,7 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) { } return true; } +#endif bool MergeFunctions::runOnModule(Module &M) { if (skipModule(M)) @@ -349,12 +387,12 @@ bool MergeFunctions::runOnModule(Module &M) { // consider merging it. Otherwise it is dropped and never considered again. if ((I != S && std::prev(I)->first == I->first) || (std::next(I) != IE && std::next(I)->first == I->first) ) { - Deferred.push_back(WeakVH(I->second)); + Deferred.push_back(WeakTrackingVH(I->second)); } } do { - std::vector<WeakVH> Worklist; + std::vector<WeakTrackingVH> Worklist; Deferred.swap(Worklist); DEBUG(doSanityCheck(Worklist)); @@ -363,7 +401,7 @@ bool MergeFunctions::runOnModule(Module &M) { DEBUG(dbgs() << "size of worklist: " << Worklist.size() << '\n'); // Insert functions and merge them. - for (WeakVH &I : Worklist) { + for (WeakTrackingVH &I : Worklist) { if (!I) continue; Function *F = cast<Function>(I); @@ -400,19 +438,15 @@ void MergeFunctions::replaceDirectCallers(Function *Old, Function *New) { // Transferring other attributes may help other optimizations, but that // should be done uniformly and not in this ad-hoc way. auto &Context = New->getContext(); - auto NewFuncAttrs = New->getAttributes(); - auto CallSiteAttrs = CS.getAttributes(); - - CallSiteAttrs = CallSiteAttrs.addAttributes( - Context, AttributeSet::ReturnIndex, NewFuncAttrs.getRetAttributes()); - - for (unsigned argIdx = 0; argIdx < CS.arg_size(); argIdx++) { - AttributeSet Attrs = NewFuncAttrs.getParamAttributes(argIdx); - if (Attrs.getNumSlots()) - CallSiteAttrs = CallSiteAttrs.addAttributes(Context, argIdx, Attrs); - } - - CS.setAttributes(CallSiteAttrs); + auto NewPAL = New->getAttributes(); + SmallVector<AttributeSet, 4> NewArgAttrs; + for (unsigned argIdx = 0; argIdx < CS.arg_size(); argIdx++) + NewArgAttrs.push_back(NewPAL.getParamAttributes(argIdx)); + // Don't transfer attributes from the function to the callee. Function + // attributes typically aren't relevant to the calling convention or ABI. + CS.setAttributes(AttributeList::get(Context, /*FnAttrs=*/AttributeSet(), + NewPAL.getRetAttributes(), + NewArgAttrs)); remove(CS.getInstruction()->getParent()->getParent()); U->set(BitcastNew); @@ -461,51 +495,242 @@ static Value *createCast(IRBuilder<> &Builder, Value *V, Type *DestTy) { return Builder.CreateBitCast(V, DestTy); } -// Replace G with a simple tail call to bitcast(F). Also replace direct uses -// of G with bitcast(F). Deletes G. +// Erase the instructions in PDIUnrelatedWL as they are unrelated to the +// parameter debug info, from the entry block. +void MergeFunctions::eraseInstsUnrelatedToPDI( + std::vector<Instruction *> &PDIUnrelatedWL) { + + DEBUG(dbgs() << " Erasing instructions (in reverse order of appearance in " + "entry block) unrelated to parameter debug info from entry " + "block: {\n"); + while (!PDIUnrelatedWL.empty()) { + Instruction *I = PDIUnrelatedWL.back(); + DEBUG(dbgs() << " Deleting Instruction: "); + DEBUG(I->print(dbgs())); + DEBUG(dbgs() << "\n"); + I->eraseFromParent(); + PDIUnrelatedWL.pop_back(); + } + DEBUG(dbgs() << " } // Done erasing instructions unrelated to parameter " + "debug info from entry block. \n"); +} + +// Reduce G to its entry block. +void MergeFunctions::eraseTail(Function *G) { + + std::vector<BasicBlock *> WorklistBB; + for (Function::iterator BBI = std::next(G->begin()), BBE = G->end(); + BBI != BBE; ++BBI) { + BBI->dropAllReferences(); + WorklistBB.push_back(&*BBI); + } + while (!WorklistBB.empty()) { + BasicBlock *BB = WorklistBB.back(); + BB->eraseFromParent(); + WorklistBB.pop_back(); + } +} + +// We are interested in the following instructions from the entry block as being +// related to parameter debug info: +// - @llvm.dbg.declare +// - stores from the incoming parameters to locations on the stack-frame +// - allocas that create these locations on the stack-frame +// - @llvm.dbg.value +// - the entry block's terminator +// The rest are unrelated to debug info for the parameters; fill up +// PDIUnrelatedWL with such instructions. +void MergeFunctions::filterInstsUnrelatedToPDI( + BasicBlock *GEntryBlock, std::vector<Instruction *> &PDIUnrelatedWL) { + + std::set<Instruction *> PDIRelated; + for (BasicBlock::iterator BI = GEntryBlock->begin(), BIE = GEntryBlock->end(); + BI != BIE; ++BI) { + if (auto *DVI = dyn_cast<DbgValueInst>(&*BI)) { + DEBUG(dbgs() << " Deciding: "); + DEBUG(BI->print(dbgs())); + DEBUG(dbgs() << "\n"); + DILocalVariable *DILocVar = DVI->getVariable(); + if (DILocVar->isParameter()) { + DEBUG(dbgs() << " Include (parameter): "); + DEBUG(BI->print(dbgs())); + DEBUG(dbgs() << "\n"); + PDIRelated.insert(&*BI); + } else { + DEBUG(dbgs() << " Delete (!parameter): "); + DEBUG(BI->print(dbgs())); + DEBUG(dbgs() << "\n"); + } + } else if (auto *DDI = dyn_cast<DbgDeclareInst>(&*BI)) { + DEBUG(dbgs() << " Deciding: "); + DEBUG(BI->print(dbgs())); + DEBUG(dbgs() << "\n"); + DILocalVariable *DILocVar = DDI->getVariable(); + if (DILocVar->isParameter()) { + DEBUG(dbgs() << " Parameter: "); + DEBUG(DILocVar->print(dbgs())); + AllocaInst *AI = dyn_cast_or_null<AllocaInst>(DDI->getAddress()); + if (AI) { + DEBUG(dbgs() << " Processing alloca users: "); + DEBUG(dbgs() << "\n"); + for (User *U : AI->users()) { + if (StoreInst *SI = dyn_cast<StoreInst>(U)) { + if (Value *Arg = SI->getValueOperand()) { + if (dyn_cast<Argument>(Arg)) { + DEBUG(dbgs() << " Include: "); + DEBUG(AI->print(dbgs())); + DEBUG(dbgs() << "\n"); + PDIRelated.insert(AI); + DEBUG(dbgs() << " Include (parameter): "); + DEBUG(SI->print(dbgs())); + DEBUG(dbgs() << "\n"); + PDIRelated.insert(SI); + DEBUG(dbgs() << " Include: "); + DEBUG(BI->print(dbgs())); + DEBUG(dbgs() << "\n"); + PDIRelated.insert(&*BI); + } else { + DEBUG(dbgs() << " Delete (!parameter): "); + DEBUG(SI->print(dbgs())); + DEBUG(dbgs() << "\n"); + } + } + } else { + DEBUG(dbgs() << " Defer: "); + DEBUG(U->print(dbgs())); + DEBUG(dbgs() << "\n"); + } + } + } else { + DEBUG(dbgs() << " Delete (alloca NULL): "); + DEBUG(BI->print(dbgs())); + DEBUG(dbgs() << "\n"); + } + } else { + DEBUG(dbgs() << " Delete (!parameter): "); + DEBUG(BI->print(dbgs())); + DEBUG(dbgs() << "\n"); + } + } else if (dyn_cast<TerminatorInst>(BI) == GEntryBlock->getTerminator()) { + DEBUG(dbgs() << " Will Include Terminator: "); + DEBUG(BI->print(dbgs())); + DEBUG(dbgs() << "\n"); + PDIRelated.insert(&*BI); + } else { + DEBUG(dbgs() << " Defer: "); + DEBUG(BI->print(dbgs())); + DEBUG(dbgs() << "\n"); + } + } + DEBUG(dbgs() + << " Report parameter debug info related/related instructions: {\n"); + for (BasicBlock::iterator BI = GEntryBlock->begin(), BE = GEntryBlock->end(); + BI != BE; ++BI) { + + Instruction *I = &*BI; + if (PDIRelated.find(I) == PDIRelated.end()) { + DEBUG(dbgs() << " !PDIRelated: "); + DEBUG(I->print(dbgs())); + DEBUG(dbgs() << "\n"); + PDIUnrelatedWL.push_back(I); + } else { + DEBUG(dbgs() << " PDIRelated: "); + DEBUG(I->print(dbgs())); + DEBUG(dbgs() << "\n"); + } + } + DEBUG(dbgs() << " }\n"); +} + +// Replace G with a simple tail call to bitcast(F). Also (unless +// MergeFunctionsPDI holds) replace direct uses of G with bitcast(F), +// delete G. Under MergeFunctionsPDI, we use G itself for creating +// the thunk as we preserve the debug info (and associated instructions) +// from G's entry block pertaining to G's incoming arguments which are +// passed on as corresponding arguments in the call that G makes to F. +// For better debugability, under MergeFunctionsPDI, we do not modify G's +// call sites to point to F even when within the same translation unit. void MergeFunctions::writeThunk(Function *F, Function *G) { - if (!G->isInterposable()) { - // Redirect direct callers of G to F. + if (!G->isInterposable() && !MergeFunctionsPDI) { + // Redirect direct callers of G to F. (See note on MergeFunctionsPDI + // above). replaceDirectCallers(G, F); } // If G was internal then we may have replaced all uses of G with F. If so, - // stop here and delete G. There's no need for a thunk. - if (G->hasLocalLinkage() && G->use_empty()) { + // stop here and delete G. There's no need for a thunk. (See note on + // MergeFunctionsPDI above). + if (G->hasLocalLinkage() && G->use_empty() && !MergeFunctionsPDI) { G->eraseFromParent(); return; } - Function *NewG = Function::Create(G->getFunctionType(), G->getLinkage(), "", - G->getParent()); - BasicBlock *BB = BasicBlock::Create(F->getContext(), "", NewG); - IRBuilder<> Builder(BB); + BasicBlock *GEntryBlock = nullptr; + std::vector<Instruction *> PDIUnrelatedWL; + BasicBlock *BB = nullptr; + Function *NewG = nullptr; + if (MergeFunctionsPDI) { + DEBUG(dbgs() << "writeThunk: (MergeFunctionsPDI) Do not create a new " + "function as thunk; retain original: " + << G->getName() << "()\n"); + GEntryBlock = &G->getEntryBlock(); + DEBUG(dbgs() << "writeThunk: (MergeFunctionsPDI) filter parameter related " + "debug info for " + << G->getName() << "() {\n"); + filterInstsUnrelatedToPDI(GEntryBlock, PDIUnrelatedWL); + GEntryBlock->getTerminator()->eraseFromParent(); + BB = GEntryBlock; + } else { + NewG = Function::Create(G->getFunctionType(), G->getLinkage(), "", + G->getParent()); + BB = BasicBlock::Create(F->getContext(), "", NewG); + } + IRBuilder<> Builder(BB); + Function *H = MergeFunctionsPDI ? G : NewG; SmallVector<Value *, 16> Args; unsigned i = 0; FunctionType *FFTy = F->getFunctionType(); - for (Argument & AI : NewG->args()) { + for (Argument & AI : H->args()) { Args.push_back(createCast(Builder, &AI, FFTy->getParamType(i))); ++i; } CallInst *CI = Builder.CreateCall(F, Args); + ReturnInst *RI = nullptr; CI->setTailCall(); CI->setCallingConv(F->getCallingConv()); CI->setAttributes(F->getAttributes()); - if (NewG->getReturnType()->isVoidTy()) { - Builder.CreateRetVoid(); + if (H->getReturnType()->isVoidTy()) { + RI = Builder.CreateRetVoid(); } else { - Builder.CreateRet(createCast(Builder, CI, NewG->getReturnType())); + RI = Builder.CreateRet(createCast(Builder, CI, H->getReturnType())); } - NewG->copyAttributesFrom(G); - NewG->takeName(G); - removeUsers(G); - G->replaceAllUsesWith(NewG); - G->eraseFromParent(); + if (MergeFunctionsPDI) { + DISubprogram *DIS = G->getSubprogram(); + if (DIS) { + DebugLoc CIDbgLoc = DebugLoc::get(DIS->getScopeLine(), 0, DIS); + DebugLoc RIDbgLoc = DebugLoc::get(DIS->getScopeLine(), 0, DIS); + CI->setDebugLoc(CIDbgLoc); + RI->setDebugLoc(RIDbgLoc); + } else { + DEBUG(dbgs() << "writeThunk: (MergeFunctionsPDI) No DISubprogram for " + << G->getName() << "()\n"); + } + eraseTail(G); + eraseInstsUnrelatedToPDI(PDIUnrelatedWL); + DEBUG(dbgs() << "} // End of parameter related debug info filtering for: " + << G->getName() << "()\n"); + } else { + NewG->copyAttributesFrom(G); + NewG->takeName(G); + removeUsers(G); + G->replaceAllUsesWith(NewG); + G->eraseFromParent(); + } - DEBUG(dbgs() << "writeThunk: " << NewG->getName() << '\n'); + DEBUG(dbgs() << "writeThunk: " << H->getName() << '\n'); ++NumThunksWritten; } diff --git a/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp b/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp index 7ef3fc1..8840435 100644 --- a/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp +++ b/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp @@ -16,8 +16,15 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/OptimizationDiagnosticInfo.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/CFG.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" @@ -27,19 +34,177 @@ #include "llvm/Transforms/Utils/CodeExtractor.h" using namespace llvm; -#define DEBUG_TYPE "partialinlining" - -STATISTIC(NumPartialInlined, "Number of functions partially inlined"); +#define DEBUG_TYPE "partial-inlining" + +STATISTIC(NumPartialInlined, + "Number of callsites functions partially inlined into."); + +// Command line option to disable partial-inlining. The default is false: +static cl::opt<bool> + DisablePartialInlining("disable-partial-inlining", cl::init(false), + cl::Hidden, cl::desc("Disable partial ininling")); +// This is an option used by testing: +static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis", + cl::init(false), cl::ZeroOrMore, + cl::ReallyHidden, + cl::desc("Skip Cost Analysis")); + +static cl::opt<unsigned> MaxNumInlineBlocks( + "max-num-inline-blocks", cl::init(5), cl::Hidden, + cl::desc("Max Number of Blocks To be Partially Inlined")); + +// Command line option to set the maximum number of partial inlining allowed +// for the module. The default value of -1 means no limit. +static cl::opt<int> MaxNumPartialInlining( + "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore, + cl::desc("Max number of partial inlining. The default is unlimited")); + +// Used only when PGO or user annotated branch data is absent. It is +// the least value that is used to weigh the outline region. If BFI +// produces larger value, the BFI value will be used. +static cl::opt<int> + OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), + cl::Hidden, cl::ZeroOrMore, + cl::desc("Relative frequency of outline region to " + "the entry block")); + +static cl::opt<unsigned> ExtraOutliningPenalty( + "partial-inlining-extra-penalty", cl::init(0), cl::Hidden, + cl::desc("A debug option to add additional penalty to the computed one.")); namespace { + +struct FunctionOutliningInfo { + FunctionOutliningInfo() + : Entries(), ReturnBlock(nullptr), NonReturnBlock(nullptr), + ReturnBlockPreds() {} + // Returns the number of blocks to be inlined including all blocks + // in Entries and one return block. + unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; } + + // A set of blocks including the function entry that guard + // the region to be outlined. + SmallVector<BasicBlock *, 4> Entries; + // The return block that is not included in the outlined region. + BasicBlock *ReturnBlock; + // The dominating block of the region to be outlined. + BasicBlock *NonReturnBlock; + // The set of blocks in Entries that that are predecessors to ReturnBlock + SmallVector<BasicBlock *, 4> ReturnBlockPreds; +}; + struct PartialInlinerImpl { - PartialInlinerImpl(InlineFunctionInfo IFI) : IFI(IFI) {} + PartialInlinerImpl( + std::function<AssumptionCache &(Function &)> *GetAC, + std::function<TargetTransformInfo &(Function &)> *GTTI, + Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI, + ProfileSummaryInfo *ProfSI) + : GetAssumptionCache(GetAC), GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI) {} bool run(Module &M); Function *unswitchFunction(Function *F); + // This class speculatively clones the the function to be partial inlined. + // At the end of partial inlining, the remaining callsites to the cloned + // function that are not partially inlined will be fixed up to reference + // the original function, and the cloned function will be erased. + struct FunctionCloner { + FunctionCloner(Function *F, FunctionOutliningInfo *OI); + ~FunctionCloner(); + + // Prepare for function outlining: making sure there is only + // one incoming edge from the extracted/outlined region to + // the return block. + void NormalizeReturnBlock(); + + // Do function outlining: + Function *doFunctionOutlining(); + + Function *OrigFunc = nullptr; + Function *ClonedFunc = nullptr; + Function *OutlinedFunc = nullptr; + BasicBlock *OutliningCallBB = nullptr; + // ClonedFunc is inlined in one of its callers after function + // outlining. + bool IsFunctionInlined = false; + // The cost of the region to be outlined. + int OutlinedRegionCost = 0; + std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr; + std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr; + }; + private: - InlineFunctionInfo IFI; + int NumPartialInlining = 0; + std::function<AssumptionCache &(Function &)> *GetAssumptionCache; + std::function<TargetTransformInfo &(Function &)> *GetTTI; + Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI; + ProfileSummaryInfo *PSI; + + // Return the frequency of the OutlininingBB relative to F's entry point. + // The result is no larger than 1 and is represented using BP. + // (Note that the outlined region's 'head' block can only have incoming + // edges from the guarding entry blocks). + BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner); + + // Return true if the callee of CS should be partially inlined with + // profit. + bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner, + BlockFrequency WeightedOutliningRcost, + OptimizationRemarkEmitter &ORE); + + // Try to inline DuplicateFunction (cloned from F with call to + // the OutlinedFunction into its callers. Return true + // if there is any successful inlining. + bool tryPartialInline(FunctionCloner &Cloner); + + // Compute the mapping from use site of DuplicationFunction to the enclosing + // BB's profile count. + void computeCallsiteToProfCountMap(Function *DuplicateFunction, + DenseMap<User *, uint64_t> &SiteCountMap); + + bool IsLimitReached() { + return (MaxNumPartialInlining != -1 && + NumPartialInlining >= MaxNumPartialInlining); + } + + static CallSite getCallSite(User *U) { + CallSite CS; + if (CallInst *CI = dyn_cast<CallInst>(U)) + CS = CallSite(CI); + else if (InvokeInst *II = dyn_cast<InvokeInst>(U)) + CS = CallSite(II); + else + llvm_unreachable("All uses must be calls"); + return CS; + } + + static CallSite getOneCallSiteTo(Function *F) { + User *User = *F->user_begin(); + return getCallSite(User); + } + + std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) { + CallSite CS = getOneCallSiteTo(F); + DebugLoc DLoc = CS.getInstruction()->getDebugLoc(); + BasicBlock *Block = CS.getParent(); + return std::make_tuple(DLoc, Block); + } + + // Returns the costs associated with function outlining: + // - The first value is the non-weighted runtime cost for making the call + // to the outlined function, including the addtional setup cost in the + // outlined function itself; + // - The second value is the estimated size of the new call sequence in + // basic block Cloner.OutliningCallBB; + std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner); + // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to + // approximate both the size and runtime cost (Note that in the current + // inline cost analysis, there is no clear distinction there either). + static int computeBBInlineCost(BasicBlock *BB); + + std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F); + }; + struct PartialInlinerLegacyPass : public ModulePass { static char ID; // Pass identification, replacement for typeid PartialInlinerLegacyPass() : ModulePass(ID) { @@ -48,124 +213,713 @@ struct PartialInlinerLegacyPass : public ModulePass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<ProfileSummaryInfoWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); } bool runOnModule(Module &M) override { if (skipModule(M)) return false; AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>(); + TargetTransformInfoWrapperPass *TTIWP = + &getAnalysis<TargetTransformInfoWrapperPass>(); + ProfileSummaryInfo *PSI = + getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); + std::function<AssumptionCache &(Function &)> GetAssumptionCache = [&ACT](Function &F) -> AssumptionCache & { return ACT->getAssumptionCache(F); }; - InlineFunctionInfo IFI(nullptr, &GetAssumptionCache); - return PartialInlinerImpl(IFI).run(M); + + std::function<TargetTransformInfo &(Function &)> GetTTI = + [&TTIWP](Function &F) -> TargetTransformInfo & { + return TTIWP->getTTI(F); + }; + + return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, None, PSI).run(M); } }; } -Function *PartialInlinerImpl::unswitchFunction(Function *F) { - // First, verify that this function is an unswitching candidate... +std::unique_ptr<FunctionOutliningInfo> +PartialInlinerImpl::computeOutliningInfo(Function *F) { BasicBlock *EntryBlock = &F->front(); BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator()); if (!BR || BR->isUnconditional()) - return nullptr; + return std::unique_ptr<FunctionOutliningInfo>(); + + // Returns true if Succ is BB's successor + auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) { + return is_contained(successors(BB), Succ); + }; + + auto SuccSize = [](BasicBlock *BB) { + return std::distance(succ_begin(BB), succ_end(BB)); + }; + + auto IsReturnBlock = [](BasicBlock *BB) { + TerminatorInst *TI = BB->getTerminator(); + return isa<ReturnInst>(TI); + }; + + auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) { + if (IsReturnBlock(Succ1)) + return std::make_tuple(Succ1, Succ2); + if (IsReturnBlock(Succ2)) + return std::make_tuple(Succ2, Succ1); + + return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); + }; + + // Detect a triangular shape: + auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) { + if (IsSuccessor(Succ1, Succ2)) + return std::make_tuple(Succ1, Succ2); + if (IsSuccessor(Succ2, Succ1)) + return std::make_tuple(Succ2, Succ1); + + return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); + }; + + std::unique_ptr<FunctionOutliningInfo> OutliningInfo = + llvm::make_unique<FunctionOutliningInfo>(); + + BasicBlock *CurrEntry = EntryBlock; + bool CandidateFound = false; + do { + // The number of blocks to be inlined has already reached + // the limit. When MaxNumInlineBlocks is set to 0 or 1, this + // disables partial inlining for the function. + if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks) + break; + + if (SuccSize(CurrEntry) != 2) + break; + + BasicBlock *Succ1 = *succ_begin(CurrEntry); + BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1); + + BasicBlock *ReturnBlock, *NonReturnBlock; + std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); + + if (ReturnBlock) { + OutliningInfo->Entries.push_back(CurrEntry); + OutliningInfo->ReturnBlock = ReturnBlock; + OutliningInfo->NonReturnBlock = NonReturnBlock; + CandidateFound = true; + break; + } + + BasicBlock *CommSucc; + BasicBlock *OtherSucc; + std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2); + + if (!CommSucc) + break; - BasicBlock *ReturnBlock = nullptr; - BasicBlock *NonReturnBlock = nullptr; - unsigned ReturnCount = 0; - for (BasicBlock *BB : successors(EntryBlock)) { - if (isa<ReturnInst>(BB->getTerminator())) { - ReturnBlock = BB; - ReturnCount++; - } else - NonReturnBlock = BB; + OutliningInfo->Entries.push_back(CurrEntry); + CurrEntry = OtherSucc; + + } while (true); + + if (!CandidateFound) + return std::unique_ptr<FunctionOutliningInfo>(); + + // Do sanity check of the entries: threre should not + // be any successors (not in the entry set) other than + // {ReturnBlock, NonReturnBlock} + assert(OutliningInfo->Entries[0] == &F->front() && + "Function Entry must be the first in Entries vector"); + DenseSet<BasicBlock *> Entries; + for (BasicBlock *E : OutliningInfo->Entries) + Entries.insert(E); + + // Returns true of BB has Predecessor which is not + // in Entries set. + auto HasNonEntryPred = [Entries](BasicBlock *BB) { + for (auto Pred : predecessors(BB)) { + if (!Entries.count(Pred)) + return true; + } + return false; + }; + auto CheckAndNormalizeCandidate = + [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) { + for (BasicBlock *E : OutliningInfo->Entries) { + for (auto Succ : successors(E)) { + if (Entries.count(Succ)) + continue; + if (Succ == OutliningInfo->ReturnBlock) + OutliningInfo->ReturnBlockPreds.push_back(E); + else if (Succ != OutliningInfo->NonReturnBlock) + return false; + } + // There should not be any outside incoming edges either: + if (HasNonEntryPred(E)) + return false; + } + return true; + }; + + if (!CheckAndNormalizeCandidate(OutliningInfo.get())) + return std::unique_ptr<FunctionOutliningInfo>(); + + // Now further growing the candidate's inlining region by + // peeling off dominating blocks from the outlining region: + while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) { + BasicBlock *Cand = OutliningInfo->NonReturnBlock; + if (SuccSize(Cand) != 2) + break; + + if (HasNonEntryPred(Cand)) + break; + + BasicBlock *Succ1 = *succ_begin(Cand); + BasicBlock *Succ2 = *(succ_begin(Cand) + 1); + + BasicBlock *ReturnBlock, *NonReturnBlock; + std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); + if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock) + break; + + if (NonReturnBlock->getSinglePredecessor() != Cand) + break; + + // Now grow and update OutlininigInfo: + OutliningInfo->Entries.push_back(Cand); + OutliningInfo->NonReturnBlock = NonReturnBlock; + OutliningInfo->ReturnBlockPreds.push_back(Cand); + Entries.insert(Cand); } - if (ReturnCount != 1) - return nullptr; + return OutliningInfo; +} + +// Check if there is PGO data or user annoated branch data: +static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) { + if (F->getEntryCount()) + return true; + // Now check if any of the entry block has MD_prof data: + for (auto *E : OI->Entries) { + BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator()); + if (!BR || BR->isUnconditional()) + continue; + uint64_t T, F; + if (BR->extractProfMetadata(T, F)) + return true; + } + return false; +} + +BranchProbability +PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) { + + auto EntryFreq = + Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock()); + auto OutliningCallFreq = + Cloner.ClonedFuncBFI->getBlockFreq(Cloner.OutliningCallBB); + + auto OutlineRegionRelFreq = + BranchProbability::getBranchProbability(OutliningCallFreq.getFrequency(), + EntryFreq.getFrequency()); + + if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get())) + return OutlineRegionRelFreq; + + // When profile data is not available, we need to be conservative in + // estimating the overall savings. Static branch prediction can usually + // guess the branch direction right (taken/non-taken), but the guessed + // branch probability is usually not biased enough. In case when the + // outlined region is predicted to be likely, its probability needs + // to be made higher (more biased) to not under-estimate the cost of + // function outlining. On the other hand, if the outlined region + // is predicted to be less likely, the predicted probablity is usually + // higher than the actual. For instance, the actual probability of the + // less likely target is only 5%, but the guessed probablity can be + // 40%. In the latter case, there is no need for further adjustement. + // FIXME: add an option for this. + if (OutlineRegionRelFreq < BranchProbability(45, 100)) + return OutlineRegionRelFreq; + + OutlineRegionRelFreq = std::max( + OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100)); + + return OutlineRegionRelFreq; +} + +bool PartialInlinerImpl::shouldPartialInline( + CallSite CS, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost, + OptimizationRemarkEmitter &ORE) { + + using namespace ore; + if (SkipCostAnalysis) + return true; + + Instruction *Call = CS.getInstruction(); + Function *Callee = CS.getCalledFunction(); + assert(Callee == Cloner.ClonedFunc); + + Function *Caller = CS.getCaller(); + auto &CalleeTTI = (*GetTTI)(*Callee); + InlineCost IC = getInlineCost(CS, getInlineParams(), CalleeTTI, + *GetAssumptionCache, GetBFI, PSI); + + if (IC.isAlways()) { + ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call) + << NV("Callee", Cloner.OrigFunc) + << " should always be fully inlined, not partially"); + return false; + } + + if (IC.isNever()) { + ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call) + << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " + << NV("Caller", Caller) + << " because it should never be inlined (cost=never)"); + return false; + } + + if (!IC) { + ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call) + << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " + << NV("Caller", Caller) << " because too costly to inline (cost=" + << NV("Cost", IC.getCost()) << ", threshold=" + << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"); + return false; + } + const DataLayout &DL = Caller->getParent()->getDataLayout(); + + // The savings of eliminating the call: + int NonWeightedSavings = getCallsiteCost(CS, DL); + BlockFrequency NormWeightedSavings(NonWeightedSavings); + + // Weighted saving is smaller than weighted cost, return false + if (NormWeightedSavings < WeightedOutliningRcost) { + ORE.emit( + OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", Call) + << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " + << NV("Caller", Caller) << " runtime overhead (overhead=" + << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency()) + << ", savings=" + << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) << ")" + << " of making the outlined call is too high"); + + return false; + } + + ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call) + << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into " + << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost()) + << " (threshold=" + << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"); + return true; +} + +// TODO: Ideally we should share Inliner's InlineCost Analysis code. +// For now use a simplified version. The returned 'InlineCost' will be used +// to esimate the size cost as well as runtime cost of the BB. +int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) { + int InlineCost = 0; + const DataLayout &DL = BB->getParent()->getParent()->getDataLayout(); + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + if (isa<DbgInfoIntrinsic>(I)) + continue; + + switch (I->getOpcode()) { + case Instruction::BitCast: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::Alloca: + continue; + case Instruction::GetElementPtr: + if (cast<GetElementPtrInst>(I)->hasAllZeroIndices()) + continue; + default: + break; + } + + IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(I); + if (IntrInst) { + if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start || + IntrInst->getIntrinsicID() == Intrinsic::lifetime_end) + continue; + } + + if (CallInst *CI = dyn_cast<CallInst>(I)) { + InlineCost += getCallsiteCost(CallSite(CI), DL); + continue; + } + + if (InvokeInst *II = dyn_cast<InvokeInst>(I)) { + InlineCost += getCallsiteCost(CallSite(II), DL); + continue; + } + + if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) { + InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost; + continue; + } + InlineCost += InlineConstants::InstrCost; + } + return InlineCost; +} + +std::tuple<int, int> +PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) { + + // Now compute the cost of the call sequence to the outlined function + // 'OutlinedFunction' in BB 'OutliningCallBB': + int OutliningFuncCallCost = computeBBInlineCost(Cloner.OutliningCallBB); + + // Now compute the cost of the extracted/outlined function itself: + int OutlinedFunctionCost = 0; + for (BasicBlock &BB : *Cloner.OutlinedFunc) { + OutlinedFunctionCost += computeBBInlineCost(&BB); + } + + assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost && + "Outlined function cost should be no less than the outlined region"); + // The code extractor introduces a new root and exit stub blocks with + // additional unconditional branches. Those branches will be eliminated + // later with bb layout. The cost should be adjusted accordingly: + OutlinedFunctionCost -= 2 * InlineConstants::InstrCost; + + int OutliningRuntimeOverhead = + OutliningFuncCallCost + + (OutlinedFunctionCost - Cloner.OutlinedRegionCost) + + ExtraOutliningPenalty; + + return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead); +} + +// Create the callsite to profile count map which is +// used to update the original function's entry count, +// after the function is partially inlined into the callsite. +void PartialInlinerImpl::computeCallsiteToProfCountMap( + Function *DuplicateFunction, + DenseMap<User *, uint64_t> &CallSiteToProfCountMap) { + std::vector<User *> Users(DuplicateFunction->user_begin(), + DuplicateFunction->user_end()); + Function *CurrentCaller = nullptr; + std::unique_ptr<BlockFrequencyInfo> TempBFI; + BlockFrequencyInfo *CurrentCallerBFI = nullptr; + + auto ComputeCurrBFI = [&,this](Function *Caller) { + // For the old pass manager: + if (!GetBFI) { + DominatorTree DT(*Caller); + LoopInfo LI(DT); + BranchProbabilityInfo BPI(*Caller, LI); + TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI)); + CurrentCallerBFI = TempBFI.get(); + } else { + // New pass manager: + CurrentCallerBFI = &(*GetBFI)(*Caller); + } + }; + + for (User *User : Users) { + CallSite CS = getCallSite(User); + Function *Caller = CS.getCaller(); + if (CurrentCaller != Caller) { + CurrentCaller = Caller; + ComputeCurrBFI(Caller); + } else { + assert(CurrentCallerBFI && "CallerBFI is not set"); + } + BasicBlock *CallBB = CS.getInstruction()->getParent(); + auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB); + if (Count) + CallSiteToProfCountMap[User] = *Count; + else + CallSiteToProfCountMap[User] = 0; + } +} + +PartialInlinerImpl::FunctionCloner::FunctionCloner(Function *F, + FunctionOutliningInfo *OI) + : OrigFunc(F) { + ClonedOI = llvm::make_unique<FunctionOutliningInfo>(); // Clone the function, so that we can hack away on it. ValueToValueMapTy VMap; - Function *DuplicateFunction = CloneFunction(F, VMap); - DuplicateFunction->setLinkage(GlobalValue::InternalLinkage); - BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[EntryBlock]); - BasicBlock *NewReturnBlock = cast<BasicBlock>(VMap[ReturnBlock]); - BasicBlock *NewNonReturnBlock = cast<BasicBlock>(VMap[NonReturnBlock]); + ClonedFunc = CloneFunction(F, VMap); + ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]); + ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]); + for (BasicBlock *BB : OI->Entries) { + ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB])); + } + for (BasicBlock *E : OI->ReturnBlockPreds) { + BasicBlock *NewE = cast<BasicBlock>(VMap[E]); + ClonedOI->ReturnBlockPreds.push_back(NewE); + } // Go ahead and update all uses to the duplicate, so that we can just // use the inliner functionality when we're done hacking. - F->replaceAllUsesWith(DuplicateFunction); + F->replaceAllUsesWith(ClonedFunc); +} + +void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() { + + auto getFirstPHI = [](BasicBlock *BB) { + BasicBlock::iterator I = BB->begin(); + PHINode *FirstPhi = nullptr; + while (I != BB->end()) { + PHINode *Phi = dyn_cast<PHINode>(I); + if (!Phi) + break; + if (!FirstPhi) { + FirstPhi = Phi; + break; + } + } + return FirstPhi; + }; // Special hackery is needed with PHI nodes that have inputs from more than // one extracted block. For simplicity, just split the PHIs into a two-level // sequence of PHIs, some of which will go in the extracted region, and some // of which will go outside. - BasicBlock *PreReturn = NewReturnBlock; - NewReturnBlock = NewReturnBlock->splitBasicBlock( - NewReturnBlock->getFirstNonPHI()->getIterator()); + BasicBlock *PreReturn = ClonedOI->ReturnBlock; + // only split block when necessary: + PHINode *FirstPhi = getFirstPHI(PreReturn); + unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size(); + + if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1) + return; + + auto IsTrivialPhi = [](PHINode *PN) -> Value * { + Value *CommonValue = PN->getIncomingValue(0); + if (all_of(PN->incoming_values(), + [&](Value *V) { return V == CommonValue; })) + return CommonValue; + return nullptr; + }; + + ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock( + ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator()); BasicBlock::iterator I = PreReturn->begin(); - Instruction *Ins = &NewReturnBlock->front(); + Instruction *Ins = &ClonedOI->ReturnBlock->front(); + SmallVector<Instruction *, 4> DeadPhis; while (I != PreReturn->end()) { PHINode *OldPhi = dyn_cast<PHINode>(I); if (!OldPhi) break; - PHINode *RetPhi = PHINode::Create(OldPhi->getType(), 2, "", Ins); + PHINode *RetPhi = + PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins); OldPhi->replaceAllUsesWith(RetPhi); - Ins = NewReturnBlock->getFirstNonPHI(); + Ins = ClonedOI->ReturnBlock->getFirstNonPHI(); RetPhi->addIncoming(&*I, PreReturn); - RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(NewEntryBlock), - NewEntryBlock); - OldPhi->removeIncomingValue(NewEntryBlock); + for (BasicBlock *E : ClonedOI->ReturnBlockPreds) { + RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E); + OldPhi->removeIncomingValue(E); + } + // After incoming values splitting, the old phi may become trivial. + // Keeping the trivial phi can introduce definition inside the outline + // region which is live-out, causing necessary overhead (load, store + // arg passing etc). + if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) { + OldPhi->replaceAllUsesWith(OldPhiVal); + DeadPhis.push_back(OldPhi); + } ++I; - } - NewEntryBlock->getTerminator()->replaceUsesOfWith(PreReturn, NewReturnBlock); + } + for (auto *DP : DeadPhis) + DP->eraseFromParent(); + + for (auto E : ClonedOI->ReturnBlockPreds) { + E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock); + } +} + +Function *PartialInlinerImpl::FunctionCloner::doFunctionOutlining() { + // Returns true if the block is to be partial inlined into the caller + // (i.e. not to be extracted to the out of line function) + auto ToBeInlined = [&, this](BasicBlock *BB) { + return BB == ClonedOI->ReturnBlock || + (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) != + ClonedOI->Entries.end()); + }; // Gather up the blocks that we're going to extract. std::vector<BasicBlock *> ToExtract; - ToExtract.push_back(NewNonReturnBlock); - for (BasicBlock &BB : *DuplicateFunction) - if (&BB != NewEntryBlock && &BB != NewReturnBlock && - &BB != NewNonReturnBlock) + ToExtract.push_back(ClonedOI->NonReturnBlock); + OutlinedRegionCost += + PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock); + for (BasicBlock &BB : *ClonedFunc) + if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) { ToExtract.push_back(&BB); + // FIXME: the code extractor may hoist/sink more code + // into the outlined function which may make the outlining + // overhead (the difference of the outlined function cost + // and OutliningRegionCost) look larger. + OutlinedRegionCost += computeBBInlineCost(&BB); + } // The CodeExtractor needs a dominator tree. DominatorTree DT; - DT.recalculate(*DuplicateFunction); + DT.recalculate(*ClonedFunc); // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. LoopInfo LI(DT); - BranchProbabilityInfo BPI(*DuplicateFunction, LI); - BlockFrequencyInfo BFI(*DuplicateFunction, BPI, LI); + BranchProbabilityInfo BPI(*ClonedFunc, LI); + ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); // Extract the body of the if. - Function *ExtractedFunction = - CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, &BFI, &BPI) - .extractCodeRegion(); + OutlinedFunc = CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, + ClonedFuncBFI.get(), &BPI) + .extractCodeRegion(); + + if (OutlinedFunc) { + OutliningCallBB = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc) + .getInstruction() + ->getParent(); + assert(OutliningCallBB->getParent() == ClonedFunc); + } - // Inline the top-level if test into all callers. - std::vector<User *> Users(DuplicateFunction->user_begin(), - DuplicateFunction->user_end()); - for (User *User : Users) - if (CallInst *CI = dyn_cast<CallInst>(User)) - InlineFunction(CI, IFI); - else if (InvokeInst *II = dyn_cast<InvokeInst>(User)) - InlineFunction(II, IFI); + return OutlinedFunc; +} +PartialInlinerImpl::FunctionCloner::~FunctionCloner() { // Ditch the duplicate, since we're done with it, and rewrite all remaining // users (function pointers, etc.) back to the original function. - DuplicateFunction->replaceAllUsesWith(F); - DuplicateFunction->eraseFromParent(); + ClonedFunc->replaceAllUsesWith(OrigFunc); + ClonedFunc->eraseFromParent(); + if (!IsFunctionInlined) { + // Remove the function that is speculatively created if there is no + // reference. + if (OutlinedFunc) + OutlinedFunc->eraseFromParent(); + } +} - ++NumPartialInlined; +Function *PartialInlinerImpl::unswitchFunction(Function *F) { + + if (F->hasAddressTaken()) + return nullptr; + + // Let inliner handle it + if (F->hasFnAttribute(Attribute::AlwaysInline)) + return nullptr; + + if (F->hasFnAttribute(Attribute::NoInline)) + return nullptr; + + if (PSI->isFunctionEntryCold(F)) + return nullptr; + + if (F->user_begin() == F->user_end()) + return nullptr; + + std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F); - return ExtractedFunction; + if (!OI) + return nullptr; + + FunctionCloner Cloner(F, OI.get()); + Cloner.NormalizeReturnBlock(); + Function *OutlinedFunction = Cloner.doFunctionOutlining(); + + bool AnyInline = tryPartialInline(Cloner); + + if (AnyInline) + return OutlinedFunction; + + return nullptr; +} + +bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { + int NonWeightedRcost; + int SizeCost; + + if (Cloner.OutlinedFunc == nullptr) + return false; + + std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner); + + auto RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner); + auto WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq; + + // The call sequence to the outlined function is larger than the original + // outlined region size, it does not increase the chances of inlining + // the function with outlining (The inliner usies the size increase to + // model the cost of inlining a callee). + if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) { + OptimizationRemarkEmitter ORE(Cloner.OrigFunc); + DebugLoc DLoc; + BasicBlock *Block; + std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc); + ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall", + DLoc, Block) + << ore::NV("Function", Cloner.OrigFunc) + << " not partially inlined into callers (Original Size = " + << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost) + << ", Size of call sequence to outlined function = " + << ore::NV("NewSize", SizeCost) << ")"); + return false; + } + + assert(Cloner.OrigFunc->user_begin() == Cloner.OrigFunc->user_end() && + "F's users should all be replaced!"); + + std::vector<User *> Users(Cloner.ClonedFunc->user_begin(), + Cloner.ClonedFunc->user_end()); + + DenseMap<User *, uint64_t> CallSiteToProfCountMap; + if (Cloner.OrigFunc->getEntryCount()) + computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap); + + auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount(); + uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0); + + bool AnyInline = false; + for (User *User : Users) { + CallSite CS = getCallSite(User); + + if (IsLimitReached()) + continue; + + OptimizationRemarkEmitter ORE(CS.getCaller()); + + if (!shouldPartialInline(CS, Cloner, WeightedRcost, ORE)) + continue; + + ORE.emit( + OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction()) + << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into " + << ore::NV("Caller", CS.getCaller())); + + InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI); + InlineFunction(CS, IFI); + + // Now update the entry count: + if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) { + uint64_t CallSiteCount = CallSiteToProfCountMap[User]; + CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount); + } + + AnyInline = true; + NumPartialInlining++; + // Update the stats + NumPartialInlined++; + } + + if (AnyInline) { + Cloner.IsFunctionInlined = true; + if (CalleeEntryCount) + Cloner.OrigFunc->setEntryCount(CalleeEntryCountV); + } + + return AnyInline; } bool PartialInlinerImpl::run(Module &M) { + if (DisablePartialInlining) + return false; + std::vector<Function *> Worklist; Worklist.reserve(M.size()); for (Function &F : M) @@ -203,6 +957,8 @@ char PartialInlinerLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner", "Partial Inliner", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner", "Partial Inliner", false, false) @@ -213,12 +969,25 @@ ModulePass *llvm::createPartialInliningPass() { PreservedAnalyses PartialInlinerPass::run(Module &M, ModuleAnalysisManager &AM) { auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); + std::function<AssumptionCache &(Function &)> GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & { return FAM.getResult<AssumptionAnalysis>(F); }; - InlineFunctionInfo IFI(nullptr, &GetAssumptionCache); - if (PartialInlinerImpl(IFI).run(M)) + + std::function<BlockFrequencyInfo &(Function &)> GetBFI = + [&FAM](Function &F) -> BlockFrequencyInfo & { + return FAM.getResult<BlockFrequencyAnalysis>(F); + }; + + std::function<TargetTransformInfo &(Function &)> GetTTI = + [&FAM](Function &F) -> TargetTransformInfo & { + return FAM.getResult<TargetIRAnalysis>(F); + }; + + ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M); + + if (PartialInlinerImpl(&GetAssumptionCache, &GetTTI, {GetBFI}, PSI).run(M)) return PreservedAnalyses::none(); return PreservedAnalyses::all(); } diff --git a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index 941efb2..0b319f6 100644 --- a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -38,21 +38,22 @@ #include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/GVN.h" +#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" #include "llvm/Transforms/Vectorize.h" using namespace llvm; static cl::opt<bool> -RunLoopVectorization("vectorize-loops", cl::Hidden, - cl::desc("Run the Loop vectorization passes")); + RunPartialInlining("enable-partial-inlining", cl::init(false), cl::Hidden, + cl::ZeroOrMore, cl::desc("Run Partial inlinining pass")); static cl::opt<bool> -RunSLPVectorization("vectorize-slp", cl::Hidden, - cl::desc("Run the SLP vectorization passes")); + RunLoopVectorization("vectorize-loops", cl::Hidden, + cl::desc("Run the Loop vectorization passes")); static cl::opt<bool> -RunBBVectorization("vectorize-slp-aggressive", cl::Hidden, - cl::desc("Run the BB vectorization passes")); +RunSLPVectorization("vectorize-slp", cl::Hidden, + cl::desc("Run the SLP vectorization passes")); static cl::opt<bool> UseGVNAfterVectorization("use-gvn-after-vectorization", @@ -67,10 +68,6 @@ static cl::opt<bool> RunLoopRerolling("reroll-loops", cl::Hidden, cl::desc("Run the loop rerolling pass")); -static cl::opt<bool> RunLoadCombine("combine-loads", cl::init(false), - cl::Hidden, - cl::desc("Run the load combining pass")); - static cl::opt<bool> RunNewGVN("enable-newgvn", cl::init(false), cl::Hidden, cl::desc("Run the NewGVN pass")); @@ -93,10 +90,6 @@ static cl::opt<CFLAAType> clEnumValN(CFLAAType::Both, "both", "Enable both variants of CFL-AA"))); -static cl::opt<bool> -EnableMLSM("mlsm", cl::init(true), cl::Hidden, - cl::desc("Enable motion of merged load and store")); - static cl::opt<bool> EnableLoopInterchange( "enable-loopinterchange", cl::init(false), cl::Hidden, cl::desc("Enable the new, experimental LoopInterchange Pass")); @@ -140,15 +133,28 @@ static cl::opt<int> PreInlineThreshold( cl::desc("Control the amount of inlining in pre-instrumentation inliner " "(default = 75)")); +static cl::opt<bool> EnableEarlyCSEMemSSA( + "enable-earlycse-memssa", cl::init(true), cl::Hidden, + cl::desc("Enable the EarlyCSE w/ MemorySSA pass (default = on)")); + static cl::opt<bool> EnableGVNHoist( "enable-gvn-hoist", cl::init(false), cl::Hidden, - cl::desc("Enable the GVN hoisting pass")); + cl::desc("Enable the GVN hoisting pass (default = off)")); static cl::opt<bool> DisableLibCallsShrinkWrap("disable-libcalls-shrinkwrap", cl::init(false), cl::Hidden, cl::desc("Disable shrink-wrap library calls")); +static cl::opt<bool> + EnableSimpleLoopUnswitch("enable-simple-loop-unswitch", cl::init(false), + cl::Hidden, + cl::desc("Enable the simple loop unswitch pass.")); + +static cl::opt<bool> EnableGVNSink( + "enable-gvn-sink", cl::init(false), cl::Hidden, + cl::desc("Enable the GVN sinking pass (default = off)")); + PassManagerBuilder::PassManagerBuilder() { OptLevel = 2; SizeLevel = 0; @@ -156,11 +162,9 @@ PassManagerBuilder::PassManagerBuilder() { Inliner = nullptr; DisableUnitAtATime = false; DisableUnrollLoops = false; - BBVectorize = RunBBVectorization; SLPVectorize = RunSLPVectorization; LoopVectorize = RunLoopVectorization; RerollLoops = RunLoopRerolling; - LoadCombine = RunLoadCombine; NewGVN = RunNewGVN; DisableGVNLoadPRE = false; VerifyInput = false; @@ -172,6 +176,7 @@ PassManagerBuilder::PassManagerBuilder() { PGOInstrUse = RunPGOInstrUse; PrepareForThinLTO = EnablePrepareForThinLTO; PerformThinLTO = false; + DivergentTarget = false; } PassManagerBuilder::~PassManagerBuilder() { @@ -183,6 +188,13 @@ PassManagerBuilder::~PassManagerBuilder() { static ManagedStatic<SmallVector<std::pair<PassManagerBuilder::ExtensionPointTy, PassManagerBuilder::ExtensionFn>, 8> > GlobalExtensions; +/// Check if GlobalExtensions is constructed and not empty. +/// Since GlobalExtensions is a managed static, calling 'empty()' will trigger +/// the construction of the object. +static bool GlobalExtensionsNotEmpty() { + return GlobalExtensions.isConstructed() && !GlobalExtensions->empty(); +} + void PassManagerBuilder::addGlobalExtension( PassManagerBuilder::ExtensionPointTy Ty, PassManagerBuilder::ExtensionFn Fn) { @@ -195,9 +207,12 @@ void PassManagerBuilder::addExtension(ExtensionPointTy Ty, ExtensionFn Fn) { void PassManagerBuilder::addExtensionsToPM(ExtensionPointTy ETy, legacy::PassManagerBase &PM) const { - for (unsigned i = 0, e = GlobalExtensions->size(); i != e; ++i) - if ((*GlobalExtensions)[i].first == ETy) - (*GlobalExtensions)[i].second(*this, PM); + if (GlobalExtensionsNotEmpty()) { + for (auto &Ext : *GlobalExtensions) { + if (Ext.first == ETy) + Ext.second(*this, PM); + } + } for (unsigned i = 0, e = Extensions.size(); i != e; ++i) if (Extensions[i].first == ETy) Extensions[i].second(*this, PM); @@ -248,18 +263,17 @@ void PassManagerBuilder::populateFunctionPassManager( FPM.add(createCFGSimplificationPass()); FPM.add(createSROAPass()); FPM.add(createEarlyCSEPass()); - if(EnableGVNHoist) - FPM.add(createGVNHoistPass()); FPM.add(createLowerExpectIntrinsicPass()); } // Do PGO instrumentation generation or use pass as the option specified. void PassManagerBuilder::addPGOInstrPasses(legacy::PassManagerBase &MPM) { - if (!EnablePGOInstrGen && PGOInstrUse.empty()) + if (!EnablePGOInstrGen && PGOInstrUse.empty() && PGOSampleUse.empty()) return; // Perform the preinline and cleanup passes for O1 and above. // And avoid doing them if optimizing for size. - if (OptLevel > 0 && SizeLevel == 0 && !DisablePreInliner) { + if (OptLevel > 0 && SizeLevel == 0 && !DisablePreInliner && + PGOSampleUse.empty()) { // Create preinline pass. We construct an InlineParams object and specify // the threshold here to avoid the command line options of the regular // inliner to influence pre-inlining. The only fields of InlineParams we @@ -283,17 +297,32 @@ void PassManagerBuilder::addPGOInstrPasses(legacy::PassManagerBase &MPM) { InstrProfOptions Options; if (!PGOInstrGen.empty()) Options.InstrProfileOutput = PGOInstrGen; + Options.DoCounterPromotion = true; + MPM.add(createLoopRotatePass()); MPM.add(createInstrProfilingLegacyPass(Options)); } if (!PGOInstrUse.empty()) MPM.add(createPGOInstrumentationUseLegacyPass(PGOInstrUse)); + // Indirect call promotion that promotes intra-module targets only. + // For ThinLTO this is done earlier due to interactions with globalopt + // for imported functions. We don't run this at -O0. + if (OptLevel > 0) + MPM.add( + createPGOIndirectCallPromotionLegacyPass(false, !PGOSampleUse.empty())); } void PassManagerBuilder::addFunctionSimplificationPasses( legacy::PassManagerBase &MPM) { // Start of function pass. // Break up aggregate allocas, using SSAUpdater. MPM.add(createSROAPass()); - MPM.add(createEarlyCSEPass()); // Catch trivial redundancies + MPM.add(createEarlyCSEPass(EnableEarlyCSEMemSSA)); // Catch trivial redundancies + if (EnableGVNHoist) + MPM.add(createGVNHoistPass()); + if (EnableGVNSink) { + MPM.add(createGVNSinkPass()); + MPM.add(createCFGSimplificationPass()); + } + // Speculative execution if the target has divergent branches; otherwise nop. MPM.add(createSpeculativeExecutionIfHasBranchDivergencePass()); MPM.add(createJumpThreadingPass()); // Thread jumps. @@ -305,29 +334,37 @@ void PassManagerBuilder::addFunctionSimplificationPasses( MPM.add(createLibCallsShrinkWrapPass()); addExtensionsToPM(EP_Peephole, MPM); + // Optimize memory intrinsic calls based on the profiled size information. + if (SizeLevel == 0) + MPM.add(createPGOMemOPSizeOptLegacyPass()); + MPM.add(createTailCallEliminationPass()); // Eliminate tail calls MPM.add(createCFGSimplificationPass()); // Merge & remove BBs MPM.add(createReassociatePass()); // Reassociate expressions // Rotate Loop - disable header duplication at -Oz MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1)); MPM.add(createLICMPass()); // Hoist loop invariants - MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3)); + if (EnableSimpleLoopUnswitch) + MPM.add(createSimpleLoopUnswitchLegacyPass()); + else + MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget)); MPM.add(createCFGSimplificationPass()); addInstructionCombiningPass(MPM); MPM.add(createIndVarSimplifyPass()); // Canonicalize indvars MPM.add(createLoopIdiomPass()); // Recognize idioms like memset. + addExtensionsToPM(EP_LateLoopOptimizations, MPM); MPM.add(createLoopDeletionPass()); // Delete dead loops + if (EnableLoopInterchange) { MPM.add(createLoopInterchangePass()); // Interchange loops MPM.add(createCFGSimplificationPass()); } if (!DisableUnrollLoops) - MPM.add(createSimpleLoopUnrollPass()); // Unroll small loops + MPM.add(createSimpleLoopUnrollPass(OptLevel)); // Unroll small loops addExtensionsToPM(EP_LoopOptimizerEnd, MPM); if (OptLevel > 1) { - if (EnableMLSM) - MPM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds + MPM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds MPM.add(NewGVN ? createNewGVNPass() : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies } @@ -352,29 +389,8 @@ void PassManagerBuilder::addFunctionSimplificationPasses( if (RerollLoops) MPM.add(createLoopRerollPass()); - if (!RunSLPAfterLoopVectorization) { - if (SLPVectorize) - MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. - - if (BBVectorize) { - MPM.add(createBBVectorizePass()); - addInstructionCombiningPass(MPM); - addExtensionsToPM(EP_Peephole, MPM); - if (OptLevel > 1 && UseGVNAfterVectorization) - MPM.add(NewGVN - ? createNewGVNPass() - : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies - else - MPM.add(createEarlyCSEPass()); // Catch trivial redundancies - - // BBVectorize may have significantly shortened a loop body; unroll again. - if (!DisableUnrollLoops) - MPM.add(createLoopUnrollPass()); - } - } - - if (LoadCombine) - MPM.add(createLoadCombinePass()); + if (!RunSLPAfterLoopVectorization && SLPVectorize) + MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. MPM.add(createAggressiveDCEPass()); // Delete dead instructions MPM.add(createCFGSimplificationPass()); // Merge & remove BBs @@ -409,14 +425,17 @@ void PassManagerBuilder::populateModulePassManager( // builds. The function merging pass is if (MergeFunctions) MPM.add(createMergeFunctionsPass()); - else if (!GlobalExtensions->empty() || !Extensions.empty()) + else if (GlobalExtensionsNotEmpty() || !Extensions.empty()) MPM.add(createBarrierNoopPass()); + addExtensionsToPM(EP_EnabledOnOptLevel0, MPM); + + // Rename anon globals to be able to export them in the summary. + // This has to be done after we add the extensions to the pass manager + // as there could be passes (e.g. Adddress sanitizer) which introduce + // new unnamed globals. if (PrepareForThinLTO) - // Rename anon globals to be able to export them in the summary. MPM.add(createNameAnonGlobalPass()); - - addExtensionsToPM(EP_EnabledOnOptLevel0, MPM); return; } @@ -434,7 +453,16 @@ void PassManagerBuilder::populateModulePassManager( // earlier in the pass pipeline, here before globalopt. Otherwise imported // available_externally functions look unreferenced and are removed. if (PerformThinLTO) - MPM.add(createPGOIndirectCallPromotionLegacyPass(/*InLTO = */ true)); + MPM.add(createPGOIndirectCallPromotionLegacyPass(/*InLTO = */ true, + !PGOSampleUse.empty())); + + // For SamplePGO in ThinLTO compile phase, we do not want to unroll loops + // as it will change the CFG too much to make the 2nd profile annotation + // in backend more difficult. + bool PrepareForThinLTOUsingPGOSampleProfile = + PrepareForThinLTO && !PGOSampleUse.empty(); + if (PrepareForThinLTOUsingPGOSampleProfile) + DisableUnrollLoops = true; if (!DisableUnitAtATime) { // Infer attributes about declarations if possible. @@ -454,15 +482,13 @@ void PassManagerBuilder::populateModulePassManager( MPM.add(createCFGSimplificationPass()); // Clean up after IPCP & DAE } - if (!PerformThinLTO) { - /// PGO instrumentation is added during the compile phase for ThinLTO, do - /// not run it a second time + // For SamplePGO in ThinLTO compile phase, we do not want to do indirect + // call promotion as it will change the CFG too much to make the 2nd + // profile annotation in backend more difficult. + // PGO instrumentation is added during the compile phase for ThinLTO, do + // not run it a second time + if (!PerformThinLTO && !PrepareForThinLTOUsingPGOSampleProfile) addPGOInstrPasses(MPM); - // Indirect call promotion that promotes intra-module targets only. - // For ThinLTO this is done earlier due to interactions with globalopt - // for imported functions. - MPM.add(createPGOIndirectCallPromotionLegacyPass()); - } if (EnableNonLTOGlobalsModRef) // We add a module alias analysis pass here. In part due to bugs in the @@ -489,6 +515,8 @@ void PassManagerBuilder::populateModulePassManager( // pass manager that we are specifically trying to avoid. To prevent this // we must insert a no-op module pass to reset the pass manager. MPM.add(createBarrierNoopPass()); + if (RunPartialInlining) + MPM.add(createPartialInliningPass()); if (!DisableUnitAtATime && OptLevel > 1 && !PrepareForLTO && !PrepareForThinLTO) @@ -589,42 +617,24 @@ void PassManagerBuilder::populateModulePassManager( MPM.add(createCorrelatedValuePropagationPass()); addInstructionCombiningPass(MPM); MPM.add(createLICMPass()); - MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3)); + MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget)); MPM.add(createCFGSimplificationPass()); addInstructionCombiningPass(MPM); } - if (RunSLPAfterLoopVectorization) { - if (SLPVectorize) { - MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. - if (OptLevel > 1 && ExtraVectorizerPasses) { - MPM.add(createEarlyCSEPass()); - } - } - - if (BBVectorize) { - MPM.add(createBBVectorizePass()); - addInstructionCombiningPass(MPM); - addExtensionsToPM(EP_Peephole, MPM); - if (OptLevel > 1 && UseGVNAfterVectorization) - MPM.add(NewGVN - ? createNewGVNPass() - : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies - else - MPM.add(createEarlyCSEPass()); // Catch trivial redundancies - - // BBVectorize may have significantly shortened a loop body; unroll again. - if (!DisableUnrollLoops) - MPM.add(createLoopUnrollPass()); + if (RunSLPAfterLoopVectorization && SLPVectorize) { + MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. + if (OptLevel > 1 && ExtraVectorizerPasses) { + MPM.add(createEarlyCSEPass()); } } addExtensionsToPM(EP_Peephole, MPM); - MPM.add(createCFGSimplificationPass()); + MPM.add(createLateCFGSimplificationPass()); // Switches to lookup tables addInstructionCombiningPass(MPM); if (!DisableUnrollLoops) { - MPM.add(createLoopUnrollPass()); // Unroll small loops + MPM.add(createLoopUnrollPass(OptLevel)); // Unroll small loops // LoopUnroll may generate some redundency to cleanup. addInstructionCombiningPass(MPM); @@ -662,6 +672,11 @@ void PassManagerBuilder::populateModulePassManager( MPM.add(createLoopSinkPass()); // Get rid of LCSSA nodes. MPM.add(createInstructionSimplifierPass()); + + // LoopSink (and other loop passes since the last simplifyCFG) might have + // resulted in single-entry-single-exit or empty blocks. Clean up the CFG. + MPM.add(createCFGSimplificationPass()); + addExtensionsToPM(EP_OptimizerLast, MPM); } @@ -684,7 +699,8 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { // left by the earlier promotion pass that promotes intra-module targets. // This two-step promotion is to save the compile time. For LTO, it should // produce the same result as if we only do promotion here. - PM.add(createPGOIndirectCallPromotionLegacyPass(true)); + PM.add( + createPGOIndirectCallPromotionLegacyPass(true, !PGOSampleUse.empty())); // Propagate constants at call sites into the functions they call. This // opens opportunities for globalopt (and inlining) by substituting function @@ -703,7 +719,7 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { PM.add(createGlobalSplitPass()); // Apply whole-program devirtualization and virtual constant propagation. - PM.add(createWholeProgramDevirtPass()); + PM.add(createWholeProgramDevirtPass(ExportSummary, nullptr)); // That's all we need at opt level 1. if (OptLevel == 1) @@ -759,8 +775,7 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { PM.add(createGlobalsAAWrapperPass()); // IP alias analysis. PM.add(createLICMPass()); // Hoist loop invariants. - if (EnableMLSM) - PM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds. + PM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds. PM.add(NewGVN ? createNewGVNPass() : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies. PM.add(createMemCpyOptPass()); // Remove dead memcpys. @@ -775,11 +790,11 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { PM.add(createLoopInterchangePass()); if (!DisableUnrollLoops) - PM.add(createSimpleLoopUnrollPass()); // Unroll small loops + PM.add(createSimpleLoopUnrollPass(OptLevel)); // Unroll small loops PM.add(createLoopVectorizePass(true, LoopVectorize)); // The vectorizer may have significantly shortened a loop body; unroll again. if (!DisableUnrollLoops) - PM.add(createLoopUnrollPass()); + PM.add(createLoopUnrollPass(OptLevel)); // Now that we've optimized loops (in particular loop induction variables), // we may have exposed more scalar opportunities. Run parts of the scalar @@ -799,9 +814,6 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { // alignments. PM.add(createAlignmentFromAssumptionsPass()); - if (LoadCombine) - PM.add(createLoadCombinePass()); - // Cleanup and simplify the code after the scalar optimizations. addInstructionCombiningPass(PM); addExtensionsToPM(EP_Peephole, PM); @@ -833,6 +845,23 @@ void PassManagerBuilder::populateThinLTOPassManager( if (VerifyInput) PM.add(createVerifierPass()); + if (ImportSummary) { + // These passes import type identifier resolutions for whole-program + // devirtualization and CFI. They must run early because other passes may + // disturb the specific instruction patterns that these passes look for, + // creating dependencies on resolutions that may not appear in the summary. + // + // For example, GVN may transform the pattern assume(type.test) appearing in + // two basic blocks into assume(phi(type.test, type.test)), which would + // transform a dependency on a WPD resolution into a dependency on a type + // identifier resolution for CFI. + // + // Also, WPD has access to more precise information than ICP and can + // devirtualize more effectively, so it should operate on the IR first. + PM.add(createWholeProgramDevirtPass(nullptr, ImportSummary)); + PM.add(createLowerTypeTestsPass(nullptr, ImportSummary)); + } + populateModulePassManager(PM); if (VerifyOutput) @@ -849,6 +878,12 @@ void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) { if (OptLevel != 0) addLTOOptimizationPasses(PM); + else { + // The whole-program-devirt pass needs to run at -O0 because only it knows + // about the llvm.type.checked.load intrinsic: it needs to both lower the + // intrinsic itself and handle it in the summary. + PM.add(createWholeProgramDevirtPass(ExportSummary, nullptr)); + } // Create a function that performs CFI checks for cross-DSO calls with targets // in the current module. @@ -857,8 +892,7 @@ void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) { // Lower type metadata and the type.test intrinsic. This pass supports Clang's // control flow integrity mechanisms (-fsanitize=cfi*) and needs to run at // link time if CFI is enabled. The pass does nothing if CFI is disabled. - PM.add(createLowerTypeTestsPass(LowerTypeTestsSummaryAction::None, - /*Summary=*/nullptr)); + PM.add(createLowerTypeTestsPass(ExportSummary, nullptr)); if (OptLevel != 0) addLateLTOOptimizationPasses(PM); diff --git a/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp b/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp index d9acb9b..3fd5984 100644 --- a/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp +++ b/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp @@ -14,10 +14,8 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/IPO.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" #include "llvm/Analysis/EHPersonalities.h" @@ -28,6 +26,8 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Utils/Local.h" #include <algorithm> using namespace llvm; diff --git a/contrib/llvm/lib/Transforms/IPO/SampleProfile.cpp b/contrib/llvm/lib/Transforms/IPO/SampleProfile.cpp index 6a43f8d..6baada2 100644 --- a/contrib/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/contrib/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -35,6 +35,7 @@ #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" @@ -42,7 +43,9 @@ #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" +#include "llvm/IR/ValueSymbolTable.h" #include "llvm/Pass.h" +#include "llvm/ProfileData/InstrProf.h" #include "llvm/ProfileData/SampleProfReader.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -50,6 +53,7 @@ #include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/Utils/Cloning.h" #include <cctype> @@ -159,21 +163,26 @@ protected: ErrorOr<uint64_t> getInstWeight(const Instruction &I); ErrorOr<uint64_t> getBlockWeight(const BasicBlock *BB); const FunctionSamples *findCalleeFunctionSamples(const Instruction &I) const; + std::vector<const FunctionSamples *> + findIndirectCallFunctionSamples(const Instruction &I) const; const FunctionSamples *findFunctionSamples(const Instruction &I) const; - bool inlineHotFunctions(Function &F); + bool inlineHotFunctions(Function &F, + DenseSet<GlobalValue::GUID> &ImportGUIDs); void printEdgeWeight(raw_ostream &OS, Edge E); void printBlockWeight(raw_ostream &OS, const BasicBlock *BB) const; void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB); bool computeBlockWeights(Function &F); void findEquivalenceClasses(Function &F); + template <bool IsPostDom> void findEquivalencesFor(BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants, - DominatorTreeBase<BasicBlock> *DomTree); + DominatorTreeBase<BasicBlock, IsPostDom> *DomTree); + void propagateWeights(Function &F); uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge); void buildEdges(Function &F); bool propagateThroughEdges(Function &F, bool UpdateBlockCount); void computeDominanceAndLoopInfo(Function &F); - unsigned getOffset(unsigned L, unsigned H) const; + unsigned getOffset(const DILocation *DIL) const; void clearFunctionData(); /// \brief Map basic blocks to their computed weights. @@ -202,9 +211,15 @@ protected: /// the same number of times. EquivalenceClassMap EquivalenceClass; + /// Map from function name to Function *. Used to find the function from + /// the function name. If the function name contains suffix, additional + /// entry is added to map from the stripped name to the function if there + /// is one-to-one mapping. + StringMap<Function *> SymbolMap; + /// \brief Dominance, post-dominance and loop information. std::unique_ptr<DominatorTree> DT; - std::unique_ptr<DominatorTreeBase<BasicBlock>> PDT; + std::unique_ptr<PostDomTreeBase<BasicBlock>> PDT; std::unique_ptr<LoopInfo> LI; AssumptionCacheTracker *ACT; @@ -326,11 +341,12 @@ SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS) const { // If there are inlined callsites in this function, count the samples found // in the respective bodies. However, do not bother counting callees with 0 // total samples, these are callees that were never invoked at runtime. - for (const auto &I : FS->getCallsiteSamples()) { - const FunctionSamples *CalleeSamples = &I.second; - if (callsiteIsHot(FS, CalleeSamples)) - Count += countUsedRecords(CalleeSamples); - } + for (const auto &I : FS->getCallsiteSamples()) + for (const auto &J : I.second) { + const FunctionSamples *CalleeSamples = &J.second; + if (callsiteIsHot(FS, CalleeSamples)) + Count += countUsedRecords(CalleeSamples); + } return Count; } @@ -343,11 +359,12 @@ SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS) const { unsigned Count = FS->getBodySamples().size(); // Only count records in hot callsites. - for (const auto &I : FS->getCallsiteSamples()) { - const FunctionSamples *CalleeSamples = &I.second; - if (callsiteIsHot(FS, CalleeSamples)) - Count += countBodyRecords(CalleeSamples); - } + for (const auto &I : FS->getCallsiteSamples()) + for (const auto &J : I.second) { + const FunctionSamples *CalleeSamples = &J.second; + if (callsiteIsHot(FS, CalleeSamples)) + Count += countBodyRecords(CalleeSamples); + } return Count; } @@ -362,11 +379,12 @@ SampleCoverageTracker::countBodySamples(const FunctionSamples *FS) const { Total += I.second.getSamples(); // Only count samples in hot callsites. - for (const auto &I : FS->getCallsiteSamples()) { - const FunctionSamples *CalleeSamples = &I.second; - if (callsiteIsHot(FS, CalleeSamples)) - Total += countBodySamples(CalleeSamples); - } + for (const auto &I : FS->getCallsiteSamples()) + for (const auto &J : I.second) { + const FunctionSamples *CalleeSamples = &J.second; + if (callsiteIsHot(FS, CalleeSamples)) + Total += countBodySamples(CalleeSamples); + } return Total; } @@ -398,15 +416,11 @@ void SampleProfileLoader::clearFunctionData() { CoverageTracker.clear(); } -/// \brief Returns the offset of lineno \p L to head_lineno \p H -/// -/// \param L Lineno -/// \param H Header lineno of the function -/// -/// \returns offset to the header lineno. 16 bits are used to represent offset. +/// Returns the line offset to the start line of the subprogram. /// We assume that a single function will not exceed 65535 LOC. -unsigned SampleProfileLoader::getOffset(unsigned L, unsigned H) const { - return (L - H) & 0xffff; +unsigned SampleProfileLoader::getOffset(const DILocation *DIL) const { + return (DIL->getLine() - DIL->getScope()->getSubprogram()->getLine()) & + 0xffff; } /// \brief Print the weight of edge \p E on stream \p OS. @@ -451,8 +465,7 @@ void SampleProfileLoader::printBlockWeight(raw_ostream &OS, /// \param Inst Instruction to query. /// /// \returns the weight of \p Inst. -ErrorOr<uint64_t> -SampleProfileLoader::getInstWeight(const Instruction &Inst) { +ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) { const DebugLoc &DLoc = Inst.getDebugLoc(); if (!DLoc) return std::error_code(); @@ -470,19 +483,14 @@ SampleProfileLoader::getInstWeight(const Instruction &Inst) { // If a call/invoke instruction is inlined in profile, but not inlined here, // it means that the inlined callsite has no sample, thus the call // instruction should have 0 count. - bool IsCall = isa<CallInst>(Inst) || isa<InvokeInst>(Inst); - if (IsCall && findCalleeFunctionSamples(Inst)) + if ((isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) && + findCalleeFunctionSamples(Inst)) return 0; const DILocation *DIL = DLoc; - unsigned Lineno = DLoc.getLine(); - unsigned HeaderLineno = DIL->getScope()->getSubprogram()->getLine(); - - uint32_t LineOffset = getOffset(Lineno, HeaderLineno); - uint32_t Discriminator = DIL->getDiscriminator(); - ErrorOr<uint64_t> R = IsCall - ? FS->findCallSamplesAt(LineOffset, Discriminator) - : FS->findSamplesAt(LineOffset, Discriminator); + uint32_t LineOffset = getOffset(DIL); + uint32_t Discriminator = DIL->getBaseDiscriminator(); + ErrorOr<uint64_t> R = FS->findSamplesAt(LineOffset, Discriminator); if (R) { bool FirstMark = CoverageTracker.markSamplesUsed(FS, LineOffset, Discriminator, R.get()); @@ -491,13 +499,14 @@ SampleProfileLoader::getInstWeight(const Instruction &Inst) { LLVMContext &Ctx = F->getContext(); emitOptimizationRemark( Ctx, DEBUG_TYPE, *F, DLoc, - Twine("Applied ") + Twine(*R) + " samples from profile (offset: " + - Twine(LineOffset) + + Twine("Applied ") + Twine(*R) + + " samples from profile (offset: " + Twine(LineOffset) + ((Discriminator) ? Twine(".") + Twine(Discriminator) : "") + ")"); } - DEBUG(dbgs() << " " << Lineno << "." << DIL->getDiscriminator() << ":" - << Inst << " (line offset: " << Lineno - HeaderLineno << "." - << DIL->getDiscriminator() << " - weight: " << R.get() + DEBUG(dbgs() << " " << DLoc.getLine() << "." + << DIL->getBaseDiscriminator() << ":" << Inst + << " (line offset: " << LineOffset << "." + << DIL->getBaseDiscriminator() << " - weight: " << R.get() << ")\n"); } return R; @@ -511,8 +520,7 @@ SampleProfileLoader::getInstWeight(const Instruction &Inst) { /// \param BB The basic block to query. /// /// \returns the weight for \p BB. -ErrorOr<uint64_t> -SampleProfileLoader::getBlockWeight(const BasicBlock *BB) { +ErrorOr<uint64_t> SampleProfileLoader::getBlockWeight(const BasicBlock *BB) { uint64_t Max = 0; bool HasWeight = false; for (auto &I : BB->getInstList()) { @@ -565,16 +573,49 @@ SampleProfileLoader::findCalleeFunctionSamples(const Instruction &Inst) const { if (!DIL) { return nullptr; } - DISubprogram *SP = DIL->getScope()->getSubprogram(); - if (!SP) - return nullptr; + + StringRef CalleeName; + if (const CallInst *CI = dyn_cast<CallInst>(&Inst)) + if (Function *Callee = CI->getCalledFunction()) + CalleeName = Callee->getName(); const FunctionSamples *FS = findFunctionSamples(Inst); if (FS == nullptr) return nullptr; - return FS->findFunctionSamplesAt(LineLocation( - getOffset(DIL->getLine(), SP->getLine()), DIL->getDiscriminator())); + return FS->findFunctionSamplesAt( + LineLocation(getOffset(DIL), DIL->getBaseDiscriminator()), CalleeName); +} + +/// Returns a vector of FunctionSamples that are the indirect call targets +/// of \p Inst. The vector is sorted by the total number of samples. +std::vector<const FunctionSamples *> +SampleProfileLoader::findIndirectCallFunctionSamples( + const Instruction &Inst) const { + const DILocation *DIL = Inst.getDebugLoc(); + std::vector<const FunctionSamples *> R; + + if (!DIL) { + return R; + } + + const FunctionSamples *FS = findFunctionSamples(Inst); + if (FS == nullptr) + return R; + + if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt( + LineLocation(getOffset(DIL), DIL->getBaseDiscriminator()))) { + if (M->size() == 0) + return R; + for (const auto &NameFS : *M) { + R.push_back(&NameFS.second); + } + std::sort(R.begin(), R.end(), + [](const FunctionSamples *L, const FunctionSamples *R) { + return L->getTotalSamples() > R->getTotalSamples(); + }); + } + return R; } /// \brief Get the FunctionSamples for an instruction. @@ -588,23 +629,23 @@ SampleProfileLoader::findCalleeFunctionSamples(const Instruction &Inst) const { /// \returns the FunctionSamples pointer to the inlined instance. const FunctionSamples * SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const { - SmallVector<LineLocation, 10> S; + SmallVector<std::pair<LineLocation, StringRef>, 10> S; const DILocation *DIL = Inst.getDebugLoc(); - if (!DIL) { + if (!DIL) return Samples; - } + + const DILocation *PrevDIL = DIL; for (DIL = DIL->getInlinedAt(); DIL; DIL = DIL->getInlinedAt()) { - DISubprogram *SP = DIL->getScope()->getSubprogram(); - if (!SP) - return nullptr; - S.push_back(LineLocation(getOffset(DIL->getLine(), SP->getLine()), - DIL->getDiscriminator())); + S.push_back(std::make_pair( + LineLocation(getOffset(DIL), DIL->getBaseDiscriminator()), + PrevDIL->getScope()->getSubprogram()->getLinkageName())); + PrevDIL = DIL; } if (S.size() == 0) return Samples; const FunctionSamples *FS = Samples; for (int i = S.size() - 1; i >= 0 && FS != nullptr; i--) { - FS = FS->findFunctionSamplesAt(S[i]); + FS = FS->findFunctionSamplesAt(S[i].first, S[i].second); } return FS; } @@ -614,14 +655,17 @@ SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const { /// Iteratively traverse all callsites of the function \p F, and find if /// the corresponding inlined instance exists and is hot in profile. If /// it is hot enough, inline the callsites and adds new callsites of the -/// callee into the caller. -/// -/// TODO: investigate the possibility of not invoking InlineFunction directly. +/// callee into the caller. If the call is an indirect call, first promote +/// it to direct call. Each indirect call is limited with a single target. /// /// \param F function to perform iterative inlining. +/// \param ImportGUIDs a set to be updated to include all GUIDs that come +/// from a different module but inlined in the profiled binary. /// /// \returns True if there is any inline happened. -bool SampleProfileLoader::inlineHotFunctions(Function &F) { +bool SampleProfileLoader::inlineHotFunctions( + Function &F, DenseSet<GlobalValue::GUID> &ImportGUIDs) { + DenseSet<Instruction *> PromotedInsns; bool Changed = false; LLVMContext &Ctx = F.getContext(); std::function<AssumptionCache &(Function &)> GetAssumptionCache = [&]( @@ -635,7 +679,7 @@ bool SampleProfileLoader::inlineHotFunctions(Function &F) { for (auto &I : BB.getInstList()) { const FunctionSamples *FS = nullptr; if ((isa<CallInst>(I) || isa<InvokeInst>(I)) && - (FS = findCalleeFunctionSamples(I))) { + !isa<IntrinsicInst>(I) && (FS = findCalleeFunctionSamples(I))) { Candidates.push_back(&I); if (callsiteIsHot(Samples, FS)) Hot = true; @@ -647,18 +691,55 @@ bool SampleProfileLoader::inlineHotFunctions(Function &F) { } for (auto I : CIS) { InlineFunctionInfo IFI(nullptr, ACT ? &GetAssumptionCache : nullptr); - CallSite CS(I); - Function *CalledFunction = CS.getCalledFunction(); - if (!CalledFunction || !CalledFunction->getSubprogram()) + Function *CalledFunction = CallSite(I).getCalledFunction(); + // Do not inline recursive calls. + if (CalledFunction == &F) continue; + Instruction *DI = I; + if (!CalledFunction && !PromotedInsns.count(I) && + CallSite(I).isIndirectCall()) + for (const auto *FS : findIndirectCallFunctionSamples(*I)) { + auto CalleeFunctionName = FS->getName(); + // If it is a recursive call, we do not inline it as it could bloat + // the code exponentially. There is way to better handle this, e.g. + // clone the caller first, and inline the cloned caller if it is + // recursive. As llvm does not inline recursive calls, we will simply + // ignore it instead of handling it explicitly. + if (CalleeFunctionName == F.getName()) + continue; + const char *Reason = "Callee function not available"; + auto R = SymbolMap.find(CalleeFunctionName); + if (R == SymbolMap.end()) + continue; + CalledFunction = R->getValue(); + if (CalledFunction && isLegalToPromote(I, CalledFunction, &Reason)) { + // The indirect target was promoted and inlined in the profile, as a + // result, we do not have profile info for the branch probability. + // We set the probability to 80% taken to indicate that the static + // call is likely taken. + DI = dyn_cast<Instruction>( + promoteIndirectCall(I, CalledFunction, 80, 100, false) + ->stripPointerCasts()); + PromotedInsns.insert(I); + } else { + DEBUG(dbgs() << "\nFailed to promote indirect call to " + << CalleeFunctionName << " because " << Reason + << "\n"); + continue; + } + } + if (!CalledFunction || !CalledFunction->getSubprogram()) { + findCalleeFunctionSamples(*I)->findImportedFunctions( + ImportGUIDs, F.getParent(), + Samples->getTotalSamples() * SampleProfileHotThreshold / 100); + continue; + } DebugLoc DLoc = I->getDebugLoc(); - uint64_t NumSamples = findCalleeFunctionSamples(*I)->getTotalSamples(); - if (InlineFunction(CS, IFI)) { + if (InlineFunction(CallSite(DI), IFI)) { LocalChanged = true; emitOptimizationRemark(Ctx, DEBUG_TYPE, F, DLoc, Twine("inlined hot callee '") + - CalledFunction->getName() + "' with " + - Twine(NumSamples) + " samples into '" + + CalledFunction->getName() + "' into '" + F.getName() + "'"); } } @@ -694,9 +775,10 @@ bool SampleProfileLoader::inlineHotFunctions(Function &F) { /// \param DomTree Opposite dominator tree. If \p Descendants is filled /// with blocks from \p BB1's dominator tree, then /// this is the post-dominator tree, and vice versa. +template <bool IsPostDom> void SampleProfileLoader::findEquivalencesFor( BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants, - DominatorTreeBase<BasicBlock> *DomTree) { + DominatorTreeBase<BasicBlock, IsPostDom> *DomTree) { const BasicBlock *EC = EquivalenceClass[BB1]; uint64_t Weight = BlockWeights[EC]; for (const auto *BB2 : Descendants) { @@ -994,6 +1076,26 @@ void SampleProfileLoader::buildEdges(Function &F) { } } +/// Sorts the CallTargetMap \p M by count in descending order and stores the +/// sorted result in \p Sorted. Returns the total counts. +static uint64_t SortCallTargets(SmallVector<InstrProfValueData, 2> &Sorted, + const SampleRecord::CallTargetMap &M) { + Sorted.clear(); + uint64_t Sum = 0; + for (auto I = M.begin(); I != M.end(); ++I) { + Sum += I->getValue(); + Sorted.push_back({Function::getGUID(I->getKey()), I->getValue()}); + } + std::sort(Sorted.begin(), Sorted.end(), + [](const InstrProfValueData &L, const InstrProfValueData &R) { + if (L.Count == R.Count) + return L.Value > R.Value; + else + return L.Count > R.Count; + }); + return Sum; +} + /// \brief Propagate weights into edges /// /// The following rules are applied to every block BB in the CFG: @@ -1015,10 +1117,6 @@ void SampleProfileLoader::propagateWeights(Function &F) { bool Changed = true; unsigned I = 0; - // Add an entry count to the function using the samples gathered - // at the function entry. - F.setEntryCount(Samples->getHeadSamples() + 1); - // If BB weight is larger than its corresponding loop's header BB weight, // use the BB weight to replace the loop header BB weight. for (auto &BI : F) { @@ -1071,13 +1169,32 @@ void SampleProfileLoader::propagateWeights(Function &F) { if (BlockWeights[BB]) { for (auto &I : BB->getInstList()) { - if (CallInst *CI = dyn_cast<CallInst>(&I)) { - if (!dyn_cast<IntrinsicInst>(&I)) { - SmallVector<uint32_t, 1> Weights; - Weights.push_back(BlockWeights[BB]); - CI->setMetadata(LLVMContext::MD_prof, - MDB.createBranchWeights(Weights)); - } + if (!isa<CallInst>(I) && !isa<InvokeInst>(I)) + continue; + CallSite CS(&I); + if (!CS.getCalledFunction()) { + const DebugLoc &DLoc = I.getDebugLoc(); + if (!DLoc) + continue; + const DILocation *DIL = DLoc; + uint32_t LineOffset = getOffset(DIL); + uint32_t Discriminator = DIL->getBaseDiscriminator(); + + const FunctionSamples *FS = findFunctionSamples(I); + if (!FS) + continue; + auto T = FS->findCallTargetMapAt(LineOffset, Discriminator); + if (!T || T.get().size() == 0) + continue; + SmallVector<InstrProfValueData, 2> SortedCallTargets; + uint64_t Sum = SortCallTargets(SortedCallTargets, T.get()); + annotateValueSite(*I.getParent()->getParent()->getParent(), I, + SortedCallTargets, Sum, IPVK_IndirectCallTarget, + SortedCallTargets.size()); + } else if (!dyn_cast<IntrinsicInst>(&I)) { + SmallVector<uint32_t, 1> Weights; + Weights.push_back(BlockWeights[BB]); + I.setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights)); } } } @@ -1087,8 +1204,11 @@ void SampleProfileLoader::propagateWeights(Function &F) { if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI)) continue; + DebugLoc BranchLoc = TI->getDebugLoc(); DEBUG(dbgs() << "\nGetting weights for branch at line " - << TI->getDebugLoc().getLine() << ".\n"); + << ((BranchLoc) ? Twine(BranchLoc.getLine()) + : Twine("<UNKNOWN LOCATION>")) + << ".\n"); SmallVector<uint32_t, 4> Weights; uint32_t MaxWeight = 0; DebugLoc MaxDestLoc; @@ -1115,13 +1235,16 @@ void SampleProfileLoader::propagateWeights(Function &F) { } } + uint64_t TempWeight; // Only set weights if there is at least one non-zero weight. // In any other case, let the analyzer set weights. - if (MaxWeight > 0) { + // Do not set weights if the weights are present. In ThinLTO, the profile + // annotation is done twice. If the first annotation already set the + // weights, the second pass does not need to set it. + if (MaxWeight > 0 && !TI->extractProfTotalWeight(TempWeight)) { DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n"); TI->setMetadata(llvm::LLVMContext::MD_prof, MDB.createBranchWeights(Weights)); - DebugLoc BranchLoc = TI->getDebugLoc(); emitOptimizationRemark( Ctx, DEBUG_TYPE, F, MaxDestLoc, Twine("most popular destination for conditional branches at ") + @@ -1163,7 +1286,7 @@ void SampleProfileLoader::computeDominanceAndLoopInfo(Function &F) { DT.reset(new DominatorTree); DT->recalculate(F); - PDT.reset(new DominatorTreeBase<BasicBlock>(true)); + PDT.reset(new PostDomTreeBase<BasicBlock>()); PDT->recalculate(F); LI.reset(new LoopInfo); @@ -1228,12 +1351,19 @@ bool SampleProfileLoader::emitAnnotations(Function &F) { DEBUG(dbgs() << "Line number for the first instruction in " << F.getName() << ": " << getFunctionLoc(F) << "\n"); - Changed |= inlineHotFunctions(F); + DenseSet<GlobalValue::GUID> ImportGUIDs; + Changed |= inlineHotFunctions(F, ImportGUIDs); // Compute basic block weights. Changed |= computeBlockWeights(F); if (Changed) { + // Add an entry count to the function using the samples gathered at the + // function entry. Also sets the GUIDs that comes from a different + // module but inlined in the profiled binary. This is aiming at making + // the IR match the profiled binary before annotation. + F.setEntryCount(Samples->getHeadSamples() + 1, &ImportGUIDs); + // Compute dominance and loop info needed for propagation. computeDominanceAndLoopInfo(F); @@ -1309,6 +1439,26 @@ bool SampleProfileLoader::runOnModule(Module &M) { for (const auto &I : Reader->getProfiles()) TotalCollectedSamples += I.second.getTotalSamples(); + // Populate the symbol map. + for (const auto &N_F : M.getValueSymbolTable()) { + std::string OrigName = N_F.getKey(); + Function *F = dyn_cast<Function>(N_F.getValue()); + if (F == nullptr) + continue; + SymbolMap[OrigName] = F; + auto pos = OrigName.find('.'); + if (pos != std::string::npos) { + std::string NewName = OrigName.substr(0, pos); + auto r = SymbolMap.insert(std::make_pair(NewName, F)); + // Failiing to insert means there is already an entry in SymbolMap, + // thus there are multiple functions that are mapped to the same + // stripped name. In this case of name conflicting, set the value + // to nullptr to avoid confusion. + if (!r.second) + r.first->second = nullptr; + } + } + bool retval = false; for (auto &F : M) if (!F.isDeclaration()) { @@ -1329,7 +1479,7 @@ bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) { bool SampleProfileLoader::runOnFunction(Function &F) { F.setEntryCount(0); Samples = Reader->getSamplesFor(F); - if (!Samples->empty()) + if (Samples && !Samples->empty()) return emitAnnotations(F); return false; } @@ -1337,7 +1487,8 @@ bool SampleProfileLoader::runOnFunction(Function &F) { PreservedAnalyses SampleProfileLoaderPass::run(Module &M, ModuleAnalysisManager &AM) { - SampleProfileLoader SampleLoader(SampleProfileFile); + SampleProfileLoader SampleLoader( + ProfileFileName.empty() ? SampleProfileFile : ProfileFileName); SampleLoader.doInitialization(M); diff --git a/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp b/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp index 8f6f161..de1b51e 100644 --- a/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp +++ b/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp @@ -20,7 +20,6 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/IPO.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DebugInfo.h" @@ -30,6 +29,7 @@ #include "llvm/IR/TypeFinder.h" #include "llvm/IR/ValueSymbolTable.h" #include "llvm/Pass.h" +#include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -323,6 +323,14 @@ bool StripDeadDebugInfo::runOnModule(Module &M) { LiveGVs.insert(GVE); } + std::set<DICompileUnit *> LiveCUs; + // Any CU referenced from a subprogram is live. + for (DISubprogram *SP : F.subprograms()) { + if (SP->getUnit()) + LiveCUs.insert(SP->getUnit()); + } + + bool HasDeadCUs = false; for (DICompileUnit *DIC : F.compile_units()) { // Create our live global variable list. bool GlobalVariableChange = false; @@ -341,6 +349,11 @@ bool StripDeadDebugInfo::runOnModule(Module &M) { GlobalVariableChange = true; } + if (!LiveGlobalVariables.empty()) + LiveCUs.insert(DIC); + else if (!LiveCUs.count(DIC)) + HasDeadCUs = true; + // If we found dead global variables, replace the current global // variable list with our new live global variable list. if (GlobalVariableChange) { @@ -352,5 +365,16 @@ bool StripDeadDebugInfo::runOnModule(Module &M) { LiveGlobalVariables.clear(); } + if (HasDeadCUs) { + // Delete the old node and replace it with a new one + NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.cu"); + NMD->clearOperands(); + if (!LiveCUs.empty()) { + for (DICompileUnit *CU : LiveCUs) + NMD->addOperand(CU); + } + Changed = true; + } + return Changed; } diff --git a/contrib/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/contrib/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp index 3680cfc..8ef6bb6 100644 --- a/contrib/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp +++ b/contrib/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp @@ -6,99 +6,67 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// -// This pass prepares a module containing type metadata for ThinLTO by splitting -// it into regular and thin LTO parts if possible, and writing both parts to -// a multi-module bitcode file. Modules that do not contain type metadata are -// written unmodified as a single module. -// -//===----------------------------------------------------------------------===// -#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/IPO/ThinLTOBitcodeWriter.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/ModuleSummaryAnalysis.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TypeMetadataUtils.h" #include "llvm/Bitcode/BitcodeWriter.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DebugInfo.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/Pass.h" +#include "llvm/Support/FileSystem.h" #include "llvm/Support/ScopedPrinter.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/IPO/FunctionAttrs.h" #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" using namespace llvm; namespace { -// Produce a unique identifier for this module by taking the MD5 sum of the -// names of the module's strong external symbols. This identifier is -// normally guaranteed to be unique, or the program would fail to link due to -// multiply defined symbols. -// -// If the module has no strong external symbols (such a module may still have a -// semantic effect if it performs global initialization), we cannot produce a -// unique identifier for this module, so we return the empty string, which -// causes the entire module to be written as a regular LTO module. -std::string getModuleId(Module *M) { - MD5 Md5; - bool ExportsSymbols = false; - auto AddGlobal = [&](GlobalValue &GV) { - if (GV.isDeclaration() || GV.getName().startswith("llvm.") || - !GV.hasExternalLinkage()) - return; - ExportsSymbols = true; - Md5.update(GV.getName()); - Md5.update(ArrayRef<uint8_t>{0}); - }; - - for (auto &F : *M) - AddGlobal(F); - for (auto &GV : M->globals()) - AddGlobal(GV); - for (auto &GA : M->aliases()) - AddGlobal(GA); - for (auto &IF : M->ifuncs()) - AddGlobal(IF); - - if (!ExportsSymbols) - return ""; - - MD5::MD5Result R; - Md5.final(R); - - SmallString<32> Str; - MD5::stringifyResult(R, Str); - return ("$" + Str).str(); -} - // Promote each local-linkage entity defined by ExportM and used by ImportM by // changing visibility and appending the given ModuleId. -void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId) { - auto PromoteInternal = [&](GlobalValue &ExportGV) { +void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId, + SetVector<GlobalValue *> &PromoteExtra) { + DenseMap<const Comdat *, Comdat *> RenamedComdats; + for (auto &ExportGV : ExportM.global_values()) { if (!ExportGV.hasLocalLinkage()) - return; + continue; + + auto Name = ExportGV.getName(); + GlobalValue *ImportGV = ImportM.getNamedValue(Name); + if ((!ImportGV || ImportGV->use_empty()) && !PromoteExtra.count(&ExportGV)) + continue; - GlobalValue *ImportGV = ImportM.getNamedValue(ExportGV.getName()); - if (!ImportGV || ImportGV->use_empty()) - return; + std::string NewName = (Name + ModuleId).str(); - std::string NewName = (ExportGV.getName() + ModuleId).str(); + if (const auto *C = ExportGV.getComdat()) + if (C->getName() == Name) + RenamedComdats.try_emplace(C, ExportM.getOrInsertComdat(NewName)); ExportGV.setName(NewName); ExportGV.setLinkage(GlobalValue::ExternalLinkage); ExportGV.setVisibility(GlobalValue::HiddenVisibility); - ImportGV->setName(NewName); - ImportGV->setVisibility(GlobalValue::HiddenVisibility); - }; + if (ImportGV) { + ImportGV->setName(NewName); + ImportGV->setVisibility(GlobalValue::HiddenVisibility); + } + } - for (auto &F : ExportM) - PromoteInternal(F); - for (auto &GV : ExportM.globals()) - PromoteInternal(GV); - for (auto &GA : ExportM.aliases()) - PromoteInternal(GA); - for (auto &IF : ExportM.ifuncs()) - PromoteInternal(IF); + if (!RenamedComdats.empty()) + for (auto &GO : ExportM.global_objects()) + if (auto *C = GO.getComdat()) { + auto Replacement = RenamedComdats.find(C); + if (Replacement != RenamedComdats.end()) + GO.setComdat(Replacement->second); + } } // Promote all internal (i.e. distinct) type ids used by the module by replacing @@ -194,24 +162,7 @@ void simplifyExternals(Module &M) { } void filterModule( - Module *M, std::function<bool(const GlobalValue *)> ShouldKeepDefinition) { - for (Function &F : *M) { - if (ShouldKeepDefinition(&F)) - continue; - - F.deleteBody(); - F.clearMetadata(); - } - - for (GlobalVariable &GV : M->globals()) { - if (ShouldKeepDefinition(&GV)) - continue; - - GV.setInitializer(nullptr); - GV.setLinkage(GlobalValue::ExternalLinkage); - GV.clearMetadata(); - } - + Module *M, function_ref<bool(const GlobalValue *)> ShouldKeepDefinition) { for (Module::alias_iterator I = M->alias_begin(), E = M->alias_end(); I != E;) { GlobalAlias *GA = &*I++; @@ -219,65 +170,227 @@ void filterModule( continue; GlobalObject *GO; - if (I->getValueType()->isFunctionTy()) + if (GA->getValueType()->isFunctionTy()) GO = Function::Create(cast<FunctionType>(GA->getValueType()), GlobalValue::ExternalLinkage, "", M); else GO = new GlobalVariable( *M, GA->getValueType(), false, GlobalValue::ExternalLinkage, - (Constant *)nullptr, "", (GlobalVariable *)nullptr, + nullptr, "", nullptr, GA->getThreadLocalMode(), GA->getType()->getAddressSpace()); GO->takeName(GA); GA->replaceAllUsesWith(GO); GA->eraseFromParent(); } + + for (Function &F : *M) { + if (ShouldKeepDefinition(&F)) + continue; + + F.deleteBody(); + F.setComdat(nullptr); + F.clearMetadata(); + } + + for (GlobalVariable &GV : M->globals()) { + if (ShouldKeepDefinition(&GV)) + continue; + + GV.setInitializer(nullptr); + GV.setLinkage(GlobalValue::ExternalLinkage); + GV.setComdat(nullptr); + GV.clearMetadata(); + } +} + +void forEachVirtualFunction(Constant *C, function_ref<void(Function *)> Fn) { + if (auto *F = dyn_cast<Function>(C)) + return Fn(F); + if (isa<GlobalValue>(C)) + return; + for (Value *Op : C->operands()) + forEachVirtualFunction(cast<Constant>(Op), Fn); } // If it's possible to split M into regular and thin LTO parts, do so and write // a multi-module bitcode file with the two parts to OS. Otherwise, write only a // regular LTO bitcode file to OS. -void splitAndWriteThinLTOBitcode(raw_ostream &OS, Module &M) { - std::string ModuleId = getModuleId(&M); +void splitAndWriteThinLTOBitcode( + raw_ostream &OS, raw_ostream *ThinLinkOS, + function_ref<AAResults &(Function &)> AARGetter, Module &M) { + std::string ModuleId = getUniqueModuleId(&M); if (ModuleId.empty()) { // We couldn't generate a module ID for this module, just write it out as a // regular LTO module. WriteBitcodeToFile(&M, OS); + if (ThinLinkOS) + // We don't have a ThinLTO part, but still write the module to the + // ThinLinkOS if requested so that the expected output file is produced. + WriteBitcodeToFile(&M, *ThinLinkOS); return; } promoteTypeIds(M, ModuleId); - auto IsInMergedM = [&](const GlobalValue *GV) { - auto *GVar = dyn_cast<GlobalVariable>(GV->getBaseObject()); - if (!GVar) - return false; - + // Returns whether a global has attached type metadata. Such globals may + // participate in CFI or whole-program devirtualization, so they need to + // appear in the merged module instead of the thin LTO module. + auto HasTypeMetadata = [&](const GlobalObject *GO) { SmallVector<MDNode *, 1> MDs; - GVar->getMetadata(LLVMContext::MD_type, MDs); + GO->getMetadata(LLVMContext::MD_type, MDs); return !MDs.empty(); }; + // Collect the set of virtual functions that are eligible for virtual constant + // propagation. Each eligible function must not access memory, must return + // an integer of width <=64 bits, must take at least one argument, must not + // use its first argument (assumed to be "this") and all arguments other than + // the first one must be of <=64 bit integer type. + // + // Note that we test whether this copy of the function is readnone, rather + // than testing function attributes, which must hold for any copy of the + // function, even a less optimized version substituted at link time. This is + // sound because the virtual constant propagation optimizations effectively + // inline all implementations of the virtual function into each call site, + // rather than using function attributes to perform local optimization. + std::set<const Function *> EligibleVirtualFns; + // If any member of a comdat lives in MergedM, put all members of that + // comdat in MergedM to keep the comdat together. + DenseSet<const Comdat *> MergedMComdats; + for (GlobalVariable &GV : M.globals()) + if (HasTypeMetadata(&GV)) { + if (const auto *C = GV.getComdat()) + MergedMComdats.insert(C); + forEachVirtualFunction(GV.getInitializer(), [&](Function *F) { + auto *RT = dyn_cast<IntegerType>(F->getReturnType()); + if (!RT || RT->getBitWidth() > 64 || F->arg_empty() || + !F->arg_begin()->use_empty()) + return; + for (auto &Arg : make_range(std::next(F->arg_begin()), F->arg_end())) { + auto *ArgT = dyn_cast<IntegerType>(Arg.getType()); + if (!ArgT || ArgT->getBitWidth() > 64) + return; + } + if (!F->isDeclaration() && + computeFunctionBodyMemoryAccess(*F, AARGetter(*F)) == MAK_ReadNone) + EligibleVirtualFns.insert(F); + }); + } + ValueToValueMapTy VMap; - std::unique_ptr<Module> MergedM(CloneModule(&M, VMap, IsInMergedM)); + std::unique_ptr<Module> MergedM( + CloneModule(&M, VMap, [&](const GlobalValue *GV) -> bool { + if (const auto *C = GV->getComdat()) + if (MergedMComdats.count(C)) + return true; + if (auto *F = dyn_cast<Function>(GV)) + return EligibleVirtualFns.count(F); + if (auto *GVar = dyn_cast_or_null<GlobalVariable>(GV->getBaseObject())) + return HasTypeMetadata(GVar); + return false; + })); + StripDebugInfo(*MergedM); + + for (Function &F : *MergedM) + if (!F.isDeclaration()) { + // Reset the linkage of all functions eligible for virtual constant + // propagation. The canonical definitions live in the thin LTO module so + // that they can be imported. + F.setLinkage(GlobalValue::AvailableExternallyLinkage); + F.setComdat(nullptr); + } - filterModule(&M, [&](const GlobalValue *GV) { return !IsInMergedM(GV); }); + SetVector<GlobalValue *> CfiFunctions; + for (auto &F : M) + if ((!F.hasLocalLinkage() || F.hasAddressTaken()) && HasTypeMetadata(&F)) + CfiFunctions.insert(&F); + + // Remove all globals with type metadata, globals with comdats that live in + // MergedM, and aliases pointing to such globals from the thin LTO module. + filterModule(&M, [&](const GlobalValue *GV) { + if (auto *GVar = dyn_cast_or_null<GlobalVariable>(GV->getBaseObject())) + if (HasTypeMetadata(GVar)) + return false; + if (const auto *C = GV->getComdat()) + if (MergedMComdats.count(C)) + return false; + return true; + }); + + promoteInternals(*MergedM, M, ModuleId, CfiFunctions); + promoteInternals(M, *MergedM, ModuleId, CfiFunctions); + + SmallVector<MDNode *, 8> CfiFunctionMDs; + for (auto V : CfiFunctions) { + Function &F = *cast<Function>(V); + SmallVector<MDNode *, 2> Types; + F.getMetadata(LLVMContext::MD_type, Types); + + auto &Ctx = MergedM->getContext(); + SmallVector<Metadata *, 4> Elts; + Elts.push_back(MDString::get(Ctx, F.getName())); + CfiFunctionLinkage Linkage; + if (!F.isDeclarationForLinker()) + Linkage = CFL_Definition; + else if (F.isWeakForLinker()) + Linkage = CFL_WeakDeclaration; + else + Linkage = CFL_Declaration; + Elts.push_back(ConstantAsMetadata::get( + llvm::ConstantInt::get(Type::getInt8Ty(Ctx), Linkage))); + for (auto Type : Types) + Elts.push_back(Type); + CfiFunctionMDs.push_back(MDTuple::get(Ctx, Elts)); + } - promoteInternals(*MergedM, M, ModuleId); - promoteInternals(M, *MergedM, ModuleId); + if(!CfiFunctionMDs.empty()) { + NamedMDNode *NMD = MergedM->getOrInsertNamedMetadata("cfi.functions"); + for (auto MD : CfiFunctionMDs) + NMD->addOperand(MD); + } simplifyExternals(*MergedM); - SmallVector<char, 0> Buffer; - BitcodeWriter W(Buffer); - // FIXME: Try to re-use BSI and PFI from the original module here. - ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, nullptr); - W.writeModule(&M, /*ShouldPreserveUseListOrder=*/false, &Index, - /*GenerateHash=*/true); + ProfileSummaryInfo PSI(M); + ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, &PSI); - W.writeModule(MergedM.get()); + // Mark the merged module as requiring full LTO. We still want an index for + // it though, so that it can participate in summary-based dead stripping. + MergedM->addModuleFlag(Module::Error, "ThinLTO", uint32_t(0)); + ModuleSummaryIndex MergedMIndex = + buildModuleSummaryIndex(*MergedM, nullptr, &PSI); + SmallVector<char, 0> Buffer; + + BitcodeWriter W(Buffer); + // Save the module hash produced for the full bitcode, which will + // be used in the backends, and use that in the minimized bitcode + // produced for the full link. + ModuleHash ModHash = {{0}}; + W.writeModule(&M, /*ShouldPreserveUseListOrder=*/false, &Index, + /*GenerateHash=*/true, &ModHash); + W.writeModule(MergedM.get(), /*ShouldPreserveUseListOrder=*/false, + &MergedMIndex); + W.writeSymtab(); + W.writeStrtab(); OS << Buffer; + + // If a minimized bitcode module was requested for the thin link, + // strip the debug info (the merged module was already stripped above) + // and write it to the given OS. + if (ThinLinkOS) { + Buffer.clear(); + BitcodeWriter W2(Buffer); + StripDebugInfo(M); + W2.writeModule(&M, /*ShouldPreserveUseListOrder=*/false, &Index, + /*GenerateHash=*/false, &ModHash); + W2.writeModule(MergedM.get(), /*ShouldPreserveUseListOrder=*/false, + &MergedMIndex); + W2.writeSymtab(); + W2.writeStrtab(); + *ThinLinkOS << Buffer; + } } // Returns whether this module needs to be split because it uses type metadata. @@ -292,28 +405,45 @@ bool requiresSplit(Module &M) { return false; } -void writeThinLTOBitcode(raw_ostream &OS, Module &M, - const ModuleSummaryIndex *Index) { +void writeThinLTOBitcode(raw_ostream &OS, raw_ostream *ThinLinkOS, + function_ref<AAResults &(Function &)> AARGetter, + Module &M, const ModuleSummaryIndex *Index) { // See if this module has any type metadata. If so, we need to split it. if (requiresSplit(M)) - return splitAndWriteThinLTOBitcode(OS, M); + return splitAndWriteThinLTOBitcode(OS, ThinLinkOS, AARGetter, M); // Otherwise we can just write it out as a regular module. + + // Save the module hash produced for the full bitcode, which will + // be used in the backends, and use that in the minimized bitcode + // produced for the full link. + ModuleHash ModHash = {{0}}; WriteBitcodeToFile(&M, OS, /*ShouldPreserveUseListOrder=*/false, Index, - /*GenerateHash=*/true); + /*GenerateHash=*/true, &ModHash); + // If a minimized bitcode module was requested for the thin link, + // strip the debug info and write it to the given OS. + if (ThinLinkOS) { + StripDebugInfo(M); + WriteBitcodeToFile(&M, *ThinLinkOS, /*ShouldPreserveUseListOrder=*/false, + Index, + /*GenerateHash=*/false, &ModHash); + } } class WriteThinLTOBitcode : public ModulePass { raw_ostream &OS; // raw_ostream to print on + // The output stream on which to emit a minimized module for use + // just in the thin link, if requested. + raw_ostream *ThinLinkOS; public: static char ID; // Pass identification, replacement for typeid - WriteThinLTOBitcode() : ModulePass(ID), OS(dbgs()) { + WriteThinLTOBitcode() : ModulePass(ID), OS(dbgs()), ThinLinkOS(nullptr) { initializeWriteThinLTOBitcodePass(*PassRegistry::getPassRegistry()); } - explicit WriteThinLTOBitcode(raw_ostream &o) - : ModulePass(ID), OS(o) { + explicit WriteThinLTOBitcode(raw_ostream &o, raw_ostream *ThinLinkOS) + : ModulePass(ID), OS(o), ThinLinkOS(ThinLinkOS) { initializeWriteThinLTOBitcodePass(*PassRegistry::getPassRegistry()); } @@ -322,12 +452,14 @@ public: bool runOnModule(Module &M) override { const ModuleSummaryIndex *Index = &(getAnalysis<ModuleSummaryIndexWrapperPass>().getIndex()); - writeThinLTOBitcode(OS, M, Index); + writeThinLTOBitcode(OS, ThinLinkOS, LegacyAARGetter(*this), M, Index); return true; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); + AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<ModuleSummaryIndexWrapperPass>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } }; } // anonymous namespace @@ -335,10 +467,25 @@ public: char WriteThinLTOBitcode::ID = 0; INITIALIZE_PASS_BEGIN(WriteThinLTOBitcode, "write-thinlto-bitcode", "Write ThinLTO Bitcode", false, true) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(ModuleSummaryIndexWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(WriteThinLTOBitcode, "write-thinlto-bitcode", "Write ThinLTO Bitcode", false, true) -ModulePass *llvm::createWriteThinLTOBitcodePass(raw_ostream &Str) { - return new WriteThinLTOBitcode(Str); +ModulePass *llvm::createWriteThinLTOBitcodePass(raw_ostream &Str, + raw_ostream *ThinLinkOS) { + return new WriteThinLTOBitcode(Str, ThinLinkOS); +} + +PreservedAnalyses +llvm::ThinLTOBitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) { + FunctionAnalysisManager &FAM = + AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); + writeThinLTOBitcode(OS, ThinLinkOS, + [&FAM](Function &F) -> AAResults & { + return FAM.getResult<AAManager>(F); + }, + M, &AM.getResult<ModuleSummaryIndexAnalysis>(M)); + return PreservedAnalyses::all(); } diff --git a/contrib/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/contrib/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp index 844cc0f..00769cd 100644 --- a/contrib/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/contrib/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -25,6 +25,20 @@ // returns 0, or a single vtable's function returns 1, replace each virtual // call with a comparison of the vptr against that vtable's address. // +// This pass is intended to be used during the regular and thin LTO pipelines. +// During regular LTO, the pass determines the best optimization for each +// virtual call and applies the resolutions directly to virtual calls that are +// eligible for virtual call optimization (i.e. calls that use either of the +// llvm.assume(llvm.type.test) or llvm.type.checked.load intrinsics). During +// ThinLTO, the pass operates in two phases: +// - Export phase: this is run during the thin link over a single merged module +// that contains all vtables with !type metadata that participate in the link. +// The pass computes a resolution for each virtual call and stores it in the +// type identifier summary. +// - Import phase: this is run during the thin backends over the individual +// modules. The pass applies the resolutions previously computed during the +// import phase to each eligible virtual call. +// //===----------------------------------------------------------------------===// #include "llvm/Transforms/IPO/WholeProgramDevirt.h" @@ -32,9 +46,11 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseMapInfo.h" #include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/iterator_range.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/TypeMetadataUtils.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" @@ -54,12 +70,16 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" +#include "llvm/IR/ModuleSummaryIndexYAML.h" #include "llvm/Pass.h" #include "llvm/PassRegistry.h" #include "llvm/PassSupport.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/FileSystem.h" #include "llvm/Support/MathExtras.h" #include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/IPO/FunctionAttrs.h" #include "llvm/Transforms/Utils/Evaluator.h" #include <algorithm> #include <cstddef> @@ -72,6 +92,26 @@ using namespace wholeprogramdevirt; #define DEBUG_TYPE "wholeprogramdevirt" +static cl::opt<PassSummaryAction> ClSummaryAction( + "wholeprogramdevirt-summary-action", + cl::desc("What to do with the summary when running this pass"), + cl::values(clEnumValN(PassSummaryAction::None, "none", "Do nothing"), + clEnumValN(PassSummaryAction::Import, "import", + "Import typeid resolutions from summary and globals"), + clEnumValN(PassSummaryAction::Export, "export", + "Export typeid resolutions to summary and globals")), + cl::Hidden); + +static cl::opt<std::string> ClReadSummary( + "wholeprogramdevirt-read-summary", + cl::desc("Read summary from given YAML file before running pass"), + cl::Hidden); + +static cl::opt<std::string> ClWriteSummary( + "wholeprogramdevirt-write-summary", + cl::desc("Write summary to given YAML file after running pass"), + cl::Hidden); + // Find the minimum offset that we may store a value of size Size bits at. If // IsAfter is set, look for an offset before the object, otherwise look for an // offset after the object. @@ -259,15 +299,92 @@ struct VirtualCallSite { } }; +// Call site information collected for a specific VTableSlot and possibly a list +// of constant integer arguments. The grouping by arguments is handled by the +// VTableSlotInfo class. +struct CallSiteInfo { + /// The set of call sites for this slot. Used during regular LTO and the + /// import phase of ThinLTO (as well as the export phase of ThinLTO for any + /// call sites that appear in the merged module itself); in each of these + /// cases we are directly operating on the call sites at the IR level. + std::vector<VirtualCallSite> CallSites; + + // These fields are used during the export phase of ThinLTO and reflect + // information collected from function summaries. + + /// Whether any function summary contains an llvm.assume(llvm.type.test) for + /// this slot. + bool SummaryHasTypeTestAssumeUsers; + + /// CFI-specific: a vector containing the list of function summaries that use + /// the llvm.type.checked.load intrinsic and therefore will require + /// resolutions for llvm.type.test in order to implement CFI checks if + /// devirtualization was unsuccessful. If devirtualization was successful, the + /// pass will clear this vector by calling markDevirt(). If at the end of the + /// pass the vector is non-empty, we will need to add a use of llvm.type.test + /// to each of the function summaries in the vector. + std::vector<FunctionSummary *> SummaryTypeCheckedLoadUsers; + + bool isExported() const { + return SummaryHasTypeTestAssumeUsers || + !SummaryTypeCheckedLoadUsers.empty(); + } + + /// As explained in the comment for SummaryTypeCheckedLoadUsers. + void markDevirt() { SummaryTypeCheckedLoadUsers.clear(); } +}; + +// Call site information collected for a specific VTableSlot. +struct VTableSlotInfo { + // The set of call sites which do not have all constant integer arguments + // (excluding "this"). + CallSiteInfo CSInfo; + + // The set of call sites with all constant integer arguments (excluding + // "this"), grouped by argument list. + std::map<std::vector<uint64_t>, CallSiteInfo> ConstCSInfo; + + void addCallSite(Value *VTable, CallSite CS, unsigned *NumUnsafeUses); + +private: + CallSiteInfo &findCallSiteInfo(CallSite CS); +}; + +CallSiteInfo &VTableSlotInfo::findCallSiteInfo(CallSite CS) { + std::vector<uint64_t> Args; + auto *CI = dyn_cast<IntegerType>(CS.getType()); + if (!CI || CI->getBitWidth() > 64 || CS.arg_empty()) + return CSInfo; + for (auto &&Arg : make_range(CS.arg_begin() + 1, CS.arg_end())) { + auto *CI = dyn_cast<ConstantInt>(Arg); + if (!CI || CI->getBitWidth() > 64) + return CSInfo; + Args.push_back(CI->getZExtValue()); + } + return ConstCSInfo[Args]; +} + +void VTableSlotInfo::addCallSite(Value *VTable, CallSite CS, + unsigned *NumUnsafeUses) { + findCallSiteInfo(CS).CallSites.push_back({VTable, CS, NumUnsafeUses}); +} + struct DevirtModule { Module &M; + function_ref<AAResults &(Function &)> AARGetter; + + ModuleSummaryIndex *ExportSummary; + const ModuleSummaryIndex *ImportSummary; + IntegerType *Int8Ty; PointerType *Int8PtrTy; IntegerType *Int32Ty; + IntegerType *Int64Ty; + IntegerType *IntPtrTy; bool RemarksEnabled; - MapVector<VTableSlot, std::vector<VirtualCallSite>> CallSlots; + MapVector<VTableSlot, VTableSlotInfo> CallSlots; // This map keeps track of the number of "unsafe" uses of a loaded function // pointer. The key is the associated llvm.type.test intrinsic call generated @@ -279,11 +396,18 @@ struct DevirtModule { // true. std::map<CallInst *, unsigned> NumUnsafeUsesForTypeTest; - DevirtModule(Module &M) - : M(M), Int8Ty(Type::getInt8Ty(M.getContext())), + DevirtModule(Module &M, function_ref<AAResults &(Function &)> AARGetter, + ModuleSummaryIndex *ExportSummary, + const ModuleSummaryIndex *ImportSummary) + : M(M), AARGetter(AARGetter), ExportSummary(ExportSummary), + ImportSummary(ImportSummary), Int8Ty(Type::getInt8Ty(M.getContext())), Int8PtrTy(Type::getInt8PtrTy(M.getContext())), Int32Ty(Type::getInt32Ty(M.getContext())), - RemarksEnabled(areRemarksEnabled()) {} + Int64Ty(Type::getInt64Ty(M.getContext())), + IntPtrTy(M.getDataLayout().getIntPtrType(M.getContext(), 0)), + RemarksEnabled(areRemarksEnabled()) { + assert(!(ExportSummary && ImportSummary)); + } bool areRemarksEnabled(); @@ -298,57 +422,169 @@ struct DevirtModule { tryFindVirtualCallTargets(std::vector<VirtualCallTarget> &TargetsForSlot, const std::set<TypeMemberInfo> &TypeMemberInfos, uint64_t ByteOffset); + + void applySingleImplDevirt(VTableSlotInfo &SlotInfo, Constant *TheFn, + bool &IsExported); bool trySingleImplDevirt(MutableArrayRef<VirtualCallTarget> TargetsForSlot, - MutableArrayRef<VirtualCallSite> CallSites); + VTableSlotInfo &SlotInfo, + WholeProgramDevirtResolution *Res); + bool tryEvaluateFunctionsWithArgs( MutableArrayRef<VirtualCallTarget> TargetsForSlot, - ArrayRef<ConstantInt *> Args); - bool tryUniformRetValOpt(IntegerType *RetType, - MutableArrayRef<VirtualCallTarget> TargetsForSlot, - MutableArrayRef<VirtualCallSite> CallSites); + ArrayRef<uint64_t> Args); + + void applyUniformRetValOpt(CallSiteInfo &CSInfo, StringRef FnName, + uint64_t TheRetVal); + bool tryUniformRetValOpt(MutableArrayRef<VirtualCallTarget> TargetsForSlot, + CallSiteInfo &CSInfo, + WholeProgramDevirtResolution::ByArg *Res); + + // Returns the global symbol name that is used to export information about the + // given vtable slot and list of arguments. + std::string getGlobalName(VTableSlot Slot, ArrayRef<uint64_t> Args, + StringRef Name); + + // This function is called during the export phase to create a symbol + // definition containing information about the given vtable slot and list of + // arguments. + void exportGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args, StringRef Name, + Constant *C); + + // This function is called during the import phase to create a reference to + // the symbol definition created during the export phase. + Constant *importGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args, + StringRef Name, unsigned AbsWidth = 0); + + void applyUniqueRetValOpt(CallSiteInfo &CSInfo, StringRef FnName, bool IsOne, + Constant *UniqueMemberAddr); bool tryUniqueRetValOpt(unsigned BitWidth, MutableArrayRef<VirtualCallTarget> TargetsForSlot, - MutableArrayRef<VirtualCallSite> CallSites); + CallSiteInfo &CSInfo, + WholeProgramDevirtResolution::ByArg *Res, + VTableSlot Slot, ArrayRef<uint64_t> Args); + + void applyVirtualConstProp(CallSiteInfo &CSInfo, StringRef FnName, + Constant *Byte, Constant *Bit); bool tryVirtualConstProp(MutableArrayRef<VirtualCallTarget> TargetsForSlot, - ArrayRef<VirtualCallSite> CallSites); + VTableSlotInfo &SlotInfo, + WholeProgramDevirtResolution *Res, VTableSlot Slot); void rebuildGlobal(VTableBits &B); + // Apply the summary resolution for Slot to all virtual calls in SlotInfo. + void importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo); + + // If we were able to eliminate all unsafe uses for a type checked load, + // eliminate the associated type tests by replacing them with true. + void removeRedundantTypeTests(); + bool run(); + + // Lower the module using the action and summary passed as command line + // arguments. For testing purposes only. + static bool runForTesting(Module &M, + function_ref<AAResults &(Function &)> AARGetter); }; struct WholeProgramDevirt : public ModulePass { static char ID; - WholeProgramDevirt() : ModulePass(ID) { + bool UseCommandLine = false; + + ModuleSummaryIndex *ExportSummary; + const ModuleSummaryIndex *ImportSummary; + + WholeProgramDevirt() : ModulePass(ID), UseCommandLine(true) { + initializeWholeProgramDevirtPass(*PassRegistry::getPassRegistry()); + } + + WholeProgramDevirt(ModuleSummaryIndex *ExportSummary, + const ModuleSummaryIndex *ImportSummary) + : ModulePass(ID), ExportSummary(ExportSummary), + ImportSummary(ImportSummary) { initializeWholeProgramDevirtPass(*PassRegistry::getPassRegistry()); } bool runOnModule(Module &M) override { if (skipModule(M)) return false; + if (UseCommandLine) + return DevirtModule::runForTesting(M, LegacyAARGetter(*this)); + return DevirtModule(M, LegacyAARGetter(*this), ExportSummary, ImportSummary) + .run(); + } - return DevirtModule(M).run(); + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } }; } // end anonymous namespace -INITIALIZE_PASS(WholeProgramDevirt, "wholeprogramdevirt", - "Whole program devirtualization", false, false) +INITIALIZE_PASS_BEGIN(WholeProgramDevirt, "wholeprogramdevirt", + "Whole program devirtualization", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(WholeProgramDevirt, "wholeprogramdevirt", + "Whole program devirtualization", false, false) char WholeProgramDevirt::ID = 0; -ModulePass *llvm::createWholeProgramDevirtPass() { - return new WholeProgramDevirt; +ModulePass * +llvm::createWholeProgramDevirtPass(ModuleSummaryIndex *ExportSummary, + const ModuleSummaryIndex *ImportSummary) { + return new WholeProgramDevirt(ExportSummary, ImportSummary); } PreservedAnalyses WholeProgramDevirtPass::run(Module &M, - ModuleAnalysisManager &) { - if (!DevirtModule(M).run()) + ModuleAnalysisManager &AM) { + auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); + auto AARGetter = [&](Function &F) -> AAResults & { + return FAM.getResult<AAManager>(F); + }; + if (!DevirtModule(M, AARGetter, nullptr, nullptr).run()) return PreservedAnalyses::all(); return PreservedAnalyses::none(); } +bool DevirtModule::runForTesting( + Module &M, function_ref<AAResults &(Function &)> AARGetter) { + ModuleSummaryIndex Summary; + + // Handle the command-line summary arguments. This code is for testing + // purposes only, so we handle errors directly. + if (!ClReadSummary.empty()) { + ExitOnError ExitOnErr("-wholeprogramdevirt-read-summary: " + ClReadSummary + + ": "); + auto ReadSummaryFile = + ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(ClReadSummary))); + + yaml::Input In(ReadSummaryFile->getBuffer()); + In >> Summary; + ExitOnErr(errorCodeToError(In.error())); + } + + bool Changed = + DevirtModule( + M, AARGetter, + ClSummaryAction == PassSummaryAction::Export ? &Summary : nullptr, + ClSummaryAction == PassSummaryAction::Import ? &Summary : nullptr) + .run(); + + if (!ClWriteSummary.empty()) { + ExitOnError ExitOnErr( + "-wholeprogramdevirt-write-summary: " + ClWriteSummary + ": "); + std::error_code EC; + raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::F_Text); + ExitOnErr(errorCodeToError(EC)); + + yaml::Output Out(OS); + Out << Summary; + } + + return Changed; +} + void DevirtModule::buildTypeIdentifierMap( std::vector<VTableBits> &Bits, DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap) { @@ -443,9 +679,31 @@ bool DevirtModule::tryFindVirtualCallTargets( return !TargetsForSlot.empty(); } +void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo, + Constant *TheFn, bool &IsExported) { + auto Apply = [&](CallSiteInfo &CSInfo) { + for (auto &&VCallSite : CSInfo.CallSites) { + if (RemarksEnabled) + VCallSite.emitRemark("single-impl", TheFn->getName()); + VCallSite.CS.setCalledFunction(ConstantExpr::getBitCast( + TheFn, VCallSite.CS.getCalledValue()->getType())); + // This use is no longer unsafe. + if (VCallSite.NumUnsafeUses) + --*VCallSite.NumUnsafeUses; + } + if (CSInfo.isExported()) { + IsExported = true; + CSInfo.markDevirt(); + } + }; + Apply(SlotInfo.CSInfo); + for (auto &P : SlotInfo.ConstCSInfo) + Apply(P.second); +} + bool DevirtModule::trySingleImplDevirt( MutableArrayRef<VirtualCallTarget> TargetsForSlot, - MutableArrayRef<VirtualCallSite> CallSites) { + VTableSlotInfo &SlotInfo, WholeProgramDevirtResolution *Res) { // See if the program contains a single implementation of this virtual // function. Function *TheFn = TargetsForSlot[0].Fn; @@ -453,39 +711,51 @@ bool DevirtModule::trySingleImplDevirt( if (TheFn != Target.Fn) return false; + // If so, update each call site to call that implementation directly. if (RemarksEnabled) TargetsForSlot[0].WasDevirt = true; - // If so, update each call site to call that implementation directly. - for (auto &&VCallSite : CallSites) { - if (RemarksEnabled) - VCallSite.emitRemark("single-impl", TheFn->getName()); - VCallSite.CS.setCalledFunction(ConstantExpr::getBitCast( - TheFn, VCallSite.CS.getCalledValue()->getType())); - // This use is no longer unsafe. - if (VCallSite.NumUnsafeUses) - --*VCallSite.NumUnsafeUses; + + bool IsExported = false; + applySingleImplDevirt(SlotInfo, TheFn, IsExported); + if (!IsExported) + return false; + + // If the only implementation has local linkage, we must promote to external + // to make it visible to thin LTO objects. We can only get here during the + // ThinLTO export phase. + if (TheFn->hasLocalLinkage()) { + TheFn->setLinkage(GlobalValue::ExternalLinkage); + TheFn->setVisibility(GlobalValue::HiddenVisibility); + TheFn->setName(TheFn->getName() + "$merged"); } + + Res->TheKind = WholeProgramDevirtResolution::SingleImpl; + Res->SingleImplName = TheFn->getName(); + return true; } bool DevirtModule::tryEvaluateFunctionsWithArgs( MutableArrayRef<VirtualCallTarget> TargetsForSlot, - ArrayRef<ConstantInt *> Args) { + ArrayRef<uint64_t> Args) { // Evaluate each function and store the result in each target's RetVal // field. for (VirtualCallTarget &Target : TargetsForSlot) { if (Target.Fn->arg_size() != Args.size() + 1) return false; - for (unsigned I = 0; I != Args.size(); ++I) - if (Target.Fn->getFunctionType()->getParamType(I + 1) != - Args[I]->getType()) - return false; Evaluator Eval(M.getDataLayout(), nullptr); SmallVector<Constant *, 2> EvalArgs; EvalArgs.push_back( Constant::getNullValue(Target.Fn->getFunctionType()->getParamType(0))); - EvalArgs.insert(EvalArgs.end(), Args.begin(), Args.end()); + for (unsigned I = 0; I != Args.size(); ++I) { + auto *ArgTy = dyn_cast<IntegerType>( + Target.Fn->getFunctionType()->getParamType(I + 1)); + if (!ArgTy) + return false; + EvalArgs.push_back(ConstantInt::get(ArgTy, Args[I])); + } + Constant *RetVal; if (!Eval.EvaluateFunction(Target.Fn, RetVal, EvalArgs) || !isa<ConstantInt>(RetVal)) @@ -495,9 +765,18 @@ bool DevirtModule::tryEvaluateFunctionsWithArgs( return true; } +void DevirtModule::applyUniformRetValOpt(CallSiteInfo &CSInfo, StringRef FnName, + uint64_t TheRetVal) { + for (auto Call : CSInfo.CallSites) + Call.replaceAndErase( + "uniform-ret-val", FnName, RemarksEnabled, + ConstantInt::get(cast<IntegerType>(Call.CS.getType()), TheRetVal)); + CSInfo.markDevirt(); +} + bool DevirtModule::tryUniformRetValOpt( - IntegerType *RetType, MutableArrayRef<VirtualCallTarget> TargetsForSlot, - MutableArrayRef<VirtualCallSite> CallSites) { + MutableArrayRef<VirtualCallTarget> TargetsForSlot, CallSiteInfo &CSInfo, + WholeProgramDevirtResolution::ByArg *Res) { // Uniform return value optimization. If all functions return the same // constant, replace all calls with that constant. uint64_t TheRetVal = TargetsForSlot[0].RetVal; @@ -505,19 +784,77 @@ bool DevirtModule::tryUniformRetValOpt( if (Target.RetVal != TheRetVal) return false; - auto TheRetValConst = ConstantInt::get(RetType, TheRetVal); - for (auto Call : CallSites) - Call.replaceAndErase("uniform-ret-val", TargetsForSlot[0].Fn->getName(), - RemarksEnabled, TheRetValConst); + if (CSInfo.isExported()) { + Res->TheKind = WholeProgramDevirtResolution::ByArg::UniformRetVal; + Res->Info = TheRetVal; + } + + applyUniformRetValOpt(CSInfo, TargetsForSlot[0].Fn->getName(), TheRetVal); if (RemarksEnabled) for (auto &&Target : TargetsForSlot) Target.WasDevirt = true; return true; } +std::string DevirtModule::getGlobalName(VTableSlot Slot, + ArrayRef<uint64_t> Args, + StringRef Name) { + std::string FullName = "__typeid_"; + raw_string_ostream OS(FullName); + OS << cast<MDString>(Slot.TypeID)->getString() << '_' << Slot.ByteOffset; + for (uint64_t Arg : Args) + OS << '_' << Arg; + OS << '_' << Name; + return OS.str(); +} + +void DevirtModule::exportGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args, + StringRef Name, Constant *C) { + GlobalAlias *GA = GlobalAlias::create(Int8Ty, 0, GlobalValue::ExternalLinkage, + getGlobalName(Slot, Args, Name), C, &M); + GA->setVisibility(GlobalValue::HiddenVisibility); +} + +Constant *DevirtModule::importGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args, + StringRef Name, unsigned AbsWidth) { + Constant *C = M.getOrInsertGlobal(getGlobalName(Slot, Args, Name), Int8Ty); + auto *GV = dyn_cast<GlobalVariable>(C); + // We only need to set metadata if the global is newly created, in which + // case it would not have hidden visibility. + if (!GV || GV->getVisibility() == GlobalValue::HiddenVisibility) + return C; + + GV->setVisibility(GlobalValue::HiddenVisibility); + auto SetAbsRange = [&](uint64_t Min, uint64_t Max) { + auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Min)); + auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Max)); + GV->setMetadata(LLVMContext::MD_absolute_symbol, + MDNode::get(M.getContext(), {MinC, MaxC})); + }; + if (AbsWidth == IntPtrTy->getBitWidth()) + SetAbsRange(~0ull, ~0ull); // Full set. + else if (AbsWidth) + SetAbsRange(0, 1ull << AbsWidth); + return GV; +} + +void DevirtModule::applyUniqueRetValOpt(CallSiteInfo &CSInfo, StringRef FnName, + bool IsOne, + Constant *UniqueMemberAddr) { + for (auto &&Call : CSInfo.CallSites) { + IRBuilder<> B(Call.CS.getInstruction()); + Value *Cmp = B.CreateICmp(IsOne ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE, + Call.VTable, UniqueMemberAddr); + Cmp = B.CreateZExt(Cmp, Call.CS->getType()); + Call.replaceAndErase("unique-ret-val", FnName, RemarksEnabled, Cmp); + } + CSInfo.markDevirt(); +} + bool DevirtModule::tryUniqueRetValOpt( unsigned BitWidth, MutableArrayRef<VirtualCallTarget> TargetsForSlot, - MutableArrayRef<VirtualCallSite> CallSites) { + CallSiteInfo &CSInfo, WholeProgramDevirtResolution::ByArg *Res, + VTableSlot Slot, ArrayRef<uint64_t> Args) { // IsOne controls whether we look for a 0 or a 1. auto tryUniqueRetValOptFor = [&](bool IsOne) { const TypeMemberInfo *UniqueMember = nullptr; @@ -533,16 +870,23 @@ bool DevirtModule::tryUniqueRetValOpt( // checked for a uniform return value in tryUniformRetValOpt. assert(UniqueMember); - // Replace each call with the comparison. - for (auto &&Call : CallSites) { - IRBuilder<> B(Call.CS.getInstruction()); - Value *OneAddr = B.CreateBitCast(UniqueMember->Bits->GV, Int8PtrTy); - OneAddr = B.CreateConstGEP1_64(OneAddr, UniqueMember->Offset); - Value *Cmp = B.CreateICmp(IsOne ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE, - Call.VTable, OneAddr); - Call.replaceAndErase("unique-ret-val", TargetsForSlot[0].Fn->getName(), - RemarksEnabled, Cmp); + Constant *UniqueMemberAddr = + ConstantExpr::getBitCast(UniqueMember->Bits->GV, Int8PtrTy); + UniqueMemberAddr = ConstantExpr::getGetElementPtr( + Int8Ty, UniqueMemberAddr, + ConstantInt::get(Int64Ty, UniqueMember->Offset)); + + if (CSInfo.isExported()) { + Res->TheKind = WholeProgramDevirtResolution::ByArg::UniqueRetVal; + Res->Info = IsOne; + + exportGlobal(Slot, Args, "unique_member", UniqueMemberAddr); } + + // Replace each call with the comparison. + applyUniqueRetValOpt(CSInfo, TargetsForSlot[0].Fn->getName(), IsOne, + UniqueMemberAddr); + // Update devirtualization statistics for targets. if (RemarksEnabled) for (auto &&Target : TargetsForSlot) @@ -560,9 +904,30 @@ bool DevirtModule::tryUniqueRetValOpt( return false; } +void DevirtModule::applyVirtualConstProp(CallSiteInfo &CSInfo, StringRef FnName, + Constant *Byte, Constant *Bit) { + for (auto Call : CSInfo.CallSites) { + auto *RetType = cast<IntegerType>(Call.CS.getType()); + IRBuilder<> B(Call.CS.getInstruction()); + Value *Addr = B.CreateGEP(Int8Ty, Call.VTable, Byte); + if (RetType->getBitWidth() == 1) { + Value *Bits = B.CreateLoad(Addr); + Value *BitsAndBit = B.CreateAnd(Bits, Bit); + auto IsBitSet = B.CreateICmpNE(BitsAndBit, ConstantInt::get(Int8Ty, 0)); + Call.replaceAndErase("virtual-const-prop-1-bit", FnName, RemarksEnabled, + IsBitSet); + } else { + Value *ValAddr = B.CreateBitCast(Addr, RetType->getPointerTo()); + Value *Val = B.CreateLoad(RetType, ValAddr); + Call.replaceAndErase("virtual-const-prop", FnName, RemarksEnabled, Val); + } + } + CSInfo.markDevirt(); +} + bool DevirtModule::tryVirtualConstProp( - MutableArrayRef<VirtualCallTarget> TargetsForSlot, - ArrayRef<VirtualCallSite> CallSites) { + MutableArrayRef<VirtualCallTarget> TargetsForSlot, VTableSlotInfo &SlotInfo, + WholeProgramDevirtResolution *Res, VTableSlot Slot) { // This only works if the function returns an integer. auto RetType = dyn_cast<IntegerType>(TargetsForSlot[0].Fn->getReturnType()); if (!RetType) @@ -571,55 +936,38 @@ bool DevirtModule::tryVirtualConstProp( if (BitWidth > 64) return false; - // Make sure that each function does not access memory, takes at least one - // argument, does not use its first argument (which we assume is 'this'), - // and has the same return type. + // Make sure that each function is defined, does not access memory, takes at + // least one argument, does not use its first argument (which we assume is + // 'this'), and has the same return type. + // + // Note that we test whether this copy of the function is readnone, rather + // than testing function attributes, which must hold for any copy of the + // function, even a less optimized version substituted at link time. This is + // sound because the virtual constant propagation optimizations effectively + // inline all implementations of the virtual function into each call site, + // rather than using function attributes to perform local optimization. for (VirtualCallTarget &Target : TargetsForSlot) { - if (!Target.Fn->doesNotAccessMemory() || Target.Fn->arg_empty() || - !Target.Fn->arg_begin()->use_empty() || + if (Target.Fn->isDeclaration() || + computeFunctionBodyMemoryAccess(*Target.Fn, AARGetter(*Target.Fn)) != + MAK_ReadNone || + Target.Fn->arg_empty() || !Target.Fn->arg_begin()->use_empty() || Target.Fn->getReturnType() != RetType) return false; } - // Group call sites by the list of constant arguments they pass. - // The comparator ensures deterministic ordering. - struct ByAPIntValue { - bool operator()(const std::vector<ConstantInt *> &A, - const std::vector<ConstantInt *> &B) const { - return std::lexicographical_compare( - A.begin(), A.end(), B.begin(), B.end(), - [](ConstantInt *AI, ConstantInt *BI) { - return AI->getValue().ult(BI->getValue()); - }); - } - }; - std::map<std::vector<ConstantInt *>, std::vector<VirtualCallSite>, - ByAPIntValue> - VCallSitesByConstantArg; - for (auto &&VCallSite : CallSites) { - std::vector<ConstantInt *> Args; - if (VCallSite.CS.getType() != RetType) - continue; - for (auto &&Arg : - make_range(VCallSite.CS.arg_begin() + 1, VCallSite.CS.arg_end())) { - if (!isa<ConstantInt>(Arg)) - break; - Args.push_back(cast<ConstantInt>(&Arg)); - } - if (Args.size() + 1 != VCallSite.CS.arg_size()) - continue; - - VCallSitesByConstantArg[Args].push_back(VCallSite); - } - - for (auto &&CSByConstantArg : VCallSitesByConstantArg) { + for (auto &&CSByConstantArg : SlotInfo.ConstCSInfo) { if (!tryEvaluateFunctionsWithArgs(TargetsForSlot, CSByConstantArg.first)) continue; - if (tryUniformRetValOpt(RetType, TargetsForSlot, CSByConstantArg.second)) + WholeProgramDevirtResolution::ByArg *ResByArg = nullptr; + if (Res) + ResByArg = &Res->ResByArg[CSByConstantArg.first]; + + if (tryUniformRetValOpt(TargetsForSlot, CSByConstantArg.second, ResByArg)) continue; - if (tryUniqueRetValOpt(BitWidth, TargetsForSlot, CSByConstantArg.second)) + if (tryUniqueRetValOpt(BitWidth, TargetsForSlot, CSByConstantArg.second, + ResByArg, Slot, CSByConstantArg.first)) continue; // Find an allocation offset in bits in all vtables associated with the @@ -659,26 +1007,20 @@ bool DevirtModule::tryVirtualConstProp( for (auto &&Target : TargetsForSlot) Target.WasDevirt = true; - // Rewrite each call to a load from OffsetByte/OffsetBit. - for (auto Call : CSByConstantArg.second) { - IRBuilder<> B(Call.CS.getInstruction()); - Value *Addr = B.CreateConstGEP1_64(Call.VTable, OffsetByte); - if (BitWidth == 1) { - Value *Bits = B.CreateLoad(Addr); - Value *Bit = ConstantInt::get(Int8Ty, 1ULL << OffsetBit); - Value *BitsAndBit = B.CreateAnd(Bits, Bit); - auto IsBitSet = B.CreateICmpNE(BitsAndBit, ConstantInt::get(Int8Ty, 0)); - Call.replaceAndErase("virtual-const-prop-1-bit", - TargetsForSlot[0].Fn->getName(), - RemarksEnabled, IsBitSet); - } else { - Value *ValAddr = B.CreateBitCast(Addr, RetType->getPointerTo()); - Value *Val = B.CreateLoad(RetType, ValAddr); - Call.replaceAndErase("virtual-const-prop", - TargetsForSlot[0].Fn->getName(), - RemarksEnabled, Val); - } + Constant *ByteConst = ConstantInt::get(Int32Ty, OffsetByte); + Constant *BitConst = ConstantInt::get(Int8Ty, 1ULL << OffsetBit); + + if (CSByConstantArg.second.isExported()) { + ResByArg->TheKind = WholeProgramDevirtResolution::ByArg::VirtualConstProp; + exportGlobal(Slot, CSByConstantArg.first, "byte", + ConstantExpr::getIntToPtr(ByteConst, Int8PtrTy)); + exportGlobal(Slot, CSByConstantArg.first, "bit", + ConstantExpr::getIntToPtr(BitConst, Int8PtrTy)); } + + // Rewrite each call to a load from OffsetByte/OffsetBit. + applyVirtualConstProp(CSByConstantArg.second, + TargetsForSlot[0].Fn->getName(), ByteConst, BitConst); } return true; } @@ -733,7 +1075,11 @@ bool DevirtModule::areRemarksEnabled() { if (FL.empty()) return false; const Function &Fn = FL.front(); - auto DI = OptimizationRemark(DEBUG_TYPE, Fn, DebugLoc(), ""); + + const auto &BBL = Fn.getBasicBlockList(); + if (BBL.empty()) + return false; + auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBL.front()); return DI.isEnabled(); } @@ -766,8 +1112,8 @@ void DevirtModule::scanTypeTestUsers(Function *TypeTestFunc, Value *Ptr = CI->getArgOperand(0)->stripPointerCasts(); if (SeenPtrs.insert(Ptr).second) { for (DevirtCallSite Call : DevirtCalls) { - CallSlots[{TypeId, Call.Offset}].push_back( - {CI->getArgOperand(0), Call.CS, nullptr}); + CallSlots[{TypeId, Call.Offset}].addCallSite(CI->getArgOperand(0), + Call.CS, nullptr); } } } @@ -853,14 +1199,79 @@ void DevirtModule::scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc) { if (HasNonCallUses) ++NumUnsafeUses; for (DevirtCallSite Call : DevirtCalls) { - CallSlots[{TypeId, Call.Offset}].push_back( - {Ptr, Call.CS, &NumUnsafeUses}); + CallSlots[{TypeId, Call.Offset}].addCallSite(Ptr, Call.CS, + &NumUnsafeUses); } CI->eraseFromParent(); } } +void DevirtModule::importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo) { + const TypeIdSummary *TidSummary = + ImportSummary->getTypeIdSummary(cast<MDString>(Slot.TypeID)->getString()); + if (!TidSummary) + return; + auto ResI = TidSummary->WPDRes.find(Slot.ByteOffset); + if (ResI == TidSummary->WPDRes.end()) + return; + const WholeProgramDevirtResolution &Res = ResI->second; + + if (Res.TheKind == WholeProgramDevirtResolution::SingleImpl) { + // The type of the function in the declaration is irrelevant because every + // call site will cast it to the correct type. + auto *SingleImpl = M.getOrInsertFunction( + Res.SingleImplName, Type::getVoidTy(M.getContext())); + + // This is the import phase so we should not be exporting anything. + bool IsExported = false; + applySingleImplDevirt(SlotInfo, SingleImpl, IsExported); + assert(!IsExported); + } + + for (auto &CSByConstantArg : SlotInfo.ConstCSInfo) { + auto I = Res.ResByArg.find(CSByConstantArg.first); + if (I == Res.ResByArg.end()) + continue; + auto &ResByArg = I->second; + // FIXME: We should figure out what to do about the "function name" argument + // to the apply* functions, as the function names are unavailable during the + // importing phase. For now we just pass the empty string. This does not + // impact correctness because the function names are just used for remarks. + switch (ResByArg.TheKind) { + case WholeProgramDevirtResolution::ByArg::UniformRetVal: + applyUniformRetValOpt(CSByConstantArg.second, "", ResByArg.Info); + break; + case WholeProgramDevirtResolution::ByArg::UniqueRetVal: { + Constant *UniqueMemberAddr = + importGlobal(Slot, CSByConstantArg.first, "unique_member"); + applyUniqueRetValOpt(CSByConstantArg.second, "", ResByArg.Info, + UniqueMemberAddr); + break; + } + case WholeProgramDevirtResolution::ByArg::VirtualConstProp: { + Constant *Byte = importGlobal(Slot, CSByConstantArg.first, "byte", 32); + Byte = ConstantExpr::getPtrToInt(Byte, Int32Ty); + Constant *Bit = importGlobal(Slot, CSByConstantArg.first, "bit", 8); + Bit = ConstantExpr::getPtrToInt(Bit, Int8Ty); + applyVirtualConstProp(CSByConstantArg.second, "", Byte, Bit); + } + default: + break; + } + } +} + +void DevirtModule::removeRedundantTypeTests() { + auto True = ConstantInt::getTrue(M.getContext()); + for (auto &&U : NumUnsafeUsesForTypeTest) { + if (U.second == 0) { + U.first->replaceAllUsesWith(True); + U.first->eraseFromParent(); + } + } +} + bool DevirtModule::run() { Function *TypeTestFunc = M.getFunction(Intrinsic::getName(Intrinsic::type_test)); @@ -868,7 +1279,11 @@ bool DevirtModule::run() { M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load)); Function *AssumeFunc = M.getFunction(Intrinsic::getName(Intrinsic::assume)); - if ((!TypeTestFunc || TypeTestFunc->use_empty() || !AssumeFunc || + // Normally if there are no users of the devirtualization intrinsics in the + // module, this pass has nothing to do. But if we are exporting, we also need + // to handle any users that appear only in the function summaries. + if (!ExportSummary && + (!TypeTestFunc || TypeTestFunc->use_empty() || !AssumeFunc || AssumeFunc->use_empty()) && (!TypeCheckedLoadFunc || TypeCheckedLoadFunc->use_empty())) return false; @@ -879,6 +1294,17 @@ bool DevirtModule::run() { if (TypeCheckedLoadFunc) scanTypeCheckedLoadUsers(TypeCheckedLoadFunc); + if (ImportSummary) { + for (auto &S : CallSlots) + importResolution(S.first, S.second); + + removeRedundantTypeTests(); + + // The rest of the code is only necessary when exporting or during regular + // LTO, so we are done. + return true; + } + // Rebuild type metadata into a map for easy lookup. std::vector<VTableBits> Bits; DenseMap<Metadata *, std::set<TypeMemberInfo>> TypeIdMap; @@ -886,6 +1312,53 @@ bool DevirtModule::run() { if (TypeIdMap.empty()) return true; + // Collect information from summary about which calls to try to devirtualize. + if (ExportSummary) { + DenseMap<GlobalValue::GUID, TinyPtrVector<Metadata *>> MetadataByGUID; + for (auto &P : TypeIdMap) { + if (auto *TypeId = dyn_cast<MDString>(P.first)) + MetadataByGUID[GlobalValue::getGUID(TypeId->getString())].push_back( + TypeId); + } + + for (auto &P : *ExportSummary) { + for (auto &S : P.second.SummaryList) { + auto *FS = dyn_cast<FunctionSummary>(S.get()); + if (!FS) + continue; + // FIXME: Only add live functions. + for (FunctionSummary::VFuncId VF : FS->type_test_assume_vcalls()) { + for (Metadata *MD : MetadataByGUID[VF.GUID]) { + CallSlots[{MD, VF.Offset}].CSInfo.SummaryHasTypeTestAssumeUsers = + true; + } + } + for (FunctionSummary::VFuncId VF : FS->type_checked_load_vcalls()) { + for (Metadata *MD : MetadataByGUID[VF.GUID]) { + CallSlots[{MD, VF.Offset}] + .CSInfo.SummaryTypeCheckedLoadUsers.push_back(FS); + } + } + for (const FunctionSummary::ConstVCall &VC : + FS->type_test_assume_const_vcalls()) { + for (Metadata *MD : MetadataByGUID[VC.VFunc.GUID]) { + CallSlots[{MD, VC.VFunc.Offset}] + .ConstCSInfo[VC.Args] + .SummaryHasTypeTestAssumeUsers = true; + } + } + for (const FunctionSummary::ConstVCall &VC : + FS->type_checked_load_const_vcalls()) { + for (Metadata *MD : MetadataByGUID[VC.VFunc.GUID]) { + CallSlots[{MD, VC.VFunc.Offset}] + .ConstCSInfo[VC.Args] + .SummaryTypeCheckedLoadUsers.push_back(FS); + } + } + } + } + } + // For each (type, offset) pair: bool DidVirtualConstProp = false; std::map<std::string, Function*> DevirtTargets; @@ -894,19 +1367,39 @@ bool DevirtModule::run() { // function implementation at offset S.first.ByteOffset, and add to // TargetsForSlot. std::vector<VirtualCallTarget> TargetsForSlot; - if (!tryFindVirtualCallTargets(TargetsForSlot, TypeIdMap[S.first.TypeID], - S.first.ByteOffset)) - continue; - - if (!trySingleImplDevirt(TargetsForSlot, S.second) && - tryVirtualConstProp(TargetsForSlot, S.second)) + if (tryFindVirtualCallTargets(TargetsForSlot, TypeIdMap[S.first.TypeID], + S.first.ByteOffset)) { + WholeProgramDevirtResolution *Res = nullptr; + if (ExportSummary && isa<MDString>(S.first.TypeID)) + Res = &ExportSummary + ->getOrInsertTypeIdSummary( + cast<MDString>(S.first.TypeID)->getString()) + .WPDRes[S.first.ByteOffset]; + + if (!trySingleImplDevirt(TargetsForSlot, S.second, Res) && + tryVirtualConstProp(TargetsForSlot, S.second, Res, S.first)) DidVirtualConstProp = true; - // Collect functions devirtualized at least for one call site for stats. - if (RemarksEnabled) - for (const auto &T : TargetsForSlot) - if (T.WasDevirt) - DevirtTargets[T.Fn->getName()] = T.Fn; + // Collect functions devirtualized at least for one call site for stats. + if (RemarksEnabled) + for (const auto &T : TargetsForSlot) + if (T.WasDevirt) + DevirtTargets[T.Fn->getName()] = T.Fn; + } + + // CFI-specific: if we are exporting and any llvm.type.checked.load + // intrinsics were *not* devirtualized, we need to add the resulting + // llvm.type.test intrinsics to the function summaries so that the + // LowerTypeTests pass will export them. + if (ExportSummary && isa<MDString>(S.first.TypeID)) { + auto GUID = + GlobalValue::getGUID(cast<MDString>(S.first.TypeID)->getString()); + for (auto FS : S.second.CSInfo.SummaryTypeCheckedLoadUsers) + FS->addTypeTest(GUID); + for (auto &CCS : S.second.ConstCSInfo) + for (auto FS : CCS.second.SummaryTypeCheckedLoadUsers) + FS->addTypeTest(GUID); + } } if (RemarksEnabled) { @@ -914,23 +1407,12 @@ bool DevirtModule::run() { for (const auto &DT : DevirtTargets) { Function *F = DT.second; DISubprogram *SP = F->getSubprogram(); - DebugLoc DL = SP ? DebugLoc::get(SP->getScopeLine(), 0, SP) : DebugLoc(); - emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, DL, + emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, SP, Twine("devirtualized ") + F->getName()); } } - // If we were able to eliminate all unsafe uses for a type checked load, - // eliminate the type test by replacing it with true. - if (TypeCheckedLoadFunc) { - auto True = ConstantInt::getTrue(M.getContext()); - for (auto &&U : NumUnsafeUsesForTypeTest) { - if (U.second == 0) { - U.first->replaceAllUsesWith(True); - U.first->eraseFromParent(); - } - } - } + removeRedundantTypeTests(); // Rebuild each global we touched as part of virtual constant propagation to // include the before and after bytes. |