diff options
Diffstat (limited to 'contrib/llvm/lib/Transforms')
103 files changed, 66551 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp new file mode 100644 index 0000000..0c650cf --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -0,0 +1,906 @@ +//===-- ArgumentPromotion.cpp - Promote by-reference arguments ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass promotes "by reference" arguments to be "by value" arguments. In +// practice, this means looking for internal functions that have pointer +// arguments. If it can prove, through the use of alias analysis, that an +// argument is *only* loaded, then it can pass the value into the function +// instead of the address of the value. This can cause recursive simplification +// of code and lead to the elimination of allocas (especially in C++ template +// code like the STL). +// +// This pass also handles aggregate arguments that are passed into a function, +// scalarizing them if the elements of the aggregate are only loaded. Note that +// by default it refuses to scalarize aggregates which would require passing in +// more than three operands to the function, because passing thousands of +// operands for a large array or structure is unprofitable! This limit can be +// configured or disabled, however. +// +// Note that this transformation could also be done for arguments that are only +// stored to (returning the value instead), but does not currently. This case +// would be best handled when and if LLVM begins supporting multiple return +// values from functions. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "argpromotion" +#include "llvm/Transforms/IPO.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Module.h" +#include "llvm/CallGraphSCCPass.h" +#include "llvm/Instructions.h" +#include "llvm/LLVMContext.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Support/CFG.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" +#include <set> +using namespace llvm; + +STATISTIC(NumArgumentsPromoted , "Number of pointer arguments promoted"); +STATISTIC(NumAggregatesPromoted, "Number of aggregate arguments promoted"); +STATISTIC(NumByValArgsPromoted , "Number of byval arguments promoted"); +STATISTIC(NumArgumentsDead , "Number of dead pointer args eliminated"); + +namespace { + /// ArgPromotion - The 'by reference' to 'by value' argument promotion pass. + /// + struct ArgPromotion : public CallGraphSCCPass { + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<AliasAnalysis>(); + CallGraphSCCPass::getAnalysisUsage(AU); + } + + virtual bool runOnSCC(CallGraphSCC &SCC); + static char ID; // Pass identification, replacement for typeid + explicit ArgPromotion(unsigned maxElements = 3) + : CallGraphSCCPass(ID), maxElements(maxElements) { + initializeArgPromotionPass(*PassRegistry::getPassRegistry()); + } + + /// A vector used to hold the indices of a single GEP instruction + typedef std::vector<uint64_t> IndicesVector; + + private: + CallGraphNode *PromoteArguments(CallGraphNode *CGN); + bool isSafeToPromoteArgument(Argument *Arg, bool isByVal) const; + CallGraphNode *DoPromotion(Function *F, + SmallPtrSet<Argument*, 8> &ArgsToPromote, + SmallPtrSet<Argument*, 8> &ByValArgsToTransform); + /// The maximum number of elements to expand, or 0 for unlimited. + unsigned maxElements; + }; +} + +char ArgPromotion::ID = 0; +INITIALIZE_PASS_BEGIN(ArgPromotion, "argpromotion", + "Promote 'by reference' arguments to scalars", false, false) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_AG_DEPENDENCY(CallGraph) +INITIALIZE_PASS_END(ArgPromotion, "argpromotion", + "Promote 'by reference' arguments to scalars", false, false) + +Pass *llvm::createArgumentPromotionPass(unsigned maxElements) { + return new ArgPromotion(maxElements); +} + +bool ArgPromotion::runOnSCC(CallGraphSCC &SCC) { + bool Changed = false, LocalChange; + + do { // Iterate until we stop promoting from this SCC. + LocalChange = false; + // Attempt to promote arguments from all functions in this SCC. + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { + if (CallGraphNode *CGN = PromoteArguments(*I)) { + LocalChange = true; + SCC.ReplaceNode(*I, CGN); + } + } + Changed |= LocalChange; // Remember that we changed something. + } while (LocalChange); + + return Changed; +} + +/// PromoteArguments - This method checks the specified function to see if there +/// are any promotable arguments and if it is safe to promote the function (for +/// example, all callers are direct). If safe to promote some arguments, it +/// calls the DoPromotion method. +/// +CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) { + Function *F = CGN->getFunction(); + + // Make sure that it is local to this module. + if (!F || !F->hasLocalLinkage()) return 0; + + // First check: see if there are any pointer arguments! If not, quick exit. + SmallVector<std::pair<Argument*, unsigned>, 16> PointerArgs; + unsigned ArgNo = 0; + for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); + I != E; ++I, ++ArgNo) + if (I->getType()->isPointerTy()) + PointerArgs.push_back(std::pair<Argument*, unsigned>(I, ArgNo)); + if (PointerArgs.empty()) return 0; + + // Second check: make sure that all callers are direct callers. We can't + // transform functions that have indirect callers. Also see if the function + // is self-recursive. + bool isSelfRecursive = false; + for (Value::use_iterator UI = F->use_begin(), E = F->use_end(); + UI != E; ++UI) { + CallSite CS(*UI); + // Must be a direct call. + if (CS.getInstruction() == 0 || !CS.isCallee(UI)) return 0; + + if (CS.getInstruction()->getParent()->getParent() == F) + isSelfRecursive = true; + } + + // Check to see which arguments are promotable. If an argument is promotable, + // add it to ArgsToPromote. + SmallPtrSet<Argument*, 8> ArgsToPromote; + SmallPtrSet<Argument*, 8> ByValArgsToTransform; + for (unsigned i = 0; i != PointerArgs.size(); ++i) { + bool isByVal = F->paramHasAttr(PointerArgs[i].second+1, Attribute::ByVal); + Argument *PtrArg = PointerArgs[i].first; + const Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType(); + + // If this is a byval argument, and if the aggregate type is small, just + // pass the elements, which is always safe. + if (isByVal) { + if (const StructType *STy = dyn_cast<StructType>(AgTy)) { + if (maxElements > 0 && STy->getNumElements() > maxElements) { + DEBUG(dbgs() << "argpromotion disable promoting argument '" + << PtrArg->getName() << "' because it would require adding more" + << " than " << maxElements << " arguments to the function.\n"); + continue; + } + + // If all the elements are single-value types, we can promote it. + bool AllSimple = true; + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + if (!STy->getElementType(i)->isSingleValueType()) { + AllSimple = false; + break; + } + } + + // Safe to transform, don't even bother trying to "promote" it. + // Passing the elements as a scalar will allow scalarrepl to hack on + // the new alloca we introduce. + if (AllSimple) { + ByValArgsToTransform.insert(PtrArg); + continue; + } + } + } + + // If the argument is a recursive type and we're in a recursive + // function, we could end up infinitely peeling the function argument. + if (isSelfRecursive) { + if (const StructType *STy = dyn_cast<StructType>(AgTy)) { + bool RecursiveType = false; + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + if (STy->getElementType(i) == PtrArg->getType()) { + RecursiveType = true; + break; + } + } + if (RecursiveType) + continue; + } + } + + // Otherwise, see if we can promote the pointer to its value. + if (isSafeToPromoteArgument(PtrArg, isByVal)) + ArgsToPromote.insert(PtrArg); + } + + // No promotable pointer arguments. + if (ArgsToPromote.empty() && ByValArgsToTransform.empty()) + return 0; + + return DoPromotion(F, ArgsToPromote, ByValArgsToTransform); +} + +/// AllCallersPassInValidPointerForArgument - Return true if we can prove that +/// all callees pass in a valid pointer for the specified function argument. +static bool AllCallersPassInValidPointerForArgument(Argument *Arg) { + Function *Callee = Arg->getParent(); + + unsigned ArgNo = std::distance(Callee->arg_begin(), + Function::arg_iterator(Arg)); + + // Look at all call sites of the function. At this pointer we know we only + // have direct callees. + for (Value::use_iterator UI = Callee->use_begin(), E = Callee->use_end(); + UI != E; ++UI) { + CallSite CS(*UI); + assert(CS && "Should only have direct calls!"); + + if (!CS.getArgument(ArgNo)->isDereferenceablePointer()) + return false; + } + return true; +} + +/// Returns true if Prefix is a prefix of longer. That means, Longer has a size +/// that is greater than or equal to the size of prefix, and each of the +/// elements in Prefix is the same as the corresponding elements in Longer. +/// +/// This means it also returns true when Prefix and Longer are equal! +static bool IsPrefix(const ArgPromotion::IndicesVector &Prefix, + const ArgPromotion::IndicesVector &Longer) { + if (Prefix.size() > Longer.size()) + return false; + for (unsigned i = 0, e = Prefix.size(); i != e; ++i) + if (Prefix[i] != Longer[i]) + return false; + return true; +} + + +/// Checks if Indices, or a prefix of Indices, is in Set. +static bool PrefixIn(const ArgPromotion::IndicesVector &Indices, + std::set<ArgPromotion::IndicesVector> &Set) { + std::set<ArgPromotion::IndicesVector>::iterator Low; + Low = Set.upper_bound(Indices); + if (Low != Set.begin()) + Low--; + // Low is now the last element smaller than or equal to Indices. This means + // it points to a prefix of Indices (possibly Indices itself), if such + // prefix exists. + // + // This load is safe if any prefix of its operands is safe to load. + return Low != Set.end() && IsPrefix(*Low, Indices); +} + +/// Mark the given indices (ToMark) as safe in the given set of indices +/// (Safe). Marking safe usually means adding ToMark to Safe. However, if there +/// is already a prefix of Indices in Safe, Indices are implicitely marked safe +/// already. Furthermore, any indices that Indices is itself a prefix of, are +/// removed from Safe (since they are implicitely safe because of Indices now). +static void MarkIndicesSafe(const ArgPromotion::IndicesVector &ToMark, + std::set<ArgPromotion::IndicesVector> &Safe) { + std::set<ArgPromotion::IndicesVector>::iterator Low; + Low = Safe.upper_bound(ToMark); + // Guard against the case where Safe is empty + if (Low != Safe.begin()) + Low--; + // Low is now the last element smaller than or equal to Indices. This + // means it points to a prefix of Indices (possibly Indices itself), if + // such prefix exists. + if (Low != Safe.end()) { + if (IsPrefix(*Low, ToMark)) + // If there is already a prefix of these indices (or exactly these + // indices) marked a safe, don't bother adding these indices + return; + + // Increment Low, so we can use it as a "insert before" hint + ++Low; + } + // Insert + Low = Safe.insert(Low, ToMark); + ++Low; + // If there we're a prefix of longer index list(s), remove those + std::set<ArgPromotion::IndicesVector>::iterator End = Safe.end(); + while (Low != End && IsPrefix(ToMark, *Low)) { + std::set<ArgPromotion::IndicesVector>::iterator Remove = Low; + ++Low; + Safe.erase(Remove); + } +} + +/// isSafeToPromoteArgument - As you might guess from the name of this method, +/// it checks to see if it is both safe and useful to promote the argument. +/// This method limits promotion of aggregates to only promote up to three +/// elements of the aggregate in order to avoid exploding the number of +/// arguments passed in. +bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, bool isByVal) const { + typedef std::set<IndicesVector> GEPIndicesSet; + + // Quick exit for unused arguments + if (Arg->use_empty()) + return true; + + // We can only promote this argument if all of the uses are loads, or are GEP + // instructions (with constant indices) that are subsequently loaded. + // + // Promoting the argument causes it to be loaded in the caller + // unconditionally. This is only safe if we can prove that either the load + // would have happened in the callee anyway (ie, there is a load in the entry + // block) or the pointer passed in at every call site is guaranteed to be + // valid. + // In the former case, invalid loads can happen, but would have happened + // anyway, in the latter case, invalid loads won't happen. This prevents us + // from introducing an invalid load that wouldn't have happened in the + // original code. + // + // This set will contain all sets of indices that are loaded in the entry + // block, and thus are safe to unconditionally load in the caller. + GEPIndicesSet SafeToUnconditionallyLoad; + + // This set contains all the sets of indices that we are planning to promote. + // This makes it possible to limit the number of arguments added. + GEPIndicesSet ToPromote; + + // If the pointer is always valid, any load with first index 0 is valid. + if (isByVal || AllCallersPassInValidPointerForArgument(Arg)) + SafeToUnconditionallyLoad.insert(IndicesVector(1, 0)); + + // First, iterate the entry block and mark loads of (geps of) arguments as + // safe. + BasicBlock *EntryBlock = Arg->getParent()->begin(); + // Declare this here so we can reuse it + IndicesVector Indices; + for (BasicBlock::iterator I = EntryBlock->begin(), E = EntryBlock->end(); + I != E; ++I) + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + Value *V = LI->getPointerOperand(); + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(V)) { + V = GEP->getPointerOperand(); + if (V == Arg) { + // This load actually loads (part of) Arg? Check the indices then. + Indices.reserve(GEP->getNumIndices()); + for (User::op_iterator II = GEP->idx_begin(), IE = GEP->idx_end(); + II != IE; ++II) + if (ConstantInt *CI = dyn_cast<ConstantInt>(*II)) + Indices.push_back(CI->getSExtValue()); + else + // We found a non-constant GEP index for this argument? Bail out + // right away, can't promote this argument at all. + return false; + + // Indices checked out, mark them as safe + MarkIndicesSafe(Indices, SafeToUnconditionallyLoad); + Indices.clear(); + } + } else if (V == Arg) { + // Direct loads are equivalent to a GEP with a single 0 index. + MarkIndicesSafe(IndicesVector(1, 0), SafeToUnconditionallyLoad); + } + } + + // Now, iterate all uses of the argument to see if there are any uses that are + // not (GEP+)loads, or any (GEP+)loads that are not safe to promote. + SmallVector<LoadInst*, 16> Loads; + IndicesVector Operands; + for (Value::use_iterator UI = Arg->use_begin(), E = Arg->use_end(); + UI != E; ++UI) { + User *U = *UI; + Operands.clear(); + if (LoadInst *LI = dyn_cast<LoadInst>(U)) { + if (LI->isVolatile()) return false; // Don't hack volatile loads + Loads.push_back(LI); + // Direct loads are equivalent to a GEP with a zero index and then a load. + Operands.push_back(0); + } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) { + if (GEP->use_empty()) { + // Dead GEP's cause trouble later. Just remove them if we run into + // them. + getAnalysis<AliasAnalysis>().deleteValue(GEP); + GEP->eraseFromParent(); + // TODO: This runs the above loop over and over again for dead GEPs + // Couldn't we just do increment the UI iterator earlier and erase the + // use? + return isSafeToPromoteArgument(Arg, isByVal); + } + + // Ensure that all of the indices are constants. + for (User::op_iterator i = GEP->idx_begin(), e = GEP->idx_end(); + i != e; ++i) + if (ConstantInt *C = dyn_cast<ConstantInt>(*i)) + Operands.push_back(C->getSExtValue()); + else + return false; // Not a constant operand GEP! + + // Ensure that the only users of the GEP are load instructions. + for (Value::use_iterator UI = GEP->use_begin(), E = GEP->use_end(); + UI != E; ++UI) + if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) { + if (LI->isVolatile()) return false; // Don't hack volatile loads + Loads.push_back(LI); + } else { + // Other uses than load? + return false; + } + } else { + return false; // Not a load or a GEP. + } + + // Now, see if it is safe to promote this load / loads of this GEP. Loading + // is safe if Operands, or a prefix of Operands, is marked as safe. + if (!PrefixIn(Operands, SafeToUnconditionallyLoad)) + return false; + + // See if we are already promoting a load with these indices. If not, check + // to make sure that we aren't promoting too many elements. If so, nothing + // to do. + if (ToPromote.find(Operands) == ToPromote.end()) { + if (maxElements > 0 && ToPromote.size() == maxElements) { + DEBUG(dbgs() << "argpromotion not promoting argument '" + << Arg->getName() << "' because it would require adding more " + << "than " << maxElements << " arguments to the function.\n"); + // We limit aggregate promotion to only promoting up to a fixed number + // of elements of the aggregate. + return false; + } + ToPromote.insert(Operands); + } + } + + if (Loads.empty()) return true; // No users, this is a dead argument. + + // Okay, now we know that the argument is only used by load instructions and + // it is safe to unconditionally perform all of them. Use alias analysis to + // check to see if the pointer is guaranteed to not be modified from entry of + // the function to each of the load instructions. + + // Because there could be several/many load instructions, remember which + // blocks we know to be transparent to the load. + SmallPtrSet<BasicBlock*, 16> TranspBlocks; + + AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); + + for (unsigned i = 0, e = Loads.size(); i != e; ++i) { + // Check to see if the load is invalidated from the start of the block to + // the load itself. + LoadInst *Load = Loads[i]; + BasicBlock *BB = Load->getParent(); + + AliasAnalysis::Location Loc = AA.getLocation(Load); + if (AA.canInstructionRangeModify(BB->front(), *Load, Loc)) + return false; // Pointer is invalidated! + + // Now check every path from the entry block to the load for transparency. + // To do this, we perform a depth first search on the inverse CFG from the + // loading block. + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { + BasicBlock *P = *PI; + for (idf_ext_iterator<BasicBlock*, SmallPtrSet<BasicBlock*, 16> > + I = idf_ext_begin(P, TranspBlocks), + E = idf_ext_end(P, TranspBlocks); I != E; ++I) + if (AA.canBasicBlockModify(**I, Loc)) + return false; + } + } + + // If the path from the entry of the function to each load is free of + // instructions that potentially invalidate the load, we can make the + // transformation! + return true; +} + +/// DoPromotion - This method actually performs the promotion of the specified +/// arguments, and returns the new function. At this point, we know that it's +/// safe to do so. +CallGraphNode *ArgPromotion::DoPromotion(Function *F, + SmallPtrSet<Argument*, 8> &ArgsToPromote, + SmallPtrSet<Argument*, 8> &ByValArgsToTransform) { + + // Start by computing a new prototype for the function, which is the same as + // the old function, but has modified arguments. + const FunctionType *FTy = F->getFunctionType(); + std::vector<const Type*> Params; + + typedef std::set<IndicesVector> ScalarizeTable; + + // ScalarizedElements - If we are promoting a pointer that has elements + // accessed out of it, keep track of which elements are accessed so that we + // can add one argument for each. + // + // Arguments that are directly loaded will have a zero element value here, to + // handle cases where there are both a direct load and GEP accesses. + // + std::map<Argument*, ScalarizeTable> ScalarizedElements; + + // OriginalLoads - Keep track of a representative load instruction from the + // original function so that we can tell the alias analysis implementation + // what the new GEP/Load instructions we are inserting look like. + std::map<IndicesVector, LoadInst*> OriginalLoads; + + // Attributes - Keep track of the parameter attributes for the arguments + // that we are *not* promoting. For the ones that we do promote, the parameter + // attributes are lost + SmallVector<AttributeWithIndex, 8> AttributesVec; + const AttrListPtr &PAL = F->getAttributes(); + + // Add any return attributes. + if (Attributes attrs = PAL.getRetAttributes()) + AttributesVec.push_back(AttributeWithIndex::get(0, attrs)); + + // First, determine the new argument list + unsigned ArgIndex = 1; + for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; + ++I, ++ArgIndex) { + if (ByValArgsToTransform.count(I)) { + // Simple byval argument? Just add all the struct element types. + const Type *AgTy = cast<PointerType>(I->getType())->getElementType(); + const StructType *STy = cast<StructType>(AgTy); + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) + Params.push_back(STy->getElementType(i)); + ++NumByValArgsPromoted; + } else if (!ArgsToPromote.count(I)) { + // Unchanged argument + Params.push_back(I->getType()); + if (Attributes attrs = PAL.getParamAttributes(ArgIndex)) + AttributesVec.push_back(AttributeWithIndex::get(Params.size(), attrs)); + } else if (I->use_empty()) { + // Dead argument (which are always marked as promotable) + ++NumArgumentsDead; + } else { + // Okay, this is being promoted. This means that the only uses are loads + // or GEPs which are only used by loads + + // In this table, we will track which indices are loaded from the argument + // (where direct loads are tracked as no indices). + ScalarizeTable &ArgIndices = ScalarizedElements[I]; + for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI != E; + ++UI) { + Instruction *User = cast<Instruction>(*UI); + assert(isa<LoadInst>(User) || isa<GetElementPtrInst>(User)); + IndicesVector Indices; + Indices.reserve(User->getNumOperands() - 1); + // Since loads will only have a single operand, and GEPs only a single + // non-index operand, this will record direct loads without any indices, + // and gep+loads with the GEP indices. + for (User::op_iterator II = User->op_begin() + 1, IE = User->op_end(); + II != IE; ++II) + Indices.push_back(cast<ConstantInt>(*II)->getSExtValue()); + // GEPs with a single 0 index can be merged with direct loads + if (Indices.size() == 1 && Indices.front() == 0) + Indices.clear(); + ArgIndices.insert(Indices); + LoadInst *OrigLoad; + if (LoadInst *L = dyn_cast<LoadInst>(User)) + OrigLoad = L; + else + // Take any load, we will use it only to update Alias Analysis + OrigLoad = cast<LoadInst>(User->use_back()); + OriginalLoads[Indices] = OrigLoad; + } + + // Add a parameter to the function for each element passed in. + for (ScalarizeTable::iterator SI = ArgIndices.begin(), + E = ArgIndices.end(); SI != E; ++SI) { + // not allowed to dereference ->begin() if size() is 0 + Params.push_back(GetElementPtrInst::getIndexedType(I->getType(), + SI->begin(), + SI->end())); + assert(Params.back()); + } + + if (ArgIndices.size() == 1 && ArgIndices.begin()->empty()) + ++NumArgumentsPromoted; + else + ++NumAggregatesPromoted; + } + } + + // Add any function attributes. + if (Attributes attrs = PAL.getFnAttributes()) + AttributesVec.push_back(AttributeWithIndex::get(~0, attrs)); + + const Type *RetTy = FTy->getReturnType(); + + // Work around LLVM bug PR56: the CWriter cannot emit varargs functions which + // have zero fixed arguments. + bool ExtraArgHack = false; + if (Params.empty() && FTy->isVarArg()) { + ExtraArgHack = true; + Params.push_back(Type::getInt32Ty(F->getContext())); + } + + // Construct the new function type using the new arguments. + FunctionType *NFTy = FunctionType::get(RetTy, Params, FTy->isVarArg()); + + // Create the new function body and insert it into the module. + Function *NF = Function::Create(NFTy, F->getLinkage(), F->getName()); + NF->copyAttributesFrom(F); + + + DEBUG(dbgs() << "ARG PROMOTION: Promoting to:" << *NF << "\n" + << "From: " << *F); + + // Recompute the parameter attributes list based on the new arguments for + // the function. + NF->setAttributes(AttrListPtr::get(AttributesVec.begin(), + AttributesVec.end())); + AttributesVec.clear(); + + F->getParent()->getFunctionList().insert(F, NF); + NF->takeName(F); + + // Get the alias analysis information that we need to update to reflect our + // changes. + AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); + + // Get the callgraph information that we need to update to reflect our + // changes. + CallGraph &CG = getAnalysis<CallGraph>(); + + // Get a new callgraph node for NF. + CallGraphNode *NF_CGN = CG.getOrInsertFunction(NF); + + // Loop over all of the callers of the function, transforming the call sites + // to pass in the loaded pointers. + // + SmallVector<Value*, 16> Args; + while (!F->use_empty()) { + CallSite CS(F->use_back()); + assert(CS.getCalledFunction() == F); + Instruction *Call = CS.getInstruction(); + const AttrListPtr &CallPAL = CS.getAttributes(); + + // Add any return attributes. + if (Attributes attrs = CallPAL.getRetAttributes()) + AttributesVec.push_back(AttributeWithIndex::get(0, attrs)); + + // Loop over the operands, inserting GEP and loads in the caller as + // appropriate. + CallSite::arg_iterator AI = CS.arg_begin(); + ArgIndex = 1; + for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); + I != E; ++I, ++AI, ++ArgIndex) + if (!ArgsToPromote.count(I) && !ByValArgsToTransform.count(I)) { + Args.push_back(*AI); // Unmodified argument + + if (Attributes Attrs = CallPAL.getParamAttributes(ArgIndex)) + AttributesVec.push_back(AttributeWithIndex::get(Args.size(), Attrs)); + + } else if (ByValArgsToTransform.count(I)) { + // Emit a GEP and load for each element of the struct. + const Type *AgTy = cast<PointerType>(I->getType())->getElementType(); + const StructType *STy = cast<StructType>(AgTy); + Value *Idxs[2] = { + ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), 0 }; + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i); + Value *Idx = GetElementPtrInst::Create(*AI, Idxs, Idxs+2, + (*AI)->getName()+"."+utostr(i), + Call); + // TODO: Tell AA about the new values? + Args.push_back(new LoadInst(Idx, Idx->getName()+".val", Call)); + } + } else if (!I->use_empty()) { + // Non-dead argument: insert GEPs and loads as appropriate. + ScalarizeTable &ArgIndices = ScalarizedElements[I]; + // Store the Value* version of the indices in here, but declare it now + // for reuse. + std::vector<Value*> Ops; + for (ScalarizeTable::iterator SI = ArgIndices.begin(), + E = ArgIndices.end(); SI != E; ++SI) { + Value *V = *AI; + LoadInst *OrigLoad = OriginalLoads[*SI]; + if (!SI->empty()) { + Ops.reserve(SI->size()); + const Type *ElTy = V->getType(); + for (IndicesVector::const_iterator II = SI->begin(), + IE = SI->end(); II != IE; ++II) { + // Use i32 to index structs, and i64 for others (pointers/arrays). + // This satisfies GEP constraints. + const Type *IdxTy = (ElTy->isStructTy() ? + Type::getInt32Ty(F->getContext()) : + Type::getInt64Ty(F->getContext())); + Ops.push_back(ConstantInt::get(IdxTy, *II)); + // Keep track of the type we're currently indexing. + ElTy = cast<CompositeType>(ElTy)->getTypeAtIndex(*II); + } + // And create a GEP to extract those indices. + V = GetElementPtrInst::Create(V, Ops.begin(), Ops.end(), + V->getName()+".idx", Call); + Ops.clear(); + AA.copyValue(OrigLoad->getOperand(0), V); + } + // Since we're replacing a load make sure we take the alignment + // of the previous load. + LoadInst *newLoad = new LoadInst(V, V->getName()+".val", Call); + newLoad->setAlignment(OrigLoad->getAlignment()); + // Transfer the TBAA info too. + newLoad->setMetadata(LLVMContext::MD_tbaa, + OrigLoad->getMetadata(LLVMContext::MD_tbaa)); + Args.push_back(newLoad); + AA.copyValue(OrigLoad, Args.back()); + } + } + + if (ExtraArgHack) + Args.push_back(Constant::getNullValue(Type::getInt32Ty(F->getContext()))); + + // Push any varargs arguments on the list. + for (; AI != CS.arg_end(); ++AI, ++ArgIndex) { + Args.push_back(*AI); + if (Attributes Attrs = CallPAL.getParamAttributes(ArgIndex)) + AttributesVec.push_back(AttributeWithIndex::get(Args.size(), Attrs)); + } + + // Add any function attributes. + if (Attributes attrs = CallPAL.getFnAttributes()) + AttributesVec.push_back(AttributeWithIndex::get(~0, attrs)); + + Instruction *New; + if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) { + New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(), + Args.begin(), Args.end(), "", Call); + cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv()); + cast<InvokeInst>(New)->setAttributes(AttrListPtr::get(AttributesVec.begin(), + AttributesVec.end())); + } else { + New = CallInst::Create(NF, Args.begin(), Args.end(), "", Call); + cast<CallInst>(New)->setCallingConv(CS.getCallingConv()); + cast<CallInst>(New)->setAttributes(AttrListPtr::get(AttributesVec.begin(), + AttributesVec.end())); + if (cast<CallInst>(Call)->isTailCall()) + cast<CallInst>(New)->setTailCall(); + } + Args.clear(); + AttributesVec.clear(); + + // Update the alias analysis implementation to know that we are replacing + // the old call with a new one. + AA.replaceWithNewValue(Call, New); + + // Update the callgraph to know that the callsite has been transformed. + CallGraphNode *CalleeNode = CG[Call->getParent()->getParent()]; + CalleeNode->replaceCallEdge(Call, New, NF_CGN); + + if (!Call->use_empty()) { + Call->replaceAllUsesWith(New); + New->takeName(Call); + } + + // Finally, remove the old call from the program, reducing the use-count of + // F. + Call->eraseFromParent(); + } + + // Since we have now created the new function, splice the body of the old + // function right into the new function, leaving the old rotting hulk of the + // function empty. + NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList()); + + // Loop over the argument list, transfering uses of the old arguments over to + // the new arguments, also transfering over the names as well. + // + for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(), + I2 = NF->arg_begin(); I != E; ++I) { + if (!ArgsToPromote.count(I) && !ByValArgsToTransform.count(I)) { + // If this is an unmodified argument, move the name and users over to the + // new version. + I->replaceAllUsesWith(I2); + I2->takeName(I); + AA.replaceWithNewValue(I, I2); + ++I2; + continue; + } + + if (ByValArgsToTransform.count(I)) { + // In the callee, we create an alloca, and store each of the new incoming + // arguments into the alloca. + Instruction *InsertPt = NF->begin()->begin(); + + // Just add all the struct element types. + const Type *AgTy = cast<PointerType>(I->getType())->getElementType(); + Value *TheAlloca = new AllocaInst(AgTy, 0, "", InsertPt); + const StructType *STy = cast<StructType>(AgTy); + Value *Idxs[2] = { + ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), 0 }; + + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i); + Value *Idx = + GetElementPtrInst::Create(TheAlloca, Idxs, Idxs+2, + TheAlloca->getName()+"."+Twine(i), + InsertPt); + I2->setName(I->getName()+"."+Twine(i)); + new StoreInst(I2++, Idx, InsertPt); + } + + // Anything that used the arg should now use the alloca. + I->replaceAllUsesWith(TheAlloca); + TheAlloca->takeName(I); + AA.replaceWithNewValue(I, TheAlloca); + continue; + } + + if (I->use_empty()) { + AA.deleteValue(I); + continue; + } + + // Otherwise, if we promoted this argument, then all users are load + // instructions (or GEPs with only load users), and all loads should be + // using the new argument that we added. + ScalarizeTable &ArgIndices = ScalarizedElements[I]; + + while (!I->use_empty()) { + if (LoadInst *LI = dyn_cast<LoadInst>(I->use_back())) { + assert(ArgIndices.begin()->empty() && + "Load element should sort to front!"); + I2->setName(I->getName()+".val"); + LI->replaceAllUsesWith(I2); + AA.replaceWithNewValue(LI, I2); + LI->eraseFromParent(); + DEBUG(dbgs() << "*** Promoted load of argument '" << I->getName() + << "' in function '" << F->getName() << "'\n"); + } else { + GetElementPtrInst *GEP = cast<GetElementPtrInst>(I->use_back()); + IndicesVector Operands; + Operands.reserve(GEP->getNumIndices()); + for (User::op_iterator II = GEP->idx_begin(), IE = GEP->idx_end(); + II != IE; ++II) + Operands.push_back(cast<ConstantInt>(*II)->getSExtValue()); + + // GEPs with a single 0 index can be merged with direct loads + if (Operands.size() == 1 && Operands.front() == 0) + Operands.clear(); + + Function::arg_iterator TheArg = I2; + for (ScalarizeTable::iterator It = ArgIndices.begin(); + *It != Operands; ++It, ++TheArg) { + assert(It != ArgIndices.end() && "GEP not handled??"); + } + + std::string NewName = I->getName(); + for (unsigned i = 0, e = Operands.size(); i != e; ++i) { + NewName += "." + utostr(Operands[i]); + } + NewName += ".val"; + TheArg->setName(NewName); + + DEBUG(dbgs() << "*** Promoted agg argument '" << TheArg->getName() + << "' of function '" << NF->getName() << "'\n"); + + // All of the uses must be load instructions. Replace them all with + // the argument specified by ArgNo. + while (!GEP->use_empty()) { + LoadInst *L = cast<LoadInst>(GEP->use_back()); + L->replaceAllUsesWith(TheArg); + AA.replaceWithNewValue(L, TheArg); + L->eraseFromParent(); + } + AA.deleteValue(GEP); + GEP->eraseFromParent(); + } + } + + // Increment I2 past all of the arguments added for this promoted pointer. + for (unsigned i = 0, e = ArgIndices.size(); i != e; ++i) + ++I2; + } + + // Notify the alias analysis implementation that we inserted a new argument. + if (ExtraArgHack) + AA.copyValue(Constant::getNullValue(Type::getInt32Ty(F->getContext())), + NF->arg_begin()); + + + // Tell the alias analysis that the old function is about to disappear. + AA.replaceWithNewValue(F, NF); + + + NF_CGN->stealCalledFunctionsFrom(CG[F]); + + // Now that the old function is dead, delete it. If there is a dangling + // reference to the CallgraphNode, just leave the dead function around for + // someone else to nuke. + CallGraphNode *CGN = CG[F]; + if (CGN->getNumReferences() == 0) + delete CG.removeFunctionFromModule(CGN); + else + F->setLinkage(Function::ExternalLinkage); + + return NF_CGN; +} diff --git a/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp b/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp new file mode 100644 index 0000000..a21efce --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp @@ -0,0 +1,190 @@ +//===- ConstantMerge.cpp - Merge duplicate global constants ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the interface to a pass that merges duplicate global +// constants together into a single constant that is shared. This is useful +// because some passes (ie TraceValues) insert a lot of string constants into +// the program, regardless of whether or not an existing string is available. +// +// Algorithm: ConstantMerge is designed to build up a map of available constants +// and eliminate duplicates when it is initialized. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "constmerge" +#include "llvm/Transforms/IPO.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Module.h" +#include "llvm/Pass.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumMerged, "Number of global constants merged"); + +namespace { + struct ConstantMerge : public ModulePass { + static char ID; // Pass identification, replacement for typeid + ConstantMerge() : ModulePass(ID) { + initializeConstantMergePass(*PassRegistry::getPassRegistry()); + } + + // run - For this pass, process all of the globals in the module, + // eliminating duplicate constants. + // + bool runOnModule(Module &M); + }; +} + +char ConstantMerge::ID = 0; +INITIALIZE_PASS(ConstantMerge, "constmerge", + "Merge Duplicate Global Constants", false, false) + +ModulePass *llvm::createConstantMergePass() { return new ConstantMerge(); } + + + +/// Find values that are marked as llvm.used. +static void FindUsedValues(GlobalVariable *LLVMUsed, + SmallPtrSet<const GlobalValue*, 8> &UsedValues) { + if (LLVMUsed == 0) return; + ConstantArray *Inits = dyn_cast<ConstantArray>(LLVMUsed->getInitializer()); + if (Inits == 0) return; + + for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i) + if (GlobalValue *GV = + dyn_cast<GlobalValue>(Inits->getOperand(i)->stripPointerCasts())) + UsedValues.insert(GV); +} + +// True if A is better than B. +static bool IsBetterCannonical(const GlobalVariable &A, + const GlobalVariable &B) { + if (!A.hasLocalLinkage() && B.hasLocalLinkage()) + return true; + + if (A.hasLocalLinkage() && !B.hasLocalLinkage()) + return false; + + return A.hasUnnamedAddr(); +} + +bool ConstantMerge::runOnModule(Module &M) { + // Find all the globals that are marked "used". These cannot be merged. + SmallPtrSet<const GlobalValue*, 8> UsedGlobals; + FindUsedValues(M.getGlobalVariable("llvm.used"), UsedGlobals); + FindUsedValues(M.getGlobalVariable("llvm.compiler.used"), UsedGlobals); + + // Map unique constant/section pairs to globals. We don't want to merge + // globals in different sections. + DenseMap<Constant*, GlobalVariable*> CMap; + + // Replacements - This vector contains a list of replacements to perform. + SmallVector<std::pair<GlobalVariable*, GlobalVariable*>, 32> Replacements; + + bool MadeChange = false; + + // Iterate constant merging while we are still making progress. Merging two + // constants together may allow us to merge other constants together if the + // second level constants have initializers which point to the globals that + // were just merged. + while (1) { + + // First: Find the canonical constants others will be merged with. + for (Module::global_iterator GVI = M.global_begin(), E = M.global_end(); + GVI != E; ) { + GlobalVariable *GV = GVI++; + + // If this GV is dead, remove it. + GV->removeDeadConstantUsers(); + if (GV->use_empty() && GV->hasLocalLinkage()) { + GV->eraseFromParent(); + continue; + } + + // Only process constants with initializers in the default address space. + if (!GV->isConstant() || !GV->hasDefinitiveInitializer() || + GV->getType()->getAddressSpace() != 0 || GV->hasSection() || + // Don't touch values marked with attribute(used). + UsedGlobals.count(GV)) + continue; + + Constant *Init = GV->getInitializer(); + + // Check to see if the initializer is already known. + GlobalVariable *&Slot = CMap[Init]; + + // If this is the first constant we find or if the old on is local, + // replace with the current one. It the current is externally visible + // it cannot be replace, but can be the canonical constant we merge with. + if (Slot == 0 || IsBetterCannonical(*GV, *Slot)) { + Slot = GV; + } + } + + // Second: identify all globals that can be merged together, filling in + // the Replacements vector. We cannot do the replacement in this pass + // because doing so may cause initializers of other globals to be rewritten, + // invalidating the Constant* pointers in CMap. + for (Module::global_iterator GVI = M.global_begin(), E = M.global_end(); + GVI != E; ) { + GlobalVariable *GV = GVI++; + + // Only process constants with initializers in the default address space. + if (!GV->isConstant() || !GV->hasDefinitiveInitializer() || + GV->getType()->getAddressSpace() != 0 || GV->hasSection() || + // Don't touch values marked with attribute(used). + UsedGlobals.count(GV)) + continue; + + // We can only replace constant with local linkage. + if (!GV->hasLocalLinkage()) + continue; + + Constant *Init = GV->getInitializer(); + + // Check to see if the initializer is already known. + GlobalVariable *Slot = CMap[Init]; + + if (!Slot || Slot == GV) + continue; + + if (!Slot->hasUnnamedAddr() && !GV->hasUnnamedAddr()) + continue; + + if (!GV->hasUnnamedAddr()) + Slot->setUnnamedAddr(false); + + // Make all uses of the duplicate constant use the canonical version. + Replacements.push_back(std::make_pair(GV, Slot)); + } + + if (Replacements.empty()) + return MadeChange; + CMap.clear(); + + // Now that we have figured out which replacements must be made, do them all + // now. This avoid invalidating the pointers in CMap, which are unneeded + // now. + for (unsigned i = 0, e = Replacements.size(); i != e; ++i) { + // Eliminate any uses of the dead global. + Replacements[i].first->replaceAllUsesWith(Replacements[i].second); + + // Delete the global value from the module. + assert(Replacements[i].first->hasLocalLinkage() && + "Refusing to delete an externally visible global variable."); + Replacements[i].first->eraseFromParent(); + } + + NumMerged += Replacements.size(); + Replacements.clear(); + } +} diff --git a/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp new file mode 100644 index 0000000..b423221 --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -0,0 +1,1003 @@ +//===-- DeadArgumentElimination.cpp - Eliminate dead arguments ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass deletes dead arguments from internal functions. Dead argument +// elimination removes arguments which are directly dead, as well as arguments +// only passed into function calls as dead arguments of other functions. This +// pass also deletes dead return values in a similar way. +// +// This pass is often useful as a cleanup pass to run after aggressive +// interprocedural passes, which add possibly-dead arguments or return values. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "deadargelim" +#include "llvm/Transforms/IPO.h" +#include "llvm/CallingConv.h" +#include "llvm/Constant.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/LLVMContext.h" +#include "llvm/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" +#include <map> +#include <set> +using namespace llvm; + +STATISTIC(NumArgumentsEliminated, "Number of unread args removed"); +STATISTIC(NumRetValsEliminated , "Number of unused return values removed"); +STATISTIC(NumArgumentsReplacedWithUndef, + "Number of unread args replaced with undef"); +namespace { + /// DAE - The dead argument elimination pass. + /// + class DAE : public ModulePass { + public: + + /// Struct that represents (part of) either a return value or a function + /// argument. Used so that arguments and return values can be used + /// interchangably. + struct RetOrArg { + RetOrArg(const Function *F, unsigned Idx, bool IsArg) : F(F), Idx(Idx), + IsArg(IsArg) {} + const Function *F; + unsigned Idx; + bool IsArg; + + /// Make RetOrArg comparable, so we can put it into a map. + bool operator<(const RetOrArg &O) const { + if (F != O.F) + return F < O.F; + else if (Idx != O.Idx) + return Idx < O.Idx; + else + return IsArg < O.IsArg; + } + + /// Make RetOrArg comparable, so we can easily iterate the multimap. + bool operator==(const RetOrArg &O) const { + return F == O.F && Idx == O.Idx && IsArg == O.IsArg; + } + + std::string getDescription() const { + return std::string((IsArg ? "Argument #" : "Return value #")) + + utostr(Idx) + " of function " + F->getNameStr(); + } + }; + + /// Liveness enum - During our initial pass over the program, we determine + /// that things are either alive or maybe alive. We don't mark anything + /// explicitly dead (even if we know they are), since anything not alive + /// with no registered uses (in Uses) will never be marked alive and will + /// thus become dead in the end. + enum Liveness { Live, MaybeLive }; + + /// Convenience wrapper + RetOrArg CreateRet(const Function *F, unsigned Idx) { + return RetOrArg(F, Idx, false); + } + /// Convenience wrapper + RetOrArg CreateArg(const Function *F, unsigned Idx) { + return RetOrArg(F, Idx, true); + } + + typedef std::multimap<RetOrArg, RetOrArg> UseMap; + /// This maps a return value or argument to any MaybeLive return values or + /// arguments it uses. This allows the MaybeLive values to be marked live + /// when any of its users is marked live. + /// For example (indices are left out for clarity): + /// - Uses[ret F] = ret G + /// This means that F calls G, and F returns the value returned by G. + /// - Uses[arg F] = ret G + /// This means that some function calls G and passes its result as an + /// argument to F. + /// - Uses[ret F] = arg F + /// This means that F returns one of its own arguments. + /// - Uses[arg F] = arg G + /// This means that G calls F and passes one of its own (G's) arguments + /// directly to F. + UseMap Uses; + + typedef std::set<RetOrArg> LiveSet; + typedef std::set<const Function*> LiveFuncSet; + + /// This set contains all values that have been determined to be live. + LiveSet LiveValues; + /// This set contains all values that are cannot be changed in any way. + LiveFuncSet LiveFunctions; + + typedef SmallVector<RetOrArg, 5> UseVector; + + protected: + // DAH uses this to specify a different ID. + explicit DAE(char &ID) : ModulePass(ID) {} + + public: + static char ID; // Pass identification, replacement for typeid + DAE() : ModulePass(ID) { + initializeDAEPass(*PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M); + + virtual bool ShouldHackArguments() const { return false; } + + private: + Liveness MarkIfNotLive(RetOrArg Use, UseVector &MaybeLiveUses); + Liveness SurveyUse(Value::const_use_iterator U, UseVector &MaybeLiveUses, + unsigned RetValNum = 0); + Liveness SurveyUses(const Value *V, UseVector &MaybeLiveUses); + + void SurveyFunction(const Function &F); + void MarkValue(const RetOrArg &RA, Liveness L, + const UseVector &MaybeLiveUses); + void MarkLive(const RetOrArg &RA); + void MarkLive(const Function &F); + void PropagateLiveness(const RetOrArg &RA); + bool RemoveDeadStuffFromFunction(Function *F); + bool DeleteDeadVarargs(Function &Fn); + bool RemoveDeadArgumentsFromCallers(Function &Fn); + }; +} + + +char DAE::ID = 0; +INITIALIZE_PASS(DAE, "deadargelim", "Dead Argument Elimination", false, false) + +namespace { + /// DAH - DeadArgumentHacking pass - Same as dead argument elimination, but + /// deletes arguments to functions which are external. This is only for use + /// by bugpoint. + struct DAH : public DAE { + static char ID; + DAH() : DAE(ID) {} + + virtual bool ShouldHackArguments() const { return true; } + }; +} + +char DAH::ID = 0; +INITIALIZE_PASS(DAH, "deadarghaX0r", + "Dead Argument Hacking (BUGPOINT USE ONLY; DO NOT USE)", + false, false) + +/// createDeadArgEliminationPass - This pass removes arguments from functions +/// which are not used by the body of the function. +/// +ModulePass *llvm::createDeadArgEliminationPass() { return new DAE(); } +ModulePass *llvm::createDeadArgHackingPass() { return new DAH(); } + +/// DeleteDeadVarargs - If this is an function that takes a ... list, and if +/// llvm.vastart is never called, the varargs list is dead for the function. +bool DAE::DeleteDeadVarargs(Function &Fn) { + assert(Fn.getFunctionType()->isVarArg() && "Function isn't varargs!"); + if (Fn.isDeclaration() || !Fn.hasLocalLinkage()) return false; + + // Ensure that the function is only directly called. + if (Fn.hasAddressTaken()) + return false; + + // Okay, we know we can transform this function if safe. Scan its body + // looking for calls to llvm.vastart. + for (Function::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) { + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + if (II->getIntrinsicID() == Intrinsic::vastart) + return false; + } + } + } + + // If we get here, there are no calls to llvm.vastart in the function body, + // remove the "..." and adjust all the calls. + + // Start by computing a new prototype for the function, which is the same as + // the old function, but doesn't have isVarArg set. + const FunctionType *FTy = Fn.getFunctionType(); + + std::vector<const Type*> Params(FTy->param_begin(), FTy->param_end()); + FunctionType *NFTy = FunctionType::get(FTy->getReturnType(), + Params, false); + unsigned NumArgs = Params.size(); + + // Create the new function body and insert it into the module... + Function *NF = Function::Create(NFTy, Fn.getLinkage()); + NF->copyAttributesFrom(&Fn); + Fn.getParent()->getFunctionList().insert(&Fn, NF); + NF->takeName(&Fn); + + // Loop over all of the callers of the function, transforming the call sites + // to pass in a smaller number of arguments into the new function. + // + std::vector<Value*> Args; + while (!Fn.use_empty()) { + CallSite CS(Fn.use_back()); + Instruction *Call = CS.getInstruction(); + + // Pass all the same arguments. + Args.assign(CS.arg_begin(), CS.arg_begin() + NumArgs); + + // Drop any attributes that were on the vararg arguments. + AttrListPtr PAL = CS.getAttributes(); + if (!PAL.isEmpty() && PAL.getSlot(PAL.getNumSlots() - 1).Index > NumArgs) { + SmallVector<AttributeWithIndex, 8> AttributesVec; + for (unsigned i = 0; PAL.getSlot(i).Index <= NumArgs; ++i) + AttributesVec.push_back(PAL.getSlot(i)); + if (Attributes FnAttrs = PAL.getFnAttributes()) + AttributesVec.push_back(AttributeWithIndex::get(~0, FnAttrs)); + PAL = AttrListPtr::get(AttributesVec.begin(), AttributesVec.end()); + } + + Instruction *New; + if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) { + New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(), + Args.begin(), Args.end(), "", Call); + cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv()); + cast<InvokeInst>(New)->setAttributes(PAL); + } else { + New = CallInst::Create(NF, Args.begin(), Args.end(), "", Call); + cast<CallInst>(New)->setCallingConv(CS.getCallingConv()); + cast<CallInst>(New)->setAttributes(PAL); + if (cast<CallInst>(Call)->isTailCall()) + cast<CallInst>(New)->setTailCall(); + } + New->setDebugLoc(Call->getDebugLoc()); + + Args.clear(); + + if (!Call->use_empty()) + Call->replaceAllUsesWith(New); + + New->takeName(Call); + + // Finally, remove the old call from the program, reducing the use-count of + // F. + Call->eraseFromParent(); + } + + // Since we have now created the new function, splice the body of the old + // function right into the new function, leaving the old rotting hulk of the + // function empty. + NF->getBasicBlockList().splice(NF->begin(), Fn.getBasicBlockList()); + + // Loop over the argument list, transfering uses of the old arguments over to + // the new arguments, also transfering over the names as well. While we're at + // it, remove the dead arguments from the DeadArguments list. + // + for (Function::arg_iterator I = Fn.arg_begin(), E = Fn.arg_end(), + I2 = NF->arg_begin(); I != E; ++I, ++I2) { + // Move the name and users over to the new version. + I->replaceAllUsesWith(I2); + I2->takeName(I); + } + + // Finally, nuke the old function. + Fn.eraseFromParent(); + return true; +} + +/// RemoveDeadArgumentsFromCallers - Checks if the given function has any +/// arguments that are unused, and changes the caller parameters to be undefined +/// instead. +bool DAE::RemoveDeadArgumentsFromCallers(Function &Fn) +{ + if (Fn.isDeclaration()) + return false; + + // Functions with local linkage should already have been handled. + if (Fn.hasLocalLinkage()) + return false; + + if (Fn.use_empty()) + return false; + + llvm::SmallVector<unsigned, 8> UnusedArgs; + for (Function::arg_iterator I = Fn.arg_begin(), E = Fn.arg_end(); + I != E; ++I) { + Argument *Arg = I; + + if (Arg->use_empty() && !Arg->hasByValAttr()) + UnusedArgs.push_back(Arg->getArgNo()); + } + + if (UnusedArgs.empty()) + return false; + + bool Changed = false; + + for (Function::use_iterator I = Fn.use_begin(), E = Fn.use_end(); + I != E; ++I) { + CallSite CS(*I); + if (!CS || !CS.isCallee(I)) + continue; + + // Now go through all unused args and replace them with "undef". + for (unsigned I = 0, E = UnusedArgs.size(); I != E; ++I) { + unsigned ArgNo = UnusedArgs[I]; + + Value *Arg = CS.getArgument(ArgNo); + CS.setArgument(ArgNo, UndefValue::get(Arg->getType())); + ++NumArgumentsReplacedWithUndef; + Changed = true; + } + } + + return Changed; +} + +/// Convenience function that returns the number of return values. It returns 0 +/// for void functions and 1 for functions not returning a struct. It returns +/// the number of struct elements for functions returning a struct. +static unsigned NumRetVals(const Function *F) { + if (F->getReturnType()->isVoidTy()) + return 0; + else if (const StructType *STy = dyn_cast<StructType>(F->getReturnType())) + return STy->getNumElements(); + else + return 1; +} + +/// MarkIfNotLive - This checks Use for liveness in LiveValues. If Use is not +/// live, it adds Use to the MaybeLiveUses argument. Returns the determined +/// liveness of Use. +DAE::Liveness DAE::MarkIfNotLive(RetOrArg Use, UseVector &MaybeLiveUses) { + // We're live if our use or its Function is already marked as live. + if (LiveFunctions.count(Use.F) || LiveValues.count(Use)) + return Live; + + // We're maybe live otherwise, but remember that we must become live if + // Use becomes live. + MaybeLiveUses.push_back(Use); + return MaybeLive; +} + + +/// SurveyUse - This looks at a single use of an argument or return value +/// and determines if it should be alive or not. Adds this use to MaybeLiveUses +/// if it causes the used value to become MaybeLive. +/// +/// RetValNum is the return value number to use when this use is used in a +/// return instruction. This is used in the recursion, you should always leave +/// it at 0. +DAE::Liveness DAE::SurveyUse(Value::const_use_iterator U, + UseVector &MaybeLiveUses, unsigned RetValNum) { + const User *V = *U; + if (const ReturnInst *RI = dyn_cast<ReturnInst>(V)) { + // The value is returned from a function. It's only live when the + // function's return value is live. We use RetValNum here, for the case + // that U is really a use of an insertvalue instruction that uses the + // orginal Use. + RetOrArg Use = CreateRet(RI->getParent()->getParent(), RetValNum); + // We might be live, depending on the liveness of Use. + return MarkIfNotLive(Use, MaybeLiveUses); + } + if (const InsertValueInst *IV = dyn_cast<InsertValueInst>(V)) { + if (U.getOperandNo() != InsertValueInst::getAggregateOperandIndex() + && IV->hasIndices()) + // The use we are examining is inserted into an aggregate. Our liveness + // depends on all uses of that aggregate, but if it is used as a return + // value, only index at which we were inserted counts. + RetValNum = *IV->idx_begin(); + + // Note that if we are used as the aggregate operand to the insertvalue, + // we don't change RetValNum, but do survey all our uses. + + Liveness Result = MaybeLive; + for (Value::const_use_iterator I = IV->use_begin(), + E = V->use_end(); I != E; ++I) { + Result = SurveyUse(I, MaybeLiveUses, RetValNum); + if (Result == Live) + break; + } + return Result; + } + + if (ImmutableCallSite CS = V) { + const Function *F = CS.getCalledFunction(); + if (F) { + // Used in a direct call. + + // Find the argument number. We know for sure that this use is an + // argument, since if it was the function argument this would be an + // indirect call and the we know can't be looking at a value of the + // label type (for the invoke instruction). + unsigned ArgNo = CS.getArgumentNo(U); + + if (ArgNo >= F->getFunctionType()->getNumParams()) + // The value is passed in through a vararg! Must be live. + return Live; + + assert(CS.getArgument(ArgNo) + == CS->getOperand(U.getOperandNo()) + && "Argument is not where we expected it"); + + // Value passed to a normal call. It's only live when the corresponding + // argument to the called function turns out live. + RetOrArg Use = CreateArg(F, ArgNo); + return MarkIfNotLive(Use, MaybeLiveUses); + } + } + // Used in any other way? Value must be live. + return Live; +} + +/// SurveyUses - This looks at all the uses of the given value +/// Returns the Liveness deduced from the uses of this value. +/// +/// Adds all uses that cause the result to be MaybeLive to MaybeLiveRetUses. If +/// the result is Live, MaybeLiveUses might be modified but its content should +/// be ignored (since it might not be complete). +DAE::Liveness DAE::SurveyUses(const Value *V, UseVector &MaybeLiveUses) { + // Assume it's dead (which will only hold if there are no uses at all..). + Liveness Result = MaybeLive; + // Check each use. + for (Value::const_use_iterator I = V->use_begin(), + E = V->use_end(); I != E; ++I) { + Result = SurveyUse(I, MaybeLiveUses); + if (Result == Live) + break; + } + return Result; +} + +// SurveyFunction - This performs the initial survey of the specified function, +// checking out whether or not it uses any of its incoming arguments or whether +// any callers use the return value. This fills in the LiveValues set and Uses +// map. +// +// We consider arguments of non-internal functions to be intrinsically alive as +// well as arguments to functions which have their "address taken". +// +void DAE::SurveyFunction(const Function &F) { + unsigned RetCount = NumRetVals(&F); + // Assume all return values are dead + typedef SmallVector<Liveness, 5> RetVals; + RetVals RetValLiveness(RetCount, MaybeLive); + + typedef SmallVector<UseVector, 5> RetUses; + // These vectors map each return value to the uses that make it MaybeLive, so + // we can add those to the Uses map if the return value really turns out to be + // MaybeLive. Initialized to a list of RetCount empty lists. + RetUses MaybeLiveRetUses(RetCount); + + for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB) + if (const ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) + if (RI->getNumOperands() != 0 && RI->getOperand(0)->getType() + != F.getFunctionType()->getReturnType()) { + // We don't support old style multiple return values. + MarkLive(F); + return; + } + + if (!F.hasLocalLinkage() && (!ShouldHackArguments() || F.isIntrinsic())) { + MarkLive(F); + return; + } + + DEBUG(dbgs() << "DAE - Inspecting callers for fn: " << F.getName() << "\n"); + // Keep track of the number of live retvals, so we can skip checks once all + // of them turn out to be live. + unsigned NumLiveRetVals = 0; + const Type *STy = dyn_cast<StructType>(F.getReturnType()); + // Loop all uses of the function. + for (Value::const_use_iterator I = F.use_begin(), E = F.use_end(); + I != E; ++I) { + // If the function is PASSED IN as an argument, its address has been + // taken. + ImmutableCallSite CS(*I); + if (!CS || !CS.isCallee(I)) { + MarkLive(F); + return; + } + + // If this use is anything other than a call site, the function is alive. + const Instruction *TheCall = CS.getInstruction(); + if (!TheCall) { // Not a direct call site? + MarkLive(F); + return; + } + + // If we end up here, we are looking at a direct call to our function. + + // Now, check how our return value(s) is/are used in this caller. Don't + // bother checking return values if all of them are live already. + if (NumLiveRetVals != RetCount) { + if (STy) { + // Check all uses of the return value. + for (Value::const_use_iterator I = TheCall->use_begin(), + E = TheCall->use_end(); I != E; ++I) { + const ExtractValueInst *Ext = dyn_cast<ExtractValueInst>(*I); + if (Ext && Ext->hasIndices()) { + // This use uses a part of our return value, survey the uses of + // that part and store the results for this index only. + unsigned Idx = *Ext->idx_begin(); + if (RetValLiveness[Idx] != Live) { + RetValLiveness[Idx] = SurveyUses(Ext, MaybeLiveRetUses[Idx]); + if (RetValLiveness[Idx] == Live) + NumLiveRetVals++; + } + } else { + // Used by something else than extractvalue. Mark all return + // values as live. + for (unsigned i = 0; i != RetCount; ++i ) + RetValLiveness[i] = Live; + NumLiveRetVals = RetCount; + break; + } + } + } else { + // Single return value + RetValLiveness[0] = SurveyUses(TheCall, MaybeLiveRetUses[0]); + if (RetValLiveness[0] == Live) + NumLiveRetVals = RetCount; + } + } + } + + // Now we've inspected all callers, record the liveness of our return values. + for (unsigned i = 0; i != RetCount; ++i) + MarkValue(CreateRet(&F, i), RetValLiveness[i], MaybeLiveRetUses[i]); + + DEBUG(dbgs() << "DAE - Inspecting args for fn: " << F.getName() << "\n"); + + // Now, check all of our arguments. + unsigned i = 0; + UseVector MaybeLiveArgUses; + for (Function::const_arg_iterator AI = F.arg_begin(), + E = F.arg_end(); AI != E; ++AI, ++i) { + // See what the effect of this use is (recording any uses that cause + // MaybeLive in MaybeLiveArgUses). + Liveness Result = SurveyUses(AI, MaybeLiveArgUses); + // Mark the result. + MarkValue(CreateArg(&F, i), Result, MaybeLiveArgUses); + // Clear the vector again for the next iteration. + MaybeLiveArgUses.clear(); + } +} + +/// MarkValue - This function marks the liveness of RA depending on L. If L is +/// MaybeLive, it also takes all uses in MaybeLiveUses and records them in Uses, +/// such that RA will be marked live if any use in MaybeLiveUses gets marked +/// live later on. +void DAE::MarkValue(const RetOrArg &RA, Liveness L, + const UseVector &MaybeLiveUses) { + switch (L) { + case Live: MarkLive(RA); break; + case MaybeLive: + { + // Note any uses of this value, so this return value can be + // marked live whenever one of the uses becomes live. + for (UseVector::const_iterator UI = MaybeLiveUses.begin(), + UE = MaybeLiveUses.end(); UI != UE; ++UI) + Uses.insert(std::make_pair(*UI, RA)); + break; + } + } +} + +/// MarkLive - Mark the given Function as alive, meaning that it cannot be +/// changed in any way. Additionally, +/// mark any values that are used as this function's parameters or by its return +/// values (according to Uses) live as well. +void DAE::MarkLive(const Function &F) { + DEBUG(dbgs() << "DAE - Intrinsically live fn: " << F.getName() << "\n"); + // Mark the function as live. + LiveFunctions.insert(&F); + // Mark all arguments as live. + for (unsigned i = 0, e = F.arg_size(); i != e; ++i) + PropagateLiveness(CreateArg(&F, i)); + // Mark all return values as live. + for (unsigned i = 0, e = NumRetVals(&F); i != e; ++i) + PropagateLiveness(CreateRet(&F, i)); +} + +/// MarkLive - Mark the given return value or argument as live. Additionally, +/// mark any values that are used by this value (according to Uses) live as +/// well. +void DAE::MarkLive(const RetOrArg &RA) { + if (LiveFunctions.count(RA.F)) + return; // Function was already marked Live. + + if (!LiveValues.insert(RA).second) + return; // We were already marked Live. + + DEBUG(dbgs() << "DAE - Marking " << RA.getDescription() << " live\n"); + PropagateLiveness(RA); +} + +/// PropagateLiveness - Given that RA is a live value, propagate it's liveness +/// to any other values it uses (according to Uses). +void DAE::PropagateLiveness(const RetOrArg &RA) { + // We don't use upper_bound (or equal_range) here, because our recursive call + // to ourselves is likely to cause the upper_bound (which is the first value + // not belonging to RA) to become erased and the iterator invalidated. + UseMap::iterator Begin = Uses.lower_bound(RA); + UseMap::iterator E = Uses.end(); + UseMap::iterator I; + for (I = Begin; I != E && I->first == RA; ++I) + MarkLive(I->second); + + // Erase RA from the Uses map (from the lower bound to wherever we ended up + // after the loop). + Uses.erase(Begin, I); +} + +// RemoveDeadStuffFromFunction - Remove any arguments and return values from F +// that are not in LiveValues. Transform the function and all of the callees of +// the function to not have these arguments and return values. +// +bool DAE::RemoveDeadStuffFromFunction(Function *F) { + // Don't modify fully live functions + if (LiveFunctions.count(F)) + return false; + + // Start by computing a new prototype for the function, which is the same as + // the old function, but has fewer arguments and a different return type. + const FunctionType *FTy = F->getFunctionType(); + std::vector<const Type*> Params; + + // Set up to build a new list of parameter attributes. + SmallVector<AttributeWithIndex, 8> AttributesVec; + const AttrListPtr &PAL = F->getAttributes(); + + // The existing function return attributes. + Attributes RAttrs = PAL.getRetAttributes(); + Attributes FnAttrs = PAL.getFnAttributes(); + + // Find out the new return value. + + const Type *RetTy = FTy->getReturnType(); + const Type *NRetTy = NULL; + unsigned RetCount = NumRetVals(F); + + // -1 means unused, other numbers are the new index + SmallVector<int, 5> NewRetIdxs(RetCount, -1); + std::vector<const Type*> RetTypes; + if (RetTy->isVoidTy()) { + NRetTy = RetTy; + } else { + const StructType *STy = dyn_cast<StructType>(RetTy); + if (STy) + // Look at each of the original return values individually. + for (unsigned i = 0; i != RetCount; ++i) { + RetOrArg Ret = CreateRet(F, i); + if (LiveValues.erase(Ret)) { + RetTypes.push_back(STy->getElementType(i)); + NewRetIdxs[i] = RetTypes.size() - 1; + } else { + ++NumRetValsEliminated; + DEBUG(dbgs() << "DAE - Removing return value " << i << " from " + << F->getName() << "\n"); + } + } + else + // We used to return a single value. + if (LiveValues.erase(CreateRet(F, 0))) { + RetTypes.push_back(RetTy); + NewRetIdxs[0] = 0; + } else { + DEBUG(dbgs() << "DAE - Removing return value from " << F->getName() + << "\n"); + ++NumRetValsEliminated; + } + if (RetTypes.size() > 1) + // More than one return type? Return a struct with them. Also, if we used + // to return a struct and didn't change the number of return values, + // return a struct again. This prevents changing {something} into + // something and {} into void. + // Make the new struct packed if we used to return a packed struct + // already. + NRetTy = StructType::get(STy->getContext(), RetTypes, STy->isPacked()); + else if (RetTypes.size() == 1) + // One return type? Just a simple value then, but only if we didn't use to + // return a struct with that simple value before. + NRetTy = RetTypes.front(); + else if (RetTypes.size() == 0) + // No return types? Make it void, but only if we didn't use to return {}. + NRetTy = Type::getVoidTy(F->getContext()); + } + + assert(NRetTy && "No new return type found?"); + + // Remove any incompatible attributes, but only if we removed all return + // values. Otherwise, ensure that we don't have any conflicting attributes + // here. Currently, this should not be possible, but special handling might be + // required when new return value attributes are added. + if (NRetTy->isVoidTy()) + RAttrs &= ~Attribute::typeIncompatible(NRetTy); + else + assert((RAttrs & Attribute::typeIncompatible(NRetTy)) == 0 + && "Return attributes no longer compatible?"); + + if (RAttrs) + AttributesVec.push_back(AttributeWithIndex::get(0, RAttrs)); + + // Remember which arguments are still alive. + SmallVector<bool, 10> ArgAlive(FTy->getNumParams(), false); + // Construct the new parameter list from non-dead arguments. Also construct + // a new set of parameter attributes to correspond. Skip the first parameter + // attribute, since that belongs to the return value. + unsigned i = 0; + for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); + I != E; ++I, ++i) { + RetOrArg Arg = CreateArg(F, i); + if (LiveValues.erase(Arg)) { + Params.push_back(I->getType()); + ArgAlive[i] = true; + + // Get the original parameter attributes (skipping the first one, that is + // for the return value. + if (Attributes Attrs = PAL.getParamAttributes(i + 1)) + AttributesVec.push_back(AttributeWithIndex::get(Params.size(), Attrs)); + } else { + ++NumArgumentsEliminated; + DEBUG(dbgs() << "DAE - Removing argument " << i << " (" << I->getName() + << ") from " << F->getName() << "\n"); + } + } + + if (FnAttrs != Attribute::None) + AttributesVec.push_back(AttributeWithIndex::get(~0, FnAttrs)); + + // Reconstruct the AttributesList based on the vector we constructed. + AttrListPtr NewPAL = AttrListPtr::get(AttributesVec.begin(), + AttributesVec.end()); + + // Create the new function type based on the recomputed parameters. + FunctionType *NFTy = FunctionType::get(NRetTy, Params, FTy->isVarArg()); + + // No change? + if (NFTy == FTy) + return false; + + // Create the new function body and insert it into the module... + Function *NF = Function::Create(NFTy, F->getLinkage()); + NF->copyAttributesFrom(F); + NF->setAttributes(NewPAL); + // Insert the new function before the old function, so we won't be processing + // it again. + F->getParent()->getFunctionList().insert(F, NF); + NF->takeName(F); + + // Loop over all of the callers of the function, transforming the call sites + // to pass in a smaller number of arguments into the new function. + // + std::vector<Value*> Args; + while (!F->use_empty()) { + CallSite CS(F->use_back()); + Instruction *Call = CS.getInstruction(); + + AttributesVec.clear(); + const AttrListPtr &CallPAL = CS.getAttributes(); + + // The call return attributes. + Attributes RAttrs = CallPAL.getRetAttributes(); + Attributes FnAttrs = CallPAL.getFnAttributes(); + // Adjust in case the function was changed to return void. + RAttrs &= ~Attribute::typeIncompatible(NF->getReturnType()); + if (RAttrs) + AttributesVec.push_back(AttributeWithIndex::get(0, RAttrs)); + + // Declare these outside of the loops, so we can reuse them for the second + // loop, which loops the varargs. + CallSite::arg_iterator I = CS.arg_begin(); + unsigned i = 0; + // Loop over those operands, corresponding to the normal arguments to the + // original function, and add those that are still alive. + for (unsigned e = FTy->getNumParams(); i != e; ++I, ++i) + if (ArgAlive[i]) { + Args.push_back(*I); + // Get original parameter attributes, but skip return attributes. + if (Attributes Attrs = CallPAL.getParamAttributes(i + 1)) + AttributesVec.push_back(AttributeWithIndex::get(Args.size(), Attrs)); + } + + // Push any varargs arguments on the list. Don't forget their attributes. + for (CallSite::arg_iterator E = CS.arg_end(); I != E; ++I, ++i) { + Args.push_back(*I); + if (Attributes Attrs = CallPAL.getParamAttributes(i + 1)) + AttributesVec.push_back(AttributeWithIndex::get(Args.size(), Attrs)); + } + + if (FnAttrs != Attribute::None) + AttributesVec.push_back(AttributeWithIndex::get(~0, FnAttrs)); + + // Reconstruct the AttributesList based on the vector we constructed. + AttrListPtr NewCallPAL = AttrListPtr::get(AttributesVec.begin(), + AttributesVec.end()); + + Instruction *New; + if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) { + New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(), + Args.begin(), Args.end(), "", Call); + cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv()); + cast<InvokeInst>(New)->setAttributes(NewCallPAL); + } else { + New = CallInst::Create(NF, Args.begin(), Args.end(), "", Call); + cast<CallInst>(New)->setCallingConv(CS.getCallingConv()); + cast<CallInst>(New)->setAttributes(NewCallPAL); + if (cast<CallInst>(Call)->isTailCall()) + cast<CallInst>(New)->setTailCall(); + } + New->setDebugLoc(Call->getDebugLoc()); + + Args.clear(); + + if (!Call->use_empty()) { + if (New->getType() == Call->getType()) { + // Return type not changed? Just replace users then. + Call->replaceAllUsesWith(New); + New->takeName(Call); + } else if (New->getType()->isVoidTy()) { + // Our return value has uses, but they will get removed later on. + // Replace by null for now. + if (!Call->getType()->isX86_MMXTy()) + Call->replaceAllUsesWith(Constant::getNullValue(Call->getType())); + } else { + assert(RetTy->isStructTy() && + "Return type changed, but not into a void. The old return type" + " must have been a struct!"); + Instruction *InsertPt = Call; + if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) { + BasicBlock::iterator IP = II->getNormalDest()->begin(); + while (isa<PHINode>(IP)) ++IP; + InsertPt = IP; + } + + // We used to return a struct. Instead of doing smart stuff with all the + // uses of this struct, we will just rebuild it using + // extract/insertvalue chaining and let instcombine clean that up. + // + // Start out building up our return value from undef + Value *RetVal = UndefValue::get(RetTy); + for (unsigned i = 0; i != RetCount; ++i) + if (NewRetIdxs[i] != -1) { + Value *V; + if (RetTypes.size() > 1) + // We are still returning a struct, so extract the value from our + // return value + V = ExtractValueInst::Create(New, NewRetIdxs[i], "newret", + InsertPt); + else + // We are now returning a single element, so just insert that + V = New; + // Insert the value at the old position + RetVal = InsertValueInst::Create(RetVal, V, i, "oldret", InsertPt); + } + // Now, replace all uses of the old call instruction with the return + // struct we built + Call->replaceAllUsesWith(RetVal); + New->takeName(Call); + } + } + + // Finally, remove the old call from the program, reducing the use-count of + // F. + Call->eraseFromParent(); + } + + // Since we have now created the new function, splice the body of the old + // function right into the new function, leaving the old rotting hulk of the + // function empty. + NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList()); + + // Loop over the argument list, transfering uses of the old arguments over to + // the new arguments, also transfering over the names as well. + i = 0; + for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(), + I2 = NF->arg_begin(); I != E; ++I, ++i) + if (ArgAlive[i]) { + // If this is a live argument, move the name and users over to the new + // version. + I->replaceAllUsesWith(I2); + I2->takeName(I); + ++I2; + } else { + // If this argument is dead, replace any uses of it with null constants + // (these are guaranteed to become unused later on). + if (!I->getType()->isX86_MMXTy()) + I->replaceAllUsesWith(Constant::getNullValue(I->getType())); + } + + // If we change the return value of the function we must rewrite any return + // instructions. Check this now. + if (F->getReturnType() != NF->getReturnType()) + for (Function::iterator BB = NF->begin(), E = NF->end(); BB != E; ++BB) + if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) { + Value *RetVal; + + if (NFTy->getReturnType()->isVoidTy()) { + RetVal = 0; + } else { + assert (RetTy->isStructTy()); + // The original return value was a struct, insert + // extractvalue/insertvalue chains to extract only the values we need + // to return and insert them into our new result. + // This does generate messy code, but we'll let it to instcombine to + // clean that up. + Value *OldRet = RI->getOperand(0); + // Start out building up our return value from undef + RetVal = UndefValue::get(NRetTy); + for (unsigned i = 0; i != RetCount; ++i) + if (NewRetIdxs[i] != -1) { + ExtractValueInst *EV = ExtractValueInst::Create(OldRet, i, + "oldret", RI); + if (RetTypes.size() > 1) { + // We're still returning a struct, so reinsert the value into + // our new return value at the new index + + RetVal = InsertValueInst::Create(RetVal, EV, NewRetIdxs[i], + "newret", RI); + } else { + // We are now only returning a simple value, so just return the + // extracted value. + RetVal = EV; + } + } + } + // Replace the return instruction with one returning the new return + // value (possibly 0 if we became void). + ReturnInst::Create(F->getContext(), RetVal, RI); + BB->getInstList().erase(RI); + } + + // Now that the old function is dead, delete it. + F->eraseFromParent(); + + return true; +} + +bool DAE::runOnModule(Module &M) { + bool Changed = false; + + // First pass: Do a simple check to see if any functions can have their "..." + // removed. We can do this if they never call va_start. This loop cannot be + // fused with the next loop, because deleting a function invalidates + // information computed while surveying other functions. + DEBUG(dbgs() << "DAE - Deleting dead varargs\n"); + for (Module::iterator I = M.begin(), E = M.end(); I != E; ) { + Function &F = *I++; + if (F.getFunctionType()->isVarArg()) + Changed |= DeleteDeadVarargs(F); + } + + // Second phase:loop through the module, determining which arguments are live. + // We assume all arguments are dead unless proven otherwise (allowing us to + // determine that dead arguments passed into recursive functions are dead). + // + DEBUG(dbgs() << "DAE - Determining liveness\n"); + for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) + SurveyFunction(*I); + + // Now, remove all dead arguments and return values from each function in + // turn. + for (Module::iterator I = M.begin(), E = M.end(); I != E; ) { + // Increment now, because the function will probably get removed (ie. + // replaced by a new one). + Function *F = I++; + Changed |= RemoveDeadStuffFromFunction(F); + } + + // Finally, look for any unused parameters in functions with non-local + // linkage and replace the passed in parameters with undef. + for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { + Function& F = *I; + + Changed |= RemoveDeadArgumentsFromCallers(F); + } + + return Changed; +} diff --git a/contrib/llvm/lib/Transforms/IPO/DeadTypeElimination.cpp b/contrib/llvm/lib/Transforms/IPO/DeadTypeElimination.cpp new file mode 100644 index 0000000..a509931 --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/DeadTypeElimination.cpp @@ -0,0 +1,111 @@ +//===- DeadTypeElimination.cpp - Eliminate unused types for symbol table --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass is used to cleanup the output of GCC. It eliminate names for types +// that are unused in the entire translation unit, using the FindUsedTypes pass. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "deadtypeelim" +#include "llvm/Transforms/IPO.h" +#include "llvm/Analysis/FindUsedTypes.h" +#include "llvm/Module.h" +#include "llvm/TypeSymbolTable.h" +#include "llvm/DerivedTypes.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumKilled, "Number of unused typenames removed from symtab"); + +namespace { + struct DTE : public ModulePass { + static char ID; // Pass identification, replacement for typeid + DTE() : ModulePass(ID) { + initializeDTEPass(*PassRegistry::getPassRegistry()); + } + + // doPassInitialization - For this pass, it removes global symbol table + // entries for primitive types. These are never used for linking in GCC and + // they make the output uglier to look at, so we nuke them. + // + // Also, initialize instance variables. + // + bool runOnModule(Module &M); + + // getAnalysisUsage - This function needs FindUsedTypes to do its job... + // + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<FindUsedTypes>(); + } + }; +} + +char DTE::ID = 0; +INITIALIZE_PASS_BEGIN(DTE, "deadtypeelim", "Dead Type Elimination", + false, false) +INITIALIZE_PASS_DEPENDENCY(FindUsedTypes) +INITIALIZE_PASS_END(DTE, "deadtypeelim", "Dead Type Elimination", false, false) + +ModulePass *llvm::createDeadTypeEliminationPass() { + return new DTE(); +} + + +// ShouldNukeSymtabEntry - Return true if this module level symbol table entry +// should be eliminated. +// +static inline bool ShouldNukeSymtabEntry(const Type *Ty){ + // Nuke all names for primitive types! + if (Ty->isPrimitiveType() || Ty->isIntegerTy()) + return true; + + // Nuke all pointers to primitive types as well... + if (const PointerType *PT = dyn_cast<PointerType>(Ty)) + if (PT->getElementType()->isPrimitiveType() || + PT->getElementType()->isIntegerTy()) + return true; + + return false; +} + +// run - For this pass, it removes global symbol table entries for primitive +// types. These are never used for linking in GCC and they make the output +// uglier to look at, so we nuke them. Also eliminate types that are never used +// in the entire program as indicated by FindUsedTypes. +// +bool DTE::runOnModule(Module &M) { + bool Changed = false; + + TypeSymbolTable &ST = M.getTypeSymbolTable(); + std::set<const Type *> UsedTypes = getAnalysis<FindUsedTypes>().getTypes(); + + // Check the symbol table for superfluous type entries... + // + // Grab the 'type' plane of the module symbol... + TypeSymbolTable::iterator TI = ST.begin(); + TypeSymbolTable::iterator TE = ST.end(); + while ( TI != TE ) { + // If this entry should be unconditionally removed, or if we detect that + // the type is not used, remove it. + const Type *RHS = TI->second; + if (ShouldNukeSymtabEntry(RHS) || !UsedTypes.count(RHS)) { + ST.remove(TI++); + ++NumKilled; + Changed = true; + } else { + ++TI; + // We only need to leave one name for each type. + UsedTypes.erase(RHS); + } + } + + return Changed; +} + +// vim: sw=2 diff --git a/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp b/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp new file mode 100644 index 0000000..9d432de --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp @@ -0,0 +1,80 @@ +//===-- ExtractGV.cpp - Global Value extraction pass ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass extracts global values +// +//===----------------------------------------------------------------------===// + +#include "llvm/Instructions.h" +#include "llvm/LLVMContext.h" +#include "llvm/Module.h" +#include "llvm/Pass.h" +#include "llvm/Constants.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/ADT/SetVector.h" +#include <algorithm> +using namespace llvm; + +namespace { + /// @brief A pass to extract specific functions and their dependencies. + class GVExtractorPass : public ModulePass { + SetVector<GlobalValue *> Named; + bool deleteStuff; + public: + static char ID; // Pass identification, replacement for typeid + + /// FunctionExtractorPass - If deleteFn is true, this pass deletes as the + /// specified function. Otherwise, it deletes as much of the module as + /// possible, except for the function specified. + /// + explicit GVExtractorPass(std::vector<GlobalValue*>& GVs, bool deleteS = true) + : ModulePass(ID), Named(GVs.begin(), GVs.end()), deleteStuff(deleteS) {} + + bool runOnModule(Module &M) { + // Visit the global inline asm. + if (!deleteStuff) + M.setModuleInlineAsm(""); + + // For simplicity, just give all GlobalValues ExternalLinkage. A trickier + // implementation could figure out which GlobalValues are actually + // referenced by the Named set, and which GlobalValues in the rest of + // the module are referenced by the NamedSet, and get away with leaving + // more internal and private things internal and private. But for now, + // be conservative and simple. + + // Visit the GlobalVariables. + for (Module::global_iterator I = M.global_begin(), E = M.global_end(); + I != E; ++I) { + if (I->hasLocalLinkage()) + I->setVisibility(GlobalValue::HiddenVisibility); + I->setLinkage(GlobalValue::ExternalLinkage); + if (deleteStuff == (bool)Named.count(I) && !I->isDeclaration()) + I->setInitializer(0); + } + + // Visit the Functions. + for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { + if (I->hasLocalLinkage()) + I->setVisibility(GlobalValue::HiddenVisibility); + I->setLinkage(GlobalValue::ExternalLinkage); + if (deleteStuff == (bool)Named.count(I) && !I->isDeclaration()) + I->deleteBody(); + } + + return true; + } + }; + + char GVExtractorPass::ID = 0; +} + +ModulePass *llvm::createGVExtractionPass(std::vector<GlobalValue*>& GVs, + bool deleteFn) { + return new GVExtractorPass(GVs, deleteFn); +} diff --git a/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp new file mode 100644 index 0000000..95decec --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -0,0 +1,380 @@ +//===- FunctionAttrs.cpp - Pass which marks functions readnone or readonly ===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a simple interprocedural pass which walks the +// call-graph, looking for functions which do not access or only read +// non-local memory, and marking them readnone/readonly. In addition, +// it marks function arguments (of pointer type) 'nocapture' if a call +// to the function does not create any copies of the pointer value that +// outlive the call. This more or less means that the pointer is only +// dereferenced, and not returned from the function or stored in a global. +// This pass is implemented as a bottom-up traversal of the call-graph. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "functionattrs" +#include "llvm/Transforms/IPO.h" +#include "llvm/CallGraphSCCPass.h" +#include "llvm/GlobalVariable.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/LLVMContext.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/Analysis/CaptureTracking.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/UniqueVector.h" +#include "llvm/Support/InstIterator.h" +using namespace llvm; + +STATISTIC(NumReadNone, "Number of functions marked readnone"); +STATISTIC(NumReadOnly, "Number of functions marked readonly"); +STATISTIC(NumNoCapture, "Number of arguments marked nocapture"); +STATISTIC(NumNoAlias, "Number of function returns marked noalias"); + +namespace { + struct FunctionAttrs : public CallGraphSCCPass { + static char ID; // Pass identification, replacement for typeid + FunctionAttrs() : CallGraphSCCPass(ID), AA(0) { + initializeFunctionAttrsPass(*PassRegistry::getPassRegistry()); + } + + // runOnSCC - Analyze the SCC, performing the transformation if possible. + bool runOnSCC(CallGraphSCC &SCC); + + // AddReadAttrs - Deduce readonly/readnone attributes for the SCC. + bool AddReadAttrs(const CallGraphSCC &SCC); + + // AddNoCaptureAttrs - Deduce nocapture attributes for the SCC. + bool AddNoCaptureAttrs(const CallGraphSCC &SCC); + + // IsFunctionMallocLike - Does this function allocate new memory? + bool IsFunctionMallocLike(Function *F, + SmallPtrSet<Function*, 8> &) const; + + // AddNoAliasAttrs - Deduce noalias attributes for the SCC. + bool AddNoAliasAttrs(const CallGraphSCC &SCC); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired<AliasAnalysis>(); + CallGraphSCCPass::getAnalysisUsage(AU); + } + + private: + AliasAnalysis *AA; + }; +} + +char FunctionAttrs::ID = 0; +INITIALIZE_PASS_BEGIN(FunctionAttrs, "functionattrs", + "Deduce function attributes", false, false) +INITIALIZE_AG_DEPENDENCY(CallGraph) +INITIALIZE_PASS_END(FunctionAttrs, "functionattrs", + "Deduce function attributes", false, false) + +Pass *llvm::createFunctionAttrsPass() { return new FunctionAttrs(); } + + +/// AddReadAttrs - Deduce readonly/readnone attributes for the SCC. +bool FunctionAttrs::AddReadAttrs(const CallGraphSCC &SCC) { + SmallPtrSet<Function*, 8> SCCNodes; + + // Fill SCCNodes with the elements of the SCC. Used for quickly + // looking up whether a given CallGraphNode is in this SCC. + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) + SCCNodes.insert((*I)->getFunction()); + + // Check if any of the functions in the SCC read or write memory. If they + // write memory then they can't be marked readnone or readonly. + bool ReadsMemory = false; + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { + Function *F = (*I)->getFunction(); + + if (F == 0) + // External node - may write memory. Just give up. + return false; + + AliasAnalysis::ModRefBehavior MRB = AA->getModRefBehavior(F); + if (MRB == AliasAnalysis::DoesNotAccessMemory) + // Already perfect! + continue; + + // Definitions with weak linkage may be overridden at linktime with + // something that writes memory, so treat them like declarations. + if (F->isDeclaration() || F->mayBeOverridden()) { + if (!AliasAnalysis::onlyReadsMemory(MRB)) + // May write memory. Just give up. + return false; + + ReadsMemory = true; + continue; + } + + // Scan the function body for instructions that may read or write memory. + for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) { + Instruction *I = &*II; + + // Some instructions can be ignored even if they read or write memory. + // Detect these now, skipping to the next instruction if one is found. + CallSite CS(cast<Value>(I)); + if (CS) { + // Ignore calls to functions in the same SCC. + if (CS.getCalledFunction() && SCCNodes.count(CS.getCalledFunction())) + continue; + AliasAnalysis::ModRefBehavior MRB = AA->getModRefBehavior(CS); + // If the call doesn't access arbitrary memory, we may be able to + // figure out something. + if (AliasAnalysis::onlyAccessesArgPointees(MRB)) { + // If the call does access argument pointees, check each argument. + if (AliasAnalysis::doesAccessArgPointees(MRB)) + // Check whether all pointer arguments point to local memory, and + // ignore calls that only access local memory. + for (CallSite::arg_iterator CI = CS.arg_begin(), CE = CS.arg_end(); + CI != CE; ++CI) { + Value *Arg = *CI; + if (Arg->getType()->isPointerTy()) { + AliasAnalysis::Location Loc(Arg, + AliasAnalysis::UnknownSize, + I->getMetadata(LLVMContext::MD_tbaa)); + if (!AA->pointsToConstantMemory(Loc, /*OrLocal=*/true)) { + if (MRB & AliasAnalysis::Mod) + // Writes non-local memory. Give up. + return false; + if (MRB & AliasAnalysis::Ref) + // Ok, it reads non-local memory. + ReadsMemory = true; + } + } + } + continue; + } + // The call could access any memory. If that includes writes, give up. + if (MRB & AliasAnalysis::Mod) + return false; + // If it reads, note it. + if (MRB & AliasAnalysis::Ref) + ReadsMemory = true; + continue; + } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + // Ignore non-volatile loads from local memory. + if (!LI->isVolatile()) { + AliasAnalysis::Location Loc = AA->getLocation(LI); + if (AA->pointsToConstantMemory(Loc, /*OrLocal=*/true)) + continue; + } + } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) { + // Ignore non-volatile stores to local memory. + if (!SI->isVolatile()) { + AliasAnalysis::Location Loc = AA->getLocation(SI); + if (AA->pointsToConstantMemory(Loc, /*OrLocal=*/true)) + continue; + } + } else if (VAArgInst *VI = dyn_cast<VAArgInst>(I)) { + // Ignore vaargs on local memory. + AliasAnalysis::Location Loc = AA->getLocation(VI); + if (AA->pointsToConstantMemory(Loc, /*OrLocal=*/true)) + continue; + } + + // Any remaining instructions need to be taken seriously! Check if they + // read or write memory. + if (I->mayWriteToMemory()) + // Writes memory. Just give up. + return false; + + // If this instruction may read memory, remember that. + ReadsMemory |= I->mayReadFromMemory(); + } + } + + // Success! Functions in this SCC do not access memory, or only read memory. + // Give them the appropriate attribute. + bool MadeChange = false; + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { + Function *F = (*I)->getFunction(); + + if (F->doesNotAccessMemory()) + // Already perfect! + continue; + + if (F->onlyReadsMemory() && ReadsMemory) + // No change. + continue; + + MadeChange = true; + + // Clear out any existing attributes. + F->removeAttribute(~0, Attribute::ReadOnly | Attribute::ReadNone); + + // Add in the new attribute. + F->addAttribute(~0, ReadsMemory? Attribute::ReadOnly : Attribute::ReadNone); + + if (ReadsMemory) + ++NumReadOnly; + else + ++NumReadNone; + } + + return MadeChange; +} + +/// AddNoCaptureAttrs - Deduce nocapture attributes for the SCC. +bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) { + bool Changed = false; + + // Check each function in turn, determining which pointer arguments are not + // captured. + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { + Function *F = (*I)->getFunction(); + + if (F == 0) + // External node - skip it; + continue; + + // Definitions with weak linkage may be overridden at linktime with + // something that writes memory, so treat them like declarations. + if (F->isDeclaration() || F->mayBeOverridden()) + continue; + + for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); A!=E; ++A) + if (A->getType()->isPointerTy() && !A->hasNoCaptureAttr() && + !PointerMayBeCaptured(A, true, /*StoreCaptures=*/false)) { + A->addAttr(Attribute::NoCapture); + ++NumNoCapture; + Changed = true; + } + } + + return Changed; +} + +/// IsFunctionMallocLike - A function is malloc-like if it returns either null +/// or a pointer that doesn't alias any other pointer visible to the caller. +bool FunctionAttrs::IsFunctionMallocLike(Function *F, + SmallPtrSet<Function*, 8> &SCCNodes) const { + UniqueVector<Value *> FlowsToReturn; + for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) + if (ReturnInst *Ret = dyn_cast<ReturnInst>(I->getTerminator())) + FlowsToReturn.insert(Ret->getReturnValue()); + + for (unsigned i = 0; i != FlowsToReturn.size(); ++i) { + Value *RetVal = FlowsToReturn[i+1]; // UniqueVector[0] is reserved. + + if (Constant *C = dyn_cast<Constant>(RetVal)) { + if (!C->isNullValue() && !isa<UndefValue>(C)) + return false; + + continue; + } + + if (isa<Argument>(RetVal)) + return false; + + if (Instruction *RVI = dyn_cast<Instruction>(RetVal)) + switch (RVI->getOpcode()) { + // Extend the analysis by looking upwards. + case Instruction::BitCast: + case Instruction::GetElementPtr: + FlowsToReturn.insert(RVI->getOperand(0)); + continue; + case Instruction::Select: { + SelectInst *SI = cast<SelectInst>(RVI); + FlowsToReturn.insert(SI->getTrueValue()); + FlowsToReturn.insert(SI->getFalseValue()); + continue; + } + case Instruction::PHI: { + PHINode *PN = cast<PHINode>(RVI); + for (int i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + FlowsToReturn.insert(PN->getIncomingValue(i)); + continue; + } + + // Check whether the pointer came from an allocation. + case Instruction::Alloca: + break; + case Instruction::Call: + case Instruction::Invoke: { + CallSite CS(RVI); + if (CS.paramHasAttr(0, Attribute::NoAlias)) + break; + if (CS.getCalledFunction() && + SCCNodes.count(CS.getCalledFunction())) + break; + } // fall-through + default: + return false; // Did not come from an allocation. + } + + if (PointerMayBeCaptured(RetVal, false, /*StoreCaptures=*/false)) + return false; + } + + return true; +} + +/// AddNoAliasAttrs - Deduce noalias attributes for the SCC. +bool FunctionAttrs::AddNoAliasAttrs(const CallGraphSCC &SCC) { + SmallPtrSet<Function*, 8> SCCNodes; + + // Fill SCCNodes with the elements of the SCC. Used for quickly + // looking up whether a given CallGraphNode is in this SCC. + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) + SCCNodes.insert((*I)->getFunction()); + + // Check each function in turn, determining which functions return noalias + // pointers. + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { + Function *F = (*I)->getFunction(); + + if (F == 0) + // External node - skip it; + return false; + + // Already noalias. + if (F->doesNotAlias(0)) + continue; + + // Definitions with weak linkage may be overridden at linktime, so + // treat them like declarations. + if (F->isDeclaration() || F->mayBeOverridden()) + return false; + + // We annotate noalias return values, which are only applicable to + // pointer types. + if (!F->getReturnType()->isPointerTy()) + continue; + + if (!IsFunctionMallocLike(F, SCCNodes)) + return false; + } + + bool MadeChange = false; + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { + Function *F = (*I)->getFunction(); + if (F->doesNotAlias(0) || !F->getReturnType()->isPointerTy()) + continue; + + F->setDoesNotAlias(0); + ++NumNoAlias; + MadeChange = true; + } + + return MadeChange; +} + +bool FunctionAttrs::runOnSCC(CallGraphSCC &SCC) { + AA = &getAnalysis<AliasAnalysis>(); + + bool Changed = AddReadAttrs(SCC); + Changed |= AddNoCaptureAttrs(SCC); + Changed |= AddNoAliasAttrs(SCC); + return Changed; +} diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp new file mode 100644 index 0000000..2b427aa --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp @@ -0,0 +1,211 @@ +//===-- GlobalDCE.cpp - DCE unreachable internal functions ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This transform is designed to eliminate unreachable internal globals from the +// program. It uses an aggressive algorithm, searching out globals that are +// known to be alive. After it finds all of the globals which are needed, it +// deletes whatever is left over. This allows it to delete recursive chunks of +// the program which are unreachable. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "globaldce" +#include "llvm/Transforms/IPO.h" +#include "llvm/Constants.h" +#include "llvm/Module.h" +#include "llvm/Pass.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumAliases , "Number of global aliases removed"); +STATISTIC(NumFunctions, "Number of functions removed"); +STATISTIC(NumVariables, "Number of global variables removed"); + +namespace { + struct GlobalDCE : public ModulePass { + static char ID; // Pass identification, replacement for typeid + GlobalDCE() : ModulePass(ID) { + initializeGlobalDCEPass(*PassRegistry::getPassRegistry()); + } + + // run - Do the GlobalDCE pass on the specified module, optionally updating + // the specified callgraph to reflect the changes. + // + bool runOnModule(Module &M); + + private: + SmallPtrSet<GlobalValue*, 32> AliveGlobals; + + /// GlobalIsNeeded - mark the specific global value as needed, and + /// recursively mark anything that it uses as also needed. + void GlobalIsNeeded(GlobalValue *GV); + void MarkUsedGlobalsAsNeeded(Constant *C); + + bool RemoveUnusedGlobalValue(GlobalValue &GV); + }; +} + +char GlobalDCE::ID = 0; +INITIALIZE_PASS(GlobalDCE, "globaldce", + "Dead Global Elimination", false, false) + +ModulePass *llvm::createGlobalDCEPass() { return new GlobalDCE(); } + +bool GlobalDCE::runOnModule(Module &M) { + bool Changed = false; + + // Loop over the module, adding globals which are obviously necessary. + for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { + Changed |= RemoveUnusedGlobalValue(*I); + // Functions with external linkage are needed if they have a body + if (!I->hasLocalLinkage() && !I->hasLinkOnceLinkage() && + !I->isDeclaration() && !I->hasAvailableExternallyLinkage()) + GlobalIsNeeded(I); + } + + for (Module::global_iterator I = M.global_begin(), E = M.global_end(); + I != E; ++I) { + Changed |= RemoveUnusedGlobalValue(*I); + // Externally visible & appending globals are needed, if they have an + // initializer. + if (!I->hasLocalLinkage() && !I->hasLinkOnceLinkage() && + !I->isDeclaration() && !I->hasAvailableExternallyLinkage()) + GlobalIsNeeded(I); + } + + for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); + I != E; ++I) { + Changed |= RemoveUnusedGlobalValue(*I); + // Externally visible aliases are needed. + if (!I->hasLocalLinkage() && !I->hasLinkOnceLinkage()) + GlobalIsNeeded(I); + } + + // Now that all globals which are needed are in the AliveGlobals set, we loop + // through the program, deleting those which are not alive. + // + + // The first pass is to drop initializers of global variables which are dead. + std::vector<GlobalVariable*> DeadGlobalVars; // Keep track of dead globals + for (Module::global_iterator I = M.global_begin(), E = M.global_end(); + I != E; ++I) + if (!AliveGlobals.count(I)) { + DeadGlobalVars.push_back(I); // Keep track of dead globals + I->setInitializer(0); + } + + // The second pass drops the bodies of functions which are dead... + std::vector<Function*> DeadFunctions; + for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) + if (!AliveGlobals.count(I)) { + DeadFunctions.push_back(I); // Keep track of dead globals + if (!I->isDeclaration()) + I->deleteBody(); + } + + // The third pass drops targets of aliases which are dead... + std::vector<GlobalAlias*> DeadAliases; + for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); I != E; + ++I) + if (!AliveGlobals.count(I)) { + DeadAliases.push_back(I); + I->setAliasee(0); + } + + if (!DeadFunctions.empty()) { + // Now that all interferences have been dropped, delete the actual objects + // themselves. + for (unsigned i = 0, e = DeadFunctions.size(); i != e; ++i) { + RemoveUnusedGlobalValue(*DeadFunctions[i]); + M.getFunctionList().erase(DeadFunctions[i]); + } + NumFunctions += DeadFunctions.size(); + Changed = true; + } + + if (!DeadGlobalVars.empty()) { + for (unsigned i = 0, e = DeadGlobalVars.size(); i != e; ++i) { + RemoveUnusedGlobalValue(*DeadGlobalVars[i]); + M.getGlobalList().erase(DeadGlobalVars[i]); + } + NumVariables += DeadGlobalVars.size(); + Changed = true; + } + + // Now delete any dead aliases. + if (!DeadAliases.empty()) { + for (unsigned i = 0, e = DeadAliases.size(); i != e; ++i) { + RemoveUnusedGlobalValue(*DeadAliases[i]); + M.getAliasList().erase(DeadAliases[i]); + } + NumAliases += DeadAliases.size(); + Changed = true; + } + + // Make sure that all memory is released + AliveGlobals.clear(); + + return Changed; +} + +/// GlobalIsNeeded - the specific global value as needed, and +/// recursively mark anything that it uses as also needed. +void GlobalDCE::GlobalIsNeeded(GlobalValue *G) { + // If the global is already in the set, no need to reprocess it. + if (!AliveGlobals.insert(G)) + return; + + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(G)) { + // If this is a global variable, we must make sure to add any global values + // referenced by the initializer to the alive set. + if (GV->hasInitializer()) + MarkUsedGlobalsAsNeeded(GV->getInitializer()); + } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(G)) { + // The target of a global alias is needed. + MarkUsedGlobalsAsNeeded(GA->getAliasee()); + } else { + // Otherwise this must be a function object. We have to scan the body of + // the function looking for constants and global values which are used as + // operands. Any operands of these types must be processed to ensure that + // any globals used will be marked as needed. + Function *F = cast<Function>(G); + + for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) + for (User::op_iterator U = I->op_begin(), E = I->op_end(); U != E; ++U) + if (GlobalValue *GV = dyn_cast<GlobalValue>(*U)) + GlobalIsNeeded(GV); + else if (Constant *C = dyn_cast<Constant>(*U)) + MarkUsedGlobalsAsNeeded(C); + } +} + +void GlobalDCE::MarkUsedGlobalsAsNeeded(Constant *C) { + if (GlobalValue *GV = dyn_cast<GlobalValue>(C)) + return GlobalIsNeeded(GV); + + // Loop over all of the operands of the constant, adding any globals they + // use to the list of needed globals. + for (User::op_iterator I = C->op_begin(), E = C->op_end(); I != E; ++I) + if (Constant *OpC = dyn_cast<Constant>(*I)) + MarkUsedGlobalsAsNeeded(OpC); +} + +// RemoveUnusedGlobalValue - Loop over all of the uses of the specified +// GlobalValue, looking for the constant pointer ref that may be pointing to it. +// If found, check to see if the constant pointer ref is safe to destroy, and if +// so, nuke it. This will reduce the reference count on the global value, which +// might make it deader. +// +bool GlobalDCE::RemoveUnusedGlobalValue(GlobalValue &GV) { + if (GV.use_empty()) return false; + GV.removeDeadConstantUsers(); + return GV.use_empty(); +} diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp new file mode 100644 index 0000000..d4cb712 --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -0,0 +1,2728 @@ +//===- GlobalOpt.cpp - Optimize Global Variables --------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass transforms simple global variables that never have their address +// taken. If obviously true, it marks read/write globals as constant, deletes +// variables only stored to, etc. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "globalopt" +#include "llvm/Transforms/IPO.h" +#include "llvm/CallingConv.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Module.h" +#include "llvm/Pass.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/GetElementPtrTypeIterator.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include <algorithm> +using namespace llvm; + +STATISTIC(NumMarked , "Number of globals marked constant"); +STATISTIC(NumUnnamed , "Number of globals marked unnamed_addr"); +STATISTIC(NumSRA , "Number of aggregate globals broken into scalars"); +STATISTIC(NumHeapSRA , "Number of heap objects SRA'd"); +STATISTIC(NumSubstitute,"Number of globals with initializers stored into them"); +STATISTIC(NumDeleted , "Number of globals deleted"); +STATISTIC(NumFnDeleted , "Number of functions deleted"); +STATISTIC(NumGlobUses , "Number of global uses devirtualized"); +STATISTIC(NumLocalized , "Number of globals localized"); +STATISTIC(NumShrunkToBool , "Number of global vars shrunk to booleans"); +STATISTIC(NumFastCallFns , "Number of functions converted to fastcc"); +STATISTIC(NumCtorsEvaluated, "Number of static ctors evaluated"); +STATISTIC(NumNestRemoved , "Number of nest attributes removed"); +STATISTIC(NumAliasesResolved, "Number of global aliases resolved"); +STATISTIC(NumAliasesRemoved, "Number of global aliases eliminated"); + +namespace { + struct GlobalStatus; + struct GlobalOpt : public ModulePass { + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + } + static char ID; // Pass identification, replacement for typeid + GlobalOpt() : ModulePass(ID) { + initializeGlobalOptPass(*PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M); + + private: + GlobalVariable *FindGlobalCtors(Module &M); + bool OptimizeFunctions(Module &M); + bool OptimizeGlobalVars(Module &M); + bool OptimizeGlobalAliases(Module &M); + bool OptimizeGlobalCtorsList(GlobalVariable *&GCL); + bool ProcessGlobal(GlobalVariable *GV,Module::global_iterator &GVI); + bool ProcessInternalGlobal(GlobalVariable *GV,Module::global_iterator &GVI, + const SmallPtrSet<const PHINode*, 16> &PHIUsers, + const GlobalStatus &GS); + }; +} + +char GlobalOpt::ID = 0; +INITIALIZE_PASS(GlobalOpt, "globalopt", + "Global Variable Optimizer", false, false) + +ModulePass *llvm::createGlobalOptimizerPass() { return new GlobalOpt(); } + +namespace { + +/// GlobalStatus - As we analyze each global, keep track of some information +/// about it. If we find out that the address of the global is taken, none of +/// this info will be accurate. +struct GlobalStatus { + /// isCompared - True if the global's address is used in a comparison. + bool isCompared; + + /// isLoaded - True if the global is ever loaded. If the global isn't ever + /// loaded it can be deleted. + bool isLoaded; + + /// StoredType - Keep track of what stores to the global look like. + /// + enum StoredType { + /// NotStored - There is no store to this global. It can thus be marked + /// constant. + NotStored, + + /// isInitializerStored - This global is stored to, but the only thing + /// stored is the constant it was initialized with. This is only tracked + /// for scalar globals. + isInitializerStored, + + /// isStoredOnce - This global is stored to, but only its initializer and + /// one other value is ever stored to it. If this global isStoredOnce, we + /// track the value stored to it in StoredOnceValue below. This is only + /// tracked for scalar globals. + isStoredOnce, + + /// isStored - This global is stored to by multiple values or something else + /// that we cannot track. + isStored + } StoredType; + + /// StoredOnceValue - If only one value (besides the initializer constant) is + /// ever stored to this global, keep track of what value it is. + Value *StoredOnceValue; + + /// AccessingFunction/HasMultipleAccessingFunctions - These start out + /// null/false. When the first accessing function is noticed, it is recorded. + /// When a second different accessing function is noticed, + /// HasMultipleAccessingFunctions is set to true. + const Function *AccessingFunction; + bool HasMultipleAccessingFunctions; + + /// HasNonInstructionUser - Set to true if this global has a user that is not + /// an instruction (e.g. a constant expr or GV initializer). + bool HasNonInstructionUser; + + /// HasPHIUser - Set to true if this global has a user that is a PHI node. + bool HasPHIUser; + + GlobalStatus() : isCompared(false), isLoaded(false), StoredType(NotStored), + StoredOnceValue(0), AccessingFunction(0), + HasMultipleAccessingFunctions(false), HasNonInstructionUser(false), + HasPHIUser(false) {} +}; + +} + +// SafeToDestroyConstant - It is safe to destroy a constant iff it is only used +// by constants itself. Note that constants cannot be cyclic, so this test is +// pretty easy to implement recursively. +// +static bool SafeToDestroyConstant(const Constant *C) { + if (isa<GlobalValue>(C)) return false; + + for (Value::const_use_iterator UI = C->use_begin(), E = C->use_end(); UI != E; + ++UI) + if (const Constant *CU = dyn_cast<Constant>(*UI)) { + if (!SafeToDestroyConstant(CU)) return false; + } else + return false; + return true; +} + + +/// AnalyzeGlobal - Look at all uses of the global and fill in the GlobalStatus +/// structure. If the global has its address taken, return true to indicate we +/// can't do anything with it. +/// +static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS, + SmallPtrSet<const PHINode*, 16> &PHIUsers) { + for (Value::const_use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; + ++UI) { + const User *U = *UI; + if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) { + GS.HasNonInstructionUser = true; + + // If the result of the constantexpr isn't pointer type, then we won't + // know to expect it in various places. Just reject early. + if (!isa<PointerType>(CE->getType())) return true; + + if (AnalyzeGlobal(CE, GS, PHIUsers)) return true; + } else if (const Instruction *I = dyn_cast<Instruction>(U)) { + if (!GS.HasMultipleAccessingFunctions) { + const Function *F = I->getParent()->getParent(); + if (GS.AccessingFunction == 0) + GS.AccessingFunction = F; + else if (GS.AccessingFunction != F) + GS.HasMultipleAccessingFunctions = true; + } + if (const LoadInst *LI = dyn_cast<LoadInst>(I)) { + GS.isLoaded = true; + if (LI->isVolatile()) return true; // Don't hack on volatile loads. + } else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) { + // Don't allow a store OF the address, only stores TO the address. + if (SI->getOperand(0) == V) return true; + + if (SI->isVolatile()) return true; // Don't hack on volatile stores. + + // If this is a direct store to the global (i.e., the global is a scalar + // value, not an aggregate), keep more specific information about + // stores. + if (GS.StoredType != GlobalStatus::isStored) { + if (const GlobalVariable *GV = dyn_cast<GlobalVariable>( + SI->getOperand(1))) { + Value *StoredVal = SI->getOperand(0); + if (StoredVal == GV->getInitializer()) { + if (GS.StoredType < GlobalStatus::isInitializerStored) + GS.StoredType = GlobalStatus::isInitializerStored; + } else if (isa<LoadInst>(StoredVal) && + cast<LoadInst>(StoredVal)->getOperand(0) == GV) { + if (GS.StoredType < GlobalStatus::isInitializerStored) + GS.StoredType = GlobalStatus::isInitializerStored; + } else if (GS.StoredType < GlobalStatus::isStoredOnce) { + GS.StoredType = GlobalStatus::isStoredOnce; + GS.StoredOnceValue = StoredVal; + } else if (GS.StoredType == GlobalStatus::isStoredOnce && + GS.StoredOnceValue == StoredVal) { + // noop. + } else { + GS.StoredType = GlobalStatus::isStored; + } + } else { + GS.StoredType = GlobalStatus::isStored; + } + } + } else if (isa<GetElementPtrInst>(I)) { + if (AnalyzeGlobal(I, GS, PHIUsers)) return true; + } else if (isa<SelectInst>(I)) { + if (AnalyzeGlobal(I, GS, PHIUsers)) return true; + } else if (const PHINode *PN = dyn_cast<PHINode>(I)) { + // PHI nodes we can check just like select or GEP instructions, but we + // have to be careful about infinite recursion. + if (PHIUsers.insert(PN)) // Not already visited. + if (AnalyzeGlobal(I, GS, PHIUsers)) return true; + GS.HasPHIUser = true; + } else if (isa<CmpInst>(I)) { + GS.isCompared = true; + } else if (isa<MemTransferInst>(I)) { + const MemTransferInst *MTI = cast<MemTransferInst>(I); + if (MTI->getArgOperand(0) == V) + GS.StoredType = GlobalStatus::isStored; + if (MTI->getArgOperand(1) == V) + GS.isLoaded = true; + } else if (isa<MemSetInst>(I)) { + assert(cast<MemSetInst>(I)->getArgOperand(0) == V && + "Memset only takes one pointer!"); + GS.StoredType = GlobalStatus::isStored; + } else { + return true; // Any other non-load instruction might take address! + } + } else if (const Constant *C = dyn_cast<Constant>(U)) { + GS.HasNonInstructionUser = true; + // We might have a dead and dangling constant hanging off of here. + if (!SafeToDestroyConstant(C)) + return true; + } else { + GS.HasNonInstructionUser = true; + // Otherwise must be some other user. + return true; + } + } + + return false; +} + +static Constant *getAggregateConstantElement(Constant *Agg, Constant *Idx) { + ConstantInt *CI = dyn_cast<ConstantInt>(Idx); + if (!CI) return 0; + unsigned IdxV = CI->getZExtValue(); + + if (ConstantStruct *CS = dyn_cast<ConstantStruct>(Agg)) { + if (IdxV < CS->getNumOperands()) return CS->getOperand(IdxV); + } else if (ConstantArray *CA = dyn_cast<ConstantArray>(Agg)) { + if (IdxV < CA->getNumOperands()) return CA->getOperand(IdxV); + } else if (ConstantVector *CP = dyn_cast<ConstantVector>(Agg)) { + if (IdxV < CP->getNumOperands()) return CP->getOperand(IdxV); + } else if (isa<ConstantAggregateZero>(Agg)) { + if (const StructType *STy = dyn_cast<StructType>(Agg->getType())) { + if (IdxV < STy->getNumElements()) + return Constant::getNullValue(STy->getElementType(IdxV)); + } else if (const SequentialType *STy = + dyn_cast<SequentialType>(Agg->getType())) { + return Constant::getNullValue(STy->getElementType()); + } + } else if (isa<UndefValue>(Agg)) { + if (const StructType *STy = dyn_cast<StructType>(Agg->getType())) { + if (IdxV < STy->getNumElements()) + return UndefValue::get(STy->getElementType(IdxV)); + } else if (const SequentialType *STy = + dyn_cast<SequentialType>(Agg->getType())) { + return UndefValue::get(STy->getElementType()); + } + } + return 0; +} + + +/// CleanupConstantGlobalUsers - We just marked GV constant. Loop over all +/// users of the global, cleaning up the obvious ones. This is largely just a +/// quick scan over the use list to clean up the easy and obvious cruft. This +/// returns true if it made a change. +static bool CleanupConstantGlobalUsers(Value *V, Constant *Init) { + bool Changed = false; + for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;) { + User *U = *UI++; + + if (LoadInst *LI = dyn_cast<LoadInst>(U)) { + if (Init) { + // Replace the load with the initializer. + LI->replaceAllUsesWith(Init); + LI->eraseFromParent(); + Changed = true; + } + } else if (StoreInst *SI = dyn_cast<StoreInst>(U)) { + // Store must be unreachable or storing Init into the global. + SI->eraseFromParent(); + Changed = true; + } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) { + if (CE->getOpcode() == Instruction::GetElementPtr) { + Constant *SubInit = 0; + if (Init) + SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE); + Changed |= CleanupConstantGlobalUsers(CE, SubInit); + } else if (CE->getOpcode() == Instruction::BitCast && + CE->getType()->isPointerTy()) { + // Pointer cast, delete any stores and memsets to the global. + Changed |= CleanupConstantGlobalUsers(CE, 0); + } + + if (CE->use_empty()) { + CE->destroyConstant(); + Changed = true; + } + } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) { + // Do not transform "gepinst (gep constexpr (GV))" here, because forming + // "gepconstexpr (gep constexpr (GV))" will cause the two gep's to fold + // and will invalidate our notion of what Init is. + Constant *SubInit = 0; + if (!isa<ConstantExpr>(GEP->getOperand(0))) { + ConstantExpr *CE = + dyn_cast_or_null<ConstantExpr>(ConstantFoldInstruction(GEP)); + if (Init && CE && CE->getOpcode() == Instruction::GetElementPtr) + SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE); + } + Changed |= CleanupConstantGlobalUsers(GEP, SubInit); + + if (GEP->use_empty()) { + GEP->eraseFromParent(); + Changed = true; + } + } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U)) { // memset/cpy/mv + if (MI->getRawDest() == V) { + MI->eraseFromParent(); + Changed = true; + } + + } else if (Constant *C = dyn_cast<Constant>(U)) { + // If we have a chain of dead constantexprs or other things dangling from + // us, and if they are all dead, nuke them without remorse. + if (SafeToDestroyConstant(C)) { + C->destroyConstant(); + // This could have invalidated UI, start over from scratch. + CleanupConstantGlobalUsers(V, Init); + return true; + } + } + } + return Changed; +} + +/// isSafeSROAElementUse - Return true if the specified instruction is a safe +/// user of a derived expression from a global that we want to SROA. +static bool isSafeSROAElementUse(Value *V) { + // We might have a dead and dangling constant hanging off of here. + if (Constant *C = dyn_cast<Constant>(V)) + return SafeToDestroyConstant(C); + + Instruction *I = dyn_cast<Instruction>(V); + if (!I) return false; + + // Loads are ok. + if (isa<LoadInst>(I)) return true; + + // Stores *to* the pointer are ok. + if (StoreInst *SI = dyn_cast<StoreInst>(I)) + return SI->getOperand(0) != V; + + // Otherwise, it must be a GEP. + GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I); + if (GEPI == 0) return false; + + if (GEPI->getNumOperands() < 3 || !isa<Constant>(GEPI->getOperand(1)) || + !cast<Constant>(GEPI->getOperand(1))->isNullValue()) + return false; + + for (Value::use_iterator I = GEPI->use_begin(), E = GEPI->use_end(); + I != E; ++I) + if (!isSafeSROAElementUse(*I)) + return false; + return true; +} + + +/// IsUserOfGlobalSafeForSRA - U is a direct user of the specified global value. +/// Look at it and its uses and decide whether it is safe to SROA this global. +/// +static bool IsUserOfGlobalSafeForSRA(User *U, GlobalValue *GV) { + // The user of the global must be a GEP Inst or a ConstantExpr GEP. + if (!isa<GetElementPtrInst>(U) && + (!isa<ConstantExpr>(U) || + cast<ConstantExpr>(U)->getOpcode() != Instruction::GetElementPtr)) + return false; + + // Check to see if this ConstantExpr GEP is SRA'able. In particular, we + // don't like < 3 operand CE's, and we don't like non-constant integer + // indices. This enforces that all uses are 'gep GV, 0, C, ...' for some + // value of C. + if (U->getNumOperands() < 3 || !isa<Constant>(U->getOperand(1)) || + !cast<Constant>(U->getOperand(1))->isNullValue() || + !isa<ConstantInt>(U->getOperand(2))) + return false; + + gep_type_iterator GEPI = gep_type_begin(U), E = gep_type_end(U); + ++GEPI; // Skip over the pointer index. + + // If this is a use of an array allocation, do a bit more checking for sanity. + if (const ArrayType *AT = dyn_cast<ArrayType>(*GEPI)) { + uint64_t NumElements = AT->getNumElements(); + ConstantInt *Idx = cast<ConstantInt>(U->getOperand(2)); + + // Check to make sure that index falls within the array. If not, + // something funny is going on, so we won't do the optimization. + // + if (Idx->getZExtValue() >= NumElements) + return false; + + // We cannot scalar repl this level of the array unless any array + // sub-indices are in-range constants. In particular, consider: + // A[0][i]. We cannot know that the user isn't doing invalid things like + // allowing i to index an out-of-range subscript that accesses A[1]. + // + // Scalar replacing *just* the outer index of the array is probably not + // going to be a win anyway, so just give up. + for (++GEPI; // Skip array index. + GEPI != E; + ++GEPI) { + uint64_t NumElements; + if (const ArrayType *SubArrayTy = dyn_cast<ArrayType>(*GEPI)) + NumElements = SubArrayTy->getNumElements(); + else if (const VectorType *SubVectorTy = dyn_cast<VectorType>(*GEPI)) + NumElements = SubVectorTy->getNumElements(); + else { + assert((*GEPI)->isStructTy() && + "Indexed GEP type is not array, vector, or struct!"); + continue; + } + + ConstantInt *IdxVal = dyn_cast<ConstantInt>(GEPI.getOperand()); + if (!IdxVal || IdxVal->getZExtValue() >= NumElements) + return false; + } + } + + for (Value::use_iterator I = U->use_begin(), E = U->use_end(); I != E; ++I) + if (!isSafeSROAElementUse(*I)) + return false; + return true; +} + +/// GlobalUsersSafeToSRA - Look at all uses of the global and decide whether it +/// is safe for us to perform this transformation. +/// +static bool GlobalUsersSafeToSRA(GlobalValue *GV) { + for (Value::use_iterator UI = GV->use_begin(), E = GV->use_end(); + UI != E; ++UI) { + if (!IsUserOfGlobalSafeForSRA(*UI, GV)) + return false; + } + return true; +} + + +/// SRAGlobal - Perform scalar replacement of aggregates on the specified global +/// variable. This opens the door for other optimizations by exposing the +/// behavior of the program in a more fine-grained way. We have determined that +/// this transformation is safe already. We return the first global variable we +/// insert so that the caller can reprocess it. +static GlobalVariable *SRAGlobal(GlobalVariable *GV, const TargetData &TD) { + // Make sure this global only has simple uses that we can SRA. + if (!GlobalUsersSafeToSRA(GV)) + return 0; + + assert(GV->hasLocalLinkage() && !GV->isConstant()); + Constant *Init = GV->getInitializer(); + const Type *Ty = Init->getType(); + + std::vector<GlobalVariable*> NewGlobals; + Module::GlobalListType &Globals = GV->getParent()->getGlobalList(); + + // Get the alignment of the global, either explicit or target-specific. + unsigned StartAlignment = GV->getAlignment(); + if (StartAlignment == 0) + StartAlignment = TD.getABITypeAlignment(GV->getType()); + + if (const StructType *STy = dyn_cast<StructType>(Ty)) { + NewGlobals.reserve(STy->getNumElements()); + const StructLayout &Layout = *TD.getStructLayout(STy); + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + Constant *In = getAggregateConstantElement(Init, + ConstantInt::get(Type::getInt32Ty(STy->getContext()), i)); + assert(In && "Couldn't get element of initializer?"); + GlobalVariable *NGV = new GlobalVariable(STy->getElementType(i), false, + GlobalVariable::InternalLinkage, + In, GV->getName()+"."+Twine(i), + GV->isThreadLocal(), + GV->getType()->getAddressSpace()); + Globals.insert(GV, NGV); + NewGlobals.push_back(NGV); + + // Calculate the known alignment of the field. If the original aggregate + // had 256 byte alignment for example, something might depend on that: + // propagate info to each field. + uint64_t FieldOffset = Layout.getElementOffset(i); + unsigned NewAlign = (unsigned)MinAlign(StartAlignment, FieldOffset); + if (NewAlign > TD.getABITypeAlignment(STy->getElementType(i))) + NGV->setAlignment(NewAlign); + } + } else if (const SequentialType *STy = dyn_cast<SequentialType>(Ty)) { + unsigned NumElements = 0; + if (const ArrayType *ATy = dyn_cast<ArrayType>(STy)) + NumElements = ATy->getNumElements(); + else + NumElements = cast<VectorType>(STy)->getNumElements(); + + if (NumElements > 16 && GV->hasNUsesOrMore(16)) + return 0; // It's not worth it. + NewGlobals.reserve(NumElements); + + uint64_t EltSize = TD.getTypeAllocSize(STy->getElementType()); + unsigned EltAlign = TD.getABITypeAlignment(STy->getElementType()); + for (unsigned i = 0, e = NumElements; i != e; ++i) { + Constant *In = getAggregateConstantElement(Init, + ConstantInt::get(Type::getInt32Ty(Init->getContext()), i)); + assert(In && "Couldn't get element of initializer?"); + + GlobalVariable *NGV = new GlobalVariable(STy->getElementType(), false, + GlobalVariable::InternalLinkage, + In, GV->getName()+"."+Twine(i), + GV->isThreadLocal(), + GV->getType()->getAddressSpace()); + Globals.insert(GV, NGV); + NewGlobals.push_back(NGV); + + // Calculate the known alignment of the field. If the original aggregate + // had 256 byte alignment for example, something might depend on that: + // propagate info to each field. + unsigned NewAlign = (unsigned)MinAlign(StartAlignment, EltSize*i); + if (NewAlign > EltAlign) + NGV->setAlignment(NewAlign); + } + } + + if (NewGlobals.empty()) + return 0; + + DEBUG(dbgs() << "PERFORMING GLOBAL SRA ON: " << *GV); + + Constant *NullInt =Constant::getNullValue(Type::getInt32Ty(GV->getContext())); + + // Loop over all of the uses of the global, replacing the constantexpr geps, + // with smaller constantexpr geps or direct references. + while (!GV->use_empty()) { + User *GEP = GV->use_back(); + assert(((isa<ConstantExpr>(GEP) && + cast<ConstantExpr>(GEP)->getOpcode()==Instruction::GetElementPtr)|| + isa<GetElementPtrInst>(GEP)) && "NonGEP CE's are not SRAable!"); + + // Ignore the 1th operand, which has to be zero or else the program is quite + // broken (undefined). Get the 2nd operand, which is the structure or array + // index. + unsigned Val = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue(); + if (Val >= NewGlobals.size()) Val = 0; // Out of bound array access. + + Value *NewPtr = NewGlobals[Val]; + + // Form a shorter GEP if needed. + if (GEP->getNumOperands() > 3) { + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(GEP)) { + SmallVector<Constant*, 8> Idxs; + Idxs.push_back(NullInt); + for (unsigned i = 3, e = CE->getNumOperands(); i != e; ++i) + Idxs.push_back(CE->getOperand(i)); + NewPtr = ConstantExpr::getGetElementPtr(cast<Constant>(NewPtr), + &Idxs[0], Idxs.size()); + } else { + GetElementPtrInst *GEPI = cast<GetElementPtrInst>(GEP); + SmallVector<Value*, 8> Idxs; + Idxs.push_back(NullInt); + for (unsigned i = 3, e = GEPI->getNumOperands(); i != e; ++i) + Idxs.push_back(GEPI->getOperand(i)); + NewPtr = GetElementPtrInst::Create(NewPtr, Idxs.begin(), Idxs.end(), + GEPI->getName()+"."+Twine(Val),GEPI); + } + } + GEP->replaceAllUsesWith(NewPtr); + + if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(GEP)) + GEPI->eraseFromParent(); + else + cast<ConstantExpr>(GEP)->destroyConstant(); + } + + // Delete the old global, now that it is dead. + Globals.erase(GV); + ++NumSRA; + + // Loop over the new globals array deleting any globals that are obviously + // dead. This can arise due to scalarization of a structure or an array that + // has elements that are dead. + unsigned FirstGlobal = 0; + for (unsigned i = 0, e = NewGlobals.size(); i != e; ++i) + if (NewGlobals[i]->use_empty()) { + Globals.erase(NewGlobals[i]); + if (FirstGlobal == i) ++FirstGlobal; + } + + return FirstGlobal != NewGlobals.size() ? NewGlobals[FirstGlobal] : 0; +} + +/// AllUsesOfValueWillTrapIfNull - Return true if all users of the specified +/// value will trap if the value is dynamically null. PHIs keeps track of any +/// phi nodes we've seen to avoid reprocessing them. +static bool AllUsesOfValueWillTrapIfNull(const Value *V, + SmallPtrSet<const PHINode*, 8> &PHIs) { + for (Value::const_use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; + ++UI) { + const User *U = *UI; + + if (isa<LoadInst>(U)) { + // Will trap. + } else if (const StoreInst *SI = dyn_cast<StoreInst>(U)) { + if (SI->getOperand(0) == V) { + //cerr << "NONTRAPPING USE: " << *U; + return false; // Storing the value. + } + } else if (const CallInst *CI = dyn_cast<CallInst>(U)) { + if (CI->getCalledValue() != V) { + //cerr << "NONTRAPPING USE: " << *U; + return false; // Not calling the ptr + } + } else if (const InvokeInst *II = dyn_cast<InvokeInst>(U)) { + if (II->getCalledValue() != V) { + //cerr << "NONTRAPPING USE: " << *U; + return false; // Not calling the ptr + } + } else if (const BitCastInst *CI = dyn_cast<BitCastInst>(U)) { + if (!AllUsesOfValueWillTrapIfNull(CI, PHIs)) return false; + } else if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) { + if (!AllUsesOfValueWillTrapIfNull(GEPI, PHIs)) return false; + } else if (const PHINode *PN = dyn_cast<PHINode>(U)) { + // If we've already seen this phi node, ignore it, it has already been + // checked. + if (PHIs.insert(PN) && !AllUsesOfValueWillTrapIfNull(PN, PHIs)) + return false; + } else if (isa<ICmpInst>(U) && + isa<ConstantPointerNull>(UI->getOperand(1))) { + // Ignore icmp X, null + } else { + //cerr << "NONTRAPPING USE: " << *U; + return false; + } + } + return true; +} + +/// AllUsesOfLoadedValueWillTrapIfNull - Return true if all uses of any loads +/// from GV will trap if the loaded value is null. Note that this also permits +/// comparisons of the loaded value against null, as a special case. +static bool AllUsesOfLoadedValueWillTrapIfNull(const GlobalVariable *GV) { + for (Value::const_use_iterator UI = GV->use_begin(), E = GV->use_end(); + UI != E; ++UI) { + const User *U = *UI; + + if (const LoadInst *LI = dyn_cast<LoadInst>(U)) { + SmallPtrSet<const PHINode*, 8> PHIs; + if (!AllUsesOfValueWillTrapIfNull(LI, PHIs)) + return false; + } else if (isa<StoreInst>(U)) { + // Ignore stores to the global. + } else { + // We don't know or understand this user, bail out. + //cerr << "UNKNOWN USER OF GLOBAL!: " << *U; + return false; + } + } + return true; +} + +static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) { + bool Changed = false; + for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; ) { + Instruction *I = cast<Instruction>(*UI++); + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + LI->setOperand(0, NewV); + Changed = true; + } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) { + if (SI->getOperand(1) == V) { + SI->setOperand(1, NewV); + Changed = true; + } + } else if (isa<CallInst>(I) || isa<InvokeInst>(I)) { + CallSite CS(I); + if (CS.getCalledValue() == V) { + // Calling through the pointer! Turn into a direct call, but be careful + // that the pointer is not also being passed as an argument. + CS.setCalledFunction(NewV); + Changed = true; + bool PassedAsArg = false; + for (unsigned i = 0, e = CS.arg_size(); i != e; ++i) + if (CS.getArgument(i) == V) { + PassedAsArg = true; + CS.setArgument(i, NewV); + } + + if (PassedAsArg) { + // Being passed as an argument also. Be careful to not invalidate UI! + UI = V->use_begin(); + } + } + } else if (CastInst *CI = dyn_cast<CastInst>(I)) { + Changed |= OptimizeAwayTrappingUsesOfValue(CI, + ConstantExpr::getCast(CI->getOpcode(), + NewV, CI->getType())); + if (CI->use_empty()) { + Changed = true; + CI->eraseFromParent(); + } + } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) { + // Should handle GEP here. + SmallVector<Constant*, 8> Idxs; + Idxs.reserve(GEPI->getNumOperands()-1); + for (User::op_iterator i = GEPI->op_begin() + 1, e = GEPI->op_end(); + i != e; ++i) + if (Constant *C = dyn_cast<Constant>(*i)) + Idxs.push_back(C); + else + break; + if (Idxs.size() == GEPI->getNumOperands()-1) + Changed |= OptimizeAwayTrappingUsesOfValue(GEPI, + ConstantExpr::getGetElementPtr(NewV, &Idxs[0], + Idxs.size())); + if (GEPI->use_empty()) { + Changed = true; + GEPI->eraseFromParent(); + } + } + } + + return Changed; +} + + +/// OptimizeAwayTrappingUsesOfLoads - The specified global has only one non-null +/// value stored into it. If there are uses of the loaded value that would trap +/// if the loaded value is dynamically null, then we know that they cannot be +/// reachable with a null optimize away the load. +static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV) { + bool Changed = false; + + // Keep track of whether we are able to remove all the uses of the global + // other than the store that defines it. + bool AllNonStoreUsesGone = true; + + // Replace all uses of loads with uses of uses of the stored value. + for (Value::use_iterator GUI = GV->use_begin(), E = GV->use_end(); GUI != E;){ + User *GlobalUser = *GUI++; + if (LoadInst *LI = dyn_cast<LoadInst>(GlobalUser)) { + Changed |= OptimizeAwayTrappingUsesOfValue(LI, LV); + // If we were able to delete all uses of the loads + if (LI->use_empty()) { + LI->eraseFromParent(); + Changed = true; + } else { + AllNonStoreUsesGone = false; + } + } else if (isa<StoreInst>(GlobalUser)) { + // Ignore the store that stores "LV" to the global. + assert(GlobalUser->getOperand(1) == GV && + "Must be storing *to* the global"); + } else { + AllNonStoreUsesGone = false; + + // If we get here we could have other crazy uses that are transitively + // loaded. + assert((isa<PHINode>(GlobalUser) || isa<SelectInst>(GlobalUser) || + isa<ConstantExpr>(GlobalUser)) && "Only expect load and stores!"); + } + } + + if (Changed) { + DEBUG(dbgs() << "OPTIMIZED LOADS FROM STORED ONCE POINTER: " << *GV); + ++NumGlobUses; + } + + // If we nuked all of the loads, then none of the stores are needed either, + // nor is the global. + if (AllNonStoreUsesGone) { + DEBUG(dbgs() << " *** GLOBAL NOW DEAD!\n"); + CleanupConstantGlobalUsers(GV, 0); + if (GV->use_empty()) { + GV->eraseFromParent(); + ++NumDeleted; + } + Changed = true; + } + return Changed; +} + +/// ConstantPropUsersOf - Walk the use list of V, constant folding all of the +/// instructions that are foldable. +static void ConstantPropUsersOf(Value *V) { + for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; ) + if (Instruction *I = dyn_cast<Instruction>(*UI++)) + if (Constant *NewC = ConstantFoldInstruction(I)) { + I->replaceAllUsesWith(NewC); + + // Advance UI to the next non-I use to avoid invalidating it! + // Instructions could multiply use V. + while (UI != E && *UI == I) + ++UI; + I->eraseFromParent(); + } +} + +/// OptimizeGlobalAddressOfMalloc - This function takes the specified global +/// variable, and transforms the program as if it always contained the result of +/// the specified malloc. Because it is always the result of the specified +/// malloc, there is no reason to actually DO the malloc. Instead, turn the +/// malloc into a global, and any loads of GV as uses of the new global. +static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, + CallInst *CI, + const Type *AllocTy, + ConstantInt *NElements, + TargetData* TD) { + DEBUG(errs() << "PROMOTING GLOBAL: " << *GV << " CALL = " << *CI << '\n'); + + const Type *GlobalType; + if (NElements->getZExtValue() == 1) + GlobalType = AllocTy; + else + // If we have an array allocation, the global variable is of an array. + GlobalType = ArrayType::get(AllocTy, NElements->getZExtValue()); + + // Create the new global variable. The contents of the malloc'd memory is + // undefined, so initialize with an undef value. + GlobalVariable *NewGV = new GlobalVariable(*GV->getParent(), + GlobalType, false, + GlobalValue::InternalLinkage, + UndefValue::get(GlobalType), + GV->getName()+".body", + GV, + GV->isThreadLocal()); + + // If there are bitcast users of the malloc (which is typical, usually we have + // a malloc + bitcast) then replace them with uses of the new global. Update + // other users to use the global as well. + BitCastInst *TheBC = 0; + while (!CI->use_empty()) { + Instruction *User = cast<Instruction>(CI->use_back()); + if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) { + if (BCI->getType() == NewGV->getType()) { + BCI->replaceAllUsesWith(NewGV); + BCI->eraseFromParent(); + } else { + BCI->setOperand(0, NewGV); + } + } else { + if (TheBC == 0) + TheBC = new BitCastInst(NewGV, CI->getType(), "newgv", CI); + User->replaceUsesOfWith(CI, TheBC); + } + } + + Constant *RepValue = NewGV; + if (NewGV->getType() != GV->getType()->getElementType()) + RepValue = ConstantExpr::getBitCast(RepValue, + GV->getType()->getElementType()); + + // If there is a comparison against null, we will insert a global bool to + // keep track of whether the global was initialized yet or not. + GlobalVariable *InitBool = + new GlobalVariable(Type::getInt1Ty(GV->getContext()), false, + GlobalValue::InternalLinkage, + ConstantInt::getFalse(GV->getContext()), + GV->getName()+".init", GV->isThreadLocal()); + bool InitBoolUsed = false; + + // Loop over all uses of GV, processing them in turn. + while (!GV->use_empty()) { + if (StoreInst *SI = dyn_cast<StoreInst>(GV->use_back())) { + // The global is initialized when the store to it occurs. + new StoreInst(ConstantInt::getTrue(GV->getContext()), InitBool, SI); + SI->eraseFromParent(); + continue; + } + + LoadInst *LI = cast<LoadInst>(GV->use_back()); + while (!LI->use_empty()) { + Use &LoadUse = LI->use_begin().getUse(); + if (!isa<ICmpInst>(LoadUse.getUser())) { + LoadUse = RepValue; + continue; + } + + ICmpInst *ICI = cast<ICmpInst>(LoadUse.getUser()); + // Replace the cmp X, 0 with a use of the bool value. + Value *LV = new LoadInst(InitBool, InitBool->getName()+".val", ICI); + InitBoolUsed = true; + switch (ICI->getPredicate()) { + default: llvm_unreachable("Unknown ICmp Predicate!"); + case ICmpInst::ICMP_ULT: + case ICmpInst::ICMP_SLT: // X < null -> always false + LV = ConstantInt::getFalse(GV->getContext()); + break; + case ICmpInst::ICMP_ULE: + case ICmpInst::ICMP_SLE: + case ICmpInst::ICMP_EQ: + LV = BinaryOperator::CreateNot(LV, "notinit", ICI); + break; + case ICmpInst::ICMP_NE: + case ICmpInst::ICMP_UGE: + case ICmpInst::ICMP_SGE: + case ICmpInst::ICMP_UGT: + case ICmpInst::ICMP_SGT: + break; // no change. + } + ICI->replaceAllUsesWith(LV); + ICI->eraseFromParent(); + } + LI->eraseFromParent(); + } + + // If the initialization boolean was used, insert it, otherwise delete it. + if (!InitBoolUsed) { + while (!InitBool->use_empty()) // Delete initializations + cast<StoreInst>(InitBool->use_back())->eraseFromParent(); + delete InitBool; + } else + GV->getParent()->getGlobalList().insert(GV, InitBool); + + // Now the GV is dead, nuke it and the malloc.. + GV->eraseFromParent(); + CI->eraseFromParent(); + + // To further other optimizations, loop over all users of NewGV and try to + // constant prop them. This will promote GEP instructions with constant + // indices into GEP constant-exprs, which will allow global-opt to hack on it. + ConstantPropUsersOf(NewGV); + if (RepValue != NewGV) + ConstantPropUsersOf(RepValue); + + return NewGV; +} + +/// ValueIsOnlyUsedLocallyOrStoredToOneGlobal - Scan the use-list of V checking +/// to make sure that there are no complex uses of V. We permit simple things +/// like dereferencing the pointer, but not storing through the address, unless +/// it is to the specified global. +static bool ValueIsOnlyUsedLocallyOrStoredToOneGlobal(const Instruction *V, + const GlobalVariable *GV, + SmallPtrSet<const PHINode*, 8> &PHIs) { + for (Value::const_use_iterator UI = V->use_begin(), E = V->use_end(); + UI != E; ++UI) { + const Instruction *Inst = cast<Instruction>(*UI); + + if (isa<LoadInst>(Inst) || isa<CmpInst>(Inst)) { + continue; // Fine, ignore. + } + + if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + if (SI->getOperand(0) == V && SI->getOperand(1) != GV) + return false; // Storing the pointer itself... bad. + continue; // Otherwise, storing through it, or storing into GV... fine. + } + + // Must index into the array and into the struct. + if (isa<GetElementPtrInst>(Inst) && Inst->getNumOperands() >= 3) { + if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(Inst, GV, PHIs)) + return false; + continue; + } + + if (const PHINode *PN = dyn_cast<PHINode>(Inst)) { + // PHIs are ok if all uses are ok. Don't infinitely recurse through PHI + // cycles. + if (PHIs.insert(PN)) + if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(PN, GV, PHIs)) + return false; + continue; + } + + if (const BitCastInst *BCI = dyn_cast<BitCastInst>(Inst)) { + if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(BCI, GV, PHIs)) + return false; + continue; + } + + return false; + } + return true; +} + +/// ReplaceUsesOfMallocWithGlobal - The Alloc pointer is stored into GV +/// somewhere. Transform all uses of the allocation into loads from the +/// global and uses of the resultant pointer. Further, delete the store into +/// GV. This assumes that these value pass the +/// 'ValueIsOnlyUsedLocallyOrStoredToOneGlobal' predicate. +static void ReplaceUsesOfMallocWithGlobal(Instruction *Alloc, + GlobalVariable *GV) { + while (!Alloc->use_empty()) { + Instruction *U = cast<Instruction>(*Alloc->use_begin()); + Instruction *InsertPt = U; + if (StoreInst *SI = dyn_cast<StoreInst>(U)) { + // If this is the store of the allocation into the global, remove it. + if (SI->getOperand(1) == GV) { + SI->eraseFromParent(); + continue; + } + } else if (PHINode *PN = dyn_cast<PHINode>(U)) { + // Insert the load in the corresponding predecessor, not right before the + // PHI. + InsertPt = PN->getIncomingBlock(Alloc->use_begin())->getTerminator(); + } else if (isa<BitCastInst>(U)) { + // Must be bitcast between the malloc and store to initialize the global. + ReplaceUsesOfMallocWithGlobal(U, GV); + U->eraseFromParent(); + continue; + } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) { + // If this is a "GEP bitcast" and the user is a store to the global, then + // just process it as a bitcast. + if (GEPI->hasAllZeroIndices() && GEPI->hasOneUse()) + if (StoreInst *SI = dyn_cast<StoreInst>(GEPI->use_back())) + if (SI->getOperand(1) == GV) { + // Must be bitcast GEP between the malloc and store to initialize + // the global. + ReplaceUsesOfMallocWithGlobal(GEPI, GV); + GEPI->eraseFromParent(); + continue; + } + } + + // Insert a load from the global, and use it instead of the malloc. + Value *NL = new LoadInst(GV, GV->getName()+".val", InsertPt); + U->replaceUsesOfWith(Alloc, NL); + } +} + +/// LoadUsesSimpleEnoughForHeapSRA - Verify that all uses of V (a load, or a phi +/// of a load) are simple enough to perform heap SRA on. This permits GEP's +/// that index through the array and struct field, icmps of null, and PHIs. +static bool LoadUsesSimpleEnoughForHeapSRA(const Value *V, + SmallPtrSet<const PHINode*, 32> &LoadUsingPHIs, + SmallPtrSet<const PHINode*, 32> &LoadUsingPHIsPerLoad) { + // We permit two users of the load: setcc comparing against the null + // pointer, and a getelementptr of a specific form. + for (Value::const_use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; + ++UI) { + const Instruction *User = cast<Instruction>(*UI); + + // Comparison against null is ok. + if (const ICmpInst *ICI = dyn_cast<ICmpInst>(User)) { + if (!isa<ConstantPointerNull>(ICI->getOperand(1))) + return false; + continue; + } + + // getelementptr is also ok, but only a simple form. + if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) { + // Must index into the array and into the struct. + if (GEPI->getNumOperands() < 3) + return false; + + // Otherwise the GEP is ok. + continue; + } + + if (const PHINode *PN = dyn_cast<PHINode>(User)) { + if (!LoadUsingPHIsPerLoad.insert(PN)) + // This means some phi nodes are dependent on each other. + // Avoid infinite looping! + return false; + if (!LoadUsingPHIs.insert(PN)) + // If we have already analyzed this PHI, then it is safe. + continue; + + // Make sure all uses of the PHI are simple enough to transform. + if (!LoadUsesSimpleEnoughForHeapSRA(PN, + LoadUsingPHIs, LoadUsingPHIsPerLoad)) + return false; + + continue; + } + + // Otherwise we don't know what this is, not ok. + return false; + } + + return true; +} + + +/// AllGlobalLoadUsesSimpleEnoughForHeapSRA - If all users of values loaded from +/// GV are simple enough to perform HeapSRA, return true. +static bool AllGlobalLoadUsesSimpleEnoughForHeapSRA(const GlobalVariable *GV, + Instruction *StoredVal) { + SmallPtrSet<const PHINode*, 32> LoadUsingPHIs; + SmallPtrSet<const PHINode*, 32> LoadUsingPHIsPerLoad; + for (Value::const_use_iterator UI = GV->use_begin(), E = GV->use_end(); + UI != E; ++UI) + if (const LoadInst *LI = dyn_cast<LoadInst>(*UI)) { + if (!LoadUsesSimpleEnoughForHeapSRA(LI, LoadUsingPHIs, + LoadUsingPHIsPerLoad)) + return false; + LoadUsingPHIsPerLoad.clear(); + } + + // If we reach here, we know that all uses of the loads and transitive uses + // (through PHI nodes) are simple enough to transform. However, we don't know + // that all inputs the to the PHI nodes are in the same equivalence sets. + // Check to verify that all operands of the PHIs are either PHIS that can be + // transformed, loads from GV, or MI itself. + for (SmallPtrSet<const PHINode*, 32>::const_iterator I = LoadUsingPHIs.begin() + , E = LoadUsingPHIs.end(); I != E; ++I) { + const PHINode *PN = *I; + for (unsigned op = 0, e = PN->getNumIncomingValues(); op != e; ++op) { + Value *InVal = PN->getIncomingValue(op); + + // PHI of the stored value itself is ok. + if (InVal == StoredVal) continue; + + if (const PHINode *InPN = dyn_cast<PHINode>(InVal)) { + // One of the PHIs in our set is (optimistically) ok. + if (LoadUsingPHIs.count(InPN)) + continue; + return false; + } + + // Load from GV is ok. + if (const LoadInst *LI = dyn_cast<LoadInst>(InVal)) + if (LI->getOperand(0) == GV) + continue; + + // UNDEF? NULL? + + // Anything else is rejected. + return false; + } + } + + return true; +} + +static Value *GetHeapSROAValue(Value *V, unsigned FieldNo, + DenseMap<Value*, std::vector<Value*> > &InsertedScalarizedValues, + std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite) { + std::vector<Value*> &FieldVals = InsertedScalarizedValues[V]; + + if (FieldNo >= FieldVals.size()) + FieldVals.resize(FieldNo+1); + + // If we already have this value, just reuse the previously scalarized + // version. + if (Value *FieldVal = FieldVals[FieldNo]) + return FieldVal; + + // Depending on what instruction this is, we have several cases. + Value *Result; + if (LoadInst *LI = dyn_cast<LoadInst>(V)) { + // This is a scalarized version of the load from the global. Just create + // a new Load of the scalarized global. + Result = new LoadInst(GetHeapSROAValue(LI->getOperand(0), FieldNo, + InsertedScalarizedValues, + PHIsToRewrite), + LI->getName()+".f"+Twine(FieldNo), LI); + } else if (PHINode *PN = dyn_cast<PHINode>(V)) { + // PN's type is pointer to struct. Make a new PHI of pointer to struct + // field. + const StructType *ST = + cast<StructType>(cast<PointerType>(PN->getType())->getElementType()); + + Result = + PHINode::Create(PointerType::getUnqual(ST->getElementType(FieldNo)), + PN->getName()+".f"+Twine(FieldNo), PN); + PHIsToRewrite.push_back(std::make_pair(PN, FieldNo)); + } else { + llvm_unreachable("Unknown usable value"); + Result = 0; + } + + return FieldVals[FieldNo] = Result; +} + +/// RewriteHeapSROALoadUser - Given a load instruction and a value derived from +/// the load, rewrite the derived value to use the HeapSRoA'd load. +static void RewriteHeapSROALoadUser(Instruction *LoadUser, + DenseMap<Value*, std::vector<Value*> > &InsertedScalarizedValues, + std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite) { + // If this is a comparison against null, handle it. + if (ICmpInst *SCI = dyn_cast<ICmpInst>(LoadUser)) { + assert(isa<ConstantPointerNull>(SCI->getOperand(1))); + // If we have a setcc of the loaded pointer, we can use a setcc of any + // field. + Value *NPtr = GetHeapSROAValue(SCI->getOperand(0), 0, + InsertedScalarizedValues, PHIsToRewrite); + + Value *New = new ICmpInst(SCI, SCI->getPredicate(), NPtr, + Constant::getNullValue(NPtr->getType()), + SCI->getName()); + SCI->replaceAllUsesWith(New); + SCI->eraseFromParent(); + return; + } + + // Handle 'getelementptr Ptr, Idx, i32 FieldNo ...' + if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(LoadUser)) { + assert(GEPI->getNumOperands() >= 3 && isa<ConstantInt>(GEPI->getOperand(2)) + && "Unexpected GEPI!"); + + // Load the pointer for this field. + unsigned FieldNo = cast<ConstantInt>(GEPI->getOperand(2))->getZExtValue(); + Value *NewPtr = GetHeapSROAValue(GEPI->getOperand(0), FieldNo, + InsertedScalarizedValues, PHIsToRewrite); + + // Create the new GEP idx vector. + SmallVector<Value*, 8> GEPIdx; + GEPIdx.push_back(GEPI->getOperand(1)); + GEPIdx.append(GEPI->op_begin()+3, GEPI->op_end()); + + Value *NGEPI = GetElementPtrInst::Create(NewPtr, + GEPIdx.begin(), GEPIdx.end(), + GEPI->getName(), GEPI); + GEPI->replaceAllUsesWith(NGEPI); + GEPI->eraseFromParent(); + return; + } + + // Recursively transform the users of PHI nodes. This will lazily create the + // PHIs that are needed for individual elements. Keep track of what PHIs we + // see in InsertedScalarizedValues so that we don't get infinite loops (very + // antisocial). If the PHI is already in InsertedScalarizedValues, it has + // already been seen first by another load, so its uses have already been + // processed. + PHINode *PN = cast<PHINode>(LoadUser); + bool Inserted; + DenseMap<Value*, std::vector<Value*> >::iterator InsertPos; + tie(InsertPos, Inserted) = + InsertedScalarizedValues.insert(std::make_pair(PN, std::vector<Value*>())); + if (!Inserted) return; + + // If this is the first time we've seen this PHI, recursively process all + // users. + for (Value::use_iterator UI = PN->use_begin(), E = PN->use_end(); UI != E; ) { + Instruction *User = cast<Instruction>(*UI++); + RewriteHeapSROALoadUser(User, InsertedScalarizedValues, PHIsToRewrite); + } +} + +/// RewriteUsesOfLoadForHeapSRoA - We are performing Heap SRoA on a global. Ptr +/// is a value loaded from the global. Eliminate all uses of Ptr, making them +/// use FieldGlobals instead. All uses of loaded values satisfy +/// AllGlobalLoadUsesSimpleEnoughForHeapSRA. +static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load, + DenseMap<Value*, std::vector<Value*> > &InsertedScalarizedValues, + std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite) { + for (Value::use_iterator UI = Load->use_begin(), E = Load->use_end(); + UI != E; ) { + Instruction *User = cast<Instruction>(*UI++); + RewriteHeapSROALoadUser(User, InsertedScalarizedValues, PHIsToRewrite); + } + + if (Load->use_empty()) { + Load->eraseFromParent(); + InsertedScalarizedValues.erase(Load); + } +} + +/// PerformHeapAllocSRoA - CI is an allocation of an array of structures. Break +/// it up into multiple allocations of arrays of the fields. +static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI, + Value* NElems, TargetData *TD) { + DEBUG(dbgs() << "SROA HEAP ALLOC: " << *GV << " MALLOC = " << *CI << '\n'); + const Type* MAT = getMallocAllocatedType(CI); + const StructType *STy = cast<StructType>(MAT); + + // There is guaranteed to be at least one use of the malloc (storing + // it into GV). If there are other uses, change them to be uses of + // the global to simplify later code. This also deletes the store + // into GV. + ReplaceUsesOfMallocWithGlobal(CI, GV); + + // Okay, at this point, there are no users of the malloc. Insert N + // new mallocs at the same place as CI, and N globals. + std::vector<Value*> FieldGlobals; + std::vector<Value*> FieldMallocs; + + for (unsigned FieldNo = 0, e = STy->getNumElements(); FieldNo != e;++FieldNo){ + const Type *FieldTy = STy->getElementType(FieldNo); + const PointerType *PFieldTy = PointerType::getUnqual(FieldTy); + + GlobalVariable *NGV = + new GlobalVariable(*GV->getParent(), + PFieldTy, false, GlobalValue::InternalLinkage, + Constant::getNullValue(PFieldTy), + GV->getName() + ".f" + Twine(FieldNo), GV, + GV->isThreadLocal()); + FieldGlobals.push_back(NGV); + + unsigned TypeSize = TD->getTypeAllocSize(FieldTy); + if (const StructType *ST = dyn_cast<StructType>(FieldTy)) + TypeSize = TD->getStructLayout(ST)->getSizeInBytes(); + const Type *IntPtrTy = TD->getIntPtrType(CI->getContext()); + Value *NMI = CallInst::CreateMalloc(CI, IntPtrTy, FieldTy, + ConstantInt::get(IntPtrTy, TypeSize), + NElems, 0, + CI->getName() + ".f" + Twine(FieldNo)); + FieldMallocs.push_back(NMI); + new StoreInst(NMI, NGV, CI); + } + + // The tricky aspect of this transformation is handling the case when malloc + // fails. In the original code, malloc failing would set the result pointer + // of malloc to null. In this case, some mallocs could succeed and others + // could fail. As such, we emit code that looks like this: + // F0 = malloc(field0) + // F1 = malloc(field1) + // F2 = malloc(field2) + // if (F0 == 0 || F1 == 0 || F2 == 0) { + // if (F0) { free(F0); F0 = 0; } + // if (F1) { free(F1); F1 = 0; } + // if (F2) { free(F2); F2 = 0; } + // } + // The malloc can also fail if its argument is too large. + Constant *ConstantZero = ConstantInt::get(CI->getArgOperand(0)->getType(), 0); + Value *RunningOr = new ICmpInst(CI, ICmpInst::ICMP_SLT, CI->getArgOperand(0), + ConstantZero, "isneg"); + for (unsigned i = 0, e = FieldMallocs.size(); i != e; ++i) { + Value *Cond = new ICmpInst(CI, ICmpInst::ICMP_EQ, FieldMallocs[i], + Constant::getNullValue(FieldMallocs[i]->getType()), + "isnull"); + RunningOr = BinaryOperator::CreateOr(RunningOr, Cond, "tmp", CI); + } + + // Split the basic block at the old malloc. + BasicBlock *OrigBB = CI->getParent(); + BasicBlock *ContBB = OrigBB->splitBasicBlock(CI, "malloc_cont"); + + // Create the block to check the first condition. Put all these blocks at the + // end of the function as they are unlikely to be executed. + BasicBlock *NullPtrBlock = BasicBlock::Create(OrigBB->getContext(), + "malloc_ret_null", + OrigBB->getParent()); + + // Remove the uncond branch from OrigBB to ContBB, turning it into a cond + // branch on RunningOr. + OrigBB->getTerminator()->eraseFromParent(); + BranchInst::Create(NullPtrBlock, ContBB, RunningOr, OrigBB); + + // Within the NullPtrBlock, we need to emit a comparison and branch for each + // pointer, because some may be null while others are not. + for (unsigned i = 0, e = FieldGlobals.size(); i != e; ++i) { + Value *GVVal = new LoadInst(FieldGlobals[i], "tmp", NullPtrBlock); + Value *Cmp = new ICmpInst(*NullPtrBlock, ICmpInst::ICMP_NE, GVVal, + Constant::getNullValue(GVVal->getType()), + "tmp"); + BasicBlock *FreeBlock = BasicBlock::Create(Cmp->getContext(), "free_it", + OrigBB->getParent()); + BasicBlock *NextBlock = BasicBlock::Create(Cmp->getContext(), "next", + OrigBB->getParent()); + Instruction *BI = BranchInst::Create(FreeBlock, NextBlock, + Cmp, NullPtrBlock); + + // Fill in FreeBlock. + CallInst::CreateFree(GVVal, BI); + new StoreInst(Constant::getNullValue(GVVal->getType()), FieldGlobals[i], + FreeBlock); + BranchInst::Create(NextBlock, FreeBlock); + + NullPtrBlock = NextBlock; + } + + BranchInst::Create(ContBB, NullPtrBlock); + + // CI is no longer needed, remove it. + CI->eraseFromParent(); + + /// InsertedScalarizedLoads - As we process loads, if we can't immediately + /// update all uses of the load, keep track of what scalarized loads are + /// inserted for a given load. + DenseMap<Value*, std::vector<Value*> > InsertedScalarizedValues; + InsertedScalarizedValues[GV] = FieldGlobals; + + std::vector<std::pair<PHINode*, unsigned> > PHIsToRewrite; + + // Okay, the malloc site is completely handled. All of the uses of GV are now + // loads, and all uses of those loads are simple. Rewrite them to use loads + // of the per-field globals instead. + for (Value::use_iterator UI = GV->use_begin(), E = GV->use_end(); UI != E;) { + Instruction *User = cast<Instruction>(*UI++); + + if (LoadInst *LI = dyn_cast<LoadInst>(User)) { + RewriteUsesOfLoadForHeapSRoA(LI, InsertedScalarizedValues, PHIsToRewrite); + continue; + } + + // Must be a store of null. + StoreInst *SI = cast<StoreInst>(User); + assert(isa<ConstantPointerNull>(SI->getOperand(0)) && + "Unexpected heap-sra user!"); + + // Insert a store of null into each global. + for (unsigned i = 0, e = FieldGlobals.size(); i != e; ++i) { + const PointerType *PT = cast<PointerType>(FieldGlobals[i]->getType()); + Constant *Null = Constant::getNullValue(PT->getElementType()); + new StoreInst(Null, FieldGlobals[i], SI); + } + // Erase the original store. + SI->eraseFromParent(); + } + + // While we have PHIs that are interesting to rewrite, do it. + while (!PHIsToRewrite.empty()) { + PHINode *PN = PHIsToRewrite.back().first; + unsigned FieldNo = PHIsToRewrite.back().second; + PHIsToRewrite.pop_back(); + PHINode *FieldPN = cast<PHINode>(InsertedScalarizedValues[PN][FieldNo]); + assert(FieldPN->getNumIncomingValues() == 0 &&"Already processed this phi"); + + // Add all the incoming values. This can materialize more phis. + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + Value *InVal = PN->getIncomingValue(i); + InVal = GetHeapSROAValue(InVal, FieldNo, InsertedScalarizedValues, + PHIsToRewrite); + FieldPN->addIncoming(InVal, PN->getIncomingBlock(i)); + } + } + + // Drop all inter-phi links and any loads that made it this far. + for (DenseMap<Value*, std::vector<Value*> >::iterator + I = InsertedScalarizedValues.begin(), E = InsertedScalarizedValues.end(); + I != E; ++I) { + if (PHINode *PN = dyn_cast<PHINode>(I->first)) + PN->dropAllReferences(); + else if (LoadInst *LI = dyn_cast<LoadInst>(I->first)) + LI->dropAllReferences(); + } + + // Delete all the phis and loads now that inter-references are dead. + for (DenseMap<Value*, std::vector<Value*> >::iterator + I = InsertedScalarizedValues.begin(), E = InsertedScalarizedValues.end(); + I != E; ++I) { + if (PHINode *PN = dyn_cast<PHINode>(I->first)) + PN->eraseFromParent(); + else if (LoadInst *LI = dyn_cast<LoadInst>(I->first)) + LI->eraseFromParent(); + } + + // The old global is now dead, remove it. + GV->eraseFromParent(); + + ++NumHeapSRA; + return cast<GlobalVariable>(FieldGlobals[0]); +} + +/// TryToOptimizeStoreOfMallocToGlobal - This function is called when we see a +/// pointer global variable with a single value stored it that is a malloc or +/// cast of malloc. +static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, + CallInst *CI, + const Type *AllocTy, + Module::global_iterator &GVI, + TargetData *TD) { + if (!TD) + return false; + + // If this is a malloc of an abstract type, don't touch it. + if (!AllocTy->isSized()) + return false; + + // We can't optimize this global unless all uses of it are *known* to be + // of the malloc value, not of the null initializer value (consider a use + // that compares the global's value against zero to see if the malloc has + // been reached). To do this, we check to see if all uses of the global + // would trap if the global were null: this proves that they must all + // happen after the malloc. + if (!AllUsesOfLoadedValueWillTrapIfNull(GV)) + return false; + + // We can't optimize this if the malloc itself is used in a complex way, + // for example, being stored into multiple globals. This allows the + // malloc to be stored into the specified global, loaded setcc'd, and + // GEP'd. These are all things we could transform to using the global + // for. + SmallPtrSet<const PHINode*, 8> PHIs; + if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(CI, GV, PHIs)) + return false; + + // If we have a global that is only initialized with a fixed size malloc, + // transform the program to use global memory instead of malloc'd memory. + // This eliminates dynamic allocation, avoids an indirection accessing the + // data, and exposes the resultant global to further GlobalOpt. + // We cannot optimize the malloc if we cannot determine malloc array size. + Value *NElems = getMallocArraySize(CI, TD, true); + if (!NElems) + return false; + + if (ConstantInt *NElements = dyn_cast<ConstantInt>(NElems)) + // Restrict this transformation to only working on small allocations + // (2048 bytes currently), as we don't want to introduce a 16M global or + // something. + if (NElements->getZExtValue() * TD->getTypeAllocSize(AllocTy) < 2048) { + GVI = OptimizeGlobalAddressOfMalloc(GV, CI, AllocTy, NElements, TD); + return true; + } + + // If the allocation is an array of structures, consider transforming this + // into multiple malloc'd arrays, one for each field. This is basically + // SRoA for malloc'd memory. + + // If this is an allocation of a fixed size array of structs, analyze as a + // variable size array. malloc [100 x struct],1 -> malloc struct, 100 + if (NElems == ConstantInt::get(CI->getArgOperand(0)->getType(), 1)) + if (const ArrayType *AT = dyn_cast<ArrayType>(AllocTy)) + AllocTy = AT->getElementType(); + + const StructType *AllocSTy = dyn_cast<StructType>(AllocTy); + if (!AllocSTy) + return false; + + // This the structure has an unreasonable number of fields, leave it + // alone. + if (AllocSTy->getNumElements() <= 16 && AllocSTy->getNumElements() != 0 && + AllGlobalLoadUsesSimpleEnoughForHeapSRA(GV, CI)) { + + // If this is a fixed size array, transform the Malloc to be an alloc of + // structs. malloc [100 x struct],1 -> malloc struct, 100 + if (const ArrayType *AT = dyn_cast<ArrayType>(getMallocAllocatedType(CI))) { + const Type *IntPtrTy = TD->getIntPtrType(CI->getContext()); + unsigned TypeSize = TD->getStructLayout(AllocSTy)->getSizeInBytes(); + Value *AllocSize = ConstantInt::get(IntPtrTy, TypeSize); + Value *NumElements = ConstantInt::get(IntPtrTy, AT->getNumElements()); + Instruction *Malloc = CallInst::CreateMalloc(CI, IntPtrTy, AllocSTy, + AllocSize, NumElements, + 0, CI->getName()); + Instruction *Cast = new BitCastInst(Malloc, CI->getType(), "tmp", CI); + CI->replaceAllUsesWith(Cast); + CI->eraseFromParent(); + CI = dyn_cast<BitCastInst>(Malloc) ? + extractMallocCallFromBitCast(Malloc) : cast<CallInst>(Malloc); + } + + GVI = PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, TD, true),TD); + return true; + } + + return false; +} + +// OptimizeOnceStoredGlobal - Try to optimize globals based on the knowledge +// that only one value (besides its initializer) is ever stored to the global. +static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal, + Module::global_iterator &GVI, + TargetData *TD) { + // Ignore no-op GEPs and bitcasts. + StoredOnceVal = StoredOnceVal->stripPointerCasts(); + + // If we are dealing with a pointer global that is initialized to null and + // only has one (non-null) value stored into it, then we can optimize any + // users of the loaded value (often calls and loads) that would trap if the + // value was null. + if (GV->getInitializer()->getType()->isPointerTy() && + GV->getInitializer()->isNullValue()) { + if (Constant *SOVC = dyn_cast<Constant>(StoredOnceVal)) { + if (GV->getInitializer()->getType() != SOVC->getType()) + SOVC = + ConstantExpr::getBitCast(SOVC, GV->getInitializer()->getType()); + + // Optimize away any trapping uses of the loaded value. + if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC)) + return true; + } else if (CallInst *CI = extractMallocCall(StoredOnceVal)) { + const Type* MallocType = getMallocAllocatedType(CI); + if (MallocType && TryToOptimizeStoreOfMallocToGlobal(GV, CI, MallocType, + GVI, TD)) + return true; + } + } + + return false; +} + +/// TryToShrinkGlobalToBoolean - At this point, we have learned that the only +/// two values ever stored into GV are its initializer and OtherVal. See if we +/// can shrink the global into a boolean and select between the two values +/// whenever it is used. This exposes the values to other scalar optimizations. +static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) { + const Type *GVElType = GV->getType()->getElementType(); + + // If GVElType is already i1, it is already shrunk. If the type of the GV is + // an FP value, pointer or vector, don't do this optimization because a select + // between them is very expensive and unlikely to lead to later + // simplification. In these cases, we typically end up with "cond ? v1 : v2" + // where v1 and v2 both require constant pool loads, a big loss. + if (GVElType == Type::getInt1Ty(GV->getContext()) || + GVElType->isFloatingPointTy() || + GVElType->isPointerTy() || GVElType->isVectorTy()) + return false; + + // Walk the use list of the global seeing if all the uses are load or store. + // If there is anything else, bail out. + for (Value::use_iterator I = GV->use_begin(), E = GV->use_end(); I != E; ++I){ + User *U = *I; + if (!isa<LoadInst>(U) && !isa<StoreInst>(U)) + return false; + } + + DEBUG(dbgs() << " *** SHRINKING TO BOOL: " << *GV); + + // Create the new global, initializing it to false. + GlobalVariable *NewGV = new GlobalVariable(Type::getInt1Ty(GV->getContext()), + false, + GlobalValue::InternalLinkage, + ConstantInt::getFalse(GV->getContext()), + GV->getName()+".b", + GV->isThreadLocal()); + GV->getParent()->getGlobalList().insert(GV, NewGV); + + Constant *InitVal = GV->getInitializer(); + assert(InitVal->getType() != Type::getInt1Ty(GV->getContext()) && + "No reason to shrink to bool!"); + + // If initialized to zero and storing one into the global, we can use a cast + // instead of a select to synthesize the desired value. + bool IsOneZero = false; + if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) + IsOneZero = InitVal->isNullValue() && CI->isOne(); + + while (!GV->use_empty()) { + Instruction *UI = cast<Instruction>(GV->use_back()); + if (StoreInst *SI = dyn_cast<StoreInst>(UI)) { + // Change the store into a boolean store. + bool StoringOther = SI->getOperand(0) == OtherVal; + // Only do this if we weren't storing a loaded value. + Value *StoreVal; + if (StoringOther || SI->getOperand(0) == InitVal) + StoreVal = ConstantInt::get(Type::getInt1Ty(GV->getContext()), + StoringOther); + else { + // Otherwise, we are storing a previously loaded copy. To do this, + // change the copy from copying the original value to just copying the + // bool. + Instruction *StoredVal = cast<Instruction>(SI->getOperand(0)); + + // If we've already replaced the input, StoredVal will be a cast or + // select instruction. If not, it will be a load of the original + // global. + if (LoadInst *LI = dyn_cast<LoadInst>(StoredVal)) { + assert(LI->getOperand(0) == GV && "Not a copy!"); + // Insert a new load, to preserve the saved value. + StoreVal = new LoadInst(NewGV, LI->getName()+".b", LI); + } else { + assert((isa<CastInst>(StoredVal) || isa<SelectInst>(StoredVal)) && + "This is not a form that we understand!"); + StoreVal = StoredVal->getOperand(0); + assert(isa<LoadInst>(StoreVal) && "Not a load of NewGV!"); + } + } + new StoreInst(StoreVal, NewGV, SI); + } else { + // Change the load into a load of bool then a select. + LoadInst *LI = cast<LoadInst>(UI); + LoadInst *NLI = new LoadInst(NewGV, LI->getName()+".b", LI); + Value *NSI; + if (IsOneZero) + NSI = new ZExtInst(NLI, LI->getType(), "", LI); + else + NSI = SelectInst::Create(NLI, OtherVal, InitVal, "", LI); + NSI->takeName(LI); + LI->replaceAllUsesWith(NSI); + } + UI->eraseFromParent(); + } + + GV->eraseFromParent(); + return true; +} + + +/// ProcessInternalGlobal - Analyze the specified global variable and optimize +/// it if possible. If we make a change, return true. +bool GlobalOpt::ProcessGlobal(GlobalVariable *GV, + Module::global_iterator &GVI) { + if (!GV->hasLocalLinkage()) + return false; + + // Do more involved optimizations if the global is internal. + GV->removeDeadConstantUsers(); + + if (GV->use_empty()) { + DEBUG(dbgs() << "GLOBAL DEAD: " << *GV); + GV->eraseFromParent(); + ++NumDeleted; + return true; + } + + SmallPtrSet<const PHINode*, 16> PHIUsers; + GlobalStatus GS; + + if (AnalyzeGlobal(GV, GS, PHIUsers)) + return false; + + if (!GS.isCompared && !GV->hasUnnamedAddr()) { + GV->setUnnamedAddr(true); + NumUnnamed++; + } + + if (GV->isConstant() || !GV->hasInitializer()) + return false; + + return ProcessInternalGlobal(GV, GVI, PHIUsers, GS); +} + +/// ProcessInternalGlobal - Analyze the specified global variable and optimize +/// it if possible. If we make a change, return true. +bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, + Module::global_iterator &GVI, + const SmallPtrSet<const PHINode*, 16> &PHIUsers, + const GlobalStatus &GS) { + // If this is a first class global and has only one accessing function + // and this function is main (which we know is not recursive we can make + // this global a local variable) we replace the global with a local alloca + // in this function. + // + // NOTE: It doesn't make sense to promote non single-value types since we + // are just replacing static memory to stack memory. + // + // If the global is in different address space, don't bring it to stack. + if (!GS.HasMultipleAccessingFunctions && + GS.AccessingFunction && !GS.HasNonInstructionUser && + GV->getType()->getElementType()->isSingleValueType() && + GS.AccessingFunction->getName() == "main" && + GS.AccessingFunction->hasExternalLinkage() && + GV->getType()->getAddressSpace() == 0) { + DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV); + Instruction& FirstI = const_cast<Instruction&>(*GS.AccessingFunction + ->getEntryBlock().begin()); + const Type* ElemTy = GV->getType()->getElementType(); + // FIXME: Pass Global's alignment when globals have alignment + AllocaInst* Alloca = new AllocaInst(ElemTy, NULL, GV->getName(), &FirstI); + if (!isa<UndefValue>(GV->getInitializer())) + new StoreInst(GV->getInitializer(), Alloca, &FirstI); + + GV->replaceAllUsesWith(Alloca); + GV->eraseFromParent(); + ++NumLocalized; + return true; + } + + // If the global is never loaded (but may be stored to), it is dead. + // Delete it now. + if (!GS.isLoaded) { + DEBUG(dbgs() << "GLOBAL NEVER LOADED: " << *GV); + + // Delete any stores we can find to the global. We may not be able to + // make it completely dead though. + bool Changed = CleanupConstantGlobalUsers(GV, GV->getInitializer()); + + // If the global is dead now, delete it. + if (GV->use_empty()) { + GV->eraseFromParent(); + ++NumDeleted; + Changed = true; + } + return Changed; + + } else if (GS.StoredType <= GlobalStatus::isInitializerStored) { + DEBUG(dbgs() << "MARKING CONSTANT: " << *GV); + GV->setConstant(true); + + // Clean up any obviously simplifiable users now. + CleanupConstantGlobalUsers(GV, GV->getInitializer()); + + // If the global is dead now, just nuke it. + if (GV->use_empty()) { + DEBUG(dbgs() << " *** Marking constant allowed us to simplify " + << "all users and delete global!\n"); + GV->eraseFromParent(); + ++NumDeleted; + } + + ++NumMarked; + return true; + } else if (!GV->getInitializer()->getType()->isSingleValueType()) { + if (TargetData *TD = getAnalysisIfAvailable<TargetData>()) + if (GlobalVariable *FirstNewGV = SRAGlobal(GV, *TD)) { + GVI = FirstNewGV; // Don't skip the newly produced globals! + return true; + } + } else if (GS.StoredType == GlobalStatus::isStoredOnce) { + // If the initial value for the global was an undef value, and if only + // one other value was stored into it, we can just change the + // initializer to be the stored value, then delete all stores to the + // global. This allows us to mark it constant. + if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue)) + if (isa<UndefValue>(GV->getInitializer())) { + // Change the initial value here. + GV->setInitializer(SOVConstant); + + // Clean up any obviously simplifiable users now. + CleanupConstantGlobalUsers(GV, GV->getInitializer()); + + if (GV->use_empty()) { + DEBUG(dbgs() << " *** Substituting initializer allowed us to " + << "simplify all users and delete global!\n"); + GV->eraseFromParent(); + ++NumDeleted; + } else { + GVI = GV; + } + ++NumSubstitute; + return true; + } + + // Try to optimize globals based on the knowledge that only one value + // (besides its initializer) is ever stored to the global. + if (OptimizeOnceStoredGlobal(GV, GS.StoredOnceValue, GVI, + getAnalysisIfAvailable<TargetData>())) + return true; + + // Otherwise, if the global was not a boolean, we can shrink it to be a + // boolean. + if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue)) + if (TryToShrinkGlobalToBoolean(GV, SOVConstant)) { + ++NumShrunkToBool; + return true; + } + } + + return false; +} + +/// ChangeCalleesToFastCall - Walk all of the direct calls of the specified +/// function, changing them to FastCC. +static void ChangeCalleesToFastCall(Function *F) { + for (Value::use_iterator UI = F->use_begin(), E = F->use_end(); UI != E;++UI){ + CallSite User(cast<Instruction>(*UI)); + User.setCallingConv(CallingConv::Fast); + } +} + +static AttrListPtr StripNest(const AttrListPtr &Attrs) { + for (unsigned i = 0, e = Attrs.getNumSlots(); i != e; ++i) { + if ((Attrs.getSlot(i).Attrs & Attribute::Nest) == 0) + continue; + + // There can be only one. + return Attrs.removeAttr(Attrs.getSlot(i).Index, Attribute::Nest); + } + + return Attrs; +} + +static void RemoveNestAttribute(Function *F) { + F->setAttributes(StripNest(F->getAttributes())); + for (Value::use_iterator UI = F->use_begin(), E = F->use_end(); UI != E;++UI){ + CallSite User(cast<Instruction>(*UI)); + User.setAttributes(StripNest(User.getAttributes())); + } +} + +bool GlobalOpt::OptimizeFunctions(Module &M) { + bool Changed = false; + // Optimize functions. + for (Module::iterator FI = M.begin(), E = M.end(); FI != E; ) { + Function *F = FI++; + // Functions without names cannot be referenced outside this module. + if (!F->hasName() && !F->isDeclaration()) + F->setLinkage(GlobalValue::InternalLinkage); + F->removeDeadConstantUsers(); + if (F->use_empty() && (F->hasLocalLinkage() || F->hasLinkOnceLinkage())) { + F->eraseFromParent(); + Changed = true; + ++NumFnDeleted; + } else if (F->hasLocalLinkage()) { + if (F->getCallingConv() == CallingConv::C && !F->isVarArg() && + !F->hasAddressTaken()) { + // If this function has C calling conventions, is not a varargs + // function, and is only called directly, promote it to use the Fast + // calling convention. + F->setCallingConv(CallingConv::Fast); + ChangeCalleesToFastCall(F); + ++NumFastCallFns; + Changed = true; + } + + if (F->getAttributes().hasAttrSomewhere(Attribute::Nest) && + !F->hasAddressTaken()) { + // The function is not used by a trampoline intrinsic, so it is safe + // to remove the 'nest' attribute. + RemoveNestAttribute(F); + ++NumNestRemoved; + Changed = true; + } + } + } + return Changed; +} + +bool GlobalOpt::OptimizeGlobalVars(Module &M) { + bool Changed = false; + for (Module::global_iterator GVI = M.global_begin(), E = M.global_end(); + GVI != E; ) { + GlobalVariable *GV = GVI++; + // Global variables without names cannot be referenced outside this module. + if (!GV->hasName() && !GV->isDeclaration()) + GV->setLinkage(GlobalValue::InternalLinkage); + // Simplify the initializer. + if (GV->hasInitializer()) + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(GV->getInitializer())) { + TargetData *TD = getAnalysisIfAvailable<TargetData>(); + Constant *New = ConstantFoldConstantExpression(CE, TD); + if (New && New != CE) + GV->setInitializer(New); + } + + Changed |= ProcessGlobal(GV, GVI); + } + return Changed; +} + +/// FindGlobalCtors - Find the llvm.globalctors list, verifying that all +/// initializers have an init priority of 65535. +GlobalVariable *GlobalOpt::FindGlobalCtors(Module &M) { + GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors"); + if (GV == 0) return 0; + + // Found it, verify it's an array of { int, void()* }. + const ArrayType *ATy =dyn_cast<ArrayType>(GV->getType()->getElementType()); + if (!ATy) return 0; + const StructType *STy = dyn_cast<StructType>(ATy->getElementType()); + if (!STy || STy->getNumElements() != 2 || + !STy->getElementType(0)->isIntegerTy(32)) return 0; + const PointerType *PFTy = dyn_cast<PointerType>(STy->getElementType(1)); + if (!PFTy) return 0; + const FunctionType *FTy = dyn_cast<FunctionType>(PFTy->getElementType()); + if (!FTy || !FTy->getReturnType()->isVoidTy() || + FTy->isVarArg() || FTy->getNumParams() != 0) + return 0; + + // Verify that the initializer is simple enough for us to handle. We are + // only allowed to optimize the initializer if it is unique. + if (!GV->hasUniqueInitializer()) return 0; + + ConstantArray *CA = dyn_cast<ConstantArray>(GV->getInitializer()); + if (!CA) return 0; + + for (User::op_iterator i = CA->op_begin(), e = CA->op_end(); i != e; ++i) { + ConstantStruct *CS = dyn_cast<ConstantStruct>(*i); + if (CS == 0) return 0; + + if (isa<ConstantPointerNull>(CS->getOperand(1))) + continue; + + // Must have a function or null ptr. + if (!isa<Function>(CS->getOperand(1))) + return 0; + + // Init priority must be standard. + ConstantInt *CI = dyn_cast<ConstantInt>(CS->getOperand(0)); + if (!CI || CI->getZExtValue() != 65535) + return 0; + } + + return GV; +} + +/// ParseGlobalCtors - Given a llvm.global_ctors list that we can understand, +/// return a list of the functions and null terminator as a vector. +static std::vector<Function*> ParseGlobalCtors(GlobalVariable *GV) { + ConstantArray *CA = cast<ConstantArray>(GV->getInitializer()); + std::vector<Function*> Result; + Result.reserve(CA->getNumOperands()); + for (User::op_iterator i = CA->op_begin(), e = CA->op_end(); i != e; ++i) { + ConstantStruct *CS = cast<ConstantStruct>(*i); + Result.push_back(dyn_cast<Function>(CS->getOperand(1))); + } + return Result; +} + +/// InstallGlobalCtors - Given a specified llvm.global_ctors list, install the +/// specified array, returning the new global to use. +static GlobalVariable *InstallGlobalCtors(GlobalVariable *GCL, + const std::vector<Function*> &Ctors) { + // If we made a change, reassemble the initializer list. + std::vector<Constant*> CSVals; + CSVals.push_back(ConstantInt::get(Type::getInt32Ty(GCL->getContext()),65535)); + CSVals.push_back(0); + + // Create the new init list. + std::vector<Constant*> CAList; + for (unsigned i = 0, e = Ctors.size(); i != e; ++i) { + if (Ctors[i]) { + CSVals[1] = Ctors[i]; + } else { + const Type *FTy = FunctionType::get(Type::getVoidTy(GCL->getContext()), + false); + const PointerType *PFTy = PointerType::getUnqual(FTy); + CSVals[1] = Constant::getNullValue(PFTy); + CSVals[0] = ConstantInt::get(Type::getInt32Ty(GCL->getContext()), + 2147483647); + } + CAList.push_back(ConstantStruct::get(GCL->getContext(), CSVals, false)); + } + + // Create the array initializer. + const Type *StructTy = + cast<ArrayType>(GCL->getType()->getElementType())->getElementType(); + Constant *CA = ConstantArray::get(ArrayType::get(StructTy, + CAList.size()), CAList); + + // If we didn't change the number of elements, don't create a new GV. + if (CA->getType() == GCL->getInitializer()->getType()) { + GCL->setInitializer(CA); + return GCL; + } + + // Create the new global and insert it next to the existing list. + GlobalVariable *NGV = new GlobalVariable(CA->getType(), GCL->isConstant(), + GCL->getLinkage(), CA, "", + GCL->isThreadLocal()); + GCL->getParent()->getGlobalList().insert(GCL, NGV); + NGV->takeName(GCL); + + // Nuke the old list, replacing any uses with the new one. + if (!GCL->use_empty()) { + Constant *V = NGV; + if (V->getType() != GCL->getType()) + V = ConstantExpr::getBitCast(V, GCL->getType()); + GCL->replaceAllUsesWith(V); + } + GCL->eraseFromParent(); + + if (Ctors.size()) + return NGV; + else + return 0; +} + + +static Constant *getVal(DenseMap<Value*, Constant*> &ComputedValues, Value *V) { + if (Constant *CV = dyn_cast<Constant>(V)) return CV; + Constant *R = ComputedValues[V]; + assert(R && "Reference to an uncomputed value!"); + return R; +} + +static inline bool +isSimpleEnoughValueToCommit(Constant *C, + SmallPtrSet<Constant*, 8> &SimpleConstants); + + +/// isSimpleEnoughValueToCommit - Return true if the specified constant can be +/// handled by the code generator. We don't want to generate something like: +/// void *X = &X/42; +/// because the code generator doesn't have a relocation that can handle that. +/// +/// This function should be called if C was not found (but just got inserted) +/// in SimpleConstants to avoid having to rescan the same constants all the +/// time. +static bool isSimpleEnoughValueToCommitHelper(Constant *C, + SmallPtrSet<Constant*, 8> &SimpleConstants) { + // Simple integer, undef, constant aggregate zero, global addresses, etc are + // all supported. + if (C->getNumOperands() == 0 || isa<BlockAddress>(C) || + isa<GlobalValue>(C)) + return true; + + // Aggregate values are safe if all their elements are. + if (isa<ConstantArray>(C) || isa<ConstantStruct>(C) || + isa<ConstantVector>(C)) { + for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i) { + Constant *Op = cast<Constant>(C->getOperand(i)); + if (!isSimpleEnoughValueToCommit(Op, SimpleConstants)) + return false; + } + return true; + } + + // We don't know exactly what relocations are allowed in constant expressions, + // so we allow &global+constantoffset, which is safe and uniformly supported + // across targets. + ConstantExpr *CE = cast<ConstantExpr>(C); + switch (CE->getOpcode()) { + case Instruction::BitCast: + case Instruction::IntToPtr: + case Instruction::PtrToInt: + // These casts are always fine if the casted value is. + return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants); + + // GEP is fine if it is simple + constant offset. + case Instruction::GetElementPtr: + for (unsigned i = 1, e = CE->getNumOperands(); i != e; ++i) + if (!isa<ConstantInt>(CE->getOperand(i))) + return false; + return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants); + + case Instruction::Add: + // We allow simple+cst. + if (!isa<ConstantInt>(CE->getOperand(1))) + return false; + return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants); + } + return false; +} + +static inline bool +isSimpleEnoughValueToCommit(Constant *C, + SmallPtrSet<Constant*, 8> &SimpleConstants) { + // If we already checked this constant, we win. + if (!SimpleConstants.insert(C)) return true; + // Check the constant. + return isSimpleEnoughValueToCommitHelper(C, SimpleConstants); +} + + +/// isSimpleEnoughPointerToCommit - Return true if this constant is simple +/// enough for us to understand. In particular, if it is a cast to anything +/// other than from one pointer type to another pointer type, we punt. +/// We basically just support direct accesses to globals and GEP's of +/// globals. This should be kept up to date with CommitValueTo. +static bool isSimpleEnoughPointerToCommit(Constant *C) { + // Conservatively, avoid aggregate types. This is because we don't + // want to worry about them partially overlapping other stores. + if (!cast<PointerType>(C->getType())->getElementType()->isSingleValueType()) + return false; + + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) + // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or + // external globals. + return GV->hasUniqueInitializer(); + + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) { + // Handle a constantexpr gep. + if (CE->getOpcode() == Instruction::GetElementPtr && + isa<GlobalVariable>(CE->getOperand(0)) && + cast<GEPOperator>(CE)->isInBounds()) { + GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0)); + // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or + // external globals. + if (!GV->hasUniqueInitializer()) + return false; + + // The first index must be zero. + ConstantInt *CI = dyn_cast<ConstantInt>(*llvm::next(CE->op_begin())); + if (!CI || !CI->isZero()) return false; + + // The remaining indices must be compile-time known integers within the + // notional bounds of the corresponding static array types. + if (!CE->isGEPWithNoNotionalOverIndexing()) + return false; + + return ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE); + + // A constantexpr bitcast from a pointer to another pointer is a no-op, + // and we know how to evaluate it by moving the bitcast from the pointer + // operand to the value operand. + } else if (CE->getOpcode() == Instruction::BitCast && + isa<GlobalVariable>(CE->getOperand(0))) { + // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or + // external globals. + return cast<GlobalVariable>(CE->getOperand(0))->hasUniqueInitializer(); + } + } + + return false; +} + +/// EvaluateStoreInto - Evaluate a piece of a constantexpr store into a global +/// initializer. This returns 'Init' modified to reflect 'Val' stored into it. +/// At this point, the GEP operands of Addr [0, OpNo) have been stepped into. +static Constant *EvaluateStoreInto(Constant *Init, Constant *Val, + ConstantExpr *Addr, unsigned OpNo) { + // Base case of the recursion. + if (OpNo == Addr->getNumOperands()) { + assert(Val->getType() == Init->getType() && "Type mismatch!"); + return Val; + } + + std::vector<Constant*> Elts; + if (const StructType *STy = dyn_cast<StructType>(Init->getType())) { + + // Break up the constant into its elements. + if (ConstantStruct *CS = dyn_cast<ConstantStruct>(Init)) { + for (User::op_iterator i = CS->op_begin(), e = CS->op_end(); i != e; ++i) + Elts.push_back(cast<Constant>(*i)); + } else if (isa<ConstantAggregateZero>(Init)) { + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) + Elts.push_back(Constant::getNullValue(STy->getElementType(i))); + } else if (isa<UndefValue>(Init)) { + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) + Elts.push_back(UndefValue::get(STy->getElementType(i))); + } else { + llvm_unreachable("This code is out of sync with " + " ConstantFoldLoadThroughGEPConstantExpr"); + } + + // Replace the element that we are supposed to. + ConstantInt *CU = cast<ConstantInt>(Addr->getOperand(OpNo)); + unsigned Idx = CU->getZExtValue(); + assert(Idx < STy->getNumElements() && "Struct index out of range!"); + Elts[Idx] = EvaluateStoreInto(Elts[Idx], Val, Addr, OpNo+1); + + // Return the modified struct. + return ConstantStruct::get(Init->getContext(), &Elts[0], Elts.size(), + STy->isPacked()); + } else { + ConstantInt *CI = cast<ConstantInt>(Addr->getOperand(OpNo)); + const SequentialType *InitTy = cast<SequentialType>(Init->getType()); + + uint64_t NumElts; + if (const ArrayType *ATy = dyn_cast<ArrayType>(InitTy)) + NumElts = ATy->getNumElements(); + else + NumElts = cast<VectorType>(InitTy)->getNumElements(); + + + // Break up the array into elements. + if (ConstantArray *CA = dyn_cast<ConstantArray>(Init)) { + for (User::op_iterator i = CA->op_begin(), e = CA->op_end(); i != e; ++i) + Elts.push_back(cast<Constant>(*i)); + } else if (ConstantVector *CV = dyn_cast<ConstantVector>(Init)) { + for (User::op_iterator i = CV->op_begin(), e = CV->op_end(); i != e; ++i) + Elts.push_back(cast<Constant>(*i)); + } else if (isa<ConstantAggregateZero>(Init)) { + Elts.assign(NumElts, Constant::getNullValue(InitTy->getElementType())); + } else { + assert(isa<UndefValue>(Init) && "This code is out of sync with " + " ConstantFoldLoadThroughGEPConstantExpr"); + Elts.assign(NumElts, UndefValue::get(InitTy->getElementType())); + } + + assert(CI->getZExtValue() < NumElts); + Elts[CI->getZExtValue()] = + EvaluateStoreInto(Elts[CI->getZExtValue()], Val, Addr, OpNo+1); + + if (Init->getType()->isArrayTy()) + return ConstantArray::get(cast<ArrayType>(InitTy), Elts); + return ConstantVector::get(Elts); + } +} + +/// CommitValueTo - We have decided that Addr (which satisfies the predicate +/// isSimpleEnoughPointerToCommit) should get Val as its value. Make it happen. +static void CommitValueTo(Constant *Val, Constant *Addr) { + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) { + assert(GV->hasInitializer()); + GV->setInitializer(Val); + return; + } + + ConstantExpr *CE = cast<ConstantExpr>(Addr); + GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0)); + GV->setInitializer(EvaluateStoreInto(GV->getInitializer(), Val, CE, 2)); +} + +/// ComputeLoadResult - Return the value that would be computed by a load from +/// P after the stores reflected by 'memory' have been performed. If we can't +/// decide, return null. +static Constant *ComputeLoadResult(Constant *P, + const DenseMap<Constant*, Constant*> &Memory) { + // If this memory location has been recently stored, use the stored value: it + // is the most up-to-date. + DenseMap<Constant*, Constant*>::const_iterator I = Memory.find(P); + if (I != Memory.end()) return I->second; + + // Access it. + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P)) { + if (GV->hasDefinitiveInitializer()) + return GV->getInitializer(); + return 0; + } + + // Handle a constantexpr getelementptr. + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(P)) + if (CE->getOpcode() == Instruction::GetElementPtr && + isa<GlobalVariable>(CE->getOperand(0))) { + GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0)); + if (GV->hasDefinitiveInitializer()) + return ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE); + } + + return 0; // don't know how to evaluate. +} + +/// EvaluateFunction - Evaluate a call to function F, returning true if +/// successful, false if we can't evaluate it. ActualArgs contains the formal +/// arguments for the function. +static bool EvaluateFunction(Function *F, Constant *&RetVal, + const SmallVectorImpl<Constant*> &ActualArgs, + std::vector<Function*> &CallStack, + DenseMap<Constant*, Constant*> &MutatedMemory, + std::vector<GlobalVariable*> &AllocaTmps, + SmallPtrSet<Constant*, 8> &SimpleConstants, + const TargetData *TD) { + // Check to see if this function is already executing (recursion). If so, + // bail out. TODO: we might want to accept limited recursion. + if (std::find(CallStack.begin(), CallStack.end(), F) != CallStack.end()) + return false; + + CallStack.push_back(F); + + /// Values - As we compute SSA register values, we store their contents here. + DenseMap<Value*, Constant*> Values; + + // Initialize arguments to the incoming values specified. + unsigned ArgNo = 0; + for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); AI != E; + ++AI, ++ArgNo) + Values[AI] = ActualArgs[ArgNo]; + + /// ExecutedBlocks - We only handle non-looping, non-recursive code. As such, + /// we can only evaluate any one basic block at most once. This set keeps + /// track of what we have executed so we can detect recursive cases etc. + SmallPtrSet<BasicBlock*, 32> ExecutedBlocks; + + // CurInst - The current instruction we're evaluating. + BasicBlock::iterator CurInst = F->begin()->begin(); + + // This is the main evaluation loop. + while (1) { + Constant *InstResult = 0; + + if (StoreInst *SI = dyn_cast<StoreInst>(CurInst)) { + if (SI->isVolatile()) return false; // no volatile accesses. + Constant *Ptr = getVal(Values, SI->getOperand(1)); + if (!isSimpleEnoughPointerToCommit(Ptr)) + // If this is too complex for us to commit, reject it. + return false; + + Constant *Val = getVal(Values, SI->getOperand(0)); + + // If this might be too difficult for the backend to handle (e.g. the addr + // of one global variable divided by another) then we can't commit it. + if (!isSimpleEnoughValueToCommit(Val, SimpleConstants)) + return false; + + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) + if (CE->getOpcode() == Instruction::BitCast) { + // If we're evaluating a store through a bitcast, then we need + // to pull the bitcast off the pointer type and push it onto the + // stored value. + Ptr = CE->getOperand(0); + + const Type *NewTy=cast<PointerType>(Ptr->getType())->getElementType(); + + // In order to push the bitcast onto the stored value, a bitcast + // from NewTy to Val's type must be legal. If it's not, we can try + // introspecting NewTy to find a legal conversion. + while (!Val->getType()->canLosslesslyBitCastTo(NewTy)) { + // If NewTy is a struct, we can convert the pointer to the struct + // into a pointer to its first member. + // FIXME: This could be extended to support arrays as well. + if (const StructType *STy = dyn_cast<StructType>(NewTy)) { + NewTy = STy->getTypeAtIndex(0U); + + const IntegerType *IdxTy =IntegerType::get(NewTy->getContext(), 32); + Constant *IdxZero = ConstantInt::get(IdxTy, 0, false); + Constant * const IdxList[] = {IdxZero, IdxZero}; + + Ptr = ConstantExpr::getGetElementPtr(Ptr, IdxList, 2); + + // If we can't improve the situation by introspecting NewTy, + // we have to give up. + } else { + return 0; + } + } + + // If we found compatible types, go ahead and push the bitcast + // onto the stored value. + Val = ConstantExpr::getBitCast(Val, NewTy); + } + + MutatedMemory[Ptr] = Val; + } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(CurInst)) { + InstResult = ConstantExpr::get(BO->getOpcode(), + getVal(Values, BO->getOperand(0)), + getVal(Values, BO->getOperand(1))); + } else if (CmpInst *CI = dyn_cast<CmpInst>(CurInst)) { + InstResult = ConstantExpr::getCompare(CI->getPredicate(), + getVal(Values, CI->getOperand(0)), + getVal(Values, CI->getOperand(1))); + } else if (CastInst *CI = dyn_cast<CastInst>(CurInst)) { + InstResult = ConstantExpr::getCast(CI->getOpcode(), + getVal(Values, CI->getOperand(0)), + CI->getType()); + } else if (SelectInst *SI = dyn_cast<SelectInst>(CurInst)) { + InstResult = ConstantExpr::getSelect(getVal(Values, SI->getOperand(0)), + getVal(Values, SI->getOperand(1)), + getVal(Values, SI->getOperand(2))); + } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurInst)) { + Constant *P = getVal(Values, GEP->getOperand(0)); + SmallVector<Constant*, 8> GEPOps; + for (User::op_iterator i = GEP->op_begin() + 1, e = GEP->op_end(); + i != e; ++i) + GEPOps.push_back(getVal(Values, *i)); + InstResult = cast<GEPOperator>(GEP)->isInBounds() ? + ConstantExpr::getInBoundsGetElementPtr(P, &GEPOps[0], GEPOps.size()) : + ConstantExpr::getGetElementPtr(P, &GEPOps[0], GEPOps.size()); + } else if (LoadInst *LI = dyn_cast<LoadInst>(CurInst)) { + if (LI->isVolatile()) return false; // no volatile accesses. + InstResult = ComputeLoadResult(getVal(Values, LI->getOperand(0)), + MutatedMemory); + if (InstResult == 0) return false; // Could not evaluate load. + } else if (AllocaInst *AI = dyn_cast<AllocaInst>(CurInst)) { + if (AI->isArrayAllocation()) return false; // Cannot handle array allocs. + const Type *Ty = AI->getType()->getElementType(); + AllocaTmps.push_back(new GlobalVariable(Ty, false, + GlobalValue::InternalLinkage, + UndefValue::get(Ty), + AI->getName())); + InstResult = AllocaTmps.back(); + } else if (CallInst *CI = dyn_cast<CallInst>(CurInst)) { + + // Debug info can safely be ignored here. + if (isa<DbgInfoIntrinsic>(CI)) { + ++CurInst; + continue; + } + + // Cannot handle inline asm. + if (isa<InlineAsm>(CI->getCalledValue())) return false; + + // Resolve function pointers. + Function *Callee = dyn_cast<Function>(getVal(Values, + CI->getCalledValue())); + if (!Callee) return false; // Cannot resolve. + + SmallVector<Constant*, 8> Formals; + CallSite CS(CI); + for (User::op_iterator i = CS.arg_begin(), e = CS.arg_end(); + i != e; ++i) + Formals.push_back(getVal(Values, *i)); + + if (Callee->isDeclaration()) { + // If this is a function we can constant fold, do it. + if (Constant *C = ConstantFoldCall(Callee, Formals.data(), + Formals.size())) { + InstResult = C; + } else { + return false; + } + } else { + if (Callee->getFunctionType()->isVarArg()) + return false; + + Constant *RetVal; + // Execute the call, if successful, use the return value. + if (!EvaluateFunction(Callee, RetVal, Formals, CallStack, + MutatedMemory, AllocaTmps, SimpleConstants, TD)) + return false; + InstResult = RetVal; + } + } else if (isa<TerminatorInst>(CurInst)) { + BasicBlock *NewBB = 0; + if (BranchInst *BI = dyn_cast<BranchInst>(CurInst)) { + if (BI->isUnconditional()) { + NewBB = BI->getSuccessor(0); + } else { + ConstantInt *Cond = + dyn_cast<ConstantInt>(getVal(Values, BI->getCondition())); + if (!Cond) return false; // Cannot determine. + + NewBB = BI->getSuccessor(!Cond->getZExtValue()); + } + } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurInst)) { + ConstantInt *Val = + dyn_cast<ConstantInt>(getVal(Values, SI->getCondition())); + if (!Val) return false; // Cannot determine. + NewBB = SI->getSuccessor(SI->findCaseValue(Val)); + } else if (IndirectBrInst *IBI = dyn_cast<IndirectBrInst>(CurInst)) { + Value *Val = getVal(Values, IBI->getAddress())->stripPointerCasts(); + if (BlockAddress *BA = dyn_cast<BlockAddress>(Val)) + NewBB = BA->getBasicBlock(); + else + return false; // Cannot determine. + } else if (ReturnInst *RI = dyn_cast<ReturnInst>(CurInst)) { + if (RI->getNumOperands()) + RetVal = getVal(Values, RI->getOperand(0)); + + CallStack.pop_back(); // return from fn. + return true; // We succeeded at evaluating this ctor! + } else { + // invoke, unwind, unreachable. + return false; // Cannot handle this terminator. + } + + // Okay, we succeeded in evaluating this control flow. See if we have + // executed the new block before. If so, we have a looping function, + // which we cannot evaluate in reasonable time. + if (!ExecutedBlocks.insert(NewBB)) + return false; // looped! + + // Okay, we have never been in this block before. Check to see if there + // are any PHI nodes. If so, evaluate them with information about where + // we came from. + BasicBlock *OldBB = CurInst->getParent(); + CurInst = NewBB->begin(); + PHINode *PN; + for (; (PN = dyn_cast<PHINode>(CurInst)); ++CurInst) + Values[PN] = getVal(Values, PN->getIncomingValueForBlock(OldBB)); + + // Do NOT increment CurInst. We know that the terminator had no value. + continue; + } else { + // Did not know how to evaluate this! + return false; + } + + if (!CurInst->use_empty()) { + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(InstResult)) + InstResult = ConstantFoldConstantExpression(CE, TD); + + Values[CurInst] = InstResult; + } + + // Advance program counter. + ++CurInst; + } +} + +/// EvaluateStaticConstructor - Evaluate static constructors in the function, if +/// we can. Return true if we can, false otherwise. +static bool EvaluateStaticConstructor(Function *F, const TargetData *TD) { + /// MutatedMemory - For each store we execute, we update this map. Loads + /// check this to get the most up-to-date value. If evaluation is successful, + /// this state is committed to the process. + DenseMap<Constant*, Constant*> MutatedMemory; + + /// AllocaTmps - To 'execute' an alloca, we create a temporary global variable + /// to represent its body. This vector is needed so we can delete the + /// temporary globals when we are done. + std::vector<GlobalVariable*> AllocaTmps; + + /// CallStack - This is used to detect recursion. In pathological situations + /// we could hit exponential behavior, but at least there is nothing + /// unbounded. + std::vector<Function*> CallStack; + + /// SimpleConstants - These are constants we have checked and know to be + /// simple enough to live in a static initializer of a global. + SmallPtrSet<Constant*, 8> SimpleConstants; + + // Call the function. + Constant *RetValDummy; + bool EvalSuccess = EvaluateFunction(F, RetValDummy, + SmallVector<Constant*, 0>(), CallStack, + MutatedMemory, AllocaTmps, + SimpleConstants, TD); + + if (EvalSuccess) { + // We succeeded at evaluation: commit the result. + DEBUG(dbgs() << "FULLY EVALUATED GLOBAL CTOR FUNCTION '" + << F->getName() << "' to " << MutatedMemory.size() + << " stores.\n"); + for (DenseMap<Constant*, Constant*>::iterator I = MutatedMemory.begin(), + E = MutatedMemory.end(); I != E; ++I) + CommitValueTo(I->second, I->first); + } + + // At this point, we are done interpreting. If we created any 'alloca' + // temporaries, release them now. + while (!AllocaTmps.empty()) { + GlobalVariable *Tmp = AllocaTmps.back(); + AllocaTmps.pop_back(); + + // If there are still users of the alloca, the program is doing something + // silly, e.g. storing the address of the alloca somewhere and using it + // later. Since this is undefined, we'll just make it be null. + if (!Tmp->use_empty()) + Tmp->replaceAllUsesWith(Constant::getNullValue(Tmp->getType())); + delete Tmp; + } + + return EvalSuccess; +} + + + +/// OptimizeGlobalCtorsList - Simplify and evaluation global ctors if possible. +/// Return true if anything changed. +bool GlobalOpt::OptimizeGlobalCtorsList(GlobalVariable *&GCL) { + std::vector<Function*> Ctors = ParseGlobalCtors(GCL); + bool MadeChange = false; + if (Ctors.empty()) return false; + + const TargetData *TD = getAnalysisIfAvailable<TargetData>(); + // Loop over global ctors, optimizing them when we can. + for (unsigned i = 0; i != Ctors.size(); ++i) { + Function *F = Ctors[i]; + // Found a null terminator in the middle of the list, prune off the rest of + // the list. + if (F == 0) { + if (i != Ctors.size()-1) { + Ctors.resize(i+1); + MadeChange = true; + } + break; + } + + // We cannot simplify external ctor functions. + if (F->empty()) continue; + + // If we can evaluate the ctor at compile time, do. + if (EvaluateStaticConstructor(F, TD)) { + Ctors.erase(Ctors.begin()+i); + MadeChange = true; + --i; + ++NumCtorsEvaluated; + continue; + } + } + + if (!MadeChange) return false; + + GCL = InstallGlobalCtors(GCL, Ctors); + return true; +} + +bool GlobalOpt::OptimizeGlobalAliases(Module &M) { + bool Changed = false; + + for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); + I != E;) { + Module::alias_iterator J = I++; + // Aliases without names cannot be referenced outside this module. + if (!J->hasName() && !J->isDeclaration()) + J->setLinkage(GlobalValue::InternalLinkage); + // If the aliasee may change at link time, nothing can be done - bail out. + if (J->mayBeOverridden()) + continue; + + Constant *Aliasee = J->getAliasee(); + GlobalValue *Target = cast<GlobalValue>(Aliasee->stripPointerCasts()); + Target->removeDeadConstantUsers(); + bool hasOneUse = Target->hasOneUse() && Aliasee->hasOneUse(); + + // Make all users of the alias use the aliasee instead. + if (!J->use_empty()) { + J->replaceAllUsesWith(Aliasee); + ++NumAliasesResolved; + Changed = true; + } + + // If the alias is externally visible, we may still be able to simplify it. + if (!J->hasLocalLinkage()) { + // If the aliasee has internal linkage, give it the name and linkage + // of the alias, and delete the alias. This turns: + // define internal ... @f(...) + // @a = alias ... @f + // into: + // define ... @a(...) + if (!Target->hasLocalLinkage()) + continue; + + // Do not perform the transform if multiple aliases potentially target the + // aliasee. This check also ensures that it is safe to replace the section + // and other attributes of the aliasee with those of the alias. + if (!hasOneUse) + continue; + + // Give the aliasee the name, linkage and other attributes of the alias. + Target->takeName(J); + Target->setLinkage(J->getLinkage()); + Target->GlobalValue::copyAttributesFrom(J); + } + + // Delete the alias. + M.getAliasList().erase(J); + ++NumAliasesRemoved; + Changed = true; + } + + return Changed; +} + +bool GlobalOpt::runOnModule(Module &M) { + bool Changed = false; + + // Try to find the llvm.globalctors list. + GlobalVariable *GlobalCtors = FindGlobalCtors(M); + + bool LocalChange = true; + while (LocalChange) { + LocalChange = false; + + // Delete functions that are trivially dead, ccc -> fastcc + LocalChange |= OptimizeFunctions(M); + + // Optimize global_ctors list. + if (GlobalCtors) + LocalChange |= OptimizeGlobalCtorsList(GlobalCtors); + + // Optimize non-address-taken globals. + LocalChange |= OptimizeGlobalVars(M); + + // Resolve aliases, when possible. + LocalChange |= OptimizeGlobalAliases(M); + Changed |= LocalChange; + } + + // TODO: Move all global ctors functions to the end of the module for code + // layout. + + return Changed; +} diff --git a/contrib/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp b/contrib/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp new file mode 100644 index 0000000..c7c2939 --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp @@ -0,0 +1,279 @@ +//===-- IPConstantPropagation.cpp - Propagate constants through calls -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass implements an _extremely_ simple interprocedural constant +// propagation pass. It could certainly be improved in many different ways, +// like using a worklist. This pass makes arguments dead, but does not remove +// them. The existing dead argument elimination pass should be run after this +// to clean up the mess. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "ipconstprop" +#include "llvm/Transforms/IPO.h" +#include "llvm/Constants.h" +#include "llvm/Instructions.h" +#include "llvm/Module.h" +#include "llvm/Pass.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Support/CallSite.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/SmallVector.h" +using namespace llvm; + +STATISTIC(NumArgumentsProped, "Number of args turned into constants"); +STATISTIC(NumReturnValProped, "Number of return values turned into constants"); + +namespace { + /// IPCP - The interprocedural constant propagation pass + /// + struct IPCP : public ModulePass { + static char ID; // Pass identification, replacement for typeid + IPCP() : ModulePass(ID) { + initializeIPCPPass(*PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M); + private: + bool PropagateConstantsIntoArguments(Function &F); + bool PropagateConstantReturn(Function &F); + }; +} + +char IPCP::ID = 0; +INITIALIZE_PASS(IPCP, "ipconstprop", + "Interprocedural constant propagation", false, false) + +ModulePass *llvm::createIPConstantPropagationPass() { return new IPCP(); } + +bool IPCP::runOnModule(Module &M) { + bool Changed = false; + bool LocalChange = true; + + // FIXME: instead of using smart algorithms, we just iterate until we stop + // making changes. + while (LocalChange) { + LocalChange = false; + for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) + if (!I->isDeclaration()) { + // Delete any klingons. + I->removeDeadConstantUsers(); + if (I->hasLocalLinkage()) + LocalChange |= PropagateConstantsIntoArguments(*I); + Changed |= PropagateConstantReturn(*I); + } + Changed |= LocalChange; + } + return Changed; +} + +/// PropagateConstantsIntoArguments - Look at all uses of the specified +/// function. If all uses are direct call sites, and all pass a particular +/// constant in for an argument, propagate that constant in as the argument. +/// +bool IPCP::PropagateConstantsIntoArguments(Function &F) { + if (F.arg_empty() || F.use_empty()) return false; // No arguments? Early exit. + + // For each argument, keep track of its constant value and whether it is a + // constant or not. The bool is driven to true when found to be non-constant. + SmallVector<std::pair<Constant*, bool>, 16> ArgumentConstants; + ArgumentConstants.resize(F.arg_size()); + + unsigned NumNonconstant = 0; + for (Value::use_iterator UI = F.use_begin(), E = F.use_end(); UI != E; ++UI) { + User *U = *UI; + // Ignore blockaddress uses. + if (isa<BlockAddress>(U)) continue; + + // Used by a non-instruction, or not the callee of a function, do not + // transform. + if (!isa<CallInst>(U) && !isa<InvokeInst>(U)) + return false; + + CallSite CS(cast<Instruction>(U)); + if (!CS.isCallee(UI)) + return false; + + // Check out all of the potentially constant arguments. Note that we don't + // inspect varargs here. + CallSite::arg_iterator AI = CS.arg_begin(); + Function::arg_iterator Arg = F.arg_begin(); + for (unsigned i = 0, e = ArgumentConstants.size(); i != e; + ++i, ++AI, ++Arg) { + + // If this argument is known non-constant, ignore it. + if (ArgumentConstants[i].second) + continue; + + Constant *C = dyn_cast<Constant>(*AI); + if (C && ArgumentConstants[i].first == 0) { + ArgumentConstants[i].first = C; // First constant seen. + } else if (C && ArgumentConstants[i].first == C) { + // Still the constant value we think it is. + } else if (*AI == &*Arg) { + // Ignore recursive calls passing argument down. + } else { + // Argument became non-constant. If all arguments are non-constant now, + // give up on this function. + if (++NumNonconstant == ArgumentConstants.size()) + return false; + ArgumentConstants[i].second = true; + } + } + } + + // If we got to this point, there is a constant argument! + assert(NumNonconstant != ArgumentConstants.size()); + bool MadeChange = false; + Function::arg_iterator AI = F.arg_begin(); + for (unsigned i = 0, e = ArgumentConstants.size(); i != e; ++i, ++AI) { + // Do we have a constant argument? + if (ArgumentConstants[i].second || AI->use_empty() || + (AI->hasByValAttr() && !F.onlyReadsMemory())) + continue; + + Value *V = ArgumentConstants[i].first; + if (V == 0) V = UndefValue::get(AI->getType()); + AI->replaceAllUsesWith(V); + ++NumArgumentsProped; + MadeChange = true; + } + return MadeChange; +} + + +// Check to see if this function returns one or more constants. If so, replace +// all callers that use those return values with the constant value. This will +// leave in the actual return values and instructions, but deadargelim will +// clean that up. +// +// Additionally if a function always returns one of its arguments directly, +// callers will be updated to use the value they pass in directly instead of +// using the return value. +bool IPCP::PropagateConstantReturn(Function &F) { + if (F.getReturnType()->isVoidTy()) + return false; // No return value. + + // If this function could be overridden later in the link stage, we can't + // propagate information about its results into callers. + if (F.mayBeOverridden()) + return false; + + // Check to see if this function returns a constant. + SmallVector<Value *,4> RetVals; + const StructType *STy = dyn_cast<StructType>(F.getReturnType()); + if (STy) + for (unsigned i = 0, e = STy->getNumElements(); i < e; ++i) + RetVals.push_back(UndefValue::get(STy->getElementType(i))); + else + RetVals.push_back(UndefValue::get(F.getReturnType())); + + unsigned NumNonConstant = 0; + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) + if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) { + for (unsigned i = 0, e = RetVals.size(); i != e; ++i) { + // Already found conflicting return values? + Value *RV = RetVals[i]; + if (!RV) + continue; + + // Find the returned value + Value *V; + if (!STy) + V = RI->getOperand(i); + else + V = FindInsertedValue(RI->getOperand(0), i); + + if (V) { + // Ignore undefs, we can change them into anything + if (isa<UndefValue>(V)) + continue; + + // Try to see if all the rets return the same constant or argument. + if (isa<Constant>(V) || isa<Argument>(V)) { + if (isa<UndefValue>(RV)) { + // No value found yet? Try the current one. + RetVals[i] = V; + continue; + } + // Returning the same value? Good. + if (RV == V) + continue; + } + } + // Different or no known return value? Don't propagate this return + // value. + RetVals[i] = 0; + // All values non constant? Stop looking. + if (++NumNonConstant == RetVals.size()) + return false; + } + } + + // If we got here, the function returns at least one constant value. Loop + // over all users, replacing any uses of the return value with the returned + // constant. + bool MadeChange = false; + for (Value::use_iterator UI = F.use_begin(), E = F.use_end(); UI != E; ++UI) { + CallSite CS(*UI); + Instruction* Call = CS.getInstruction(); + + // Not a call instruction or a call instruction that's not calling F + // directly? + if (!Call || !CS.isCallee(UI)) + continue; + + // Call result not used? + if (Call->use_empty()) + continue; + + MadeChange = true; + + if (STy == 0) { + Value* New = RetVals[0]; + if (Argument *A = dyn_cast<Argument>(New)) + // Was an argument returned? Then find the corresponding argument in + // the call instruction and use that. + New = CS.getArgument(A->getArgNo()); + Call->replaceAllUsesWith(New); + continue; + } + + for (Value::use_iterator I = Call->use_begin(), E = Call->use_end(); + I != E;) { + Instruction *Ins = cast<Instruction>(*I); + + // Increment now, so we can remove the use + ++I; + + // Find the index of the retval to replace with + int index = -1; + if (ExtractValueInst *EV = dyn_cast<ExtractValueInst>(Ins)) + if (EV->hasIndices()) + index = *EV->idx_begin(); + + // If this use uses a specific return value, and we have a replacement, + // replace it. + if (index != -1) { + Value *New = RetVals[index]; + if (New) { + if (Argument *A = dyn_cast<Argument>(New)) + // Was an argument returned? Then find the corresponding argument in + // the call instruction and use that. + New = CS.getArgument(A->getArgNo()); + Ins->replaceAllUsesWith(New); + Ins->eraseFromParent(); + } + } + } + } + + if (MadeChange) ++NumReturnValProped; + return MadeChange; +} diff --git a/contrib/llvm/lib/Transforms/IPO/IPO.cpp b/contrib/llvm/lib/Transforms/IPO/IPO.cpp new file mode 100644 index 0000000..fbe90ce --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/IPO.cpp @@ -0,0 +1,118 @@ +//===-- Scalar.cpp --------------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the common infrastructure (including C bindings) for +// libLLVMIPO.a, which implements several transformations over the LLVM +// intermediate representation. +// +//===----------------------------------------------------------------------===// + +#include "llvm-c/Transforms/IPO.h" +#include "llvm/InitializePasses.h" +#include "llvm/PassManager.h" +#include "llvm/Transforms/IPO.h" + +using namespace llvm; + +void llvm::initializeIPO(PassRegistry &Registry) { + initializeArgPromotionPass(Registry); + initializeConstantMergePass(Registry); + initializeDAEPass(Registry); + initializeDAHPass(Registry); + initializeDTEPass(Registry); + initializeFunctionAttrsPass(Registry); + initializeGlobalDCEPass(Registry); + initializeGlobalOptPass(Registry); + initializeIPCPPass(Registry); + initializeAlwaysInlinerPass(Registry); + initializeSimpleInlinerPass(Registry); + initializeInternalizePassPass(Registry); + initializeLoopExtractorPass(Registry); + initializeBlockExtractorPassPass(Registry); + initializeSingleLoopExtractorPass(Registry); + initializeLowerSetJmpPass(Registry); + initializeMergeFunctionsPass(Registry); + initializePartialInlinerPass(Registry); + initializePruneEHPass(Registry); + initializeStripDeadPrototypesPassPass(Registry); + initializeStripSymbolsPass(Registry); + initializeStripDebugDeclarePass(Registry); + initializeStripDeadDebugInfoPass(Registry); + initializeStripNonDebugSymbolsPass(Registry); + initializeSRETPromotionPass(Registry); +} + +void LLVMInitializeIPO(LLVMPassRegistryRef R) { + initializeIPO(*unwrap(R)); +} + +void LLVMAddArgumentPromotionPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createArgumentPromotionPass()); +} + +void LLVMAddConstantMergePass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createConstantMergePass()); +} + +void LLVMAddDeadArgEliminationPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createDeadArgEliminationPass()); +} + +void LLVMAddDeadTypeEliminationPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createDeadTypeEliminationPass()); +} + +void LLVMAddFunctionAttrsPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createFunctionAttrsPass()); +} + +void LLVMAddFunctionInliningPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createFunctionInliningPass()); +} + +void LLVMAddGlobalDCEPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createGlobalDCEPass()); +} + +void LLVMAddGlobalOptimizerPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createGlobalOptimizerPass()); +} + +void LLVMAddIPConstantPropagationPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createIPConstantPropagationPass()); +} + +void LLVMAddLowerSetJmpPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLowerSetJmpPass()); +} + +void LLVMAddPruneEHPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createPruneEHPass()); +} + +void LLVMAddIPSCCPPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createIPSCCPPass()); +} + +void LLVMAddInternalizePass(LLVMPassManagerRef PM, unsigned AllButMain) { + unwrap(PM)->add(createInternalizePass(AllButMain != 0)); +} + + +void LLVMAddRaiseAllocationsPass(LLVMPassManagerRef PM) { + // FIXME: Remove in LLVM 3.0. +} + +void LLVMAddStripDeadPrototypesPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createStripDeadPrototypesPass()); +} + +void LLVMAddStripSymbolsPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createStripSymbolsPass()); +} diff --git a/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp b/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp new file mode 100644 index 0000000..ce795b7 --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp @@ -0,0 +1,85 @@ +//===- InlineAlways.cpp - Code to inline always_inline functions ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a custom inliner that handles only functions that +// are marked as "always inline". +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "inline" +#include "llvm/CallingConv.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Module.h" +#include "llvm/Type.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/Analysis/InlineCost.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/IPO/InlinerPass.h" +#include "llvm/ADT/SmallPtrSet.h" + +using namespace llvm; + +namespace { + + // AlwaysInliner only inlines functions that are mark as "always inline". + class AlwaysInliner : public Inliner { + // Functions that are never inlined + SmallPtrSet<const Function*, 16> NeverInline; + InlineCostAnalyzer CA; + public: + // Use extremely low threshold. + AlwaysInliner() : Inliner(ID, -2000000000) { + initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry()); + } + static char ID; // Pass identification, replacement for typeid + InlineCost getInlineCost(CallSite CS) { + return CA.getInlineCost(CS, NeverInline); + } + float getInlineFudgeFactor(CallSite CS) { + return CA.getInlineFudgeFactor(CS); + } + void resetCachedCostInfo(Function *Caller) { + CA.resetCachedCostInfo(Caller); + } + void growCachedCostInfo(Function* Caller, Function* Callee) { + CA.growCachedCostInfo(Caller, Callee); + } + virtual bool doFinalization(CallGraph &CG) { + return removeDeadFunctions(CG, &NeverInline); + } + virtual bool doInitialization(CallGraph &CG); + void releaseMemory() { + CA.clear(); + } + }; +} + +char AlwaysInliner::ID = 0; +INITIALIZE_PASS_BEGIN(AlwaysInliner, "always-inline", + "Inliner for always_inline functions", false, false) +INITIALIZE_AG_DEPENDENCY(CallGraph) +INITIALIZE_PASS_END(AlwaysInliner, "always-inline", + "Inliner for always_inline functions", false, false) + +Pass *llvm::createAlwaysInlinerPass() { return new AlwaysInliner(); } + +// doInitialization - Initializes the vector of functions that have not +// been annotated with the "always inline" attribute. +bool AlwaysInliner::doInitialization(CallGraph &CG) { + Module &M = CG.getModule(); + + for (Module::iterator I = M.begin(), E = M.end(); + I != E; ++I) + if (!I->isDeclaration() && !I->hasFnAttr(Attribute::AlwaysInline)) + NeverInline.insert(I); + + return false; +} diff --git a/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp b/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp new file mode 100644 index 0000000..0c5b3be --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp @@ -0,0 +1,118 @@ +//===- InlineSimple.cpp - Code to perform simple function inlining --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements bottom-up inlining of functions into callees. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "inline" +#include "llvm/CallingConv.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Module.h" +#include "llvm/Type.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/Analysis/InlineCost.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/IPO/InlinerPass.h" +#include "llvm/ADT/SmallPtrSet.h" + +using namespace llvm; + +namespace { + + class SimpleInliner : public Inliner { + // Functions that are never inlined + SmallPtrSet<const Function*, 16> NeverInline; + InlineCostAnalyzer CA; + public: + SimpleInliner() : Inliner(ID) { + initializeSimpleInlinerPass(*PassRegistry::getPassRegistry()); + } + SimpleInliner(int Threshold) : Inliner(ID, Threshold) { + initializeSimpleInlinerPass(*PassRegistry::getPassRegistry()); + } + static char ID; // Pass identification, replacement for typeid + InlineCost getInlineCost(CallSite CS) { + return CA.getInlineCost(CS, NeverInline); + } + float getInlineFudgeFactor(CallSite CS) { + return CA.getInlineFudgeFactor(CS); + } + void resetCachedCostInfo(Function *Caller) { + CA.resetCachedCostInfo(Caller); + } + void growCachedCostInfo(Function* Caller, Function* Callee) { + CA.growCachedCostInfo(Caller, Callee); + } + virtual bool doInitialization(CallGraph &CG); + void releaseMemory() { + CA.clear(); + } + }; +} + +char SimpleInliner::ID = 0; +INITIALIZE_PASS_BEGIN(SimpleInliner, "inline", + "Function Integration/Inlining", false, false) +INITIALIZE_AG_DEPENDENCY(CallGraph) +INITIALIZE_PASS_END(SimpleInliner, "inline", + "Function Integration/Inlining", false, false) + +Pass *llvm::createFunctionInliningPass() { return new SimpleInliner(); } + +Pass *llvm::createFunctionInliningPass(int Threshold) { + return new SimpleInliner(Threshold); +} + +// doInitialization - Initializes the vector of functions that have been +// annotated with the noinline attribute. +bool SimpleInliner::doInitialization(CallGraph &CG) { + + Module &M = CG.getModule(); + + for (Module::iterator I = M.begin(), E = M.end(); + I != E; ++I) + if (!I->isDeclaration() && I->hasFnAttr(Attribute::NoInline)) + NeverInline.insert(I); + + // Get llvm.noinline + GlobalVariable *GV = M.getNamedGlobal("llvm.noinline"); + + if (GV == 0) + return false; + + // Don't crash on invalid code + if (!GV->hasDefinitiveInitializer()) + return false; + + const ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer()); + + if (InitList == 0) + return false; + + // Iterate over each element and add to the NeverInline set + for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i) { + + // Get Source + const Constant *Elt = InitList->getOperand(i); + + if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(Elt)) + if (CE->getOpcode() == Instruction::BitCast) + Elt = CE->getOperand(0); + + // Insert into set of functions to never inline + if (const Function *F = dyn_cast<Function>(Elt)) + NeverInline.insert(F); + } + + return false; +} + diff --git a/contrib/llvm/lib/Transforms/IPO/Inliner.cpp b/contrib/llvm/lib/Transforms/IPO/Inliner.cpp new file mode 100644 index 0000000..37eafd7 --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/Inliner.cpp @@ -0,0 +1,572 @@ +//===- Inliner.cpp - Code common to all inliners --------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the mechanics required to implement inlining without +// missing any calls and updating the call graph. The decisions of which calls +// are profitable to inline are implemented elsewhere. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "inline" +#include "llvm/Module.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/Analysis/InlineCost.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Transforms/IPO/InlinerPass.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include <set> +using namespace llvm; + +STATISTIC(NumInlined, "Number of functions inlined"); +STATISTIC(NumCallsDeleted, "Number of call sites deleted, not inlined"); +STATISTIC(NumDeleted, "Number of functions deleted because all callers found"); +STATISTIC(NumMergedAllocas, "Number of allocas merged together"); + +static cl::opt<int> +InlineLimit("inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore, + cl::desc("Control the amount of inlining to perform (default = 225)")); + +static cl::opt<int> +HintThreshold("inlinehint-threshold", cl::Hidden, cl::init(325), + cl::desc("Threshold for inlining functions with inline hint")); + +// Threshold to use when optsize is specified (and there is no -inline-limit). +const int OptSizeThreshold = 75; + +Inliner::Inliner(char &ID) + : CallGraphSCCPass(ID), InlineThreshold(InlineLimit) {} + +Inliner::Inliner(char &ID, int Threshold) + : CallGraphSCCPass(ID), InlineThreshold(InlineLimit.getNumOccurrences() > 0 ? + InlineLimit : Threshold) {} + +/// getAnalysisUsage - For this class, we declare that we require and preserve +/// the call graph. If the derived class implements this method, it should +/// always explicitly call the implementation here. +void Inliner::getAnalysisUsage(AnalysisUsage &Info) const { + CallGraphSCCPass::getAnalysisUsage(Info); +} + + +typedef DenseMap<const ArrayType*, std::vector<AllocaInst*> > +InlinedArrayAllocasTy; + +/// InlineCallIfPossible - If it is possible to inline the specified call site, +/// do so and update the CallGraph for this operation. +/// +/// This function also does some basic book-keeping to update the IR. The +/// InlinedArrayAllocas map keeps track of any allocas that are already +/// available from other functions inlined into the caller. If we are able to +/// inline this call site we attempt to reuse already available allocas or add +/// any new allocas to the set if not possible. +static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI, + InlinedArrayAllocasTy &InlinedArrayAllocas, + int InlineHistory) { + Function *Callee = CS.getCalledFunction(); + Function *Caller = CS.getCaller(); + + // Try to inline the function. Get the list of static allocas that were + // inlined. + if (!InlineFunction(CS, IFI)) + return false; + + // If the inlined function had a higher stack protection level than the + // calling function, then bump up the caller's stack protection level. + if (Callee->hasFnAttr(Attribute::StackProtectReq)) + Caller->addFnAttr(Attribute::StackProtectReq); + else if (Callee->hasFnAttr(Attribute::StackProtect) && + !Caller->hasFnAttr(Attribute::StackProtectReq)) + Caller->addFnAttr(Attribute::StackProtect); + + // Look at all of the allocas that we inlined through this call site. If we + // have already inlined other allocas through other calls into this function, + // then we know that they have disjoint lifetimes and that we can merge them. + // + // There are many heuristics possible for merging these allocas, and the + // different options have different tradeoffs. One thing that we *really* + // don't want to hurt is SRoA: once inlining happens, often allocas are no + // longer address taken and so they can be promoted. + // + // Our "solution" for that is to only merge allocas whose outermost type is an + // array type. These are usually not promoted because someone is using a + // variable index into them. These are also often the most important ones to + // merge. + // + // A better solution would be to have real memory lifetime markers in the IR + // and not have the inliner do any merging of allocas at all. This would + // allow the backend to do proper stack slot coloring of all allocas that + // *actually make it to the backend*, which is really what we want. + // + // Because we don't have this information, we do this simple and useful hack. + // + SmallPtrSet<AllocaInst*, 16> UsedAllocas; + + // When processing our SCC, check to see if CS was inlined from some other + // call site. For example, if we're processing "A" in this code: + // A() { B() } + // B() { x = alloca ... C() } + // C() { y = alloca ... } + // Assume that C was not inlined into B initially, and so we're processing A + // and decide to inline B into A. Doing this makes an alloca available for + // reuse and makes a callsite (C) available for inlining. When we process + // the C call site we don't want to do any alloca merging between X and Y + // because their scopes are not disjoint. We could make this smarter by + // keeping track of the inline history for each alloca in the + // InlinedArrayAllocas but this isn't likely to be a significant win. + if (InlineHistory != -1) // Only do merging for top-level call sites in SCC. + return true; + + // Loop over all the allocas we have so far and see if they can be merged with + // a previously inlined alloca. If not, remember that we had it. + for (unsigned AllocaNo = 0, e = IFI.StaticAllocas.size(); + AllocaNo != e; ++AllocaNo) { + AllocaInst *AI = IFI.StaticAllocas[AllocaNo]; + + // Don't bother trying to merge array allocations (they will usually be + // canonicalized to be an allocation *of* an array), or allocations whose + // type is not itself an array (because we're afraid of pessimizing SRoA). + const ArrayType *ATy = dyn_cast<ArrayType>(AI->getAllocatedType()); + if (ATy == 0 || AI->isArrayAllocation()) + continue; + + // Get the list of all available allocas for this array type. + std::vector<AllocaInst*> &AllocasForType = InlinedArrayAllocas[ATy]; + + // Loop over the allocas in AllocasForType to see if we can reuse one. Note + // that we have to be careful not to reuse the same "available" alloca for + // multiple different allocas that we just inlined, we use the 'UsedAllocas' + // set to keep track of which "available" allocas are being used by this + // function. Also, AllocasForType can be empty of course! + bool MergedAwayAlloca = false; + for (unsigned i = 0, e = AllocasForType.size(); i != e; ++i) { + AllocaInst *AvailableAlloca = AllocasForType[i]; + + // The available alloca has to be in the right function, not in some other + // function in this SCC. + if (AvailableAlloca->getParent() != AI->getParent()) + continue; + + // If the inlined function already uses this alloca then we can't reuse + // it. + if (!UsedAllocas.insert(AvailableAlloca)) + continue; + + // Otherwise, we *can* reuse it, RAUW AI into AvailableAlloca and declare + // success! + DEBUG(dbgs() << " ***MERGED ALLOCA: " << *AI << "\n\t\tINTO: " + << *AvailableAlloca << '\n'); + + AI->replaceAllUsesWith(AvailableAlloca); + AI->eraseFromParent(); + MergedAwayAlloca = true; + ++NumMergedAllocas; + IFI.StaticAllocas[AllocaNo] = 0; + break; + } + + // If we already nuked the alloca, we're done with it. + if (MergedAwayAlloca) + continue; + + // If we were unable to merge away the alloca either because there are no + // allocas of the right type available or because we reused them all + // already, remember that this alloca came from an inlined function and mark + // it used so we don't reuse it for other allocas from this inline + // operation. + AllocasForType.push_back(AI); + UsedAllocas.insert(AI); + } + + return true; +} + +unsigned Inliner::getInlineThreshold(CallSite CS) const { + int thres = InlineThreshold; + + // Listen to optsize when -inline-limit is not given. + Function *Caller = CS.getCaller(); + if (Caller && !Caller->isDeclaration() && + Caller->hasFnAttr(Attribute::OptimizeForSize) && + InlineLimit.getNumOccurrences() == 0) + thres = OptSizeThreshold; + + // Listen to inlinehint when it would increase the threshold. + Function *Callee = CS.getCalledFunction(); + if (HintThreshold > thres && Callee && !Callee->isDeclaration() && + Callee->hasFnAttr(Attribute::InlineHint)) + thres = HintThreshold; + + return thres; +} + +/// shouldInline - Return true if the inliner should attempt to inline +/// at the given CallSite. +bool Inliner::shouldInline(CallSite CS) { + InlineCost IC = getInlineCost(CS); + + if (IC.isAlways()) { + DEBUG(dbgs() << " Inlining: cost=always" + << ", Call: " << *CS.getInstruction() << "\n"); + return true; + } + + if (IC.isNever()) { + DEBUG(dbgs() << " NOT Inlining: cost=never" + << ", Call: " << *CS.getInstruction() << "\n"); + return false; + } + + int Cost = IC.getValue(); + Function *Caller = CS.getCaller(); + int CurrentThreshold = getInlineThreshold(CS); + float FudgeFactor = getInlineFudgeFactor(CS); + int AdjThreshold = (int)(CurrentThreshold * FudgeFactor); + if (Cost >= AdjThreshold) { + DEBUG(dbgs() << " NOT Inlining: cost=" << Cost + << ", thres=" << AdjThreshold + << ", Call: " << *CS.getInstruction() << "\n"); + return false; + } + + // Try to detect the case where the current inlining candidate caller + // (call it B) is a static function and is an inlining candidate elsewhere, + // and the current candidate callee (call it C) is large enough that + // inlining it into B would make B too big to inline later. In these + // circumstances it may be best not to inline C into B, but to inline B + // into its callers. + if (Caller->hasLocalLinkage()) { + int TotalSecondaryCost = 0; + bool outerCallsFound = false; + // This bool tracks what happens if we do NOT inline C into B. + bool callerWillBeRemoved = true; + // This bool tracks what happens if we DO inline C into B. + bool inliningPreventsSomeOuterInline = false; + for (Value::use_iterator I = Caller->use_begin(), E =Caller->use_end(); + I != E; ++I) { + CallSite CS2(*I); + + // If this isn't a call to Caller (it could be some other sort + // of reference) skip it. Such references will prevent the caller + // from being removed. + if (!CS2 || CS2.getCalledFunction() != Caller) { + callerWillBeRemoved = false; + continue; + } + + InlineCost IC2 = getInlineCost(CS2); + if (IC2.isNever()) + callerWillBeRemoved = false; + if (IC2.isAlways() || IC2.isNever()) + continue; + + outerCallsFound = true; + int Cost2 = IC2.getValue(); + int CurrentThreshold2 = getInlineThreshold(CS2); + float FudgeFactor2 = getInlineFudgeFactor(CS2); + + if (Cost2 >= (int)(CurrentThreshold2 * FudgeFactor2)) + callerWillBeRemoved = false; + + // See if we have this case. We subtract off the penalty + // for the call instruction, which we would be deleting. + if (Cost2 < (int)(CurrentThreshold2 * FudgeFactor2) && + Cost2 + Cost - (InlineConstants::CallPenalty + 1) >= + (int)(CurrentThreshold2 * FudgeFactor2)) { + inliningPreventsSomeOuterInline = true; + TotalSecondaryCost += Cost2; + } + } + // If all outer calls to Caller would get inlined, the cost for the last + // one is set very low by getInlineCost, in anticipation that Caller will + // be removed entirely. We did not account for this above unless there + // is only one caller of Caller. + if (callerWillBeRemoved && Caller->use_begin() != Caller->use_end()) + TotalSecondaryCost += InlineConstants::LastCallToStaticBonus; + + if (outerCallsFound && inliningPreventsSomeOuterInline && + TotalSecondaryCost < Cost) { + DEBUG(dbgs() << " NOT Inlining: " << *CS.getInstruction() << + " Cost = " << Cost << + ", outer Cost = " << TotalSecondaryCost << '\n'); + return false; + } + } + + DEBUG(dbgs() << " Inlining: cost=" << Cost + << ", thres=" << AdjThreshold + << ", Call: " << *CS.getInstruction() << '\n'); + return true; +} + +/// InlineHistoryIncludes - Return true if the specified inline history ID +/// indicates an inline history that includes the specified function. +static bool InlineHistoryIncludes(Function *F, int InlineHistoryID, + const SmallVectorImpl<std::pair<Function*, int> > &InlineHistory) { + while (InlineHistoryID != -1) { + assert(unsigned(InlineHistoryID) < InlineHistory.size() && + "Invalid inline history ID"); + if (InlineHistory[InlineHistoryID].first == F) + return true; + InlineHistoryID = InlineHistory[InlineHistoryID].second; + } + return false; +} + + +bool Inliner::runOnSCC(CallGraphSCC &SCC) { + CallGraph &CG = getAnalysis<CallGraph>(); + const TargetData *TD = getAnalysisIfAvailable<TargetData>(); + + SmallPtrSet<Function*, 8> SCCFunctions; + DEBUG(dbgs() << "Inliner visiting SCC:"); + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { + Function *F = (*I)->getFunction(); + if (F) SCCFunctions.insert(F); + DEBUG(dbgs() << " " << (F ? F->getName() : "INDIRECTNODE")); + } + + // Scan through and identify all call sites ahead of time so that we only + // inline call sites in the original functions, not call sites that result + // from inlining other functions. + SmallVector<std::pair<CallSite, int>, 16> CallSites; + + // When inlining a callee produces new call sites, we want to keep track of + // the fact that they were inlined from the callee. This allows us to avoid + // infinite inlining in some obscure cases. To represent this, we use an + // index into the InlineHistory vector. + SmallVector<std::pair<Function*, int>, 8> InlineHistory; + + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { + Function *F = (*I)->getFunction(); + if (!F) continue; + + for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + CallSite CS(cast<Value>(I)); + // If this isn't a call, or it is a call to an intrinsic, it can + // never be inlined. + if (!CS || isa<IntrinsicInst>(I)) + continue; + + // If this is a direct call to an external function, we can never inline + // it. If it is an indirect call, inlining may resolve it to be a + // direct call, so we keep it. + if (CS.getCalledFunction() && CS.getCalledFunction()->isDeclaration()) + continue; + + CallSites.push_back(std::make_pair(CS, -1)); + } + } + + DEBUG(dbgs() << ": " << CallSites.size() << " call sites.\n"); + + // If there are no calls in this function, exit early. + if (CallSites.empty()) + return false; + + // Now that we have all of the call sites, move the ones to functions in the + // current SCC to the end of the list. + unsigned FirstCallInSCC = CallSites.size(); + for (unsigned i = 0; i < FirstCallInSCC; ++i) + if (Function *F = CallSites[i].first.getCalledFunction()) + if (SCCFunctions.count(F)) + std::swap(CallSites[i--], CallSites[--FirstCallInSCC]); + + + InlinedArrayAllocasTy InlinedArrayAllocas; + InlineFunctionInfo InlineInfo(&CG, TD); + + // Now that we have all of the call sites, loop over them and inline them if + // it looks profitable to do so. + bool Changed = false; + bool LocalChange; + do { + LocalChange = false; + // Iterate over the outer loop because inlining functions can cause indirect + // calls to become direct calls. + for (unsigned CSi = 0; CSi != CallSites.size(); ++CSi) { + CallSite CS = CallSites[CSi].first; + + Function *Caller = CS.getCaller(); + Function *Callee = CS.getCalledFunction(); + + // If this call site is dead and it is to a readonly function, we should + // just delete the call instead of trying to inline it, regardless of + // size. This happens because IPSCCP propagates the result out of the + // call and then we're left with the dead call. + if (isInstructionTriviallyDead(CS.getInstruction())) { + DEBUG(dbgs() << " -> Deleting dead call: " + << *CS.getInstruction() << "\n"); + // Update the call graph by deleting the edge from Callee to Caller. + CG[Caller]->removeCallEdgeFor(CS); + CS.getInstruction()->eraseFromParent(); + ++NumCallsDeleted; + // Update the cached cost info with the missing call + growCachedCostInfo(Caller, NULL); + } else { + // We can only inline direct calls to non-declarations. + if (Callee == 0 || Callee->isDeclaration()) continue; + + // If this call site was obtained by inlining another function, verify + // that the include path for the function did not include the callee + // itself. If so, we'd be recursively inlining the same function, + // which would provide the same callsites, which would cause us to + // infinitely inline. + int InlineHistoryID = CallSites[CSi].second; + if (InlineHistoryID != -1 && + InlineHistoryIncludes(Callee, InlineHistoryID, InlineHistory)) + continue; + + + // If the policy determines that we should inline this function, + // try to do so. + if (!shouldInline(CS)) + continue; + + // Attempt to inline the function. + if (!InlineCallIfPossible(CS, InlineInfo, InlinedArrayAllocas, + InlineHistoryID)) + continue; + ++NumInlined; + + // If inlining this function gave us any new call sites, throw them + // onto our worklist to process. They are useful inline candidates. + if (!InlineInfo.InlinedCalls.empty()) { + // Create a new inline history entry for this, so that we remember + // that these new callsites came about due to inlining Callee. + int NewHistoryID = InlineHistory.size(); + InlineHistory.push_back(std::make_pair(Callee, InlineHistoryID)); + + for (unsigned i = 0, e = InlineInfo.InlinedCalls.size(); + i != e; ++i) { + Value *Ptr = InlineInfo.InlinedCalls[i]; + CallSites.push_back(std::make_pair(CallSite(Ptr), NewHistoryID)); + } + } + + // Update the cached cost info with the inlined call. + growCachedCostInfo(Caller, Callee); + } + + // If we inlined or deleted the last possible call site to the function, + // delete the function body now. + if (Callee && Callee->use_empty() && Callee->hasLocalLinkage() && + // TODO: Can remove if in SCC now. + !SCCFunctions.count(Callee) && + + // The function may be apparently dead, but if there are indirect + // callgraph references to the node, we cannot delete it yet, this + // could invalidate the CGSCC iterator. + CG[Callee]->getNumReferences() == 0) { + DEBUG(dbgs() << " -> Deleting dead function: " + << Callee->getName() << "\n"); + CallGraphNode *CalleeNode = CG[Callee]; + + // Remove any call graph edges from the callee to its callees. + CalleeNode->removeAllCalledFunctions(); + + resetCachedCostInfo(Callee); + + // Removing the node for callee from the call graph and delete it. + delete CG.removeFunctionFromModule(CalleeNode); + ++NumDeleted; + } + + // Remove this call site from the list. If possible, use + // swap/pop_back for efficiency, but do not use it if doing so would + // move a call site to a function in this SCC before the + // 'FirstCallInSCC' barrier. + if (SCC.isSingular()) { + CallSites[CSi] = CallSites.back(); + CallSites.pop_back(); + } else { + CallSites.erase(CallSites.begin()+CSi); + } + --CSi; + + Changed = true; + LocalChange = true; + } + } while (LocalChange); + + return Changed; +} + +// doFinalization - Remove now-dead linkonce functions at the end of +// processing to avoid breaking the SCC traversal. +bool Inliner::doFinalization(CallGraph &CG) { + return removeDeadFunctions(CG); +} + +/// removeDeadFunctions - Remove dead functions that are not included in +/// DNR (Do Not Remove) list. +bool Inliner::removeDeadFunctions(CallGraph &CG, + SmallPtrSet<const Function *, 16> *DNR) { + SmallPtrSet<CallGraphNode*, 16> FunctionsToRemove; + + // Scan for all of the functions, looking for ones that should now be removed + // from the program. Insert the dead ones in the FunctionsToRemove set. + for (CallGraph::iterator I = CG.begin(), E = CG.end(); I != E; ++I) { + CallGraphNode *CGN = I->second; + if (CGN->getFunction() == 0) + continue; + + Function *F = CGN->getFunction(); + + // If the only remaining users of the function are dead constants, remove + // them. + F->removeDeadConstantUsers(); + + if (DNR && DNR->count(F)) + continue; + if (!F->hasLinkOnceLinkage() && !F->hasLocalLinkage() && + !F->hasAvailableExternallyLinkage()) + continue; + if (!F->use_empty()) + continue; + + // Remove any call graph edges from the function to its callees. + CGN->removeAllCalledFunctions(); + + // Remove any edges from the external node to the function's call graph + // node. These edges might have been made irrelegant due to + // optimization of the program. + CG.getExternalCallingNode()->removeAnyCallEdgeTo(CGN); + + // Removing the node for callee from the call graph and delete it. + FunctionsToRemove.insert(CGN); + } + + // Now that we know which functions to delete, do so. We didn't want to do + // this inline, because that would invalidate our CallGraph::iterator + // objects. :( + // + // Note that it doesn't matter that we are iterating over a non-stable set + // here to do this, it doesn't matter which order the functions are deleted + // in. + bool Changed = false; + for (SmallPtrSet<CallGraphNode*, 16>::iterator I = FunctionsToRemove.begin(), + E = FunctionsToRemove.end(); I != E; ++I) { + resetCachedCostInfo((*I)->getFunction()); + delete CG.removeFunctionFromModule(*I); + ++NumDeleted; + Changed = true; + } + + return Changed; +} diff --git a/contrib/llvm/lib/Transforms/IPO/Internalize.cpp b/contrib/llvm/lib/Transforms/IPO/Internalize.cpp new file mode 100644 index 0000000..9b9ebad --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/Internalize.cpp @@ -0,0 +1,192 @@ +//===-- Internalize.cpp - Mark functions internal -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass loops over all of the functions in the input module, looking for a +// main function. If a main function is found, all other functions and all +// global variables with initializers are marked as internal. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "internalize" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Pass.h" +#include "llvm/Module.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/Statistic.h" +#include <fstream> +#include <set> +using namespace llvm; + +STATISTIC(NumAliases , "Number of aliases internalized"); +STATISTIC(NumFunctions, "Number of functions internalized"); +STATISTIC(NumGlobals , "Number of global vars internalized"); + +// APIFile - A file which contains a list of symbols that should not be marked +// external. +static cl::opt<std::string> +APIFile("internalize-public-api-file", cl::value_desc("filename"), + cl::desc("A file containing list of symbol names to preserve")); + +// APIList - A list of symbols that should not be marked internal. +static cl::list<std::string> +APIList("internalize-public-api-list", cl::value_desc("list"), + cl::desc("A list of symbol names to preserve"), + cl::CommaSeparated); + +namespace { + class InternalizePass : public ModulePass { + std::set<std::string> ExternalNames; + /// If no api symbols were specified and a main function is defined, + /// assume the main function is the only API + bool AllButMain; + public: + static char ID; // Pass identification, replacement for typeid + explicit InternalizePass(bool AllButMain = true); + explicit InternalizePass(const std::vector <const char *>& exportList); + void LoadFile(const char *Filename); + virtual bool runOnModule(Module &M); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addPreserved<CallGraph>(); + } + }; +} // end anonymous namespace + +char InternalizePass::ID = 0; +INITIALIZE_PASS(InternalizePass, "internalize", + "Internalize Global Symbols", false, false) + +InternalizePass::InternalizePass(bool AllButMain) + : ModulePass(ID), AllButMain(AllButMain){ + initializeInternalizePassPass(*PassRegistry::getPassRegistry()); + if (!APIFile.empty()) // If a filename is specified, use it. + LoadFile(APIFile.c_str()); + if (!APIList.empty()) // If a list is specified, use it as well. + ExternalNames.insert(APIList.begin(), APIList.end()); +} + +InternalizePass::InternalizePass(const std::vector<const char *>&exportList) + : ModulePass(ID), AllButMain(false){ + initializeInternalizePassPass(*PassRegistry::getPassRegistry()); + for(std::vector<const char *>::const_iterator itr = exportList.begin(); + itr != exportList.end(); itr++) { + ExternalNames.insert(*itr); + } +} + +void InternalizePass::LoadFile(const char *Filename) { + // Load the APIFile... + std::ifstream In(Filename); + if (!In.good()) { + errs() << "WARNING: Internalize couldn't load file '" << Filename + << "'! Continuing as if it's empty.\n"; + return; // Just continue as if the file were empty + } + while (In) { + std::string Symbol; + In >> Symbol; + if (!Symbol.empty()) + ExternalNames.insert(Symbol); + } +} + +bool InternalizePass::runOnModule(Module &M) { + CallGraph *CG = getAnalysisIfAvailable<CallGraph>(); + CallGraphNode *ExternalNode = CG ? CG->getExternalCallingNode() : 0; + + if (ExternalNames.empty()) { + // Return if we're not in 'all but main' mode and have no external api + if (!AllButMain) + return false; + // If no list or file of symbols was specified, check to see if there is a + // "main" symbol defined in the module. If so, use it, otherwise do not + // internalize the module, it must be a library or something. + // + Function *MainFunc = M.getFunction("main"); + if (MainFunc == 0 || MainFunc->isDeclaration()) + return false; // No main found, must be a library... + + // Preserve main, internalize all else. + ExternalNames.insert(MainFunc->getName()); + } + + bool Changed = false; + + // Mark all functions not in the api as internal. + // FIXME: maybe use private linkage? + for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) + if (!I->isDeclaration() && // Function must be defined here + !I->hasLocalLinkage() && // Can't already have internal linkage + !ExternalNames.count(I->getName())) {// Not marked to keep external? + I->setLinkage(GlobalValue::InternalLinkage); + // Remove a callgraph edge from the external node to this function. + if (ExternalNode) ExternalNode->removeOneAbstractEdgeTo((*CG)[I]); + Changed = true; + ++NumFunctions; + DEBUG(dbgs() << "Internalizing func " << I->getName() << "\n"); + } + + // Never internalize the llvm.used symbol. It is used to implement + // attribute((used)). + // FIXME: Shouldn't this just filter on llvm.metadata section?? + ExternalNames.insert("llvm.used"); + ExternalNames.insert("llvm.compiler.used"); + + // Never internalize anchors used by the machine module info, else the info + // won't find them. (see MachineModuleInfo.) + ExternalNames.insert("llvm.dbg.compile_units"); + ExternalNames.insert("llvm.dbg.global_variables"); + ExternalNames.insert("llvm.dbg.subprograms"); + ExternalNames.insert("llvm.global_ctors"); + ExternalNames.insert("llvm.global_dtors"); + ExternalNames.insert("llvm.noinline"); + ExternalNames.insert("llvm.global.annotations"); + + // Mark all global variables with initializers that are not in the api as + // internal as well. + // FIXME: maybe use private linkage? + for (Module::global_iterator I = M.global_begin(), E = M.global_end(); + I != E; ++I) + if (!I->isDeclaration() && !I->hasLocalLinkage() && + // Available externally is really just a "declaration with a body". + !I->hasAvailableExternallyLinkage() && + !ExternalNames.count(I->getName())) { + I->setLinkage(GlobalValue::InternalLinkage); + Changed = true; + ++NumGlobals; + DEBUG(dbgs() << "Internalized gvar " << I->getName() << "\n"); + } + + // Mark all aliases that are not in the api as internal as well. + for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); + I != E; ++I) + if (!I->isDeclaration() && !I->hasInternalLinkage() && + // Available externally is really just a "declaration with a body". + !I->hasAvailableExternallyLinkage() && + !ExternalNames.count(I->getName())) { + I->setLinkage(GlobalValue::InternalLinkage); + Changed = true; + ++NumAliases; + DEBUG(dbgs() << "Internalized alias " << I->getName() << "\n"); + } + + return Changed; +} + +ModulePass *llvm::createInternalizePass(bool AllButMain) { + return new InternalizePass(AllButMain); +} + +ModulePass *llvm::createInternalizePass(const std::vector <const char *> &el) { + return new InternalizePass(el); +} diff --git a/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp b/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp new file mode 100644 index 0000000..848944d --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp @@ -0,0 +1,248 @@ +//===- LoopExtractor.cpp - Extract each loop into a new function ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// A pass wrapper around the ExtractLoop() scalar transformation to extract each +// top-level loop into its own new function. If the loop is the ONLY loop in a +// given function, it is not touched. This is a pass most useful for debugging +// via bugpoint. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "loop-extract" +#include "llvm/Transforms/IPO.h" +#include "llvm/Instructions.h" +#include "llvm/Module.h" +#include "llvm/Pass.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/FunctionUtils.h" +#include "llvm/ADT/Statistic.h" +#include <fstream> +#include <set> +using namespace llvm; + +STATISTIC(NumExtracted, "Number of loops extracted"); + +namespace { + struct LoopExtractor : public LoopPass { + static char ID; // Pass identification, replacement for typeid + unsigned NumLoops; + + explicit LoopExtractor(unsigned numLoops = ~0) + : LoopPass(ID), NumLoops(numLoops) { + initializeLoopExtractorPass(*PassRegistry::getPassRegistry()); + } + + virtual bool runOnLoop(Loop *L, LPPassManager &LPM); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequiredID(BreakCriticalEdgesID); + AU.addRequiredID(LoopSimplifyID); + AU.addRequired<DominatorTree>(); + } + }; +} + +char LoopExtractor::ID = 0; +INITIALIZE_PASS_BEGIN(LoopExtractor, "loop-extract", + "Extract loops into new functions", false, false) +INITIALIZE_PASS_DEPENDENCY(BreakCriticalEdges) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_END(LoopExtractor, "loop-extract", + "Extract loops into new functions", false, false) + +namespace { + /// SingleLoopExtractor - For bugpoint. + struct SingleLoopExtractor : public LoopExtractor { + static char ID; // Pass identification, replacement for typeid + SingleLoopExtractor() : LoopExtractor(1) {} + }; +} // End anonymous namespace + +char SingleLoopExtractor::ID = 0; +INITIALIZE_PASS(SingleLoopExtractor, "loop-extract-single", + "Extract at most one loop into a new function", false, false) + +// createLoopExtractorPass - This pass extracts all natural loops from the +// program into a function if it can. +// +Pass *llvm::createLoopExtractorPass() { return new LoopExtractor(); } + +bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) { + // Only visit top-level loops. + if (L->getParentLoop()) + return false; + + // If LoopSimplify form is not available, stay out of trouble. + if (!L->isLoopSimplifyForm()) + return false; + + DominatorTree &DT = getAnalysis<DominatorTree>(); + bool Changed = false; + + // If there is more than one top-level loop in this function, extract all of + // the loops. Otherwise there is exactly one top-level loop; in this case if + // this function is more than a minimal wrapper around the loop, extract + // the loop. + bool ShouldExtractLoop = false; + + // Extract the loop if the entry block doesn't branch to the loop header. + TerminatorInst *EntryTI = + L->getHeader()->getParent()->getEntryBlock().getTerminator(); + if (!isa<BranchInst>(EntryTI) || + !cast<BranchInst>(EntryTI)->isUnconditional() || + EntryTI->getSuccessor(0) != L->getHeader()) + ShouldExtractLoop = true; + else { + // Check to see if any exits from the loop are more than just return + // blocks. + SmallVector<BasicBlock*, 8> ExitBlocks; + L->getExitBlocks(ExitBlocks); + for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) + if (!isa<ReturnInst>(ExitBlocks[i]->getTerminator())) { + ShouldExtractLoop = true; + break; + } + } + if (ShouldExtractLoop) { + if (NumLoops == 0) return Changed; + --NumLoops; + if (ExtractLoop(DT, L) != 0) { + Changed = true; + // After extraction, the loop is replaced by a function call, so + // we shouldn't try to run any more loop passes on it. + LPM.deleteLoopFromQueue(L); + } + ++NumExtracted; + } + + return Changed; +} + +// createSingleLoopExtractorPass - This pass extracts one natural loop from the +// program into a function if it can. This is used by bugpoint. +// +Pass *llvm::createSingleLoopExtractorPass() { + return new SingleLoopExtractor(); +} + + +// BlockFile - A file which contains a list of blocks that should not be +// extracted. +static cl::opt<std::string> +BlockFile("extract-blocks-file", cl::value_desc("filename"), + cl::desc("A file containing list of basic blocks to not extract"), + cl::Hidden); + +namespace { + /// BlockExtractorPass - This pass is used by bugpoint to extract all blocks + /// from the module into their own functions except for those specified by the + /// BlocksToNotExtract list. + class BlockExtractorPass : public ModulePass { + void LoadFile(const char *Filename); + + std::vector<BasicBlock*> BlocksToNotExtract; + std::vector<std::pair<std::string, std::string> > BlocksToNotExtractByName; + public: + static char ID; // Pass identification, replacement for typeid + BlockExtractorPass() : ModulePass(ID) { + if (!BlockFile.empty()) + LoadFile(BlockFile.c_str()); + } + + bool runOnModule(Module &M); + }; +} + +char BlockExtractorPass::ID = 0; +INITIALIZE_PASS(BlockExtractorPass, "extract-blocks", + "Extract Basic Blocks From Module (for bugpoint use)", + false, false) + +// createBlockExtractorPass - This pass extracts all blocks (except those +// specified in the argument list) from the functions in the module. +// +ModulePass *llvm::createBlockExtractorPass() +{ + return new BlockExtractorPass(); +} + +void BlockExtractorPass::LoadFile(const char *Filename) { + // Load the BlockFile... + std::ifstream In(Filename); + if (!In.good()) { + errs() << "WARNING: BlockExtractor couldn't load file '" << Filename + << "'!\n"; + return; + } + while (In) { + std::string FunctionName, BlockName; + In >> FunctionName; + In >> BlockName; + if (!BlockName.empty()) + BlocksToNotExtractByName.push_back( + std::make_pair(FunctionName, BlockName)); + } +} + +bool BlockExtractorPass::runOnModule(Module &M) { + std::set<BasicBlock*> TranslatedBlocksToNotExtract; + for (unsigned i = 0, e = BlocksToNotExtract.size(); i != e; ++i) { + BasicBlock *BB = BlocksToNotExtract[i]; + Function *F = BB->getParent(); + + // Map the corresponding function in this module. + Function *MF = M.getFunction(F->getName()); + assert(MF->getFunctionType() == F->getFunctionType() && "Wrong function?"); + + // Figure out which index the basic block is in its function. + Function::iterator BBI = MF->begin(); + std::advance(BBI, std::distance(F->begin(), Function::iterator(BB))); + TranslatedBlocksToNotExtract.insert(BBI); + } + + while (!BlocksToNotExtractByName.empty()) { + // There's no way to find BBs by name without looking at every BB inside + // every Function. Fortunately, this is always empty except when used by + // bugpoint in which case correctness is more important than performance. + + std::string &FuncName = BlocksToNotExtractByName.back().first; + std::string &BlockName = BlocksToNotExtractByName.back().second; + + for (Module::iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI) { + Function &F = *FI; + if (F.getName() != FuncName) continue; + + for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { + BasicBlock &BB = *BI; + if (BB.getName() != BlockName) continue; + + TranslatedBlocksToNotExtract.insert(BI); + } + } + + BlocksToNotExtractByName.pop_back(); + } + + // Now that we know which blocks to not extract, figure out which ones we WANT + // to extract. + std::vector<BasicBlock*> BlocksToExtract; + for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) + for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) + if (!TranslatedBlocksToNotExtract.count(BB)) + BlocksToExtract.push_back(BB); + + for (unsigned i = 0, e = BlocksToExtract.size(); i != e; ++i) + ExtractBasicBlock(BlocksToExtract[i]); + + return !BlocksToExtract.empty(); +} diff --git a/contrib/llvm/lib/Transforms/IPO/LowerSetJmp.cpp b/contrib/llvm/lib/Transforms/IPO/LowerSetJmp.cpp new file mode 100644 index 0000000..b545f0b --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/LowerSetJmp.cpp @@ -0,0 +1,547 @@ +//===- LowerSetJmp.cpp - Code pertaining to lowering set/long jumps -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the lowering of setjmp and longjmp to use the +// LLVM invoke and unwind instructions as necessary. +// +// Lowering of longjmp is fairly trivial. We replace the call with a +// call to the LLVM library function "__llvm_sjljeh_throw_longjmp()". +// This unwinds the stack for us calling all of the destructors for +// objects allocated on the stack. +// +// At a setjmp call, the basic block is split and the setjmp removed. +// The calls in a function that have a setjmp are converted to invoke +// where the except part checks to see if it's a longjmp exception and, +// if so, if it's handled in the function. If it is, then it gets the +// value returned by the longjmp and goes to where the basic block was +// split. Invoke instructions are handled in a similar fashion with the +// original except block being executed if it isn't a longjmp except +// that is handled by that function. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// FIXME: This pass doesn't deal with PHI statements just yet. That is, +// we expect this to occur before SSAification is done. This would seem +// to make sense, but in general, it might be a good idea to make this +// pass invokable via the "opt" command at will. +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "lowersetjmp" +#include "llvm/Transforms/IPO.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Instructions.h" +#include "llvm/Intrinsics.h" +#include "llvm/LLVMContext.h" +#include "llvm/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Support/CFG.h" +#include "llvm/Support/InstVisitor.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/Statistic.h" +#include <map> +using namespace llvm; + +STATISTIC(LongJmpsTransformed, "Number of longjmps transformed"); +STATISTIC(SetJmpsTransformed , "Number of setjmps transformed"); +STATISTIC(CallsTransformed , "Number of calls invokified"); +STATISTIC(InvokesTransformed , "Number of invokes modified"); + +namespace { + //===--------------------------------------------------------------------===// + // LowerSetJmp pass implementation. + class LowerSetJmp : public ModulePass, public InstVisitor<LowerSetJmp> { + // LLVM library functions... + Constant *InitSJMap; // __llvm_sjljeh_init_setjmpmap + Constant *DestroySJMap; // __llvm_sjljeh_destroy_setjmpmap + Constant *AddSJToMap; // __llvm_sjljeh_add_setjmp_to_map + Constant *ThrowLongJmp; // __llvm_sjljeh_throw_longjmp + Constant *TryCatchLJ; // __llvm_sjljeh_try_catching_longjmp_exception + Constant *IsLJException; // __llvm_sjljeh_is_longjmp_exception + Constant *GetLJValue; // __llvm_sjljeh_get_longjmp_value + + typedef std::pair<SwitchInst*, CallInst*> SwitchValuePair; + + // Keep track of those basic blocks reachable via a depth-first search of + // the CFG from a setjmp call. We only need to transform those "call" and + // "invoke" instructions that are reachable from the setjmp call site. + std::set<BasicBlock*> DFSBlocks; + + // The setjmp map is going to hold information about which setjmps + // were called (each setjmp gets its own number) and with which + // buffer it was called. + std::map<Function*, AllocaInst*> SJMap; + + // The rethrow basic block map holds the basic block to branch to if + // the exception isn't handled in the current function and needs to + // be rethrown. + std::map<const Function*, BasicBlock*> RethrowBBMap; + + // The preliminary basic block map holds a basic block that grabs the + // exception and determines if it's handled by the current function. + std::map<const Function*, BasicBlock*> PrelimBBMap; + + // The switch/value map holds a switch inst/call inst pair. The + // switch inst controls which handler (if any) gets called and the + // value is the value returned to that handler by the call to + // __llvm_sjljeh_get_longjmp_value. + std::map<const Function*, SwitchValuePair> SwitchValMap; + + // A map of which setjmps we've seen so far in a function. + std::map<const Function*, unsigned> SetJmpIDMap; + + AllocaInst* GetSetJmpMap(Function* Func); + BasicBlock* GetRethrowBB(Function* Func); + SwitchValuePair GetSJSwitch(Function* Func, BasicBlock* Rethrow); + + void TransformLongJmpCall(CallInst* Inst); + void TransformSetJmpCall(CallInst* Inst); + + bool IsTransformableFunction(StringRef Name); + public: + static char ID; // Pass identification, replacement for typeid + LowerSetJmp() : ModulePass(ID) { + initializeLowerSetJmpPass(*PassRegistry::getPassRegistry()); + } + + void visitCallInst(CallInst& CI); + void visitInvokeInst(InvokeInst& II); + void visitReturnInst(ReturnInst& RI); + void visitUnwindInst(UnwindInst& UI); + + bool runOnModule(Module& M); + bool doInitialization(Module& M); + }; +} // end anonymous namespace + +char LowerSetJmp::ID = 0; +INITIALIZE_PASS(LowerSetJmp, "lowersetjmp", "Lower Set Jump", false, false) + +// run - Run the transformation on the program. We grab the function +// prototypes for longjmp and setjmp. If they are used in the program, +// then we can go directly to the places they're at and transform them. +bool LowerSetJmp::runOnModule(Module& M) { + bool Changed = false; + + // These are what the functions are called. + Function* SetJmp = M.getFunction("llvm.setjmp"); + Function* LongJmp = M.getFunction("llvm.longjmp"); + + // This program doesn't have longjmp and setjmp calls. + if ((!LongJmp || LongJmp->use_empty()) && + (!SetJmp || SetJmp->use_empty())) return false; + + // Initialize some values and functions we'll need to transform the + // setjmp/longjmp functions. + doInitialization(M); + + if (SetJmp) { + for (Value::use_iterator B = SetJmp->use_begin(), E = SetJmp->use_end(); + B != E; ++B) { + BasicBlock* BB = cast<Instruction>(*B)->getParent(); + for (df_ext_iterator<BasicBlock*> I = df_ext_begin(BB, DFSBlocks), + E = df_ext_end(BB, DFSBlocks); I != E; ++I) + /* empty */; + } + + while (!SetJmp->use_empty()) { + assert(isa<CallInst>(SetJmp->use_back()) && + "User of setjmp intrinsic not a call?"); + TransformSetJmpCall(cast<CallInst>(SetJmp->use_back())); + Changed = true; + } + } + + if (LongJmp) + while (!LongJmp->use_empty()) { + assert(isa<CallInst>(LongJmp->use_back()) && + "User of longjmp intrinsic not a call?"); + TransformLongJmpCall(cast<CallInst>(LongJmp->use_back())); + Changed = true; + } + + // Now go through the affected functions and convert calls and invokes + // to new invokes... + for (std::map<Function*, AllocaInst*>::iterator + B = SJMap.begin(), E = SJMap.end(); B != E; ++B) { + Function* F = B->first; + for (Function::iterator BB = F->begin(), BE = F->end(); BB != BE; ++BB) + for (BasicBlock::iterator IB = BB->begin(), IE = BB->end(); IB != IE; ) { + visit(*IB++); + if (IB != BB->end() && IB->getParent() != BB) + break; // The next instruction got moved to a different block! + } + } + + DFSBlocks.clear(); + SJMap.clear(); + RethrowBBMap.clear(); + PrelimBBMap.clear(); + SwitchValMap.clear(); + SetJmpIDMap.clear(); + + return Changed; +} + +// doInitialization - For the lower long/setjmp pass, this ensures that a +// module contains a declaration for the intrisic functions we are going +// to call to convert longjmp and setjmp calls. +// +// This function is always successful, unless it isn't. +bool LowerSetJmp::doInitialization(Module& M) +{ + const Type *SBPTy = Type::getInt8PtrTy(M.getContext()); + const Type *SBPPTy = PointerType::getUnqual(SBPTy); + + // N.B. See llvm/runtime/GCCLibraries/libexception/SJLJ-Exception.h for + // a description of the following library functions. + + // void __llvm_sjljeh_init_setjmpmap(void**) + InitSJMap = M.getOrInsertFunction("__llvm_sjljeh_init_setjmpmap", + Type::getVoidTy(M.getContext()), + SBPPTy, (Type *)0); + // void __llvm_sjljeh_destroy_setjmpmap(void**) + DestroySJMap = M.getOrInsertFunction("__llvm_sjljeh_destroy_setjmpmap", + Type::getVoidTy(M.getContext()), + SBPPTy, (Type *)0); + + // void __llvm_sjljeh_add_setjmp_to_map(void**, void*, unsigned) + AddSJToMap = M.getOrInsertFunction("__llvm_sjljeh_add_setjmp_to_map", + Type::getVoidTy(M.getContext()), + SBPPTy, SBPTy, + Type::getInt32Ty(M.getContext()), + (Type *)0); + + // void __llvm_sjljeh_throw_longjmp(int*, int) + ThrowLongJmp = M.getOrInsertFunction("__llvm_sjljeh_throw_longjmp", + Type::getVoidTy(M.getContext()), SBPTy, + Type::getInt32Ty(M.getContext()), + (Type *)0); + + // unsigned __llvm_sjljeh_try_catching_longjmp_exception(void **) + TryCatchLJ = + M.getOrInsertFunction("__llvm_sjljeh_try_catching_longjmp_exception", + Type::getInt32Ty(M.getContext()), SBPPTy, (Type *)0); + + // bool __llvm_sjljeh_is_longjmp_exception() + IsLJException = M.getOrInsertFunction("__llvm_sjljeh_is_longjmp_exception", + Type::getInt1Ty(M.getContext()), + (Type *)0); + + // int __llvm_sjljeh_get_longjmp_value() + GetLJValue = M.getOrInsertFunction("__llvm_sjljeh_get_longjmp_value", + Type::getInt32Ty(M.getContext()), + (Type *)0); + return true; +} + +// IsTransformableFunction - Return true if the function name isn't one +// of the ones we don't want transformed. Currently, don't transform any +// "llvm.{setjmp,longjmp}" functions and none of the setjmp/longjmp error +// handling functions (beginning with __llvm_sjljeh_...they don't throw +// exceptions). +bool LowerSetJmp::IsTransformableFunction(StringRef Name) { + return !Name.startswith("__llvm_sjljeh_"); +} + +// TransformLongJmpCall - Transform a longjmp call into a call to the +// internal __llvm_sjljeh_throw_longjmp function. It then takes care of +// throwing the exception for us. +void LowerSetJmp::TransformLongJmpCall(CallInst* Inst) +{ + const Type* SBPTy = Type::getInt8PtrTy(Inst->getContext()); + + // Create the call to "__llvm_sjljeh_throw_longjmp". This takes the + // same parameters as "longjmp", except that the buffer is cast to a + // char*. It returns "void", so it doesn't need to replace any of + // Inst's uses and doesn't get a name. + CastInst* CI = + new BitCastInst(Inst->getArgOperand(0), SBPTy, "LJBuf", Inst); + Value *Args[] = { CI, Inst->getArgOperand(1) }; + CallInst::Create(ThrowLongJmp, Args, Args + 2, "", Inst); + + SwitchValuePair& SVP = SwitchValMap[Inst->getParent()->getParent()]; + + // If the function has a setjmp call in it (they are transformed first) + // we should branch to the basic block that determines if this longjmp + // is applicable here. Otherwise, issue an unwind. + if (SVP.first) + BranchInst::Create(SVP.first->getParent(), Inst); + else + new UnwindInst(Inst->getContext(), Inst); + + // Remove all insts after the branch/unwind inst. Go from back to front to + // avoid replaceAllUsesWith if possible. + BasicBlock *BB = Inst->getParent(); + Instruction *Removed; + do { + Removed = &BB->back(); + // If the removed instructions have any users, replace them now. + if (!Removed->use_empty()) + Removed->replaceAllUsesWith(UndefValue::get(Removed->getType())); + Removed->eraseFromParent(); + } while (Removed != Inst); + + ++LongJmpsTransformed; +} + +// GetSetJmpMap - Retrieve (create and initialize, if necessary) the +// setjmp map. This map is going to hold information about which setjmps +// were called (each setjmp gets its own number) and with which buffer it +// was called. There can be only one! +AllocaInst* LowerSetJmp::GetSetJmpMap(Function* Func) +{ + if (SJMap[Func]) return SJMap[Func]; + + // Insert the setjmp map initialization before the first instruction in + // the function. + Instruction* Inst = Func->getEntryBlock().begin(); + assert(Inst && "Couldn't find even ONE instruction in entry block!"); + + // Fill in the alloca and call to initialize the SJ map. + const Type *SBPTy = + Type::getInt8PtrTy(Func->getContext()); + AllocaInst* Map = new AllocaInst(SBPTy, 0, "SJMap", Inst); + CallInst::Create(InitSJMap, Map, "", Inst); + return SJMap[Func] = Map; +} + +// GetRethrowBB - Only one rethrow basic block is needed per function. +// If this is a longjmp exception but not handled in this block, this BB +// performs the rethrow. +BasicBlock* LowerSetJmp::GetRethrowBB(Function* Func) +{ + if (RethrowBBMap[Func]) return RethrowBBMap[Func]; + + // The basic block we're going to jump to if we need to rethrow the + // exception. + BasicBlock* Rethrow = + BasicBlock::Create(Func->getContext(), "RethrowExcept", Func); + + // Fill in the "Rethrow" BB with a call to rethrow the exception. This + // is the last instruction in the BB since at this point the runtime + // should exit this function and go to the next function. + new UnwindInst(Func->getContext(), Rethrow); + return RethrowBBMap[Func] = Rethrow; +} + +// GetSJSwitch - Return the switch statement that controls which handler +// (if any) gets called and the value returned to that handler. +LowerSetJmp::SwitchValuePair LowerSetJmp::GetSJSwitch(Function* Func, + BasicBlock* Rethrow) +{ + if (SwitchValMap[Func].first) return SwitchValMap[Func]; + + BasicBlock* LongJmpPre = + BasicBlock::Create(Func->getContext(), "LongJmpBlkPre", Func); + + // Keep track of the preliminary basic block for some of the other + // transformations. + PrelimBBMap[Func] = LongJmpPre; + + // Grab the exception. + CallInst* Cond = CallInst::Create(IsLJException, "IsLJExcept", LongJmpPre); + + // The "decision basic block" gets the number associated with the + // setjmp call returning to switch on and the value returned by + // longjmp. + BasicBlock* DecisionBB = + BasicBlock::Create(Func->getContext(), "LJDecisionBB", Func); + + BranchInst::Create(DecisionBB, Rethrow, Cond, LongJmpPre); + + // Fill in the "decision" basic block. + CallInst* LJVal = CallInst::Create(GetLJValue, "LJVal", DecisionBB); + CallInst* SJNum = CallInst::Create(TryCatchLJ, GetSetJmpMap(Func), "SJNum", + DecisionBB); + + SwitchInst* SI = SwitchInst::Create(SJNum, Rethrow, 0, DecisionBB); + return SwitchValMap[Func] = SwitchValuePair(SI, LJVal); +} + +// TransformSetJmpCall - The setjmp call is a bit trickier to transform. +// We're going to convert all setjmp calls to nops. Then all "call" and +// "invoke" instructions in the function are converted to "invoke" where +// the "except" branch is used when returning from a longjmp call. +void LowerSetJmp::TransformSetJmpCall(CallInst* Inst) +{ + BasicBlock* ABlock = Inst->getParent(); + Function* Func = ABlock->getParent(); + + // Add this setjmp to the setjmp map. + const Type* SBPTy = + Type::getInt8PtrTy(Inst->getContext()); + CastInst* BufPtr = + new BitCastInst(Inst->getArgOperand(0), SBPTy, "SBJmpBuf", Inst); + Value *Args[] = { + GetSetJmpMap(Func), BufPtr, + ConstantInt::get(Type::getInt32Ty(Inst->getContext()), SetJmpIDMap[Func]++) + }; + CallInst::Create(AddSJToMap, Args, Args + 3, "", Inst); + + // We are guaranteed that there are no values live across basic blocks + // (because we are "not in SSA form" yet), but there can still be values live + // in basic blocks. Because of this, splitting the setjmp block can cause + // values above the setjmp to not dominate uses which are after the setjmp + // call. For all of these occasions, we must spill the value to the stack. + // + std::set<Instruction*> InstrsAfterCall; + + // The call is probably very close to the end of the basic block, for the + // common usage pattern of: 'if (setjmp(...))', so keep track of the + // instructions after the call. + for (BasicBlock::iterator I = ++BasicBlock::iterator(Inst), E = ABlock->end(); + I != E; ++I) + InstrsAfterCall.insert(I); + + for (BasicBlock::iterator II = ABlock->begin(); + II != BasicBlock::iterator(Inst); ++II) + // Loop over all of the uses of instruction. If any of them are after the + // call, "spill" the value to the stack. + for (Value::use_iterator UI = II->use_begin(), E = II->use_end(); + UI != E; ++UI) { + User *U = *UI; + if (cast<Instruction>(U)->getParent() != ABlock || + InstrsAfterCall.count(cast<Instruction>(U))) { + DemoteRegToStack(*II); + break; + } + } + InstrsAfterCall.clear(); + + // Change the setjmp call into a branch statement. We'll remove the + // setjmp call in a little bit. No worries. + BasicBlock* SetJmpContBlock = ABlock->splitBasicBlock(Inst); + assert(SetJmpContBlock && "Couldn't split setjmp BB!!"); + + SetJmpContBlock->setName(ABlock->getName()+"SetJmpCont"); + + // Add the SetJmpContBlock to the set of blocks reachable from a setjmp. + DFSBlocks.insert(SetJmpContBlock); + + // This PHI node will be in the new block created from the + // splitBasicBlock call. + PHINode* PHI = PHINode::Create(Type::getInt32Ty(Inst->getContext()), + "SetJmpReturn", Inst); + + // Coming from a call to setjmp, the return is 0. + PHI->addIncoming(Constant::getNullValue(Type::getInt32Ty(Inst->getContext())), + ABlock); + + // Add the case for this setjmp's number... + SwitchValuePair SVP = GetSJSwitch(Func, GetRethrowBB(Func)); + SVP.first->addCase(ConstantInt::get(Type::getInt32Ty(Inst->getContext()), + SetJmpIDMap[Func] - 1), + SetJmpContBlock); + + // Value coming from the handling of the exception. + PHI->addIncoming(SVP.second, SVP.second->getParent()); + + // Replace all uses of this instruction with the PHI node created by + // the eradication of setjmp. + Inst->replaceAllUsesWith(PHI); + Inst->eraseFromParent(); + + ++SetJmpsTransformed; +} + +// visitCallInst - This converts all LLVM call instructions into invoke +// instructions. The except part of the invoke goes to the "LongJmpBlkPre" +// that grabs the exception and proceeds to determine if it's a longjmp +// exception or not. +void LowerSetJmp::visitCallInst(CallInst& CI) +{ + if (CI.getCalledFunction()) + if (!IsTransformableFunction(CI.getCalledFunction()->getName()) || + CI.getCalledFunction()->isIntrinsic()) return; + + BasicBlock* OldBB = CI.getParent(); + + // If not reachable from a setjmp call, don't transform. + if (!DFSBlocks.count(OldBB)) return; + + BasicBlock* NewBB = OldBB->splitBasicBlock(CI); + assert(NewBB && "Couldn't split BB of \"call\" instruction!!"); + DFSBlocks.insert(NewBB); + NewBB->setName("Call2Invoke"); + + Function* Func = OldBB->getParent(); + + // Construct the new "invoke" instruction. + TerminatorInst* Term = OldBB->getTerminator(); + CallSite CS(&CI); + std::vector<Value*> Params(CS.arg_begin(), CS.arg_end()); + InvokeInst* II = + InvokeInst::Create(CI.getCalledValue(), NewBB, PrelimBBMap[Func], + Params.begin(), Params.end(), CI.getName(), Term); + II->setCallingConv(CI.getCallingConv()); + II->setAttributes(CI.getAttributes()); + + // Replace the old call inst with the invoke inst and remove the call. + CI.replaceAllUsesWith(II); + CI.eraseFromParent(); + + // The old terminator is useless now that we have the invoke inst. + Term->eraseFromParent(); + ++CallsTransformed; +} + +// visitInvokeInst - Converting the "invoke" instruction is fairly +// straight-forward. The old exception part is replaced by a query asking +// if this is a longjmp exception. If it is, then it goes to the longjmp +// exception blocks. Otherwise, control is passed the old exception. +void LowerSetJmp::visitInvokeInst(InvokeInst& II) +{ + if (II.getCalledFunction()) + if (!IsTransformableFunction(II.getCalledFunction()->getName()) || + II.getCalledFunction()->isIntrinsic()) return; + + BasicBlock* BB = II.getParent(); + + // If not reachable from a setjmp call, don't transform. + if (!DFSBlocks.count(BB)) return; + + BasicBlock* ExceptBB = II.getUnwindDest(); + + Function* Func = BB->getParent(); + BasicBlock* NewExceptBB = BasicBlock::Create(II.getContext(), + "InvokeExcept", Func); + + // If this is a longjmp exception, then branch to the preliminary BB of + // the longjmp exception handling. Otherwise, go to the old exception. + CallInst* IsLJExcept = CallInst::Create(IsLJException, "IsLJExcept", + NewExceptBB); + + BranchInst::Create(PrelimBBMap[Func], ExceptBB, IsLJExcept, NewExceptBB); + + II.setUnwindDest(NewExceptBB); + ++InvokesTransformed; +} + +// visitReturnInst - We want to destroy the setjmp map upon exit from the +// function. +void LowerSetJmp::visitReturnInst(ReturnInst &RI) { + Function* Func = RI.getParent()->getParent(); + CallInst::Create(DestroySJMap, GetSetJmpMap(Func), "", &RI); +} + +// visitUnwindInst - We want to destroy the setjmp map upon exit from the +// function. +void LowerSetJmp::visitUnwindInst(UnwindInst &UI) { + Function* Func = UI.getParent()->getParent(); + CallInst::Create(DestroySJMap, GetSetJmpMap(Func), "", &UI); +} + +ModulePass *llvm::createLowerSetJmpPass() { + return new LowerSetJmp(); +} + diff --git a/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp new file mode 100644 index 0000000..cccffca --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp @@ -0,0 +1,868 @@ +//===- MergeFunctions.cpp - Merge identical functions ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass looks for equivalent functions that are mergable and folds them. +// +// A hash is computed from the function, based on its type and number of +// basic blocks. +// +// Once all hashes are computed, we perform an expensive equality comparison +// on each function pair. This takes n^2/2 comparisons per bucket, so it's +// important that the hash function be high quality. The equality comparison +// iterates through each instruction in each basic block. +// +// When a match is found the functions are folded. If both functions are +// overridable, we move the functionality into a new internal function and +// leave two overridable thunks to it. +// +//===----------------------------------------------------------------------===// +// +// Future work: +// +// * virtual functions. +// +// Many functions have their address taken by the virtual function table for +// the object they belong to. However, as long as it's only used for a lookup +// and call, this is irrelevant, and we'd like to fold such functions. +// +// * switch from n^2 pair-wise comparisons to an n-way comparison for each +// bucket. +// +// * be smarter about bitcasts. +// +// In order to fold functions, we will sometimes add either bitcast instructions +// or bitcast constant expressions. Unfortunately, this can confound further +// analysis since the two functions differ where one has a bitcast and the +// other doesn't. We should learn to look through bitcasts. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "mergefunc" +#include "llvm/Transforms/IPO.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/FoldingSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Constants.h" +#include "llvm/InlineAsm.h" +#include "llvm/Instructions.h" +#include "llvm/LLVMContext.h" +#include "llvm/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/IRBuilder.h" +#include "llvm/Support/ValueHandle.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetData.h" +#include <vector> +using namespace llvm; + +STATISTIC(NumFunctionsMerged, "Number of functions merged"); +STATISTIC(NumThunksWritten, "Number of thunks generated"); +STATISTIC(NumAliasesWritten, "Number of aliases generated"); +STATISTIC(NumDoubleWeak, "Number of new functions created"); + +/// Creates a hash-code for the function which is the same for any two +/// functions that will compare equal, without looking at the instructions +/// inside the function. +static unsigned profileFunction(const Function *F) { + const FunctionType *FTy = F->getFunctionType(); + + FoldingSetNodeID ID; + ID.AddInteger(F->size()); + ID.AddInteger(F->getCallingConv()); + ID.AddBoolean(F->hasGC()); + ID.AddBoolean(FTy->isVarArg()); + ID.AddInteger(FTy->getReturnType()->getTypeID()); + for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) + ID.AddInteger(FTy->getParamType(i)->getTypeID()); + return ID.ComputeHash(); +} + +namespace { + +/// ComparableFunction - A struct that pairs together functions with a +/// TargetData so that we can keep them together as elements in the DenseSet. +class ComparableFunction { +public: + static const ComparableFunction EmptyKey; + static const ComparableFunction TombstoneKey; + static TargetData * const LookupOnly; + + ComparableFunction(Function *Func, TargetData *TD) + : Func(Func), Hash(profileFunction(Func)), TD(TD) {} + + Function *getFunc() const { return Func; } + unsigned getHash() const { return Hash; } + TargetData *getTD() const { return TD; } + + // Drops AssertingVH reference to the function. Outside of debug mode, this + // does nothing. + void release() { + assert(Func && + "Attempted to release function twice, or release empty/tombstone!"); + Func = NULL; + } + +private: + explicit ComparableFunction(unsigned Hash) + : Func(NULL), Hash(Hash), TD(NULL) {} + + AssertingVH<Function> Func; + unsigned Hash; + TargetData *TD; +}; + +const ComparableFunction ComparableFunction::EmptyKey = ComparableFunction(0); +const ComparableFunction ComparableFunction::TombstoneKey = + ComparableFunction(1); +TargetData * const ComparableFunction::LookupOnly = (TargetData*)(-1); + +} + +namespace llvm { + template <> + struct DenseMapInfo<ComparableFunction> { + static ComparableFunction getEmptyKey() { + return ComparableFunction::EmptyKey; + } + static ComparableFunction getTombstoneKey() { + return ComparableFunction::TombstoneKey; + } + static unsigned getHashValue(const ComparableFunction &CF) { + return CF.getHash(); + } + static bool isEqual(const ComparableFunction &LHS, + const ComparableFunction &RHS); + }; +} + +namespace { + +/// FunctionComparator - Compares two functions to determine whether or not +/// they will generate machine code with the same behaviour. TargetData is +/// used if available. The comparator always fails conservatively (erring on the +/// side of claiming that two functions are different). +class FunctionComparator { +public: + FunctionComparator(const TargetData *TD, const Function *F1, + const Function *F2) + : F1(F1), F2(F2), TD(TD) {} + + /// Test whether the two functions have equivalent behaviour. + bool compare(); + +private: + /// Test whether two basic blocks have equivalent behaviour. + bool compare(const BasicBlock *BB1, const BasicBlock *BB2); + + /// Assign or look up previously assigned numbers for the two values, and + /// return whether the numbers are equal. Numbers are assigned in the order + /// visited. + bool enumerate(const Value *V1, const Value *V2); + + /// Compare two Instructions for equivalence, similar to + /// Instruction::isSameOperationAs but with modifications to the type + /// comparison. + bool isEquivalentOperation(const Instruction *I1, + const Instruction *I2) const; + + /// Compare two GEPs for equivalent pointer arithmetic. + bool isEquivalentGEP(const GEPOperator *GEP1, const GEPOperator *GEP2); + bool isEquivalentGEP(const GetElementPtrInst *GEP1, + const GetElementPtrInst *GEP2) { + return isEquivalentGEP(cast<GEPOperator>(GEP1), cast<GEPOperator>(GEP2)); + } + + /// Compare two Types, treating all pointer types as equal. + bool isEquivalentType(const Type *Ty1, const Type *Ty2) const; + + // The two functions undergoing comparison. + const Function *F1, *F2; + + const TargetData *TD; + + DenseMap<const Value *, const Value *> id_map; + DenseSet<const Value *> seen_values; +}; + +} + +// Any two pointers in the same address space are equivalent, intptr_t and +// pointers are equivalent. Otherwise, standard type equivalence rules apply. +bool FunctionComparator::isEquivalentType(const Type *Ty1, + const Type *Ty2) const { + if (Ty1 == Ty2) + return true; + if (Ty1->getTypeID() != Ty2->getTypeID()) { + if (TD) { + LLVMContext &Ctx = Ty1->getContext(); + if (isa<PointerType>(Ty1) && Ty2 == TD->getIntPtrType(Ctx)) return true; + if (isa<PointerType>(Ty2) && Ty1 == TD->getIntPtrType(Ctx)) return true; + } + return false; + } + + switch(Ty1->getTypeID()) { + default: + llvm_unreachable("Unknown type!"); + // Fall through in Release mode. + case Type::IntegerTyID: + case Type::OpaqueTyID: + case Type::VectorTyID: + // Ty1 == Ty2 would have returned true earlier. + return false; + + case Type::VoidTyID: + case Type::FloatTyID: + case Type::DoubleTyID: + case Type::X86_FP80TyID: + case Type::FP128TyID: + case Type::PPC_FP128TyID: + case Type::LabelTyID: + case Type::MetadataTyID: + return true; + + case Type::PointerTyID: { + const PointerType *PTy1 = cast<PointerType>(Ty1); + const PointerType *PTy2 = cast<PointerType>(Ty2); + return PTy1->getAddressSpace() == PTy2->getAddressSpace(); + } + + case Type::StructTyID: { + const StructType *STy1 = cast<StructType>(Ty1); + const StructType *STy2 = cast<StructType>(Ty2); + if (STy1->getNumElements() != STy2->getNumElements()) + return false; + + if (STy1->isPacked() != STy2->isPacked()) + return false; + + for (unsigned i = 0, e = STy1->getNumElements(); i != e; ++i) { + if (!isEquivalentType(STy1->getElementType(i), STy2->getElementType(i))) + return false; + } + return true; + } + + case Type::FunctionTyID: { + const FunctionType *FTy1 = cast<FunctionType>(Ty1); + const FunctionType *FTy2 = cast<FunctionType>(Ty2); + if (FTy1->getNumParams() != FTy2->getNumParams() || + FTy1->isVarArg() != FTy2->isVarArg()) + return false; + + if (!isEquivalentType(FTy1->getReturnType(), FTy2->getReturnType())) + return false; + + for (unsigned i = 0, e = FTy1->getNumParams(); i != e; ++i) { + if (!isEquivalentType(FTy1->getParamType(i), FTy2->getParamType(i))) + return false; + } + return true; + } + + case Type::ArrayTyID: { + const ArrayType *ATy1 = cast<ArrayType>(Ty1); + const ArrayType *ATy2 = cast<ArrayType>(Ty2); + return ATy1->getNumElements() == ATy2->getNumElements() && + isEquivalentType(ATy1->getElementType(), ATy2->getElementType()); + } + } +} + +// Determine whether the two operations are the same except that pointer-to-A +// and pointer-to-B are equivalent. This should be kept in sync with +// Instruction::isSameOperationAs. +bool FunctionComparator::isEquivalentOperation(const Instruction *I1, + const Instruction *I2) const { + // Differences from Instruction::isSameOperationAs: + // * replace type comparison with calls to isEquivalentType. + // * we test for I->hasSameSubclassOptionalData (nuw/nsw/tail) at the top + // * because of the above, we don't test for the tail bit on calls later on + if (I1->getOpcode() != I2->getOpcode() || + I1->getNumOperands() != I2->getNumOperands() || + !isEquivalentType(I1->getType(), I2->getType()) || + !I1->hasSameSubclassOptionalData(I2)) + return false; + + // We have two instructions of identical opcode and #operands. Check to see + // if all operands are the same type + for (unsigned i = 0, e = I1->getNumOperands(); i != e; ++i) + if (!isEquivalentType(I1->getOperand(i)->getType(), + I2->getOperand(i)->getType())) + return false; + + // Check special state that is a part of some instructions. + if (const LoadInst *LI = dyn_cast<LoadInst>(I1)) + return LI->isVolatile() == cast<LoadInst>(I2)->isVolatile() && + LI->getAlignment() == cast<LoadInst>(I2)->getAlignment(); + if (const StoreInst *SI = dyn_cast<StoreInst>(I1)) + return SI->isVolatile() == cast<StoreInst>(I2)->isVolatile() && + SI->getAlignment() == cast<StoreInst>(I2)->getAlignment(); + if (const CmpInst *CI = dyn_cast<CmpInst>(I1)) + return CI->getPredicate() == cast<CmpInst>(I2)->getPredicate(); + if (const CallInst *CI = dyn_cast<CallInst>(I1)) + return CI->getCallingConv() == cast<CallInst>(I2)->getCallingConv() && + CI->getAttributes() == cast<CallInst>(I2)->getAttributes(); + if (const InvokeInst *CI = dyn_cast<InvokeInst>(I1)) + return CI->getCallingConv() == cast<InvokeInst>(I2)->getCallingConv() && + CI->getAttributes() == cast<InvokeInst>(I2)->getAttributes(); + if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(I1)) { + if (IVI->getNumIndices() != cast<InsertValueInst>(I2)->getNumIndices()) + return false; + for (unsigned i = 0, e = IVI->getNumIndices(); i != e; ++i) + if (IVI->idx_begin()[i] != cast<InsertValueInst>(I2)->idx_begin()[i]) + return false; + return true; + } + if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(I1)) { + if (EVI->getNumIndices() != cast<ExtractValueInst>(I2)->getNumIndices()) + return false; + for (unsigned i = 0, e = EVI->getNumIndices(); i != e; ++i) + if (EVI->idx_begin()[i] != cast<ExtractValueInst>(I2)->idx_begin()[i]) + return false; + return true; + } + + return true; +} + +// Determine whether two GEP operations perform the same underlying arithmetic. +bool FunctionComparator::isEquivalentGEP(const GEPOperator *GEP1, + const GEPOperator *GEP2) { + // When we have target data, we can reduce the GEP down to the value in bytes + // added to the address. + if (TD && GEP1->hasAllConstantIndices() && GEP2->hasAllConstantIndices()) { + SmallVector<Value *, 8> Indices1(GEP1->idx_begin(), GEP1->idx_end()); + SmallVector<Value *, 8> Indices2(GEP2->idx_begin(), GEP2->idx_end()); + uint64_t Offset1 = TD->getIndexedOffset(GEP1->getPointerOperandType(), + Indices1.data(), Indices1.size()); + uint64_t Offset2 = TD->getIndexedOffset(GEP2->getPointerOperandType(), + Indices2.data(), Indices2.size()); + return Offset1 == Offset2; + } + + if (GEP1->getPointerOperand()->getType() != + GEP2->getPointerOperand()->getType()) + return false; + + if (GEP1->getNumOperands() != GEP2->getNumOperands()) + return false; + + for (unsigned i = 0, e = GEP1->getNumOperands(); i != e; ++i) { + if (!enumerate(GEP1->getOperand(i), GEP2->getOperand(i))) + return false; + } + + return true; +} + +// Compare two values used by the two functions under pair-wise comparison. If +// this is the first time the values are seen, they're added to the mapping so +// that we will detect mismatches on next use. +bool FunctionComparator::enumerate(const Value *V1, const Value *V2) { + // Check for function @f1 referring to itself and function @f2 referring to + // itself, or referring to each other, or both referring to either of them. + // They're all equivalent if the two functions are otherwise equivalent. + if (V1 == F1 && V2 == F2) + return true; + if (V1 == F2 && V2 == F1) + return true; + + if (const Constant *C1 = dyn_cast<Constant>(V1)) { + if (V1 == V2) return true; + const Constant *C2 = dyn_cast<Constant>(V2); + if (!C2) return false; + // TODO: constant expressions with GEP or references to F1 or F2. + if (C1->isNullValue() && C2->isNullValue() && + isEquivalentType(C1->getType(), C2->getType())) + return true; + // Try bitcasting C2 to C1's type. If the bitcast is legal and returns C1 + // then they must have equal bit patterns. + return C1->getType()->canLosslesslyBitCastTo(C2->getType()) && + C1 == ConstantExpr::getBitCast(const_cast<Constant*>(C2), C1->getType()); + } + + if (isa<InlineAsm>(V1) || isa<InlineAsm>(V2)) + return V1 == V2; + + // Check that V1 maps to V2. If we find a value that V1 maps to then we simply + // check whether it's equal to V2. When there is no mapping then we need to + // ensure that V2 isn't already equivalent to something else. For this + // purpose, we track the V2 values in a set. + + const Value *&map_elem = id_map[V1]; + if (map_elem) + return map_elem == V2; + if (!seen_values.insert(V2).second) + return false; + map_elem = V2; + return true; +} + +// Test whether two basic blocks have equivalent behaviour. +bool FunctionComparator::compare(const BasicBlock *BB1, const BasicBlock *BB2) { + BasicBlock::const_iterator F1I = BB1->begin(), F1E = BB1->end(); + BasicBlock::const_iterator F2I = BB2->begin(), F2E = BB2->end(); + + do { + if (!enumerate(F1I, F2I)) + return false; + + if (const GetElementPtrInst *GEP1 = dyn_cast<GetElementPtrInst>(F1I)) { + const GetElementPtrInst *GEP2 = dyn_cast<GetElementPtrInst>(F2I); + if (!GEP2) + return false; + + if (!enumerate(GEP1->getPointerOperand(), GEP2->getPointerOperand())) + return false; + + if (!isEquivalentGEP(GEP1, GEP2)) + return false; + } else { + if (!isEquivalentOperation(F1I, F2I)) + return false; + + assert(F1I->getNumOperands() == F2I->getNumOperands()); + for (unsigned i = 0, e = F1I->getNumOperands(); i != e; ++i) { + Value *OpF1 = F1I->getOperand(i); + Value *OpF2 = F2I->getOperand(i); + + if (!enumerate(OpF1, OpF2)) + return false; + + if (OpF1->getValueID() != OpF2->getValueID() || + !isEquivalentType(OpF1->getType(), OpF2->getType())) + return false; + } + } + + ++F1I, ++F2I; + } while (F1I != F1E && F2I != F2E); + + return F1I == F1E && F2I == F2E; +} + +// Test whether the two functions have equivalent behaviour. +bool FunctionComparator::compare() { + // We need to recheck everything, but check the things that weren't included + // in the hash first. + + if (F1->getAttributes() != F2->getAttributes()) + return false; + + if (F1->hasGC() != F2->hasGC()) + return false; + + if (F1->hasGC() && F1->getGC() != F2->getGC()) + return false; + + if (F1->hasSection() != F2->hasSection()) + return false; + + if (F1->hasSection() && F1->getSection() != F2->getSection()) + return false; + + if (F1->isVarArg() != F2->isVarArg()) + return false; + + // TODO: if it's internal and only used in direct calls, we could handle this + // case too. + if (F1->getCallingConv() != F2->getCallingConv()) + return false; + + if (!isEquivalentType(F1->getFunctionType(), F2->getFunctionType())) + return false; + + assert(F1->arg_size() == F2->arg_size() && + "Identically typed functions have different numbers of args!"); + + // Visit the arguments so that they get enumerated in the order they're + // passed in. + for (Function::const_arg_iterator f1i = F1->arg_begin(), + f2i = F2->arg_begin(), f1e = F1->arg_end(); f1i != f1e; ++f1i, ++f2i) { + if (!enumerate(f1i, f2i)) + llvm_unreachable("Arguments repeat!"); + } + + // We do a CFG-ordered walk since the actual ordering of the blocks in the + // linked list is immaterial. Our walk starts at the entry block for both + // functions, then takes each block from each terminator in order. As an + // artifact, this also means that unreachable blocks are ignored. + SmallVector<const BasicBlock *, 8> F1BBs, F2BBs; + SmallSet<const BasicBlock *, 128> VisitedBBs; // in terms of F1. + + F1BBs.push_back(&F1->getEntryBlock()); + F2BBs.push_back(&F2->getEntryBlock()); + + VisitedBBs.insert(F1BBs[0]); + while (!F1BBs.empty()) { + const BasicBlock *F1BB = F1BBs.pop_back_val(); + const BasicBlock *F2BB = F2BBs.pop_back_val(); + + if (!enumerate(F1BB, F2BB) || !compare(F1BB, F2BB)) + return false; + + const TerminatorInst *F1TI = F1BB->getTerminator(); + const TerminatorInst *F2TI = F2BB->getTerminator(); + + assert(F1TI->getNumSuccessors() == F2TI->getNumSuccessors()); + for (unsigned i = 0, e = F1TI->getNumSuccessors(); i != e; ++i) { + if (!VisitedBBs.insert(F1TI->getSuccessor(i))) + continue; + + F1BBs.push_back(F1TI->getSuccessor(i)); + F2BBs.push_back(F2TI->getSuccessor(i)); + } + } + return true; +} + +namespace { + +/// MergeFunctions finds functions which will generate identical machine code, +/// by considering all pointer types to be equivalent. Once identified, +/// MergeFunctions will fold them by replacing a call to one to a call to a +/// bitcast of the other. +/// +class MergeFunctions : public ModulePass { +public: + static char ID; + MergeFunctions() + : ModulePass(ID), HasGlobalAliases(false) { + initializeMergeFunctionsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M); + +private: + typedef DenseSet<ComparableFunction> FnSetType; + + /// A work queue of functions that may have been modified and should be + /// analyzed again. + std::vector<WeakVH> Deferred; + + /// Insert a ComparableFunction into the FnSet, or merge it away if it's + /// equal to one that's already present. + bool insert(ComparableFunction &NewF); + + /// Remove a Function from the FnSet and queue it up for a second sweep of + /// analysis. + void remove(Function *F); + + /// Find the functions that use this Value and remove them from FnSet and + /// queue the functions. + void removeUsers(Value *V); + + /// Replace all direct calls of Old with calls of New. Will bitcast New if + /// necessary to make types match. + void replaceDirectCallers(Function *Old, Function *New); + + /// Merge two equivalent functions. Upon completion, G may be deleted, or may + /// be converted into a thunk. In either case, it should never be visited + /// again. + void mergeTwoFunctions(Function *F, Function *G); + + /// Replace G with a thunk or an alias to F. Deletes G. + void writeThunkOrAlias(Function *F, Function *G); + + /// Replace G with a simple tail call to bitcast(F). Also replace direct uses + /// of G with bitcast(F). Deletes G. + void writeThunk(Function *F, Function *G); + + /// Replace G with an alias to F. Deletes G. + void writeAlias(Function *F, Function *G); + + /// The set of all distinct functions. Use the insert() and remove() methods + /// to modify it. + FnSetType FnSet; + + /// TargetData for more accurate GEP comparisons. May be NULL. + TargetData *TD; + + /// Whether or not the target supports global aliases. + bool HasGlobalAliases; +}; + +} // end anonymous namespace + +char MergeFunctions::ID = 0; +INITIALIZE_PASS(MergeFunctions, "mergefunc", "Merge Functions", false, false) + +ModulePass *llvm::createMergeFunctionsPass() { + return new MergeFunctions(); +} + +bool MergeFunctions::runOnModule(Module &M) { + bool Changed = false; + TD = getAnalysisIfAvailable<TargetData>(); + + for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { + if (!I->isDeclaration() && !I->hasAvailableExternallyLinkage()) + Deferred.push_back(WeakVH(I)); + } + FnSet.resize(Deferred.size()); + + do { + std::vector<WeakVH> Worklist; + Deferred.swap(Worklist); + + DEBUG(dbgs() << "size of module: " << M.size() << '\n'); + DEBUG(dbgs() << "size of worklist: " << Worklist.size() << '\n'); + + // Insert only strong functions and merge them. Strong function merging + // always deletes one of them. + for (std::vector<WeakVH>::iterator I = Worklist.begin(), + E = Worklist.end(); I != E; ++I) { + if (!*I) continue; + Function *F = cast<Function>(*I); + if (!F->isDeclaration() && !F->hasAvailableExternallyLinkage() && + !F->mayBeOverridden()) { + ComparableFunction CF = ComparableFunction(F, TD); + Changed |= insert(CF); + } + } + + // Insert only weak functions and merge them. By doing these second we + // create thunks to the strong function when possible. When two weak + // functions are identical, we create a new strong function with two weak + // weak thunks to it which are identical but not mergable. + for (std::vector<WeakVH>::iterator I = Worklist.begin(), + E = Worklist.end(); I != E; ++I) { + if (!*I) continue; + Function *F = cast<Function>(*I); + if (!F->isDeclaration() && !F->hasAvailableExternallyLinkage() && + F->mayBeOverridden()) { + ComparableFunction CF = ComparableFunction(F, TD); + Changed |= insert(CF); + } + } + DEBUG(dbgs() << "size of FnSet: " << FnSet.size() << '\n'); + } while (!Deferred.empty()); + + FnSet.clear(); + + return Changed; +} + +bool DenseMapInfo<ComparableFunction>::isEqual(const ComparableFunction &LHS, + const ComparableFunction &RHS) { + if (LHS.getFunc() == RHS.getFunc() && + LHS.getHash() == RHS.getHash()) + return true; + if (!LHS.getFunc() || !RHS.getFunc()) + return false; + + // One of these is a special "underlying pointer comparison only" object. + if (LHS.getTD() == ComparableFunction::LookupOnly || + RHS.getTD() == ComparableFunction::LookupOnly) + return false; + + assert(LHS.getTD() == RHS.getTD() && + "Comparing functions for different targets"); + + return FunctionComparator(LHS.getTD(), LHS.getFunc(), + RHS.getFunc()).compare(); +} + +// Replace direct callers of Old with New. +void MergeFunctions::replaceDirectCallers(Function *Old, Function *New) { + Constant *BitcastNew = ConstantExpr::getBitCast(New, Old->getType()); + for (Value::use_iterator UI = Old->use_begin(), UE = Old->use_end(); + UI != UE;) { + Value::use_iterator TheIter = UI; + ++UI; + CallSite CS(*TheIter); + if (CS && CS.isCallee(TheIter)) { + remove(CS.getInstruction()->getParent()->getParent()); + TheIter.getUse().set(BitcastNew); + } + } +} + +// Replace G with an alias to F if possible, or else a thunk to F. Deletes G. +void MergeFunctions::writeThunkOrAlias(Function *F, Function *G) { + if (HasGlobalAliases && G->hasUnnamedAddr()) { + if (G->hasExternalLinkage() || G->hasLocalLinkage() || + G->hasWeakLinkage()) { + writeAlias(F, G); + return; + } + } + + writeThunk(F, G); +} + +// Replace G with a simple tail call to bitcast(F). Also replace direct uses +// of G with bitcast(F). Deletes G. +void MergeFunctions::writeThunk(Function *F, Function *G) { + if (!G->mayBeOverridden()) { + // Redirect direct callers of G to F. + replaceDirectCallers(G, F); + } + + // If G was internal then we may have replaced all uses of G with F. If so, + // stop here and delete G. There's no need for a thunk. + if (G->hasLocalLinkage() && G->use_empty()) { + G->eraseFromParent(); + return; + } + + Function *NewG = Function::Create(G->getFunctionType(), G->getLinkage(), "", + G->getParent()); + BasicBlock *BB = BasicBlock::Create(F->getContext(), "", NewG); + IRBuilder<false> Builder(BB); + + SmallVector<Value *, 16> Args; + unsigned i = 0; + const FunctionType *FFTy = F->getFunctionType(); + for (Function::arg_iterator AI = NewG->arg_begin(), AE = NewG->arg_end(); + AI != AE; ++AI) { + Args.push_back(Builder.CreateBitCast(AI, FFTy->getParamType(i))); + ++i; + } + + CallInst *CI = Builder.CreateCall(F, Args.begin(), Args.end()); + CI->setTailCall(); + CI->setCallingConv(F->getCallingConv()); + if (NewG->getReturnType()->isVoidTy()) { + Builder.CreateRetVoid(); + } else { + Builder.CreateRet(Builder.CreateBitCast(CI, NewG->getReturnType())); + } + + NewG->copyAttributesFrom(G); + NewG->takeName(G); + removeUsers(G); + G->replaceAllUsesWith(NewG); + G->eraseFromParent(); + + DEBUG(dbgs() << "writeThunk: " << NewG->getName() << '\n'); + ++NumThunksWritten; +} + +// Replace G with an alias to F and delete G. +void MergeFunctions::writeAlias(Function *F, Function *G) { + Constant *BitcastF = ConstantExpr::getBitCast(F, G->getType()); + GlobalAlias *GA = new GlobalAlias(G->getType(), G->getLinkage(), "", + BitcastF, G->getParent()); + F->setAlignment(std::max(F->getAlignment(), G->getAlignment())); + GA->takeName(G); + GA->setVisibility(G->getVisibility()); + removeUsers(G); + G->replaceAllUsesWith(GA); + G->eraseFromParent(); + + DEBUG(dbgs() << "writeAlias: " << GA->getName() << '\n'); + ++NumAliasesWritten; +} + +// Merge two equivalent functions. Upon completion, Function G is deleted. +void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) { + if (F->mayBeOverridden()) { + assert(G->mayBeOverridden()); + + if (HasGlobalAliases) { + // Make them both thunks to the same internal function. + Function *H = Function::Create(F->getFunctionType(), F->getLinkage(), "", + F->getParent()); + H->copyAttributesFrom(F); + H->takeName(F); + removeUsers(F); + F->replaceAllUsesWith(H); + + unsigned MaxAlignment = std::max(G->getAlignment(), H->getAlignment()); + + writeAlias(F, G); + writeAlias(F, H); + + F->setAlignment(MaxAlignment); + F->setLinkage(GlobalValue::PrivateLinkage); + } else { + // We can't merge them. Instead, pick one and update all direct callers + // to call it and hope that we improve the instruction cache hit rate. + replaceDirectCallers(G, F); + } + + ++NumDoubleWeak; + } else { + writeThunkOrAlias(F, G); + } + + ++NumFunctionsMerged; +} + +// Insert a ComparableFunction into the FnSet, or merge it away if equal to one +// that was already inserted. +bool MergeFunctions::insert(ComparableFunction &NewF) { + std::pair<FnSetType::iterator, bool> Result = FnSet.insert(NewF); + if (Result.second) { + DEBUG(dbgs() << "Inserting as unique: " << NewF.getFunc()->getName() << '\n'); + return false; + } + + const ComparableFunction &OldF = *Result.first; + + // Never thunk a strong function to a weak function. + assert(!OldF.getFunc()->mayBeOverridden() || + NewF.getFunc()->mayBeOverridden()); + + DEBUG(dbgs() << " " << OldF.getFunc()->getName() << " == " + << NewF.getFunc()->getName() << '\n'); + + Function *DeleteF = NewF.getFunc(); + NewF.release(); + mergeTwoFunctions(OldF.getFunc(), DeleteF); + return true; +} + +// Remove a function from FnSet. If it was already in FnSet, add it to Deferred +// so that we'll look at it in the next round. +void MergeFunctions::remove(Function *F) { + // We need to make sure we remove F, not a function "equal" to F per the + // function equality comparator. + // + // The special "lookup only" ComparableFunction bypasses the expensive + // function comparison in favour of a pointer comparison on the underlying + // Function*'s. + ComparableFunction CF = ComparableFunction(F, ComparableFunction::LookupOnly); + if (FnSet.erase(CF)) { + DEBUG(dbgs() << "Removed " << F->getName() << " from set and deferred it.\n"); + Deferred.push_back(F); + } +} + +// For each instruction used by the value, remove() the function that contains +// the instruction. This should happen right before a call to RAUW. +void MergeFunctions::removeUsers(Value *V) { + std::vector<Value *> Worklist; + Worklist.push_back(V); + while (!Worklist.empty()) { + Value *V = Worklist.back(); + Worklist.pop_back(); + + for (Value::use_iterator UI = V->use_begin(), UE = V->use_end(); + UI != UE; ++UI) { + Use &U = UI.getUse(); + if (Instruction *I = dyn_cast<Instruction>(U.getUser())) { + remove(I->getParent()->getParent()); + } else if (isa<GlobalValue>(U.getUser())) { + // do nothing + } else if (Constant *C = dyn_cast<Constant>(U.getUser())) { + for (Value::use_iterator CUI = C->use_begin(), CUE = C->use_end(); + CUI != CUE; ++CUI) + Worklist.push_back(*CUI); + } + } + } +} diff --git a/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp b/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp new file mode 100644 index 0000000..2afd029 --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp @@ -0,0 +1,182 @@ +//===- PartialInlining.cpp - Inline parts of functions --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass performs partial inlining, typically by inlining an if statement +// that surrounds the body of the function. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "partialinlining" +#include "llvm/Transforms/IPO.h" +#include "llvm/Instructions.h" +#include "llvm/Module.h" +#include "llvm/Pass.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/FunctionUtils.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/CFG.h" +using namespace llvm; + +STATISTIC(NumPartialInlined, "Number of functions partially inlined"); + +namespace { + struct PartialInliner : public ModulePass { + virtual void getAnalysisUsage(AnalysisUsage &AU) const { } + static char ID; // Pass identification, replacement for typeid + PartialInliner() : ModulePass(ID) { + initializePartialInlinerPass(*PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module& M); + + private: + Function* unswitchFunction(Function* F); + }; +} + +char PartialInliner::ID = 0; +INITIALIZE_PASS(PartialInliner, "partial-inliner", + "Partial Inliner", false, false) + +ModulePass* llvm::createPartialInliningPass() { return new PartialInliner(); } + +Function* PartialInliner::unswitchFunction(Function* F) { + // First, verify that this function is an unswitching candidate... + BasicBlock* entryBlock = F->begin(); + BranchInst *BR = dyn_cast<BranchInst>(entryBlock->getTerminator()); + if (!BR || BR->isUnconditional()) + return 0; + + BasicBlock* returnBlock = 0; + BasicBlock* nonReturnBlock = 0; + unsigned returnCount = 0; + for (succ_iterator SI = succ_begin(entryBlock), SE = succ_end(entryBlock); + SI != SE; ++SI) + if (isa<ReturnInst>((*SI)->getTerminator())) { + returnBlock = *SI; + returnCount++; + } else + nonReturnBlock = *SI; + + if (returnCount != 1) + return 0; + + // Clone the function, so that we can hack away on it. + ValueToValueMapTy VMap; + Function* duplicateFunction = CloneFunction(F, VMap, + /*ModuleLevelChanges=*/false); + duplicateFunction->setLinkage(GlobalValue::InternalLinkage); + F->getParent()->getFunctionList().push_back(duplicateFunction); + BasicBlock* newEntryBlock = cast<BasicBlock>(VMap[entryBlock]); + BasicBlock* newReturnBlock = cast<BasicBlock>(VMap[returnBlock]); + BasicBlock* newNonReturnBlock = cast<BasicBlock>(VMap[nonReturnBlock]); + + // Go ahead and update all uses to the duplicate, so that we can just + // use the inliner functionality when we're done hacking. + F->replaceAllUsesWith(duplicateFunction); + + // Special hackery is needed with PHI nodes that have inputs from more than + // one extracted block. For simplicity, just split the PHIs into a two-level + // sequence of PHIs, some of which will go in the extracted region, and some + // of which will go outside. + BasicBlock* preReturn = newReturnBlock; + newReturnBlock = newReturnBlock->splitBasicBlock( + newReturnBlock->getFirstNonPHI()); + BasicBlock::iterator I = preReturn->begin(); + BasicBlock::iterator Ins = newReturnBlock->begin(); + while (I != preReturn->end()) { + PHINode* OldPhi = dyn_cast<PHINode>(I); + if (!OldPhi) break; + + PHINode* retPhi = PHINode::Create(OldPhi->getType(), "", Ins); + OldPhi->replaceAllUsesWith(retPhi); + Ins = newReturnBlock->getFirstNonPHI(); + + retPhi->addIncoming(I, preReturn); + retPhi->addIncoming(OldPhi->getIncomingValueForBlock(newEntryBlock), + newEntryBlock); + OldPhi->removeIncomingValue(newEntryBlock); + + ++I; + } + newEntryBlock->getTerminator()->replaceUsesOfWith(preReturn, newReturnBlock); + + // Gather up the blocks that we're going to extract. + std::vector<BasicBlock*> toExtract; + toExtract.push_back(newNonReturnBlock); + for (Function::iterator FI = duplicateFunction->begin(), + FE = duplicateFunction->end(); FI != FE; ++FI) + if (&*FI != newEntryBlock && &*FI != newReturnBlock && + &*FI != newNonReturnBlock) + toExtract.push_back(FI); + + // The CodeExtractor needs a dominator tree. + DominatorTree DT; + DT.runOnFunction(*duplicateFunction); + + // Extract the body of the if. + Function* extractedFunction = ExtractCodeRegion(DT, toExtract); + + InlineFunctionInfo IFI; + + // Inline the top-level if test into all callers. + std::vector<User*> Users(duplicateFunction->use_begin(), + duplicateFunction->use_end()); + for (std::vector<User*>::iterator UI = Users.begin(), UE = Users.end(); + UI != UE; ++UI) + if (CallInst *CI = dyn_cast<CallInst>(*UI)) + InlineFunction(CI, IFI); + else if (InvokeInst *II = dyn_cast<InvokeInst>(*UI)) + InlineFunction(II, IFI); + + // Ditch the duplicate, since we're done with it, and rewrite all remaining + // users (function pointers, etc.) back to the original function. + duplicateFunction->replaceAllUsesWith(F); + duplicateFunction->eraseFromParent(); + + ++NumPartialInlined; + + return extractedFunction; +} + +bool PartialInliner::runOnModule(Module& M) { + std::vector<Function*> worklist; + worklist.reserve(M.size()); + for (Module::iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI) + if (!FI->use_empty() && !FI->isDeclaration()) + worklist.push_back(&*FI); + + bool changed = false; + while (!worklist.empty()) { + Function* currFunc = worklist.back(); + worklist.pop_back(); + + if (currFunc->use_empty()) continue; + + bool recursive = false; + for (Function::use_iterator UI = currFunc->use_begin(), + UE = currFunc->use_end(); UI != UE; ++UI) + if (Instruction* I = dyn_cast<Instruction>(*UI)) + if (I->getParent()->getParent() == currFunc) { + recursive = true; + break; + } + if (recursive) continue; + + + if (Function* newFunc = unswitchFunction(currFunc)) { + worklist.push_back(newFunc); + changed = true; + } + + } + + return changed; +} diff --git a/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp b/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp new file mode 100644 index 0000000..d91c2c4 --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp @@ -0,0 +1,257 @@ +//===- PruneEH.cpp - Pass which deletes unused exception handlers ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a simple interprocedural pass which walks the +// call-graph, turning invoke instructions into calls, iff the callee cannot +// throw an exception, and marking functions 'nounwind' if they cannot throw. +// It implements this as a bottom-up traversal of the call-graph. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "prune-eh" +#include "llvm/Transforms/IPO.h" +#include "llvm/CallGraphSCCPass.h" +#include "llvm/Constants.h" +#include "llvm/Function.h" +#include "llvm/LLVMContext.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/CFG.h" +#include <set> +#include <algorithm> +using namespace llvm; + +STATISTIC(NumRemoved, "Number of invokes removed"); +STATISTIC(NumUnreach, "Number of noreturn calls optimized"); + +namespace { + struct PruneEH : public CallGraphSCCPass { + static char ID; // Pass identification, replacement for typeid + PruneEH() : CallGraphSCCPass(ID) { + initializePruneEHPass(*PassRegistry::getPassRegistry()); + } + + // runOnSCC - Analyze the SCC, performing the transformation if possible. + bool runOnSCC(CallGraphSCC &SCC); + + bool SimplifyFunction(Function *F); + void DeleteBasicBlock(BasicBlock *BB); + }; +} + +char PruneEH::ID = 0; +INITIALIZE_PASS_BEGIN(PruneEH, "prune-eh", + "Remove unused exception handling info", false, false) +INITIALIZE_AG_DEPENDENCY(CallGraph) +INITIALIZE_PASS_END(PruneEH, "prune-eh", + "Remove unused exception handling info", false, false) + +Pass *llvm::createPruneEHPass() { return new PruneEH(); } + + +bool PruneEH::runOnSCC(CallGraphSCC &SCC) { + SmallPtrSet<CallGraphNode *, 8> SCCNodes; + CallGraph &CG = getAnalysis<CallGraph>(); + bool MadeChange = false; + + // Fill SCCNodes with the elements of the SCC. Used for quickly + // looking up whether a given CallGraphNode is in this SCC. + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) + SCCNodes.insert(*I); + + // First pass, scan all of the functions in the SCC, simplifying them + // according to what we know. + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) + if (Function *F = (*I)->getFunction()) + MadeChange |= SimplifyFunction(F); + + // Next, check to see if any callees might throw or if there are any external + // functions in this SCC: if so, we cannot prune any functions in this SCC. + // Definitions that are weak and not declared non-throwing might be + // overridden at linktime with something that throws, so assume that. + // If this SCC includes the unwind instruction, we KNOW it throws, so + // obviously the SCC might throw. + // + bool SCCMightUnwind = false, SCCMightReturn = false; + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); + (!SCCMightUnwind || !SCCMightReturn) && I != E; ++I) { + Function *F = (*I)->getFunction(); + if (F == 0) { + SCCMightUnwind = true; + SCCMightReturn = true; + } else if (F->isDeclaration() || F->mayBeOverridden()) { + SCCMightUnwind |= !F->doesNotThrow(); + SCCMightReturn |= !F->doesNotReturn(); + } else { + bool CheckUnwind = !SCCMightUnwind && !F->doesNotThrow(); + bool CheckReturn = !SCCMightReturn && !F->doesNotReturn(); + + if (!CheckUnwind && !CheckReturn) + continue; + + // Check to see if this function performs an unwind or calls an + // unwinding function. + for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { + if (CheckUnwind && isa<UnwindInst>(BB->getTerminator())) { + // Uses unwind! + SCCMightUnwind = true; + } else if (CheckReturn && isa<ReturnInst>(BB->getTerminator())) { + SCCMightReturn = true; + } + + // Invoke instructions don't allow unwinding to continue, so we are + // only interested in call instructions. + if (CheckUnwind && !SCCMightUnwind) + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) + if (CallInst *CI = dyn_cast<CallInst>(I)) { + if (CI->doesNotThrow()) { + // This call cannot throw. + } else if (Function *Callee = CI->getCalledFunction()) { + CallGraphNode *CalleeNode = CG[Callee]; + // If the callee is outside our current SCC then we may + // throw because it might. + if (!SCCNodes.count(CalleeNode)) { + SCCMightUnwind = true; + break; + } + } else { + // Indirect call, it might throw. + SCCMightUnwind = true; + break; + } + } + if (SCCMightUnwind && SCCMightReturn) break; + } + } + } + + // If the SCC doesn't unwind or doesn't throw, note this fact. + if (!SCCMightUnwind || !SCCMightReturn) + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { + Attributes NewAttributes = Attribute::None; + + if (!SCCMightUnwind) + NewAttributes |= Attribute::NoUnwind; + if (!SCCMightReturn) + NewAttributes |= Attribute::NoReturn; + + Function *F = (*I)->getFunction(); + const AttrListPtr &PAL = F->getAttributes(); + const AttrListPtr &NPAL = PAL.addAttr(~0, NewAttributes); + if (PAL != NPAL) { + MadeChange = true; + F->setAttributes(NPAL); + } + } + + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { + // Convert any invoke instructions to non-throwing functions in this node + // into call instructions with a branch. This makes the exception blocks + // dead. + if (Function *F = (*I)->getFunction()) + MadeChange |= SimplifyFunction(F); + } + + return MadeChange; +} + + +// SimplifyFunction - Given information about callees, simplify the specified +// function if we have invokes to non-unwinding functions or code after calls to +// no-return functions. +bool PruneEH::SimplifyFunction(Function *F) { + bool MadeChange = false; + for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { + if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) + if (II->doesNotThrow()) { + SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3); + // Insert a call instruction before the invoke. + CallInst *Call = CallInst::Create(II->getCalledValue(), + Args.begin(), Args.end(), "", II); + Call->takeName(II); + Call->setCallingConv(II->getCallingConv()); + Call->setAttributes(II->getAttributes()); + + // Anything that used the value produced by the invoke instruction + // now uses the value produced by the call instruction. Note that we + // do this even for void functions and calls with no uses so that the + // callgraph edge is updated. + II->replaceAllUsesWith(Call); + BasicBlock *UnwindBlock = II->getUnwindDest(); + UnwindBlock->removePredecessor(II->getParent()); + + // Insert a branch to the normal destination right before the + // invoke. + BranchInst::Create(II->getNormalDest(), II); + + // Finally, delete the invoke instruction! + BB->getInstList().pop_back(); + + // If the unwind block is now dead, nuke it. + if (pred_begin(UnwindBlock) == pred_end(UnwindBlock)) + DeleteBasicBlock(UnwindBlock); // Delete the new BB. + + ++NumRemoved; + MadeChange = true; + } + + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) + if (CallInst *CI = dyn_cast<CallInst>(I++)) + if (CI->doesNotReturn() && !isa<UnreachableInst>(I)) { + // This call calls a function that cannot return. Insert an + // unreachable instruction after it and simplify the code. Do this + // by splitting the BB, adding the unreachable, then deleting the + // new BB. + BasicBlock *New = BB->splitBasicBlock(I); + + // Remove the uncond branch and add an unreachable. + BB->getInstList().pop_back(); + new UnreachableInst(BB->getContext(), BB); + + DeleteBasicBlock(New); // Delete the new BB. + MadeChange = true; + ++NumUnreach; + break; + } + } + + return MadeChange; +} + +/// DeleteBasicBlock - remove the specified basic block from the program, +/// updating the callgraph to reflect any now-obsolete edges due to calls that +/// exist in the BB. +void PruneEH::DeleteBasicBlock(BasicBlock *BB) { + assert(pred_begin(BB) == pred_end(BB) && "BB is not dead!"); + CallGraph &CG = getAnalysis<CallGraph>(); + + CallGraphNode *CGN = CG[BB->getParent()]; + for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; ) { + --I; + if (CallInst *CI = dyn_cast<CallInst>(I)) { + if (!isa<DbgInfoIntrinsic>(I)) + CGN->removeCallEdgeFor(CI); + } else if (InvokeInst *II = dyn_cast<InvokeInst>(I)) + CGN->removeCallEdgeFor(II); + if (!I->use_empty()) + I->replaceAllUsesWith(UndefValue::get(I->getType())); + } + + // Get the list of successors of this block. + std::vector<BasicBlock*> Succs(succ_begin(BB), succ_end(BB)); + + for (unsigned i = 0, e = Succs.size(); i != e; ++i) + Succs[i]->removePredecessor(BB); + + BB->eraseFromParent(); +} diff --git a/contrib/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp b/contrib/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp new file mode 100644 index 0000000..b5f09ec --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp @@ -0,0 +1,73 @@ +//===-- StripDeadPrototypes.cpp - Remove unused function declarations ----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass loops over all of the functions in the input module, looking for +// dead declarations and removes them. Dead declarations are declarations of +// functions for which no implementation is available (i.e., declarations for +// unused library functions). +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "strip-dead-prototypes" +#include "llvm/Transforms/IPO.h" +#include "llvm/Pass.h" +#include "llvm/Module.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumDeadPrototypes, "Number of dead prototypes removed"); + +namespace { + +/// @brief Pass to remove unused function declarations. +class StripDeadPrototypesPass : public ModulePass { +public: + static char ID; // Pass identification, replacement for typeid + StripDeadPrototypesPass() : ModulePass(ID) { + initializeStripDeadPrototypesPassPass(*PassRegistry::getPassRegistry()); + } + virtual bool runOnModule(Module &M); +}; + +} // end anonymous namespace + +char StripDeadPrototypesPass::ID = 0; +INITIALIZE_PASS(StripDeadPrototypesPass, "strip-dead-prototypes", + "Strip Unused Function Prototypes", false, false) + +bool StripDeadPrototypesPass::runOnModule(Module &M) { + bool MadeChange = false; + + // Erase dead function prototypes. + for (Module::iterator I = M.begin(), E = M.end(); I != E; ) { + Function *F = I++; + // Function must be a prototype and unused. + if (F->isDeclaration() && F->use_empty()) { + F->eraseFromParent(); + ++NumDeadPrototypes; + MadeChange = true; + } + } + + // Erase dead global var prototypes. + for (Module::global_iterator I = M.global_begin(), E = M.global_end(); + I != E; ) { + GlobalVariable *GV = I++; + // Global must be a prototype and unused. + if (GV->isDeclaration() && GV->use_empty()) + GV->eraseFromParent(); + } + + // Return an indication of whether we changed anything or not. + return MadeChange; +} + +ModulePass *llvm::createStripDeadPrototypesPass() { + return new StripDeadPrototypesPass(); +} diff --git a/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp b/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp new file mode 100644 index 0000000..a690765 --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp @@ -0,0 +1,408 @@ +//===- StripSymbols.cpp - Strip symbols and debug info from a module ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// The StripSymbols transformation implements code stripping. Specifically, it +// can delete: +// +// * names for virtual registers +// * symbols for internal globals and functions +// * debug information +// +// Note that this transformation makes code much less readable, so it should +// only be used in situations where the 'strip' utility would be used, such as +// reducing code size or making it harder to reverse engineer code. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/IPO.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Instructions.h" +#include "llvm/Module.h" +#include "llvm/Pass.h" +#include "llvm/Analysis/DebugInfo.h" +#include "llvm/ValueSymbolTable.h" +#include "llvm/TypeSymbolTable.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/ADT/SmallPtrSet.h" +using namespace llvm; + +namespace { + class StripSymbols : public ModulePass { + bool OnlyDebugInfo; + public: + static char ID; // Pass identification, replacement for typeid + explicit StripSymbols(bool ODI = false) + : ModulePass(ID), OnlyDebugInfo(ODI) { + initializeStripSymbolsPass(*PassRegistry::getPassRegistry()); + } + + virtual bool runOnModule(Module &M); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + } + }; + + class StripNonDebugSymbols : public ModulePass { + public: + static char ID; // Pass identification, replacement for typeid + explicit StripNonDebugSymbols() + : ModulePass(ID) { + initializeStripNonDebugSymbolsPass(*PassRegistry::getPassRegistry()); + } + + virtual bool runOnModule(Module &M); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + } + }; + + class StripDebugDeclare : public ModulePass { + public: + static char ID; // Pass identification, replacement for typeid + explicit StripDebugDeclare() + : ModulePass(ID) { + initializeStripDebugDeclarePass(*PassRegistry::getPassRegistry()); + } + + virtual bool runOnModule(Module &M); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + } + }; + + class StripDeadDebugInfo : public ModulePass { + public: + static char ID; // Pass identification, replacement for typeid + explicit StripDeadDebugInfo() + : ModulePass(ID) { + initializeStripDeadDebugInfoPass(*PassRegistry::getPassRegistry()); + } + + virtual bool runOnModule(Module &M); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + } + }; +} + +char StripSymbols::ID = 0; +INITIALIZE_PASS(StripSymbols, "strip", + "Strip all symbols from a module", false, false) + +ModulePass *llvm::createStripSymbolsPass(bool OnlyDebugInfo) { + return new StripSymbols(OnlyDebugInfo); +} + +char StripNonDebugSymbols::ID = 0; +INITIALIZE_PASS(StripNonDebugSymbols, "strip-nondebug", + "Strip all symbols, except dbg symbols, from a module", + false, false) + +ModulePass *llvm::createStripNonDebugSymbolsPass() { + return new StripNonDebugSymbols(); +} + +char StripDebugDeclare::ID = 0; +INITIALIZE_PASS(StripDebugDeclare, "strip-debug-declare", + "Strip all llvm.dbg.declare intrinsics", false, false) + +ModulePass *llvm::createStripDebugDeclarePass() { + return new StripDebugDeclare(); +} + +char StripDeadDebugInfo::ID = 0; +INITIALIZE_PASS(StripDeadDebugInfo, "strip-dead-debug-info", + "Strip debug info for unused symbols", false, false) + +ModulePass *llvm::createStripDeadDebugInfoPass() { + return new StripDeadDebugInfo(); +} + +/// OnlyUsedBy - Return true if V is only used by Usr. +static bool OnlyUsedBy(Value *V, Value *Usr) { + for(Value::use_iterator I = V->use_begin(), E = V->use_end(); I != E; ++I) { + User *U = *I; + if (U != Usr) + return false; + } + return true; +} + +static void RemoveDeadConstant(Constant *C) { + assert(C->use_empty() && "Constant is not dead!"); + SmallPtrSet<Constant*, 4> Operands; + for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i) + if (isa<DerivedType>(C->getOperand(i)->getType()) && + OnlyUsedBy(C->getOperand(i), C)) + Operands.insert(cast<Constant>(C->getOperand(i))); + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) { + if (!GV->hasLocalLinkage()) return; // Don't delete non static globals. + GV->eraseFromParent(); + } + else if (!isa<Function>(C)) + if (isa<CompositeType>(C->getType())) + C->destroyConstant(); + + // If the constant referenced anything, see if we can delete it as well. + for (SmallPtrSet<Constant*, 4>::iterator OI = Operands.begin(), + OE = Operands.end(); OI != OE; ++OI) + RemoveDeadConstant(*OI); +} + +// Strip the symbol table of its names. +// +static void StripSymtab(ValueSymbolTable &ST, bool PreserveDbgInfo) { + for (ValueSymbolTable::iterator VI = ST.begin(), VE = ST.end(); VI != VE; ) { + Value *V = VI->getValue(); + ++VI; + if (!isa<GlobalValue>(V) || cast<GlobalValue>(V)->hasLocalLinkage()) { + if (!PreserveDbgInfo || !V->getName().startswith("llvm.dbg")) + // Set name to "", removing from symbol table! + V->setName(""); + } + } +} + +// Strip the symbol table of its names. +static void StripTypeSymtab(TypeSymbolTable &ST, bool PreserveDbgInfo) { + for (TypeSymbolTable::iterator TI = ST.begin(), E = ST.end(); TI != E; ) { + if (PreserveDbgInfo && StringRef(TI->first).startswith("llvm.dbg")) + ++TI; + else + ST.remove(TI++); + } +} + +/// Find values that are marked as llvm.used. +static void findUsedValues(GlobalVariable *LLVMUsed, + SmallPtrSet<const GlobalValue*, 8> &UsedValues) { + if (LLVMUsed == 0) return; + UsedValues.insert(LLVMUsed); + + ConstantArray *Inits = dyn_cast<ConstantArray>(LLVMUsed->getInitializer()); + if (Inits == 0) return; + + for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i) + if (GlobalValue *GV = + dyn_cast<GlobalValue>(Inits->getOperand(i)->stripPointerCasts())) + UsedValues.insert(GV); +} + +/// StripSymbolNames - Strip symbol names. +static bool StripSymbolNames(Module &M, bool PreserveDbgInfo) { + + SmallPtrSet<const GlobalValue*, 8> llvmUsedValues; + findUsedValues(M.getGlobalVariable("llvm.used"), llvmUsedValues); + findUsedValues(M.getGlobalVariable("llvm.compiler.used"), llvmUsedValues); + + for (Module::global_iterator I = M.global_begin(), E = M.global_end(); + I != E; ++I) { + if (I->hasLocalLinkage() && llvmUsedValues.count(I) == 0) + if (!PreserveDbgInfo || !I->getName().startswith("llvm.dbg")) + I->setName(""); // Internal symbols can't participate in linkage + } + + for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { + if (I->hasLocalLinkage() && llvmUsedValues.count(I) == 0) + if (!PreserveDbgInfo || !I->getName().startswith("llvm.dbg")) + I->setName(""); // Internal symbols can't participate in linkage + StripSymtab(I->getValueSymbolTable(), PreserveDbgInfo); + } + + // Remove all names from types. + StripTypeSymtab(M.getTypeSymbolTable(), PreserveDbgInfo); + + return true; +} + +// StripDebugInfo - Strip debug info in the module if it exists. +// To do this, we remove llvm.dbg.func.start, llvm.dbg.stoppoint, and +// llvm.dbg.region.end calls, and any globals they point to if now dead. +static bool StripDebugInfo(Module &M) { + + bool Changed = false; + + // Remove all of the calls to the debugger intrinsics, and remove them from + // the module. + if (Function *Declare = M.getFunction("llvm.dbg.declare")) { + while (!Declare->use_empty()) { + CallInst *CI = cast<CallInst>(Declare->use_back()); + CI->eraseFromParent(); + } + Declare->eraseFromParent(); + Changed = true; + } + + if (Function *DbgVal = M.getFunction("llvm.dbg.value")) { + while (!DbgVal->use_empty()) { + CallInst *CI = cast<CallInst>(DbgVal->use_back()); + CI->eraseFromParent(); + } + DbgVal->eraseFromParent(); + Changed = true; + } + + for (Module::named_metadata_iterator NMI = M.named_metadata_begin(), + NME = M.named_metadata_end(); NMI != NME;) { + NamedMDNode *NMD = NMI; + ++NMI; + if (NMD->getName().startswith("llvm.dbg.")) { + NMD->eraseFromParent(); + Changed = true; + } + } + + for (Module::iterator MI = M.begin(), ME = M.end(); MI != ME; ++MI) + for (Function::iterator FI = MI->begin(), FE = MI->end(); FI != FE; + ++FI) + for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; + ++BI) { + if (!BI->getDebugLoc().isUnknown()) { + Changed = true; + BI->setDebugLoc(DebugLoc()); + } + } + + return Changed; +} + +bool StripSymbols::runOnModule(Module &M) { + bool Changed = false; + Changed |= StripDebugInfo(M); + if (!OnlyDebugInfo) + Changed |= StripSymbolNames(M, false); + return Changed; +} + +bool StripNonDebugSymbols::runOnModule(Module &M) { + return StripSymbolNames(M, true); +} + +bool StripDebugDeclare::runOnModule(Module &M) { + + Function *Declare = M.getFunction("llvm.dbg.declare"); + std::vector<Constant*> DeadConstants; + + if (Declare) { + while (!Declare->use_empty()) { + CallInst *CI = cast<CallInst>(Declare->use_back()); + Value *Arg1 = CI->getArgOperand(0); + Value *Arg2 = CI->getArgOperand(1); + assert(CI->use_empty() && "llvm.dbg intrinsic should have void result"); + CI->eraseFromParent(); + if (Arg1->use_empty()) { + if (Constant *C = dyn_cast<Constant>(Arg1)) + DeadConstants.push_back(C); + else + RecursivelyDeleteTriviallyDeadInstructions(Arg1); + } + if (Arg2->use_empty()) + if (Constant *C = dyn_cast<Constant>(Arg2)) + DeadConstants.push_back(C); + } + Declare->eraseFromParent(); + } + + while (!DeadConstants.empty()) { + Constant *C = DeadConstants.back(); + DeadConstants.pop_back(); + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) { + if (GV->hasLocalLinkage()) + RemoveDeadConstant(GV); + } else + RemoveDeadConstant(C); + } + + return true; +} + +/// getRealLinkageName - If special LLVM prefix that is used to inform the asm +/// printer to not emit usual symbol prefix before the symbol name is used then +/// return linkage name after skipping this special LLVM prefix. +static StringRef getRealLinkageName(StringRef LinkageName) { + char One = '\1'; + if (LinkageName.startswith(StringRef(&One, 1))) + return LinkageName.substr(1); + return LinkageName; +} + +bool StripDeadDebugInfo::runOnModule(Module &M) { + bool Changed = false; + + // Debugging infomration is encoded in llvm IR using metadata. This is designed + // such a way that debug info for symbols preserved even if symbols are + // optimized away by the optimizer. This special pass removes debug info for + // such symbols. + + // llvm.dbg.gv keeps track of debug info for global variables. + if (NamedMDNode *NMD = M.getNamedMetadata("llvm.dbg.gv")) { + SmallVector<MDNode *, 8> MDs; + for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) + if (DIGlobalVariable(NMD->getOperand(i)).Verify()) + MDs.push_back(NMD->getOperand(i)); + else + Changed = true; + NMD->eraseFromParent(); + NMD = NULL; + + for (SmallVector<MDNode *, 8>::iterator I = MDs.begin(), + E = MDs.end(); I != E; ++I) { + GlobalVariable *GV = DIGlobalVariable(*I).getGlobal(); + if (GV && M.getGlobalVariable(GV->getName(), true)) { + if (!NMD) + NMD = M.getOrInsertNamedMetadata("llvm.dbg.gv"); + NMD->addOperand(*I); + } + else + Changed = true; + } + } + + // llvm.dbg.sp keeps track of debug info for subprograms. + if (NamedMDNode *NMD = M.getNamedMetadata("llvm.dbg.sp")) { + SmallVector<MDNode *, 8> MDs; + for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) + if (DISubprogram(NMD->getOperand(i)).Verify()) + MDs.push_back(NMD->getOperand(i)); + else + Changed = true; + NMD->eraseFromParent(); + NMD = NULL; + + for (SmallVector<MDNode *, 8>::iterator I = MDs.begin(), + E = MDs.end(); I != E; ++I) { + bool FnIsLive = false; + if (Function *F = DISubprogram(*I).getFunction()) + if (M.getFunction(F->getName())) + FnIsLive = true; + if (FnIsLive) { + if (!NMD) + NMD = M.getOrInsertNamedMetadata("llvm.dbg.sp"); + NMD->addOperand(*I); + } else { + // Remove llvm.dbg.lv.fnname named mdnode which may have been used + // to hold debug info for dead function's local variables. + StringRef FName = DISubprogram(*I).getLinkageName(); + if (FName.empty()) + FName = DISubprogram(*I).getName(); + if (NamedMDNode *LVNMD = + M.getNamedMetadata(Twine("llvm.dbg.lv.", + getRealLinkageName(FName)))) + LVNMD->eraseFromParent(); + } + } + } + + return Changed; +} diff --git a/contrib/llvm/lib/Transforms/IPO/StructRetPromotion.cpp b/contrib/llvm/lib/Transforms/IPO/StructRetPromotion.cpp new file mode 100644 index 0000000..584deac --- /dev/null +++ b/contrib/llvm/lib/Transforms/IPO/StructRetPromotion.cpp @@ -0,0 +1,357 @@ +//===-- StructRetPromotion.cpp - Promote sret arguments -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass finds functions that return a struct (using a pointer to the struct +// as the first argument of the function, marked with the 'sret' attribute) and +// replaces them with a new function that simply returns each of the elements of +// that struct (using multiple return values). +// +// This pass works under a number of conditions: +// 1. The returned struct must not contain other structs +// 2. The returned struct must only be used to load values from +// 3. The placeholder struct passed in is the result of an alloca +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "sretpromotion" +#include "llvm/Transforms/IPO.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/LLVMContext.h" +#include "llvm/Module.h" +#include "llvm/CallGraphSCCPass.h" +#include "llvm/Instructions.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Support/CFG.h" +#include "llvm/Support/Debug.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +STATISTIC(NumRejectedSRETUses , "Number of sret rejected due to unexpected uses"); +STATISTIC(NumSRET , "Number of sret promoted"); +namespace { + /// SRETPromotion - This pass removes sret parameter and updates + /// function to use multiple return value. + /// + struct SRETPromotion : public CallGraphSCCPass { + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + CallGraphSCCPass::getAnalysisUsage(AU); + } + + virtual bool runOnSCC(CallGraphSCC &SCC); + static char ID; // Pass identification, replacement for typeid + SRETPromotion() : CallGraphSCCPass(ID) { + initializeSRETPromotionPass(*PassRegistry::getPassRegistry()); + } + + private: + CallGraphNode *PromoteReturn(CallGraphNode *CGN); + bool isSafeToUpdateAllCallers(Function *F); + Function *cloneFunctionBody(Function *F, const StructType *STy); + CallGraphNode *updateCallSites(Function *F, Function *NF); + }; +} + +char SRETPromotion::ID = 0; +INITIALIZE_PASS_BEGIN(SRETPromotion, "sretpromotion", + "Promote sret arguments to multiple ret values", false, false) +INITIALIZE_AG_DEPENDENCY(CallGraph) +INITIALIZE_PASS_END(SRETPromotion, "sretpromotion", + "Promote sret arguments to multiple ret values", false, false) + +Pass *llvm::createStructRetPromotionPass() { + return new SRETPromotion(); +} + +bool SRETPromotion::runOnSCC(CallGraphSCC &SCC) { + bool Changed = false; + + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) + if (CallGraphNode *NewNode = PromoteReturn(*I)) { + SCC.ReplaceNode(*I, NewNode); + Changed = true; + } + + return Changed; +} + +/// PromoteReturn - This method promotes function that uses StructRet paramater +/// into a function that uses multiple return values. +CallGraphNode *SRETPromotion::PromoteReturn(CallGraphNode *CGN) { + Function *F = CGN->getFunction(); + + if (!F || F->isDeclaration() || !F->hasLocalLinkage()) + return 0; + + // Make sure that function returns struct. + if (F->arg_size() == 0 || !F->hasStructRetAttr() || F->doesNotReturn()) + return 0; + + DEBUG(dbgs() << "SretPromotion: Looking at sret function " + << F->getName() << "\n"); + + assert(F->getReturnType()->isVoidTy() && "Invalid function return type"); + Function::arg_iterator AI = F->arg_begin(); + const llvm::PointerType *FArgType = dyn_cast<PointerType>(AI->getType()); + assert(FArgType && "Invalid sret parameter type"); + const llvm::StructType *STy = + dyn_cast<StructType>(FArgType->getElementType()); + assert(STy && "Invalid sret parameter element type"); + + // Check if it is ok to perform this promotion. + if (isSafeToUpdateAllCallers(F) == false) { + DEBUG(dbgs() << "SretPromotion: Not all callers can be updated\n"); + ++NumRejectedSRETUses; + return 0; + } + + DEBUG(dbgs() << "SretPromotion: sret argument will be promoted\n"); + ++NumSRET; + // [1] Replace use of sret parameter + AllocaInst *TheAlloca = new AllocaInst(STy, NULL, "mrv", + F->getEntryBlock().begin()); + Value *NFirstArg = F->arg_begin(); + NFirstArg->replaceAllUsesWith(TheAlloca); + + // [2] Find and replace ret instructions + for (Function::iterator FI = F->begin(), FE = F->end(); FI != FE; ++FI) + for(BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ) { + Instruction *I = BI; + ++BI; + if (isa<ReturnInst>(I)) { + Value *NV = new LoadInst(TheAlloca, "mrv.ld", I); + ReturnInst *NR = ReturnInst::Create(F->getContext(), NV, I); + I->replaceAllUsesWith(NR); + I->eraseFromParent(); + } + } + + // [3] Create the new function body and insert it into the module. + Function *NF = cloneFunctionBody(F, STy); + + // [4] Update all call sites to use new function + CallGraphNode *NF_CFN = updateCallSites(F, NF); + + CallGraph &CG = getAnalysis<CallGraph>(); + NF_CFN->stealCalledFunctionsFrom(CG[F]); + + delete CG.removeFunctionFromModule(F); + return NF_CFN; +} + +// Check if it is ok to perform this promotion. +bool SRETPromotion::isSafeToUpdateAllCallers(Function *F) { + + if (F->use_empty()) + // No users. OK to modify signature. + return true; + + for (Value::use_iterator FnUseI = F->use_begin(), FnUseE = F->use_end(); + FnUseI != FnUseE; ++FnUseI) { + // The function is passed in as an argument to (possibly) another function, + // we can't change it! + CallSite CS(*FnUseI); + Instruction *Call = CS.getInstruction(); + // The function is used by something else than a call or invoke instruction, + // we can't change it! + if (!Call || !CS.isCallee(FnUseI)) + return false; + CallSite::arg_iterator AI = CS.arg_begin(); + Value *FirstArg = *AI; + + if (!isa<AllocaInst>(FirstArg)) + return false; + + // Check FirstArg's users. + for (Value::use_iterator ArgI = FirstArg->use_begin(), + ArgE = FirstArg->use_end(); ArgI != ArgE; ++ArgI) { + User *U = *ArgI; + // If FirstArg user is a CallInst that does not correspond to current + // call site then this function F is not suitable for sret promotion. + if (CallInst *CI = dyn_cast<CallInst>(U)) { + if (CI != Call) + return false; + } + // If FirstArg user is a GEP whose all users are not LoadInst then + // this function F is not suitable for sret promotion. + else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) { + // TODO : Use dom info and insert PHINodes to collect get results + // from multiple call sites for this GEP. + if (GEP->getParent() != Call->getParent()) + return false; + for (Value::use_iterator GEPI = GEP->use_begin(), GEPE = GEP->use_end(); + GEPI != GEPE; ++GEPI) + if (!isa<LoadInst>(*GEPI)) + return false; + } + // Any other FirstArg users make this function unsuitable for sret + // promotion. + else + return false; + } + } + + return true; +} + +/// cloneFunctionBody - Create a new function based on F and +/// insert it into module. Remove first argument. Use STy as +/// the return type for new function. +Function *SRETPromotion::cloneFunctionBody(Function *F, + const StructType *STy) { + + const FunctionType *FTy = F->getFunctionType(); + std::vector<const Type*> Params; + + // Attributes - Keep track of the parameter attributes for the arguments. + SmallVector<AttributeWithIndex, 8> AttributesVec; + const AttrListPtr &PAL = F->getAttributes(); + + // Add any return attributes. + if (Attributes attrs = PAL.getRetAttributes()) + AttributesVec.push_back(AttributeWithIndex::get(0, attrs)); + + // Skip first argument. + Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); + ++I; + // 0th parameter attribute is reserved for return type. + // 1th parameter attribute is for first 1st sret argument. + unsigned ParamIndex = 2; + while (I != E) { + Params.push_back(I->getType()); + if (Attributes Attrs = PAL.getParamAttributes(ParamIndex)) + AttributesVec.push_back(AttributeWithIndex::get(ParamIndex - 1, Attrs)); + ++I; + ++ParamIndex; + } + + // Add any fn attributes. + if (Attributes attrs = PAL.getFnAttributes()) + AttributesVec.push_back(AttributeWithIndex::get(~0, attrs)); + + + FunctionType *NFTy = FunctionType::get(STy, Params, FTy->isVarArg()); + Function *NF = Function::Create(NFTy, F->getLinkage()); + NF->takeName(F); + NF->copyAttributesFrom(F); + NF->setAttributes(AttrListPtr::get(AttributesVec.begin(), AttributesVec.end())); + F->getParent()->getFunctionList().insert(F, NF); + NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList()); + + // Replace arguments + I = F->arg_begin(); + E = F->arg_end(); + Function::arg_iterator NI = NF->arg_begin(); + ++I; + while (I != E) { + I->replaceAllUsesWith(NI); + NI->takeName(I); + ++I; + ++NI; + } + + return NF; +} + +/// updateCallSites - Update all sites that call F to use NF. +CallGraphNode *SRETPromotion::updateCallSites(Function *F, Function *NF) { + CallGraph &CG = getAnalysis<CallGraph>(); + SmallVector<Value*, 16> Args; + + // Attributes - Keep track of the parameter attributes for the arguments. + SmallVector<AttributeWithIndex, 8> ArgAttrsVec; + + // Get a new callgraph node for NF. + CallGraphNode *NF_CGN = CG.getOrInsertFunction(NF); + + while (!F->use_empty()) { + CallSite CS(*F->use_begin()); + Instruction *Call = CS.getInstruction(); + + const AttrListPtr &PAL = F->getAttributes(); + // Add any return attributes. + if (Attributes attrs = PAL.getRetAttributes()) + ArgAttrsVec.push_back(AttributeWithIndex::get(0, attrs)); + + // Copy arguments, however skip first one. + CallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end(); + Value *FirstCArg = *AI; + ++AI; + // 0th parameter attribute is reserved for return type. + // 1th parameter attribute is for first 1st sret argument. + unsigned ParamIndex = 2; + while (AI != AE) { + Args.push_back(*AI); + if (Attributes Attrs = PAL.getParamAttributes(ParamIndex)) + ArgAttrsVec.push_back(AttributeWithIndex::get(ParamIndex - 1, Attrs)); + ++ParamIndex; + ++AI; + } + + // Add any function attributes. + if (Attributes attrs = PAL.getFnAttributes()) + ArgAttrsVec.push_back(AttributeWithIndex::get(~0, attrs)); + + AttrListPtr NewPAL = AttrListPtr::get(ArgAttrsVec.begin(), ArgAttrsVec.end()); + + // Build new call instruction. + Instruction *New; + if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) { + New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(), + Args.begin(), Args.end(), "", Call); + cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv()); + cast<InvokeInst>(New)->setAttributes(NewPAL); + } else { + New = CallInst::Create(NF, Args.begin(), Args.end(), "", Call); + cast<CallInst>(New)->setCallingConv(CS.getCallingConv()); + cast<CallInst>(New)->setAttributes(NewPAL); + if (cast<CallInst>(Call)->isTailCall()) + cast<CallInst>(New)->setTailCall(); + } + Args.clear(); + ArgAttrsVec.clear(); + New->takeName(Call); + + // Update the callgraph to know that the callsite has been transformed. + CallGraphNode *CalleeNode = CG[Call->getParent()->getParent()]; + CalleeNode->removeCallEdgeFor(Call); + CalleeNode->addCalledFunction(New, NF_CGN); + + // Update all users of sret parameter to extract value using extractvalue. + for (Value::use_iterator UI = FirstCArg->use_begin(), + UE = FirstCArg->use_end(); UI != UE; ) { + User *U2 = *UI++; + CallInst *C2 = dyn_cast<CallInst>(U2); + if (C2 && (C2 == Call)) + continue; + + GetElementPtrInst *UGEP = cast<GetElementPtrInst>(U2); + ConstantInt *Idx = cast<ConstantInt>(UGEP->getOperand(2)); + Value *GR = ExtractValueInst::Create(New, Idx->getZExtValue(), + "evi", UGEP); + while(!UGEP->use_empty()) { + // isSafeToUpdateAllCallers has checked that all GEP uses are + // LoadInsts + LoadInst *L = cast<LoadInst>(*UGEP->use_begin()); + L->replaceAllUsesWith(GR); + L->eraseFromParent(); + } + UGEP->eraseFromParent(); + continue; + } + Call->eraseFromParent(); + } + + return NF_CGN; +} + diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombine.h b/contrib/llvm/lib/Transforms/InstCombine/InstCombine.h new file mode 100644 index 0000000..9c2969c --- /dev/null +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombine.h @@ -0,0 +1,356 @@ +//===- InstCombine.h - Main InstCombine pass definition -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef INSTCOMBINE_INSTCOMBINE_H +#define INSTCOMBINE_INSTCOMBINE_H + +#include "InstCombineWorklist.h" +#include "llvm/Pass.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Support/IRBuilder.h" +#include "llvm/Support/InstVisitor.h" +#include "llvm/Support/TargetFolder.h" + +namespace llvm { + class CallSite; + class TargetData; + class DbgDeclareInst; + class MemIntrinsic; + class MemSetInst; + +/// SelectPatternFlavor - We can match a variety of different patterns for +/// select operations. +enum SelectPatternFlavor { + SPF_UNKNOWN = 0, + SPF_SMIN, SPF_UMIN, + SPF_SMAX, SPF_UMAX + //SPF_ABS - TODO. +}; + +/// getComplexity: Assign a complexity or rank value to LLVM Values... +/// 0 -> undef, 1 -> Const, 2 -> Other, 3 -> Arg, 3 -> Unary, 4 -> OtherInst +static inline unsigned getComplexity(Value *V) { + if (isa<Instruction>(V)) { + if (BinaryOperator::isNeg(V) || + BinaryOperator::isFNeg(V) || + BinaryOperator::isNot(V)) + return 3; + return 4; + } + if (isa<Argument>(V)) return 3; + return isa<Constant>(V) ? (isa<UndefValue>(V) ? 0 : 1) : 2; +} + + +/// InstCombineIRInserter - This is an IRBuilder insertion helper that works +/// just like the normal insertion helper, but also adds any new instructions +/// to the instcombine worklist. +class LLVM_LIBRARY_VISIBILITY InstCombineIRInserter + : public IRBuilderDefaultInserter<true> { + InstCombineWorklist &Worklist; +public: + InstCombineIRInserter(InstCombineWorklist &WL) : Worklist(WL) {} + + void InsertHelper(Instruction *I, const Twine &Name, + BasicBlock *BB, BasicBlock::iterator InsertPt) const { + IRBuilderDefaultInserter<true>::InsertHelper(I, Name, BB, InsertPt); + Worklist.Add(I); + } +}; + +/// InstCombiner - The -instcombine pass. +class LLVM_LIBRARY_VISIBILITY InstCombiner + : public FunctionPass, + public InstVisitor<InstCombiner, Instruction*> { + TargetData *TD; + bool MustPreserveLCSSA; + bool MadeIRChange; +public: + /// Worklist - All of the instructions that need to be simplified. + InstCombineWorklist Worklist; + + /// Builder - This is an IRBuilder that automatically inserts new + /// instructions into the worklist when they are created. + typedef IRBuilder<true, TargetFolder, InstCombineIRInserter> BuilderTy; + BuilderTy *Builder; + + static char ID; // Pass identification, replacement for typeid + InstCombiner() : FunctionPass(ID), TD(0), Builder(0) { + initializeInstCombinerPass(*PassRegistry::getPassRegistry()); + } + +public: + virtual bool runOnFunction(Function &F); + + bool DoOneIteration(Function &F, unsigned ItNum); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const; + + TargetData *getTargetData() const { return TD; } + + // Visitation implementation - Implement instruction combining for different + // instruction types. The semantics are as follows: + // Return Value: + // null - No change was made + // I - Change was made, I is still valid, I may be dead though + // otherwise - Change was made, replace I with returned instruction + // + Instruction *visitAdd(BinaryOperator &I); + Instruction *visitFAdd(BinaryOperator &I); + Value *OptimizePointerDifference(Value *LHS, Value *RHS, const Type *Ty); + Instruction *visitSub(BinaryOperator &I); + Instruction *visitFSub(BinaryOperator &I); + Instruction *visitMul(BinaryOperator &I); + Instruction *visitFMul(BinaryOperator &I); + Instruction *visitURem(BinaryOperator &I); + Instruction *visitSRem(BinaryOperator &I); + Instruction *visitFRem(BinaryOperator &I); + bool SimplifyDivRemOfSelect(BinaryOperator &I); + Instruction *commonRemTransforms(BinaryOperator &I); + Instruction *commonIRemTransforms(BinaryOperator &I); + Instruction *commonDivTransforms(BinaryOperator &I); + Instruction *commonIDivTransforms(BinaryOperator &I); + Instruction *visitUDiv(BinaryOperator &I); + Instruction *visitSDiv(BinaryOperator &I); + Instruction *visitFDiv(BinaryOperator &I); + Value *FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS); + Value *FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS); + Instruction *visitAnd(BinaryOperator &I); + Value *FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS); + Value *FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS); + Instruction *FoldOrWithConstants(BinaryOperator &I, Value *Op, + Value *A, Value *B, Value *C); + Instruction *visitOr (BinaryOperator &I); + Instruction *visitXor(BinaryOperator &I); + Instruction *visitShl(BinaryOperator &I); + Instruction *visitAShr(BinaryOperator &I); + Instruction *visitLShr(BinaryOperator &I); + Instruction *commonShiftTransforms(BinaryOperator &I); + Instruction *FoldFCmp_IntToFP_Cst(FCmpInst &I, Instruction *LHSI, + Constant *RHSC); + Instruction *FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, + GlobalVariable *GV, CmpInst &ICI, + ConstantInt *AndCst = 0); + Instruction *visitFCmpInst(FCmpInst &I); + Instruction *visitICmpInst(ICmpInst &I); + Instruction *visitICmpInstWithCastAndCast(ICmpInst &ICI); + Instruction *visitICmpInstWithInstAndIntCst(ICmpInst &ICI, + Instruction *LHS, + ConstantInt *RHS); + Instruction *FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI, + ConstantInt *DivRHS); + Instruction *FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *DivI, + ConstantInt *DivRHS); + Instruction *FoldICmpAddOpCst(ICmpInst &ICI, Value *X, ConstantInt *CI, + ICmpInst::Predicate Pred, Value *TheAdd); + Instruction *FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS, + ICmpInst::Predicate Cond, Instruction &I); + Instruction *FoldShiftByConstant(Value *Op0, ConstantInt *Op1, + BinaryOperator &I); + Instruction *commonCastTransforms(CastInst &CI); + Instruction *commonPointerCastTransforms(CastInst &CI); + Instruction *visitTrunc(TruncInst &CI); + Instruction *visitZExt(ZExtInst &CI); + Instruction *visitSExt(SExtInst &CI); + Instruction *visitFPTrunc(FPTruncInst &CI); + Instruction *visitFPExt(CastInst &CI); + Instruction *visitFPToUI(FPToUIInst &FI); + Instruction *visitFPToSI(FPToSIInst &FI); + Instruction *visitUIToFP(CastInst &CI); + Instruction *visitSIToFP(CastInst &CI); + Instruction *visitPtrToInt(PtrToIntInst &CI); + Instruction *visitIntToPtr(IntToPtrInst &CI); + Instruction *visitBitCast(BitCastInst &CI); + Instruction *FoldSelectOpOp(SelectInst &SI, Instruction *TI, + Instruction *FI); + Instruction *FoldSelectIntoOp(SelectInst &SI, Value*, Value*); + Instruction *FoldSPFofSPF(Instruction *Inner, SelectPatternFlavor SPF1, + Value *A, Value *B, Instruction &Outer, + SelectPatternFlavor SPF2, Value *C); + Instruction *visitSelectInst(SelectInst &SI); + Instruction *visitSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI); + Instruction *visitCallInst(CallInst &CI); + Instruction *visitInvokeInst(InvokeInst &II); + + Instruction *SliceUpIllegalIntegerPHI(PHINode &PN); + Instruction *visitPHINode(PHINode &PN); + Instruction *visitGetElementPtrInst(GetElementPtrInst &GEP); + Instruction *visitAllocaInst(AllocaInst &AI); + Instruction *visitMalloc(Instruction &FI); + Instruction *visitFree(CallInst &FI); + Instruction *visitLoadInst(LoadInst &LI); + Instruction *visitStoreInst(StoreInst &SI); + Instruction *visitBranchInst(BranchInst &BI); + Instruction *visitSwitchInst(SwitchInst &SI); + Instruction *visitInsertElementInst(InsertElementInst &IE); + Instruction *visitExtractElementInst(ExtractElementInst &EI); + Instruction *visitShuffleVectorInst(ShuffleVectorInst &SVI); + Instruction *visitExtractValueInst(ExtractValueInst &EV); + + // visitInstruction - Specify what to return for unhandled instructions... + Instruction *visitInstruction(Instruction &I) { return 0; } + +private: + bool ShouldChangeType(const Type *From, const Type *To) const; + Value *dyn_castNegVal(Value *V) const; + Value *dyn_castFNegVal(Value *V) const; + const Type *FindElementAtOffset(const Type *Ty, int64_t Offset, + SmallVectorImpl<Value*> &NewIndices); + Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI); + + /// ShouldOptimizeCast - Return true if the cast from "V to Ty" actually + /// results in any code being generated and is interesting to optimize out. If + /// the cast can be eliminated by some other simple transformation, we prefer + /// to do the simplification first. + bool ShouldOptimizeCast(Instruction::CastOps opcode,const Value *V, + const Type *Ty); + + Instruction *visitCallSite(CallSite CS); + Instruction *tryOptimizeCall(CallInst *CI, const TargetData *TD); + bool transformConstExprCastCall(CallSite CS); + Instruction *transformCallThroughTrampoline(CallSite CS); + Instruction *transformZExtICmp(ICmpInst *ICI, Instruction &CI, + bool DoXform = true); + bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS); + DbgDeclareInst *hasOneUsePlusDeclare(Value *V); + Value *EmitGEPOffset(User *GEP); + +public: + // InsertNewInstBefore - insert an instruction New before instruction Old + // in the program. Add the new instruction to the worklist. + // + Instruction *InsertNewInstBefore(Instruction *New, Instruction &Old) { + assert(New && New->getParent() == 0 && + "New instruction already inserted into a basic block!"); + BasicBlock *BB = Old.getParent(); + BB->getInstList().insert(&Old, New); // Insert inst + Worklist.Add(New); + return New; + } + + // ReplaceInstUsesWith - This method is to be used when an instruction is + // found to be dead, replacable with another preexisting expression. Here + // we add all uses of I to the worklist, replace all uses of I with the new + // value, then return I, so that the inst combiner will know that I was + // modified. + // + Instruction *ReplaceInstUsesWith(Instruction &I, Value *V) { + Worklist.AddUsersToWorkList(I); // Add all modified instrs to worklist. + + // If we are replacing the instruction with itself, this must be in a + // segment of unreachable code, so just clobber the instruction. + if (&I == V) + V = UndefValue::get(I.getType()); + + I.replaceAllUsesWith(V); + return &I; + } + + // EraseInstFromFunction - When dealing with an instruction that has side + // effects or produces a void value, we can't rely on DCE to delete the + // instruction. Instead, visit methods should return the value returned by + // this function. + Instruction *EraseInstFromFunction(Instruction &I) { + DEBUG(errs() << "IC: ERASE " << I << '\n'); + + assert(I.use_empty() && "Cannot erase instruction that is used!"); + // Make sure that we reprocess all operands now that we reduced their + // use counts. + if (I.getNumOperands() < 8) { + for (User::op_iterator i = I.op_begin(), e = I.op_end(); i != e; ++i) + if (Instruction *Op = dyn_cast<Instruction>(*i)) + Worklist.Add(Op); + } + Worklist.Remove(&I); + I.eraseFromParent(); + MadeIRChange = true; + return 0; // Don't do anything with FI + } + + void ComputeMaskedBits(Value *V, const APInt &Mask, APInt &KnownZero, + APInt &KnownOne, unsigned Depth = 0) const { + return llvm::ComputeMaskedBits(V, Mask, KnownZero, KnownOne, TD, Depth); + } + + bool MaskedValueIsZero(Value *V, const APInt &Mask, + unsigned Depth = 0) const { + return llvm::MaskedValueIsZero(V, Mask, TD, Depth); + } + unsigned ComputeNumSignBits(Value *Op, unsigned Depth = 0) const { + return llvm::ComputeNumSignBits(Op, TD, Depth); + } + +private: + + /// SimplifyAssociativeOrCommutative - This performs a few simplifications for + /// operators which are associative or commutative. + bool SimplifyAssociativeOrCommutative(BinaryOperator &I); + + /// SimplifyUsingDistributiveLaws - This tries to simplify binary operations + /// which some other binary operation distributes over either by factorizing + /// out common terms (eg "(A*B)+(A*C)" -> "A*(B+C)") or expanding out if this + /// results in simplifications (eg: "A & (B | C) -> (A&B) | (A&C)" if this is + /// a win). Returns the simplified value, or null if it didn't simplify. + Value *SimplifyUsingDistributiveLaws(BinaryOperator &I); + + /// SimplifyDemandedUseBits - Attempts to replace V with a simpler value + /// based on the demanded bits. + Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, + APInt& KnownZero, APInt& KnownOne, + unsigned Depth); + bool SimplifyDemandedBits(Use &U, APInt DemandedMask, + APInt& KnownZero, APInt& KnownOne, + unsigned Depth=0); + + /// SimplifyDemandedInstructionBits - Inst is an integer instruction that + /// SimplifyDemandedBits knows about. See if the instruction has any + /// properties that allow us to simplify its operands. + bool SimplifyDemandedInstructionBits(Instruction &Inst); + + Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, + APInt& UndefElts, unsigned Depth = 0); + + // FoldOpIntoPhi - Given a binary operator, cast instruction, or select + // which has a PHI node as operand #0, see if we can fold the instruction + // into the PHI (which is only possible if all operands to the PHI are + // constants). + // + Instruction *FoldOpIntoPhi(Instruction &I); + + // FoldPHIArgOpIntoPHI - If all operands to a PHI node are the same "unary" + // operator and they all are only used by the PHI, PHI together their + // inputs, and do the operation once, to the result of the PHI. + Instruction *FoldPHIArgOpIntoPHI(PHINode &PN); + Instruction *FoldPHIArgBinOpIntoPHI(PHINode &PN); + Instruction *FoldPHIArgGEPIntoPHI(PHINode &PN); + Instruction *FoldPHIArgLoadIntoPHI(PHINode &PN); + + + Instruction *OptAndOp(Instruction *Op, ConstantInt *OpRHS, + ConstantInt *AndRHS, BinaryOperator &TheAnd); + + Value *FoldLogicalPlusAnd(Value *LHS, Value *RHS, ConstantInt *Mask, + bool isSub, Instruction &I); + Value *InsertRangeTest(Value *V, Constant *Lo, Constant *Hi, + bool isSigned, bool Inside); + Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI); + Instruction *MatchBSwap(BinaryOperator &I); + bool SimplifyStoreAtEndOfBlock(StoreInst &SI); + Instruction *SimplifyMemTransfer(MemIntrinsic *MI); + Instruction *SimplifyMemSet(MemSetInst *MI); + + + Value *EvaluateInDifferentType(Value *V, const Type *Ty, bool isSigned); +}; + + + +} // end namespace llvm. + +#endif diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp new file mode 100644 index 0000000..c36a955 --- /dev/null +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -0,0 +1,697 @@ +//===- InstCombineAddSub.cpp ----------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the visit functions for add, fadd, sub, and fsub. +// +//===----------------------------------------------------------------------===// + +#include "InstCombine.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Support/GetElementPtrTypeIterator.h" +#include "llvm/Support/PatternMatch.h" +using namespace llvm; +using namespace PatternMatch; + +/// AddOne - Add one to a ConstantInt. +static Constant *AddOne(Constant *C) { + return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1)); +} +/// SubOne - Subtract one from a ConstantInt. +static Constant *SubOne(ConstantInt *C) { + return ConstantInt::get(C->getContext(), C->getValue()-1); +} + + +// dyn_castFoldableMul - If this value is a multiply that can be folded into +// other computations (because it has a constant operand), return the +// non-constant operand of the multiply, and set CST to point to the multiplier. +// Otherwise, return null. +// +static inline Value *dyn_castFoldableMul(Value *V, ConstantInt *&CST) { + if (!V->hasOneUse() || !V->getType()->isIntegerTy()) + return 0; + + Instruction *I = dyn_cast<Instruction>(V); + if (I == 0) return 0; + + if (I->getOpcode() == Instruction::Mul) + if ((CST = dyn_cast<ConstantInt>(I->getOperand(1)))) + return I->getOperand(0); + if (I->getOpcode() == Instruction::Shl) + if ((CST = dyn_cast<ConstantInt>(I->getOperand(1)))) { + // The multiplier is really 1 << CST. + uint32_t BitWidth = cast<IntegerType>(V->getType())->getBitWidth(); + uint32_t CSTVal = CST->getLimitedValue(BitWidth); + CST = ConstantInt::get(V->getType()->getContext(), + APInt(BitWidth, 1).shl(CSTVal)); + return I->getOperand(0); + } + return 0; +} + + +/// WillNotOverflowSignedAdd - Return true if we can prove that: +/// (sext (add LHS, RHS)) === (add (sext LHS), (sext RHS)) +/// This basically requires proving that the add in the original type would not +/// overflow to change the sign bit or have a carry out. +bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS) { + // There are different heuristics we can use for this. Here are some simple + // ones. + + // Add has the property that adding any two 2's complement numbers can only + // have one carry bit which can change a sign. As such, if LHS and RHS each + // have at least two sign bits, we know that the addition of the two values + // will sign extend fine. + if (ComputeNumSignBits(LHS) > 1 && ComputeNumSignBits(RHS) > 1) + return true; + + + // If one of the operands only has one non-zero bit, and if the other operand + // has a known-zero bit in a more significant place than it (not including the + // sign bit) the ripple may go up to and fill the zero, but won't change the + // sign. For example, (X & ~4) + 1. + + // TODO: Implement. + + return false; +} + +Instruction *InstCombiner::visitAdd(BinaryOperator &I) { + bool Changed = SimplifyAssociativeOrCommutative(I); + Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); + + if (Value *V = SimplifyAddInst(LHS, RHS, I.hasNoSignedWrap(), + I.hasNoUnsignedWrap(), TD)) + return ReplaceInstUsesWith(I, V); + + // (A*B)+(A*C) -> A*(B+C) etc + if (Value *V = SimplifyUsingDistributiveLaws(I)) + return ReplaceInstUsesWith(I, V); + + if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) { + // X + (signbit) --> X ^ signbit + const APInt &Val = CI->getValue(); + if (Val.isSignBit()) + return BinaryOperator::CreateXor(LHS, RHS); + + // See if SimplifyDemandedBits can simplify this. This handles stuff like + // (X & 254)+1 -> (X&254)|1 + if (SimplifyDemandedInstructionBits(I)) + return &I; + + // zext(bool) + C -> bool ? C + 1 : C + if (ZExtInst *ZI = dyn_cast<ZExtInst>(LHS)) + if (ZI->getSrcTy()->isIntegerTy(1)) + return SelectInst::Create(ZI->getOperand(0), AddOne(CI), CI); + + Value *XorLHS = 0; ConstantInt *XorRHS = 0; + if (match(LHS, m_Xor(m_Value(XorLHS), m_ConstantInt(XorRHS)))) { + uint32_t TySizeBits = I.getType()->getScalarSizeInBits(); + const APInt &RHSVal = CI->getValue(); + unsigned ExtendAmt = 0; + // If we have ADD(XOR(AND(X, 0xFF), 0x80), 0xF..F80), it's a sext. + // If we have ADD(XOR(AND(X, 0xFF), 0xF..F80), 0x80), it's a sext. + if (XorRHS->getValue() == -RHSVal) { + if (RHSVal.isPowerOf2()) + ExtendAmt = TySizeBits - RHSVal.logBase2() - 1; + else if (XorRHS->getValue().isPowerOf2()) + ExtendAmt = TySizeBits - XorRHS->getValue().logBase2() - 1; + } + + if (ExtendAmt) { + APInt Mask = APInt::getHighBitsSet(TySizeBits, ExtendAmt); + if (!MaskedValueIsZero(XorLHS, Mask)) + ExtendAmt = 0; + } + + if (ExtendAmt) { + Constant *ShAmt = ConstantInt::get(I.getType(), ExtendAmt); + Value *NewShl = Builder->CreateShl(XorLHS, ShAmt, "sext"); + return BinaryOperator::CreateAShr(NewShl, ShAmt); + } + } + } + + if (isa<Constant>(RHS) && isa<PHINode>(LHS)) + if (Instruction *NV = FoldOpIntoPhi(I)) + return NV; + + if (I.getType()->isIntegerTy(1)) + return BinaryOperator::CreateXor(LHS, RHS); + + // X + X --> X << 1 + if (LHS == RHS) { + BinaryOperator *New = + BinaryOperator::CreateShl(LHS, ConstantInt::get(I.getType(), 1)); + New->setHasNoSignedWrap(I.hasNoSignedWrap()); + New->setHasNoUnsignedWrap(I.hasNoUnsignedWrap()); + return New; + } + + // -A + B --> B - A + // -A + -B --> -(A + B) + if (Value *LHSV = dyn_castNegVal(LHS)) { + if (Value *RHSV = dyn_castNegVal(RHS)) { + Value *NewAdd = Builder->CreateAdd(LHSV, RHSV, "sum"); + return BinaryOperator::CreateNeg(NewAdd); + } + + return BinaryOperator::CreateSub(RHS, LHSV); + } + + // A + -B --> A - B + if (!isa<Constant>(RHS)) + if (Value *V = dyn_castNegVal(RHS)) + return BinaryOperator::CreateSub(LHS, V); + + + ConstantInt *C2; + if (Value *X = dyn_castFoldableMul(LHS, C2)) { + if (X == RHS) // X*C + X --> X * (C+1) + return BinaryOperator::CreateMul(RHS, AddOne(C2)); + + // X*C1 + X*C2 --> X * (C1+C2) + ConstantInt *C1; + if (X == dyn_castFoldableMul(RHS, C1)) + return BinaryOperator::CreateMul(X, ConstantExpr::getAdd(C1, C2)); + } + + // X + X*C --> X * (C+1) + if (dyn_castFoldableMul(RHS, C2) == LHS) + return BinaryOperator::CreateMul(LHS, AddOne(C2)); + + // A+B --> A|B iff A and B have no bits set in common. + if (const IntegerType *IT = dyn_cast<IntegerType>(I.getType())) { + APInt Mask = APInt::getAllOnesValue(IT->getBitWidth()); + APInt LHSKnownOne(IT->getBitWidth(), 0); + APInt LHSKnownZero(IT->getBitWidth(), 0); + ComputeMaskedBits(LHS, Mask, LHSKnownZero, LHSKnownOne); + if (LHSKnownZero != 0) { + APInt RHSKnownOne(IT->getBitWidth(), 0); + APInt RHSKnownZero(IT->getBitWidth(), 0); + ComputeMaskedBits(RHS, Mask, RHSKnownZero, RHSKnownOne); + + // No bits in common -> bitwise or. + if ((LHSKnownZero|RHSKnownZero).isAllOnesValue()) + return BinaryOperator::CreateOr(LHS, RHS); + } + } + + // W*X + Y*Z --> W * (X+Z) iff W == Y + { + Value *W, *X, *Y, *Z; + if (match(LHS, m_Mul(m_Value(W), m_Value(X))) && + match(RHS, m_Mul(m_Value(Y), m_Value(Z)))) { + if (W != Y) { + if (W == Z) { + std::swap(Y, Z); + } else if (Y == X) { + std::swap(W, X); + } else if (X == Z) { + std::swap(Y, Z); + std::swap(W, X); + } + } + + if (W == Y) { + Value *NewAdd = Builder->CreateAdd(X, Z, LHS->getName()); + return BinaryOperator::CreateMul(W, NewAdd); + } + } + } + + if (ConstantInt *CRHS = dyn_cast<ConstantInt>(RHS)) { + Value *X = 0; + if (match(LHS, m_Not(m_Value(X)))) // ~X + C --> (C-1) - X + return BinaryOperator::CreateSub(SubOne(CRHS), X); + + // (X & FF00) + xx00 -> (X+xx00) & FF00 + if (LHS->hasOneUse() && + match(LHS, m_And(m_Value(X), m_ConstantInt(C2))) && + CRHS->getValue() == (CRHS->getValue() & C2->getValue())) { + // See if all bits from the first bit set in the Add RHS up are included + // in the mask. First, get the rightmost bit. + const APInt &AddRHSV = CRHS->getValue(); + + // Form a mask of all bits from the lowest bit added through the top. + APInt AddRHSHighBits(~((AddRHSV & -AddRHSV)-1)); + + // See if the and mask includes all of these bits. + APInt AddRHSHighBitsAnd(AddRHSHighBits & C2->getValue()); + + if (AddRHSHighBits == AddRHSHighBitsAnd) { + // Okay, the xform is safe. Insert the new add pronto. + Value *NewAdd = Builder->CreateAdd(X, CRHS, LHS->getName()); + return BinaryOperator::CreateAnd(NewAdd, C2); + } + } + + // Try to fold constant add into select arguments. + if (SelectInst *SI = dyn_cast<SelectInst>(LHS)) + if (Instruction *R = FoldOpIntoSelect(I, SI)) + return R; + } + + // add (select X 0 (sub n A)) A --> select X A n + { + SelectInst *SI = dyn_cast<SelectInst>(LHS); + Value *A = RHS; + if (!SI) { + SI = dyn_cast<SelectInst>(RHS); + A = LHS; + } + if (SI && SI->hasOneUse()) { + Value *TV = SI->getTrueValue(); + Value *FV = SI->getFalseValue(); + Value *N; + + // Can we fold the add into the argument of the select? + // We check both true and false select arguments for a matching subtract. + if (match(FV, m_Zero()) && match(TV, m_Sub(m_Value(N), m_Specific(A)))) + // Fold the add into the true select value. + return SelectInst::Create(SI->getCondition(), N, A); + + if (match(TV, m_Zero()) && match(FV, m_Sub(m_Value(N), m_Specific(A)))) + // Fold the add into the false select value. + return SelectInst::Create(SI->getCondition(), A, N); + } + } + + // Check for (add (sext x), y), see if we can merge this into an + // integer add followed by a sext. + if (SExtInst *LHSConv = dyn_cast<SExtInst>(LHS)) { + // (add (sext x), cst) --> (sext (add x, cst')) + if (ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS)) { + Constant *CI = + ConstantExpr::getTrunc(RHSC, LHSConv->getOperand(0)->getType()); + if (LHSConv->hasOneUse() && + ConstantExpr::getSExt(CI, I.getType()) == RHSC && + WillNotOverflowSignedAdd(LHSConv->getOperand(0), CI)) { + // Insert the new, smaller add. + Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0), + CI, "addconv"); + return new SExtInst(NewAdd, I.getType()); + } + } + + // (add (sext x), (sext y)) --> (sext (add int x, y)) + if (SExtInst *RHSConv = dyn_cast<SExtInst>(RHS)) { + // Only do this if x/y have the same type, if at last one of them has a + // single use (so we don't increase the number of sexts), and if the + // integer add will not overflow. + if (LHSConv->getOperand(0)->getType()==RHSConv->getOperand(0)->getType()&& + (LHSConv->hasOneUse() || RHSConv->hasOneUse()) && + WillNotOverflowSignedAdd(LHSConv->getOperand(0), + RHSConv->getOperand(0))) { + // Insert the new integer add. + Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0), + RHSConv->getOperand(0), "addconv"); + return new SExtInst(NewAdd, I.getType()); + } + } + } + + return Changed ? &I : 0; +} + +Instruction *InstCombiner::visitFAdd(BinaryOperator &I) { + bool Changed = SimplifyAssociativeOrCommutative(I); + Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); + + if (Constant *RHSC = dyn_cast<Constant>(RHS)) { + // X + 0 --> X + if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHSC)) { + if (CFP->isExactlyValue(ConstantFP::getNegativeZero + (I.getType())->getValueAPF())) + return ReplaceInstUsesWith(I, LHS); + } + + if (isa<PHINode>(LHS)) + if (Instruction *NV = FoldOpIntoPhi(I)) + return NV; + } + + // -A + B --> B - A + // -A + -B --> -(A + B) + if (Value *LHSV = dyn_castFNegVal(LHS)) + return BinaryOperator::CreateFSub(RHS, LHSV); + + // A + -B --> A - B + if (!isa<Constant>(RHS)) + if (Value *V = dyn_castFNegVal(RHS)) + return BinaryOperator::CreateFSub(LHS, V); + + // Check for X+0.0. Simplify it to X if we know X is not -0.0. + if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHS)) + if (CFP->getValueAPF().isPosZero() && CannotBeNegativeZero(LHS)) + return ReplaceInstUsesWith(I, LHS); + + // Check for (fadd double (sitofp x), y), see if we can merge this into an + // integer add followed by a promotion. + if (SIToFPInst *LHSConv = dyn_cast<SIToFPInst>(LHS)) { + // (fadd double (sitofp x), fpcst) --> (sitofp (add int x, intcst)) + // ... if the constant fits in the integer value. This is useful for things + // like (double)(x & 1234) + 4.0 -> (double)((X & 1234)+4) which no longer + // requires a constant pool load, and generally allows the add to be better + // instcombined. + if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHS)) { + Constant *CI = + ConstantExpr::getFPToSI(CFP, LHSConv->getOperand(0)->getType()); + if (LHSConv->hasOneUse() && + ConstantExpr::getSIToFP(CI, I.getType()) == CFP && + WillNotOverflowSignedAdd(LHSConv->getOperand(0), CI)) { + // Insert the new integer add. + Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0), + CI, "addconv"); + return new SIToFPInst(NewAdd, I.getType()); + } + } + + // (fadd double (sitofp x), (sitofp y)) --> (sitofp (add int x, y)) + if (SIToFPInst *RHSConv = dyn_cast<SIToFPInst>(RHS)) { + // Only do this if x/y have the same type, if at last one of them has a + // single use (so we don't increase the number of int->fp conversions), + // and if the integer add will not overflow. + if (LHSConv->getOperand(0)->getType()==RHSConv->getOperand(0)->getType()&& + (LHSConv->hasOneUse() || RHSConv->hasOneUse()) && + WillNotOverflowSignedAdd(LHSConv->getOperand(0), + RHSConv->getOperand(0))) { + // Insert the new integer add. + Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0), + RHSConv->getOperand(0),"addconv"); + return new SIToFPInst(NewAdd, I.getType()); + } + } + } + + return Changed ? &I : 0; +} + + +/// EmitGEPOffset - Given a getelementptr instruction/constantexpr, emit the +/// code necessary to compute the offset from the base pointer (without adding +/// in the base pointer). Return the result as a signed integer of intptr size. +Value *InstCombiner::EmitGEPOffset(User *GEP) { + TargetData &TD = *getTargetData(); + gep_type_iterator GTI = gep_type_begin(GEP); + const Type *IntPtrTy = TD.getIntPtrType(GEP->getContext()); + Value *Result = Constant::getNullValue(IntPtrTy); + + // If the GEP is inbounds, we know that none of the addressing operations will + // overflow in an unsigned sense. + bool isInBounds = cast<GEPOperator>(GEP)->isInBounds(); + + // Build a mask for high order bits. + unsigned IntPtrWidth = TD.getPointerSizeInBits(); + uint64_t PtrSizeMask = ~0ULL >> (64-IntPtrWidth); + + for (User::op_iterator i = GEP->op_begin() + 1, e = GEP->op_end(); i != e; + ++i, ++GTI) { + Value *Op = *i; + uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType()) & PtrSizeMask; + if (ConstantInt *OpC = dyn_cast<ConstantInt>(Op)) { + if (OpC->isZero()) continue; + + // Handle a struct index, which adds its field offset to the pointer. + if (const StructType *STy = dyn_cast<StructType>(*GTI)) { + Size = TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue()); + + if (Size) + Result = Builder->CreateAdd(Result, ConstantInt::get(IntPtrTy, Size), + GEP->getName()+".offs"); + continue; + } + + Constant *Scale = ConstantInt::get(IntPtrTy, Size); + Constant *OC = + ConstantExpr::getIntegerCast(OpC, IntPtrTy, true /*SExt*/); + Scale = ConstantExpr::getMul(OC, Scale, isInBounds/*NUW*/); + // Emit an add instruction. + Result = Builder->CreateAdd(Result, Scale, GEP->getName()+".offs"); + continue; + } + // Convert to correct type. + if (Op->getType() != IntPtrTy) + Op = Builder->CreateIntCast(Op, IntPtrTy, true, Op->getName()+".c"); + if (Size != 1) { + // We'll let instcombine(mul) convert this to a shl if possible. + Op = Builder->CreateMul(Op, ConstantInt::get(IntPtrTy, Size), + GEP->getName()+".idx", isInBounds /*NUW*/); + } + + // Emit an add instruction. + Result = Builder->CreateAdd(Op, Result, GEP->getName()+".offs"); + } + return Result; +} + + + + +/// Optimize pointer differences into the same array into a size. Consider: +/// &A[10] - &A[0]: we should compile this to "10". LHS/RHS are the pointer +/// operands to the ptrtoint instructions for the LHS/RHS of the subtract. +/// +Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS, + const Type *Ty) { + assert(TD && "Must have target data info for this"); + + // If LHS is a gep based on RHS or RHS is a gep based on LHS, we can optimize + // this. + bool Swapped = false; + GetElementPtrInst *GEP = 0; + ConstantExpr *CstGEP = 0; + + // TODO: Could also optimize &A[i] - &A[j] -> "i-j", and "&A.foo[i] - &A.foo". + // For now we require one side to be the base pointer "A" or a constant + // expression derived from it. + if (GetElementPtrInst *LHSGEP = dyn_cast<GetElementPtrInst>(LHS)) { + // (gep X, ...) - X + if (LHSGEP->getOperand(0) == RHS) { + GEP = LHSGEP; + Swapped = false; + } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(RHS)) { + // (gep X, ...) - (ce_gep X, ...) + if (CE->getOpcode() == Instruction::GetElementPtr && + LHSGEP->getOperand(0) == CE->getOperand(0)) { + CstGEP = CE; + GEP = LHSGEP; + Swapped = false; + } + } + } + + if (GetElementPtrInst *RHSGEP = dyn_cast<GetElementPtrInst>(RHS)) { + // X - (gep X, ...) + if (RHSGEP->getOperand(0) == LHS) { + GEP = RHSGEP; + Swapped = true; + } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(LHS)) { + // (ce_gep X, ...) - (gep X, ...) + if (CE->getOpcode() == Instruction::GetElementPtr && + RHSGEP->getOperand(0) == CE->getOperand(0)) { + CstGEP = CE; + GEP = RHSGEP; + Swapped = true; + } + } + } + + if (GEP == 0) + return 0; + + // Emit the offset of the GEP and an intptr_t. + Value *Result = EmitGEPOffset(GEP); + + // If we had a constant expression GEP on the other side offsetting the + // pointer, subtract it from the offset we have. + if (CstGEP) { + Value *CstOffset = EmitGEPOffset(CstGEP); + Result = Builder->CreateSub(Result, CstOffset); + } + + + // If we have p - gep(p, ...) then we have to negate the result. + if (Swapped) + Result = Builder->CreateNeg(Result, "diff.neg"); + + return Builder->CreateIntCast(Result, Ty, true); +} + + +Instruction *InstCombiner::visitSub(BinaryOperator &I) { + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + + if (Value *V = SimplifySubInst(Op0, Op1, I.hasNoSignedWrap(), + I.hasNoUnsignedWrap(), TD)) + return ReplaceInstUsesWith(I, V); + + // (A*B)-(A*C) -> A*(B-C) etc + if (Value *V = SimplifyUsingDistributiveLaws(I)) + return ReplaceInstUsesWith(I, V); + + // If this is a 'B = x-(-A)', change to B = x+A. This preserves NSW/NUW. + if (Value *V = dyn_castNegVal(Op1)) { + BinaryOperator *Res = BinaryOperator::CreateAdd(Op0, V); + Res->setHasNoSignedWrap(I.hasNoSignedWrap()); + Res->setHasNoUnsignedWrap(I.hasNoUnsignedWrap()); + return Res; + } + + if (I.getType()->isIntegerTy(1)) + return BinaryOperator::CreateXor(Op0, Op1); + + // Replace (-1 - A) with (~A). + if (match(Op0, m_AllOnes())) + return BinaryOperator::CreateNot(Op1); + + if (ConstantInt *C = dyn_cast<ConstantInt>(Op0)) { + // C - ~X == X + (1+C) + Value *X = 0; + if (match(Op1, m_Not(m_Value(X)))) + return BinaryOperator::CreateAdd(X, AddOne(C)); + + // -(X >>u 31) -> (X >>s 31) + // -(X >>s 31) -> (X >>u 31) + if (C->isZero()) { + Value *X; ConstantInt *CI; + if (match(Op1, m_LShr(m_Value(X), m_ConstantInt(CI))) && + // Verify we are shifting out everything but the sign bit. + CI->getValue() == I.getType()->getPrimitiveSizeInBits()-1) + return BinaryOperator::CreateAShr(X, CI); + + if (match(Op1, m_AShr(m_Value(X), m_ConstantInt(CI))) && + // Verify we are shifting out everything but the sign bit. + CI->getValue() == I.getType()->getPrimitiveSizeInBits()-1) + return BinaryOperator::CreateLShr(X, CI); + } + + // Try to fold constant sub into select arguments. + if (SelectInst *SI = dyn_cast<SelectInst>(Op1)) + if (Instruction *R = FoldOpIntoSelect(I, SI)) + return R; + + // C - zext(bool) -> bool ? C - 1 : C + if (ZExtInst *ZI = dyn_cast<ZExtInst>(Op1)) + if (ZI->getSrcTy()->isIntegerTy(1)) + return SelectInst::Create(ZI->getOperand(0), SubOne(C), C); + + // C-(X+C2) --> (C-C2)-X + ConstantInt *C2; + if (match(Op1, m_Add(m_Value(X), m_ConstantInt(C2)))) + return BinaryOperator::CreateSub(ConstantExpr::getSub(C, C2), X); + } + + + { Value *Y; + // X-(X+Y) == -Y X-(Y+X) == -Y + if (match(Op1, m_Add(m_Specific(Op0), m_Value(Y))) || + match(Op1, m_Add(m_Value(Y), m_Specific(Op0)))) + return BinaryOperator::CreateNeg(Y); + + // (X-Y)-X == -Y + if (match(Op0, m_Sub(m_Specific(Op1), m_Value(Y)))) + return BinaryOperator::CreateNeg(Y); + } + + if (Op1->hasOneUse()) { + Value *X = 0, *Y = 0, *Z = 0; + Constant *C = 0; + ConstantInt *CI = 0; + + // (X - (Y - Z)) --> (X + (Z - Y)). + if (match(Op1, m_Sub(m_Value(Y), m_Value(Z)))) + return BinaryOperator::CreateAdd(Op0, + Builder->CreateSub(Z, Y, Op1->getName())); + + // (X - (X & Y)) --> (X & ~Y) + // + if (match(Op1, m_And(m_Value(Y), m_Specific(Op0))) || + match(Op1, m_And(m_Specific(Op0), m_Value(Y)))) + return BinaryOperator::CreateAnd(Op0, + Builder->CreateNot(Y, Y->getName() + ".not")); + + // 0 - (X sdiv C) -> (X sdiv -C) + if (match(Op1, m_SDiv(m_Value(X), m_Constant(C))) && + match(Op0, m_Zero())) + return BinaryOperator::CreateSDiv(X, ConstantExpr::getNeg(C)); + + // 0 - (X << Y) -> (-X << Y) when X is freely negatable. + if (match(Op1, m_Shl(m_Value(X), m_Value(Y))) && match(Op0, m_Zero())) + if (Value *XNeg = dyn_castNegVal(X)) + return BinaryOperator::CreateShl(XNeg, Y); + + // X - X*C --> X * (1-C) + if (match(Op1, m_Mul(m_Specific(Op0), m_ConstantInt(CI)))) { + Constant *CP1 = ConstantExpr::getSub(ConstantInt::get(I.getType(),1), CI); + return BinaryOperator::CreateMul(Op0, CP1); + } + + // X - X<<C --> X * (1-(1<<C)) + if (match(Op1, m_Shl(m_Specific(Op0), m_ConstantInt(CI)))) { + Constant *One = ConstantInt::get(I.getType(), 1); + C = ConstantExpr::getSub(One, ConstantExpr::getShl(One, CI)); + return BinaryOperator::CreateMul(Op0, C); + } + + // X - A*-B -> X + A*B + // X - -A*B -> X + A*B + Value *A, *B; + if (match(Op1, m_Mul(m_Value(A), m_Neg(m_Value(B)))) || + match(Op1, m_Mul(m_Neg(m_Value(A)), m_Value(B)))) + return BinaryOperator::CreateAdd(Op0, Builder->CreateMul(A, B)); + + // X - A*CI -> X + A*-CI + // X - CI*A -> X + A*-CI + if (match(Op1, m_Mul(m_Value(A), m_ConstantInt(CI))) || + match(Op1, m_Mul(m_ConstantInt(CI), m_Value(A)))) { + Value *NewMul = Builder->CreateMul(A, ConstantExpr::getNeg(CI)); + return BinaryOperator::CreateAdd(Op0, NewMul); + } + } + + ConstantInt *C1; + if (Value *X = dyn_castFoldableMul(Op0, C1)) { + if (X == Op1) // X*C - X --> X * (C-1) + return BinaryOperator::CreateMul(Op1, SubOne(C1)); + + ConstantInt *C2; // X*C1 - X*C2 -> X * (C1-C2) + if (X == dyn_castFoldableMul(Op1, C2)) + return BinaryOperator::CreateMul(X, ConstantExpr::getSub(C1, C2)); + } + + // Optimize pointer differences into the same array into a size. Consider: + // &A[10] - &A[0]: we should compile this to "10". + if (TD) { + Value *LHSOp, *RHSOp; + if (match(Op0, m_PtrToInt(m_Value(LHSOp))) && + match(Op1, m_PtrToInt(m_Value(RHSOp)))) + if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType())) + return ReplaceInstUsesWith(I, Res); + + // trunc(p)-trunc(q) -> trunc(p-q) + if (match(Op0, m_Trunc(m_PtrToInt(m_Value(LHSOp)))) && + match(Op1, m_Trunc(m_PtrToInt(m_Value(RHSOp))))) + if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType())) + return ReplaceInstUsesWith(I, Res); + } + + return 0; +} + +Instruction *InstCombiner::visitFSub(BinaryOperator &I) { + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + + // If this is a 'B = x-(-A)', change to B = x+A... + if (Value *V = dyn_castFNegVal(Op1)) + return BinaryOperator::CreateFAdd(Op0, V); + + return 0; +} diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp new file mode 100644 index 0000000..b6b6b84 --- /dev/null +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -0,0 +1,2203 @@ +//===- InstCombineAndOrXor.cpp --------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the visitAnd, visitOr, and visitXor functions. +// +//===----------------------------------------------------------------------===// + +#include "InstCombine.h" +#include "llvm/Intrinsics.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Support/PatternMatch.h" +using namespace llvm; +using namespace PatternMatch; + + +/// AddOne - Add one to a ConstantInt. +static Constant *AddOne(Constant *C) { + return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1)); +} +/// SubOne - Subtract one from a ConstantInt. +static Constant *SubOne(ConstantInt *C) { + return ConstantInt::get(C->getContext(), C->getValue()-1); +} + +/// isFreeToInvert - Return true if the specified value is free to invert (apply +/// ~ to). This happens in cases where the ~ can be eliminated. +static inline bool isFreeToInvert(Value *V) { + // ~(~(X)) -> X. + if (BinaryOperator::isNot(V)) + return true; + + // Constants can be considered to be not'ed values. + if (isa<ConstantInt>(V)) + return true; + + // Compares can be inverted if they have a single use. + if (CmpInst *CI = dyn_cast<CmpInst>(V)) + return CI->hasOneUse(); + + return false; +} + +static inline Value *dyn_castNotVal(Value *V) { + // If this is not(not(x)) don't return that this is a not: we want the two + // not's to be folded first. + if (BinaryOperator::isNot(V)) { + Value *Operand = BinaryOperator::getNotArgument(V); + if (!isFreeToInvert(Operand)) + return Operand; + } + + // Constants can be considered to be not'ed values... + if (ConstantInt *C = dyn_cast<ConstantInt>(V)) + return ConstantInt::get(C->getType(), ~C->getValue()); + return 0; +} + + +/// getICmpCode - Encode a icmp predicate into a three bit mask. These bits +/// are carefully arranged to allow folding of expressions such as: +/// +/// (A < B) | (A > B) --> (A != B) +/// +/// Note that this is only valid if the first and second predicates have the +/// same sign. Is illegal to do: (A u< B) | (A s> B) +/// +/// Three bits are used to represent the condition, as follows: +/// 0 A > B +/// 1 A == B +/// 2 A < B +/// +/// <=> Value Definition +/// 000 0 Always false +/// 001 1 A > B +/// 010 2 A == B +/// 011 3 A >= B +/// 100 4 A < B +/// 101 5 A != B +/// 110 6 A <= B +/// 111 7 Always true +/// +static unsigned getICmpCode(const ICmpInst *ICI) { + switch (ICI->getPredicate()) { + // False -> 0 + case ICmpInst::ICMP_UGT: return 1; // 001 + case ICmpInst::ICMP_SGT: return 1; // 001 + case ICmpInst::ICMP_EQ: return 2; // 010 + case ICmpInst::ICMP_UGE: return 3; // 011 + case ICmpInst::ICMP_SGE: return 3; // 011 + case ICmpInst::ICMP_ULT: return 4; // 100 + case ICmpInst::ICMP_SLT: return 4; // 100 + case ICmpInst::ICMP_NE: return 5; // 101 + case ICmpInst::ICMP_ULE: return 6; // 110 + case ICmpInst::ICMP_SLE: return 6; // 110 + // True -> 7 + default: + llvm_unreachable("Invalid ICmp predicate!"); + return 0; + } +} + +/// getFCmpCode - Similar to getICmpCode but for FCmpInst. This encodes a fcmp +/// predicate into a three bit mask. It also returns whether it is an ordered +/// predicate by reference. +static unsigned getFCmpCode(FCmpInst::Predicate CC, bool &isOrdered) { + isOrdered = false; + switch (CC) { + case FCmpInst::FCMP_ORD: isOrdered = true; return 0; // 000 + case FCmpInst::FCMP_UNO: return 0; // 000 + case FCmpInst::FCMP_OGT: isOrdered = true; return 1; // 001 + case FCmpInst::FCMP_UGT: return 1; // 001 + case FCmpInst::FCMP_OEQ: isOrdered = true; return 2; // 010 + case FCmpInst::FCMP_UEQ: return 2; // 010 + case FCmpInst::FCMP_OGE: isOrdered = true; return 3; // 011 + case FCmpInst::FCMP_UGE: return 3; // 011 + case FCmpInst::FCMP_OLT: isOrdered = true; return 4; // 100 + case FCmpInst::FCMP_ULT: return 4; // 100 + case FCmpInst::FCMP_ONE: isOrdered = true; return 5; // 101 + case FCmpInst::FCMP_UNE: return 5; // 101 + case FCmpInst::FCMP_OLE: isOrdered = true; return 6; // 110 + case FCmpInst::FCMP_ULE: return 6; // 110 + // True -> 7 + default: + // Not expecting FCMP_FALSE and FCMP_TRUE; + llvm_unreachable("Unexpected FCmp predicate!"); + return 0; + } +} + +/// getICmpValue - This is the complement of getICmpCode, which turns an +/// opcode and two operands into either a constant true or false, or a brand +/// new ICmp instruction. The sign is passed in to determine which kind +/// of predicate to use in the new icmp instruction. +static Value *getICmpValue(bool Sign, unsigned Code, Value *LHS, Value *RHS, + InstCombiner::BuilderTy *Builder) { + CmpInst::Predicate Pred; + switch (Code) { + default: assert(0 && "Illegal ICmp code!"); + case 0: // False. + return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0); + case 1: Pred = Sign ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; break; + case 2: Pred = ICmpInst::ICMP_EQ; break; + case 3: Pred = Sign ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; break; + case 4: Pred = Sign ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; break; + case 5: Pred = ICmpInst::ICMP_NE; break; + case 6: Pred = Sign ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; break; + case 7: // True. + return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 1); + } + return Builder->CreateICmp(Pred, LHS, RHS); +} + +/// getFCmpValue - This is the complement of getFCmpCode, which turns an +/// opcode and two operands into either a FCmp instruction. isordered is passed +/// in to determine which kind of predicate to use in the new fcmp instruction. +static Value *getFCmpValue(bool isordered, unsigned code, + Value *LHS, Value *RHS, + InstCombiner::BuilderTy *Builder) { + CmpInst::Predicate Pred; + switch (code) { + default: assert(0 && "Illegal FCmp code!"); + case 0: Pred = isordered ? FCmpInst::FCMP_ORD : FCmpInst::FCMP_UNO; break; + case 1: Pred = isordered ? FCmpInst::FCMP_OGT : FCmpInst::FCMP_UGT; break; + case 2: Pred = isordered ? FCmpInst::FCMP_OEQ : FCmpInst::FCMP_UEQ; break; + case 3: Pred = isordered ? FCmpInst::FCMP_OGE : FCmpInst::FCMP_UGE; break; + case 4: Pred = isordered ? FCmpInst::FCMP_OLT : FCmpInst::FCMP_ULT; break; + case 5: Pred = isordered ? FCmpInst::FCMP_ONE : FCmpInst::FCMP_UNE; break; + case 6: Pred = isordered ? FCmpInst::FCMP_OLE : FCmpInst::FCMP_ULE; break; + case 7: + if (!isordered) return ConstantInt::getTrue(LHS->getContext()); + Pred = FCmpInst::FCMP_ORD; break; + } + return Builder->CreateFCmp(Pred, LHS, RHS); +} + +/// PredicatesFoldable - Return true if both predicates match sign or if at +/// least one of them is an equality comparison (which is signless). +static bool PredicatesFoldable(ICmpInst::Predicate p1, ICmpInst::Predicate p2) { + return (CmpInst::isSigned(p1) == CmpInst::isSigned(p2)) || + (CmpInst::isSigned(p1) && ICmpInst::isEquality(p2)) || + (CmpInst::isSigned(p2) && ICmpInst::isEquality(p1)); +} + +// OptAndOp - This handles expressions of the form ((val OP C1) & C2). Where +// the Op parameter is 'OP', OpRHS is 'C1', and AndRHS is 'C2'. Op is +// guaranteed to be a binary operator. +Instruction *InstCombiner::OptAndOp(Instruction *Op, + ConstantInt *OpRHS, + ConstantInt *AndRHS, + BinaryOperator &TheAnd) { + Value *X = Op->getOperand(0); + Constant *Together = 0; + if (!Op->isShift()) + Together = ConstantExpr::getAnd(AndRHS, OpRHS); + + switch (Op->getOpcode()) { + case Instruction::Xor: + if (Op->hasOneUse()) { + // (X ^ C1) & C2 --> (X & C2) ^ (C1&C2) + Value *And = Builder->CreateAnd(X, AndRHS); + And->takeName(Op); + return BinaryOperator::CreateXor(And, Together); + } + break; + case Instruction::Or: + if (Op->hasOneUse()){ + if (Together != OpRHS) { + // (X | C1) & C2 --> (X | (C1&C2)) & C2 + Value *Or = Builder->CreateOr(X, Together); + Or->takeName(Op); + return BinaryOperator::CreateAnd(Or, AndRHS); + } + + ConstantInt *TogetherCI = dyn_cast<ConstantInt>(Together); + if (TogetherCI && !TogetherCI->isZero()){ + // (X | C1) & C2 --> (X & (C2^(C1&C2))) | C1 + // NOTE: This reduces the number of bits set in the & mask, which + // can expose opportunities for store narrowing. + Together = ConstantExpr::getXor(AndRHS, Together); + Value *And = Builder->CreateAnd(X, Together); + And->takeName(Op); + return BinaryOperator::CreateOr(And, OpRHS); + } + } + + break; + case Instruction::Add: + if (Op->hasOneUse()) { + // Adding a one to a single bit bit-field should be turned into an XOR + // of the bit. First thing to check is to see if this AND is with a + // single bit constant. + const APInt &AndRHSV = cast<ConstantInt>(AndRHS)->getValue(); + + // If there is only one bit set. + if (AndRHSV.isPowerOf2()) { + // Ok, at this point, we know that we are masking the result of the + // ADD down to exactly one bit. If the constant we are adding has + // no bits set below this bit, then we can eliminate the ADD. + const APInt& AddRHS = cast<ConstantInt>(OpRHS)->getValue(); + + // Check to see if any bits below the one bit set in AndRHSV are set. + if ((AddRHS & (AndRHSV-1)) == 0) { + // If not, the only thing that can effect the output of the AND is + // the bit specified by AndRHSV. If that bit is set, the effect of + // the XOR is to toggle the bit. If it is clear, then the ADD has + // no effect. + if ((AddRHS & AndRHSV) == 0) { // Bit is not set, noop + TheAnd.setOperand(0, X); + return &TheAnd; + } else { + // Pull the XOR out of the AND. + Value *NewAnd = Builder->CreateAnd(X, AndRHS); + NewAnd->takeName(Op); + return BinaryOperator::CreateXor(NewAnd, AndRHS); + } + } + } + } + break; + + case Instruction::Shl: { + // We know that the AND will not produce any of the bits shifted in, so if + // the anded constant includes them, clear them now! + // + uint32_t BitWidth = AndRHS->getType()->getBitWidth(); + uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth); + APInt ShlMask(APInt::getHighBitsSet(BitWidth, BitWidth-OpRHSVal)); + ConstantInt *CI = ConstantInt::get(AndRHS->getContext(), + AndRHS->getValue() & ShlMask); + + if (CI->getValue() == ShlMask) + // Masking out bits that the shift already masks. + return ReplaceInstUsesWith(TheAnd, Op); // No need for the and. + + if (CI != AndRHS) { // Reducing bits set in and. + TheAnd.setOperand(1, CI); + return &TheAnd; + } + break; + } + case Instruction::LShr: { + // We know that the AND will not produce any of the bits shifted in, so if + // the anded constant includes them, clear them now! This only applies to + // unsigned shifts, because a signed shr may bring in set bits! + // + uint32_t BitWidth = AndRHS->getType()->getBitWidth(); + uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth); + APInt ShrMask(APInt::getLowBitsSet(BitWidth, BitWidth - OpRHSVal)); + ConstantInt *CI = ConstantInt::get(Op->getContext(), + AndRHS->getValue() & ShrMask); + + if (CI->getValue() == ShrMask) + // Masking out bits that the shift already masks. + return ReplaceInstUsesWith(TheAnd, Op); + + if (CI != AndRHS) { + TheAnd.setOperand(1, CI); // Reduce bits set in and cst. + return &TheAnd; + } + break; + } + case Instruction::AShr: + // Signed shr. + // See if this is shifting in some sign extension, then masking it out + // with an and. + if (Op->hasOneUse()) { + uint32_t BitWidth = AndRHS->getType()->getBitWidth(); + uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth); + APInt ShrMask(APInt::getLowBitsSet(BitWidth, BitWidth - OpRHSVal)); + Constant *C = ConstantInt::get(Op->getContext(), + AndRHS->getValue() & ShrMask); + if (C == AndRHS) { // Masking out bits shifted in. + // (Val ashr C1) & C2 -> (Val lshr C1) & C2 + // Make the argument unsigned. + Value *ShVal = Op->getOperand(0); + ShVal = Builder->CreateLShr(ShVal, OpRHS, Op->getName()); + return BinaryOperator::CreateAnd(ShVal, AndRHS, TheAnd.getName()); + } + } + break; + } + return 0; +} + + +/// InsertRangeTest - Emit a computation of: (V >= Lo && V < Hi) if Inside is +/// true, otherwise (V < Lo || V >= Hi). In pratice, we emit the more efficient +/// (V-Lo) <u Hi-Lo. This method expects that Lo <= Hi. isSigned indicates +/// whether to treat the V, Lo and HI as signed or not. IB is the location to +/// insert new instructions. +Value *InstCombiner::InsertRangeTest(Value *V, Constant *Lo, Constant *Hi, + bool isSigned, bool Inside) { + assert(cast<ConstantInt>(ConstantExpr::getICmp((isSigned ? + ICmpInst::ICMP_SLE:ICmpInst::ICMP_ULE), Lo, Hi))->getZExtValue() && + "Lo is not <= Hi in range emission code!"); + + if (Inside) { + if (Lo == Hi) // Trivially false. + return ConstantInt::getFalse(V->getContext()); + + // V >= Min && V < Hi --> V < Hi + if (cast<ConstantInt>(Lo)->isMinValue(isSigned)) { + ICmpInst::Predicate pred = (isSigned ? + ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT); + return Builder->CreateICmp(pred, V, Hi); + } + + // Emit V-Lo <u Hi-Lo + Constant *NegLo = ConstantExpr::getNeg(Lo); + Value *Add = Builder->CreateAdd(V, NegLo, V->getName()+".off"); + Constant *UpperBound = ConstantExpr::getAdd(NegLo, Hi); + return Builder->CreateICmpULT(Add, UpperBound); + } + + if (Lo == Hi) // Trivially true. + return ConstantInt::getTrue(V->getContext()); + + // V < Min || V >= Hi -> V > Hi-1 + Hi = SubOne(cast<ConstantInt>(Hi)); + if (cast<ConstantInt>(Lo)->isMinValue(isSigned)) { + ICmpInst::Predicate pred = (isSigned ? + ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT); + return Builder->CreateICmp(pred, V, Hi); + } + + // Emit V-Lo >u Hi-1-Lo + // Note that Hi has already had one subtracted from it, above. + ConstantInt *NegLo = cast<ConstantInt>(ConstantExpr::getNeg(Lo)); + Value *Add = Builder->CreateAdd(V, NegLo, V->getName()+".off"); + Constant *LowerBound = ConstantExpr::getAdd(NegLo, Hi); + return Builder->CreateICmpUGT(Add, LowerBound); +} + +// isRunOfOnes - Returns true iff Val consists of one contiguous run of 1s with +// any number of 0s on either side. The 1s are allowed to wrap from LSB to +// MSB, so 0x000FFF0, 0x0000FFFF, and 0xFF0000FF are all runs. 0x0F0F0000 is +// not, since all 1s are not contiguous. +static bool isRunOfOnes(ConstantInt *Val, uint32_t &MB, uint32_t &ME) { + const APInt& V = Val->getValue(); + uint32_t BitWidth = Val->getType()->getBitWidth(); + if (!APIntOps::isShiftedMask(BitWidth, V)) return false; + + // look for the first zero bit after the run of ones + MB = BitWidth - ((V - 1) ^ V).countLeadingZeros(); + // look for the first non-zero bit + ME = V.getActiveBits(); + return true; +} + +/// FoldLogicalPlusAnd - This is part of an expression (LHS +/- RHS) & Mask, +/// where isSub determines whether the operator is a sub. If we can fold one of +/// the following xforms: +/// +/// ((A & N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == Mask +/// ((A | N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == 0 +/// ((A ^ N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == 0 +/// +/// return (A +/- B). +/// +Value *InstCombiner::FoldLogicalPlusAnd(Value *LHS, Value *RHS, + ConstantInt *Mask, bool isSub, + Instruction &I) { + Instruction *LHSI = dyn_cast<Instruction>(LHS); + if (!LHSI || LHSI->getNumOperands() != 2 || + !isa<ConstantInt>(LHSI->getOperand(1))) return 0; + + ConstantInt *N = cast<ConstantInt>(LHSI->getOperand(1)); + + switch (LHSI->getOpcode()) { + default: return 0; + case Instruction::And: + if (ConstantExpr::getAnd(N, Mask) == Mask) { + // If the AndRHS is a power of two minus one (0+1+), this is simple. + if ((Mask->getValue().countLeadingZeros() + + Mask->getValue().countPopulation()) == + Mask->getValue().getBitWidth()) + break; + + // Otherwise, if Mask is 0+1+0+, and if B is known to have the low 0+ + // part, we don't need any explicit masks to take them out of A. If that + // is all N is, ignore it. + uint32_t MB = 0, ME = 0; + if (isRunOfOnes(Mask, MB, ME)) { // begin/end bit of run, inclusive + uint32_t BitWidth = cast<IntegerType>(RHS->getType())->getBitWidth(); + APInt Mask(APInt::getLowBitsSet(BitWidth, MB-1)); + if (MaskedValueIsZero(RHS, Mask)) + break; + } + } + return 0; + case Instruction::Or: + case Instruction::Xor: + // If the AndRHS is a power of two minus one (0+1+), and N&Mask == 0 + if ((Mask->getValue().countLeadingZeros() + + Mask->getValue().countPopulation()) == Mask->getValue().getBitWidth() + && ConstantExpr::getAnd(N, Mask)->isNullValue()) + break; + return 0; + } + + if (isSub) + return Builder->CreateSub(LHSI->getOperand(0), RHS, "fold"); + return Builder->CreateAdd(LHSI->getOperand(0), RHS, "fold"); +} + +/// enum for classifying (icmp eq (A & B), C) and (icmp ne (A & B), C) +/// One of A and B is considered the mask, the other the value. This is +/// described as the "AMask" or "BMask" part of the enum. If the enum +/// contains only "Mask", then both A and B can be considered masks. +/// If A is the mask, then it was proven, that (A & C) == C. This +/// is trivial if C == A, or C == 0. If both A and C are constants, this +/// proof is also easy. +/// For the following explanations we assume that A is the mask. +/// The part "AllOnes" declares, that the comparison is true only +/// if (A & B) == A, or all bits of A are set in B. +/// Example: (icmp eq (A & 3), 3) -> FoldMskICmp_AMask_AllOnes +/// The part "AllZeroes" declares, that the comparison is true only +/// if (A & B) == 0, or all bits of A are cleared in B. +/// Example: (icmp eq (A & 3), 0) -> FoldMskICmp_Mask_AllZeroes +/// The part "Mixed" declares, that (A & B) == C and C might or might not +/// contain any number of one bits and zero bits. +/// Example: (icmp eq (A & 3), 1) -> FoldMskICmp_AMask_Mixed +/// The Part "Not" means, that in above descriptions "==" should be replaced +/// by "!=". +/// Example: (icmp ne (A & 3), 3) -> FoldMskICmp_AMask_NotAllOnes +/// If the mask A contains a single bit, then the following is equivalent: +/// (icmp eq (A & B), A) equals (icmp ne (A & B), 0) +/// (icmp ne (A & B), A) equals (icmp eq (A & B), 0) +enum MaskedICmpType { + FoldMskICmp_AMask_AllOnes = 1, + FoldMskICmp_AMask_NotAllOnes = 2, + FoldMskICmp_BMask_AllOnes = 4, + FoldMskICmp_BMask_NotAllOnes = 8, + FoldMskICmp_Mask_AllZeroes = 16, + FoldMskICmp_Mask_NotAllZeroes = 32, + FoldMskICmp_AMask_Mixed = 64, + FoldMskICmp_AMask_NotMixed = 128, + FoldMskICmp_BMask_Mixed = 256, + FoldMskICmp_BMask_NotMixed = 512 +}; + +/// return the set of pattern classes (from MaskedICmpType) +/// that (icmp SCC (A & B), C) satisfies +static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C, + ICmpInst::Predicate SCC) +{ + ConstantInt *ACst = dyn_cast<ConstantInt>(A); + ConstantInt *BCst = dyn_cast<ConstantInt>(B); + ConstantInt *CCst = dyn_cast<ConstantInt>(C); + bool icmp_eq = (SCC == ICmpInst::ICMP_EQ); + bool icmp_abit = (ACst != 0 && !ACst->isZero() && + ACst->getValue().isPowerOf2()); + bool icmp_bbit = (BCst != 0 && !BCst->isZero() && + BCst->getValue().isPowerOf2()); + unsigned result = 0; + if (CCst != 0 && CCst->isZero()) { + // if C is zero, then both A and B qualify as mask + result |= (icmp_eq ? (FoldMskICmp_Mask_AllZeroes | + FoldMskICmp_Mask_AllZeroes | + FoldMskICmp_AMask_Mixed | + FoldMskICmp_BMask_Mixed) + : (FoldMskICmp_Mask_NotAllZeroes | + FoldMskICmp_Mask_NotAllZeroes | + FoldMskICmp_AMask_NotMixed | + FoldMskICmp_BMask_NotMixed)); + if (icmp_abit) + result |= (icmp_eq ? (FoldMskICmp_AMask_NotAllOnes | + FoldMskICmp_AMask_NotMixed) + : (FoldMskICmp_AMask_AllOnes | + FoldMskICmp_AMask_Mixed)); + if (icmp_bbit) + result |= (icmp_eq ? (FoldMskICmp_BMask_NotAllOnes | + FoldMskICmp_BMask_NotMixed) + : (FoldMskICmp_BMask_AllOnes | + FoldMskICmp_BMask_Mixed)); + return result; + } + if (A == C) { + result |= (icmp_eq ? (FoldMskICmp_AMask_AllOnes | + FoldMskICmp_AMask_Mixed) + : (FoldMskICmp_AMask_NotAllOnes | + FoldMskICmp_AMask_NotMixed)); + if (icmp_abit) + result |= (icmp_eq ? (FoldMskICmp_Mask_NotAllZeroes | + FoldMskICmp_AMask_NotMixed) + : (FoldMskICmp_Mask_AllZeroes | + FoldMskICmp_AMask_Mixed)); + } + else if (ACst != 0 && CCst != 0 && + ConstantExpr::getAnd(ACst, CCst) == CCst) { + result |= (icmp_eq ? FoldMskICmp_AMask_Mixed + : FoldMskICmp_AMask_NotMixed); + } + if (B == C) + { + result |= (icmp_eq ? (FoldMskICmp_BMask_AllOnes | + FoldMskICmp_BMask_Mixed) + : (FoldMskICmp_BMask_NotAllOnes | + FoldMskICmp_BMask_NotMixed)); + if (icmp_bbit) + result |= (icmp_eq ? (FoldMskICmp_Mask_NotAllZeroes | + FoldMskICmp_BMask_NotMixed) + : (FoldMskICmp_Mask_AllZeroes | + FoldMskICmp_BMask_Mixed)); + } + else if (BCst != 0 && CCst != 0 && + ConstantExpr::getAnd(BCst, CCst) == CCst) { + result |= (icmp_eq ? FoldMskICmp_BMask_Mixed + : FoldMskICmp_BMask_NotMixed); + } + return result; +} + +/// foldLogOpOfMaskedICmpsHelper: +/// handle (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E) +/// return the set of pattern classes (from MaskedICmpType) +/// that both LHS and RHS satisfy +static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A, + Value*& B, Value*& C, + Value*& D, Value*& E, + ICmpInst *LHS, ICmpInst *RHS) { + ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate(); + if (LHSCC != ICmpInst::ICMP_EQ && LHSCC != ICmpInst::ICMP_NE) return 0; + if (RHSCC != ICmpInst::ICMP_EQ && RHSCC != ICmpInst::ICMP_NE) return 0; + if (LHS->getOperand(0)->getType() != RHS->getOperand(0)->getType()) return 0; + // vectors are not (yet?) supported + if (LHS->getOperand(0)->getType()->isVectorTy()) return 0; + + // Here comes the tricky part: + // LHS might be of the form L11 & L12 == X, X == L21 & L22, + // and L11 & L12 == L21 & L22. The same goes for RHS. + // Now we must find those components L** and R**, that are equal, so + // that we can extract the parameters A, B, C, D, and E for the canonical + // above. + Value *L1 = LHS->getOperand(0); + Value *L2 = LHS->getOperand(1); + Value *L11,*L12,*L21,*L22; + if (match(L1, m_And(m_Value(L11), m_Value(L12)))) { + if (!match(L2, m_And(m_Value(L21), m_Value(L22)))) + L21 = L22 = 0; + } + else { + if (!match(L2, m_And(m_Value(L11), m_Value(L12)))) + return 0; + std::swap(L1, L2); + L21 = L22 = 0; + } + + Value *R1 = RHS->getOperand(0); + Value *R2 = RHS->getOperand(1); + Value *R11,*R12; + bool ok = false; + if (match(R1, m_And(m_Value(R11), m_Value(R12)))) { + if (R11 != 0 && (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22)) { + A = R11; D = R12; E = R2; ok = true; + } + else + if (R12 != 0 && (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22)) { + A = R12; D = R11; E = R2; ok = true; + } + } + if (!ok && match(R2, m_And(m_Value(R11), m_Value(R12)))) { + if (R11 != 0 && (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22)) { + A = R11; D = R12; E = R1; ok = true; + } + else + if (R12 != 0 && (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22)) { + A = R12; D = R11; E = R1; ok = true; + } + else + return 0; + } + if (!ok) + return 0; + + if (L11 == A) { + B = L12; C = L2; + } + else if (L12 == A) { + B = L11; C = L2; + } + else if (L21 == A) { + B = L22; C = L1; + } + else if (L22 == A) { + B = L21; C = L1; + } + + unsigned left_type = getTypeOfMaskedICmp(A, B, C, LHSCC); + unsigned right_type = getTypeOfMaskedICmp(A, D, E, RHSCC); + return left_type & right_type; +} +/// foldLogOpOfMaskedICmps: +/// try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E) +/// into a single (icmp(A & X) ==/!= Y) +static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, + ICmpInst::Predicate NEWCC, + llvm::InstCombiner::BuilderTy* Builder) { + Value *A = 0, *B = 0, *C = 0, *D = 0, *E = 0; + unsigned mask = foldLogOpOfMaskedICmpsHelper(A, B, C, D, E, LHS, RHS); + if (mask == 0) return 0; + + if (NEWCC == ICmpInst::ICMP_NE) + mask >>= 1; // treat "Not"-states as normal states + + if (mask & FoldMskICmp_Mask_AllZeroes) { + // (icmp eq (A & B), 0) & (icmp eq (A & D), 0) + // -> (icmp eq (A & (B|D)), 0) + Value* newOr = Builder->CreateOr(B, D); + Value* newAnd = Builder->CreateAnd(A, newOr); + // we can't use C as zero, because we might actually handle + // (icmp ne (A & B), B) & (icmp ne (A & D), D) + // with B and D, having a single bit set + Value* zero = Constant::getNullValue(A->getType()); + return Builder->CreateICmp(NEWCC, newAnd, zero); + } + else if (mask & FoldMskICmp_BMask_AllOnes) { + // (icmp eq (A & B), B) & (icmp eq (A & D), D) + // -> (icmp eq (A & (B|D)), (B|D)) + Value* newOr = Builder->CreateOr(B, D); + Value* newAnd = Builder->CreateAnd(A, newOr); + return Builder->CreateICmp(NEWCC, newAnd, newOr); + } + else if (mask & FoldMskICmp_AMask_AllOnes) { + // (icmp eq (A & B), A) & (icmp eq (A & D), A) + // -> (icmp eq (A & (B&D)), A) + Value* newAnd1 = Builder->CreateAnd(B, D); + Value* newAnd = Builder->CreateAnd(A, newAnd1); + return Builder->CreateICmp(NEWCC, newAnd, A); + } + else if (mask & FoldMskICmp_BMask_Mixed) { + // (icmp eq (A & B), C) & (icmp eq (A & D), E) + // We already know that B & C == C && D & E == E. + // If we can prove that (B & D) & (C ^ E) == 0, that is, the bits of + // C and E, which are shared by both the mask B and the mask D, don't + // contradict, then we can transform to + // -> (icmp eq (A & (B|D)), (C|E)) + // Currently, we only handle the case of B, C, D, and E being constant. + ConstantInt *BCst = dyn_cast<ConstantInt>(B); + if (BCst == 0) return 0; + ConstantInt *DCst = dyn_cast<ConstantInt>(D); + if (DCst == 0) return 0; + // we can't simply use C and E, because we might actually handle + // (icmp ne (A & B), B) & (icmp eq (A & D), D) + // with B and D, having a single bit set + + ConstantInt *CCst = dyn_cast<ConstantInt>(C); + if (CCst == 0) return 0; + if (LHS->getPredicate() != NEWCC) + CCst = dyn_cast<ConstantInt>( ConstantExpr::getXor(BCst, CCst) ); + ConstantInt *ECst = dyn_cast<ConstantInt>(E); + if (ECst == 0) return 0; + if (RHS->getPredicate() != NEWCC) + ECst = dyn_cast<ConstantInt>( ConstantExpr::getXor(DCst, ECst) ); + ConstantInt* MCst = dyn_cast<ConstantInt>( + ConstantExpr::getAnd(ConstantExpr::getAnd(BCst, DCst), + ConstantExpr::getXor(CCst, ECst)) ); + // if there is a conflict we should actually return a false for the + // whole construct + if (!MCst->isZero()) + return 0; + Value *newOr1 = Builder->CreateOr(B, D); + Value *newOr2 = ConstantExpr::getOr(CCst, ECst); + Value *newAnd = Builder->CreateAnd(A, newOr1); + return Builder->CreateICmp(NEWCC, newAnd, newOr2); + } + return 0; +} + +/// FoldAndOfICmps - Fold (icmp)&(icmp) if possible. +Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) { + ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate(); + + // (icmp1 A, B) & (icmp2 A, B) --> (icmp3 A, B) + if (PredicatesFoldable(LHSCC, RHSCC)) { + if (LHS->getOperand(0) == RHS->getOperand(1) && + LHS->getOperand(1) == RHS->getOperand(0)) + LHS->swapOperands(); + if (LHS->getOperand(0) == RHS->getOperand(0) && + LHS->getOperand(1) == RHS->getOperand(1)) { + Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1); + unsigned Code = getICmpCode(LHS) & getICmpCode(RHS); + bool isSigned = LHS->isSigned() || RHS->isSigned(); + return getICmpValue(isSigned, Code, Op0, Op1, Builder); + } + } + + // handle (roughly): (icmp eq (A & B), C) & (icmp eq (A & D), E) + if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, ICmpInst::ICMP_EQ, Builder)) + return V; + + // This only handles icmp of constants: (icmp1 A, C1) & (icmp2 B, C2). + Value *Val = LHS->getOperand(0), *Val2 = RHS->getOperand(0); + ConstantInt *LHSCst = dyn_cast<ConstantInt>(LHS->getOperand(1)); + ConstantInt *RHSCst = dyn_cast<ConstantInt>(RHS->getOperand(1)); + if (LHSCst == 0 || RHSCst == 0) return 0; + + if (LHSCst == RHSCst && LHSCC == RHSCC) { + // (icmp ult A, C) & (icmp ult B, C) --> (icmp ult (A|B), C) + // where C is a power of 2 + if (LHSCC == ICmpInst::ICMP_ULT && + LHSCst->getValue().isPowerOf2()) { + Value *NewOr = Builder->CreateOr(Val, Val2); + return Builder->CreateICmp(LHSCC, NewOr, LHSCst); + } + + // (icmp eq A, 0) & (icmp eq B, 0) --> (icmp eq (A|B), 0) + if (LHSCC == ICmpInst::ICMP_EQ && LHSCst->isZero()) { + Value *NewOr = Builder->CreateOr(Val, Val2); + return Builder->CreateICmp(LHSCC, NewOr, LHSCst); + } + } + + // From here on, we only handle: + // (icmp1 A, C1) & (icmp2 A, C2) --> something simpler. + if (Val != Val2) return 0; + + // ICMP_[US][GL]E X, CST is folded to ICMP_[US][GL]T elsewhere. + if (LHSCC == ICmpInst::ICMP_UGE || LHSCC == ICmpInst::ICMP_ULE || + RHSCC == ICmpInst::ICMP_UGE || RHSCC == ICmpInst::ICMP_ULE || + LHSCC == ICmpInst::ICMP_SGE || LHSCC == ICmpInst::ICMP_SLE || + RHSCC == ICmpInst::ICMP_SGE || RHSCC == ICmpInst::ICMP_SLE) + return 0; + + // We can't fold (ugt x, C) & (sgt x, C2). + if (!PredicatesFoldable(LHSCC, RHSCC)) + return 0; + + // Ensure that the larger constant is on the RHS. + bool ShouldSwap; + if (CmpInst::isSigned(LHSCC) || + (ICmpInst::isEquality(LHSCC) && + CmpInst::isSigned(RHSCC))) + ShouldSwap = LHSCst->getValue().sgt(RHSCst->getValue()); + else + ShouldSwap = LHSCst->getValue().ugt(RHSCst->getValue()); + + if (ShouldSwap) { + std::swap(LHS, RHS); + std::swap(LHSCst, RHSCst); + std::swap(LHSCC, RHSCC); + } + + // At this point, we know we have two icmp instructions + // comparing a value against two constants and and'ing the result + // together. Because of the above check, we know that we only have + // icmp eq, icmp ne, icmp [su]lt, and icmp [SU]gt here. We also know + // (from the icmp folding check above), that the two constants + // are not equal and that the larger constant is on the RHS + assert(LHSCst != RHSCst && "Compares not folded above?"); + + switch (LHSCC) { + default: llvm_unreachable("Unknown integer condition code!"); + case ICmpInst::ICMP_EQ: + switch (RHSCC) { + default: llvm_unreachable("Unknown integer condition code!"); + case ICmpInst::ICMP_EQ: // (X == 13 & X == 15) -> false + case ICmpInst::ICMP_UGT: // (X == 13 & X > 15) -> false + case ICmpInst::ICMP_SGT: // (X == 13 & X > 15) -> false + return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0); + case ICmpInst::ICMP_NE: // (X == 13 & X != 15) -> X == 13 + case ICmpInst::ICMP_ULT: // (X == 13 & X < 15) -> X == 13 + case ICmpInst::ICMP_SLT: // (X == 13 & X < 15) -> X == 13 + return LHS; + } + case ICmpInst::ICMP_NE: + switch (RHSCC) { + default: llvm_unreachable("Unknown integer condition code!"); + case ICmpInst::ICMP_ULT: + if (LHSCst == SubOne(RHSCst)) // (X != 13 & X u< 14) -> X < 13 + return Builder->CreateICmpULT(Val, LHSCst); + break; // (X != 13 & X u< 15) -> no change + case ICmpInst::ICMP_SLT: + if (LHSCst == SubOne(RHSCst)) // (X != 13 & X s< 14) -> X < 13 + return Builder->CreateICmpSLT(Val, LHSCst); + break; // (X != 13 & X s< 15) -> no change + case ICmpInst::ICMP_EQ: // (X != 13 & X == 15) -> X == 15 + case ICmpInst::ICMP_UGT: // (X != 13 & X u> 15) -> X u> 15 + case ICmpInst::ICMP_SGT: // (X != 13 & X s> 15) -> X s> 15 + return RHS; + case ICmpInst::ICMP_NE: + if (LHSCst == SubOne(RHSCst)){// (X != 13 & X != 14) -> X-13 >u 1 + Constant *AddCST = ConstantExpr::getNeg(LHSCst); + Value *Add = Builder->CreateAdd(Val, AddCST, Val->getName()+".off"); + return Builder->CreateICmpUGT(Add, ConstantInt::get(Add->getType(), 1)); + } + break; // (X != 13 & X != 15) -> no change + } + break; + case ICmpInst::ICMP_ULT: + switch (RHSCC) { + default: llvm_unreachable("Unknown integer condition code!"); + case ICmpInst::ICMP_EQ: // (X u< 13 & X == 15) -> false + case ICmpInst::ICMP_UGT: // (X u< 13 & X u> 15) -> false + return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0); + case ICmpInst::ICMP_SGT: // (X u< 13 & X s> 15) -> no change + break; + case ICmpInst::ICMP_NE: // (X u< 13 & X != 15) -> X u< 13 + case ICmpInst::ICMP_ULT: // (X u< 13 & X u< 15) -> X u< 13 + return LHS; + case ICmpInst::ICMP_SLT: // (X u< 13 & X s< 15) -> no change + break; + } + break; + case ICmpInst::ICMP_SLT: + switch (RHSCC) { + default: llvm_unreachable("Unknown integer condition code!"); + case ICmpInst::ICMP_EQ: // (X s< 13 & X == 15) -> false + case ICmpInst::ICMP_SGT: // (X s< 13 & X s> 15) -> false + return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0); + case ICmpInst::ICMP_UGT: // (X s< 13 & X u> 15) -> no change + break; + case ICmpInst::ICMP_NE: // (X s< 13 & X != 15) -> X < 13 + case ICmpInst::ICMP_SLT: // (X s< 13 & X s< 15) -> X < 13 + return LHS; + case ICmpInst::ICMP_ULT: // (X s< 13 & X u< 15) -> no change + break; + } + break; + case ICmpInst::ICMP_UGT: + switch (RHSCC) { + default: llvm_unreachable("Unknown integer condition code!"); + case ICmpInst::ICMP_EQ: // (X u> 13 & X == 15) -> X == 15 + case ICmpInst::ICMP_UGT: // (X u> 13 & X u> 15) -> X u> 15 + return RHS; + case ICmpInst::ICMP_SGT: // (X u> 13 & X s> 15) -> no change + break; + case ICmpInst::ICMP_NE: + if (RHSCst == AddOne(LHSCst)) // (X u> 13 & X != 14) -> X u> 14 + return Builder->CreateICmp(LHSCC, Val, RHSCst); + break; // (X u> 13 & X != 15) -> no change + case ICmpInst::ICMP_ULT: // (X u> 13 & X u< 15) -> (X-14) <u 1 + return InsertRangeTest(Val, AddOne(LHSCst), RHSCst, false, true); + case ICmpInst::ICMP_SLT: // (X u> 13 & X s< 15) -> no change + break; + } + break; + case ICmpInst::ICMP_SGT: + switch (RHSCC) { + default: llvm_unreachable("Unknown integer condition code!"); + case ICmpInst::ICMP_EQ: // (X s> 13 & X == 15) -> X == 15 + case ICmpInst::ICMP_SGT: // (X s> 13 & X s> 15) -> X s> 15 + return RHS; + case ICmpInst::ICMP_UGT: // (X s> 13 & X u> 15) -> no change + break; + case ICmpInst::ICMP_NE: + if (RHSCst == AddOne(LHSCst)) // (X s> 13 & X != 14) -> X s> 14 + return Builder->CreateICmp(LHSCC, Val, RHSCst); + break; // (X s> 13 & X != 15) -> no change + case ICmpInst::ICMP_SLT: // (X s> 13 & X s< 15) -> (X-14) s< 1 + return InsertRangeTest(Val, AddOne(LHSCst), RHSCst, true, true); + case ICmpInst::ICMP_ULT: // (X s> 13 & X u< 15) -> no change + break; + } + break; + } + + return 0; +} + +/// FoldAndOfFCmps - Optimize (fcmp)&(fcmp). NOTE: Unlike the rest of +/// instcombine, this returns a Value which should already be inserted into the +/// function. +Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { + if (LHS->getPredicate() == FCmpInst::FCMP_ORD && + RHS->getPredicate() == FCmpInst::FCMP_ORD) { + // (fcmp ord x, c) & (fcmp ord y, c) -> (fcmp ord x, y) + if (ConstantFP *LHSC = dyn_cast<ConstantFP>(LHS->getOperand(1))) + if (ConstantFP *RHSC = dyn_cast<ConstantFP>(RHS->getOperand(1))) { + // If either of the constants are nans, then the whole thing returns + // false. + if (LHSC->getValueAPF().isNaN() || RHSC->getValueAPF().isNaN()) + return ConstantInt::getFalse(LHS->getContext()); + return Builder->CreateFCmpORD(LHS->getOperand(0), RHS->getOperand(0)); + } + + // Handle vector zeros. This occurs because the canonical form of + // "fcmp ord x,x" is "fcmp ord x, 0". + if (isa<ConstantAggregateZero>(LHS->getOperand(1)) && + isa<ConstantAggregateZero>(RHS->getOperand(1))) + return Builder->CreateFCmpORD(LHS->getOperand(0), RHS->getOperand(0)); + return 0; + } + + Value *Op0LHS = LHS->getOperand(0), *Op0RHS = LHS->getOperand(1); + Value *Op1LHS = RHS->getOperand(0), *Op1RHS = RHS->getOperand(1); + FCmpInst::Predicate Op0CC = LHS->getPredicate(), Op1CC = RHS->getPredicate(); + + + if (Op0LHS == Op1RHS && Op0RHS == Op1LHS) { + // Swap RHS operands to match LHS. + Op1CC = FCmpInst::getSwappedPredicate(Op1CC); + std::swap(Op1LHS, Op1RHS); + } + + if (Op0LHS == Op1LHS && Op0RHS == Op1RHS) { + // Simplify (fcmp cc0 x, y) & (fcmp cc1 x, y). + if (Op0CC == Op1CC) + return Builder->CreateFCmp((FCmpInst::Predicate)Op0CC, Op0LHS, Op0RHS); + if (Op0CC == FCmpInst::FCMP_FALSE || Op1CC == FCmpInst::FCMP_FALSE) + return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0); + if (Op0CC == FCmpInst::FCMP_TRUE) + return RHS; + if (Op1CC == FCmpInst::FCMP_TRUE) + return LHS; + + bool Op0Ordered; + bool Op1Ordered; + unsigned Op0Pred = getFCmpCode(Op0CC, Op0Ordered); + unsigned Op1Pred = getFCmpCode(Op1CC, Op1Ordered); + if (Op1Pred == 0) { + std::swap(LHS, RHS); + std::swap(Op0Pred, Op1Pred); + std::swap(Op0Ordered, Op1Ordered); + } + if (Op0Pred == 0) { + // uno && ueq -> uno && (uno || eq) -> ueq + // ord && olt -> ord && (ord && lt) -> olt + if (Op0Ordered == Op1Ordered) + return RHS; + + // uno && oeq -> uno && (ord && eq) -> false + // uno && ord -> false + if (!Op0Ordered) + return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0); + // ord && ueq -> ord && (uno || eq) -> oeq + return getFCmpValue(true, Op1Pred, Op0LHS, Op0RHS, Builder); + } + } + + return 0; +} + + +Instruction *InstCombiner::visitAnd(BinaryOperator &I) { + bool Changed = SimplifyAssociativeOrCommutative(I); + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + + if (Value *V = SimplifyAndInst(Op0, Op1, TD)) + return ReplaceInstUsesWith(I, V); + + // (A|B)&(A|C) -> A|(B&C) etc + if (Value *V = SimplifyUsingDistributiveLaws(I)) + return ReplaceInstUsesWith(I, V); + + // See if we can simplify any instructions used by the instruction whose sole + // purpose is to compute bits we don't care about. + if (SimplifyDemandedInstructionBits(I)) + return &I; + + if (ConstantInt *AndRHS = dyn_cast<ConstantInt>(Op1)) { + const APInt &AndRHSMask = AndRHS->getValue(); + + // Optimize a variety of ((val OP C1) & C2) combinations... + if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0)) { + Value *Op0LHS = Op0I->getOperand(0); + Value *Op0RHS = Op0I->getOperand(1); + switch (Op0I->getOpcode()) { + default: break; + case Instruction::Xor: + case Instruction::Or: { + // If the mask is only needed on one incoming arm, push it up. + if (!Op0I->hasOneUse()) break; + + APInt NotAndRHS(~AndRHSMask); + if (MaskedValueIsZero(Op0LHS, NotAndRHS)) { + // Not masking anything out for the LHS, move to RHS. + Value *NewRHS = Builder->CreateAnd(Op0RHS, AndRHS, + Op0RHS->getName()+".masked"); + return BinaryOperator::Create(Op0I->getOpcode(), Op0LHS, NewRHS); + } + if (!isa<Constant>(Op0RHS) && + MaskedValueIsZero(Op0RHS, NotAndRHS)) { + // Not masking anything out for the RHS, move to LHS. + Value *NewLHS = Builder->CreateAnd(Op0LHS, AndRHS, + Op0LHS->getName()+".masked"); + return BinaryOperator::Create(Op0I->getOpcode(), NewLHS, Op0RHS); + } + + break; + } + case Instruction::Add: + // ((A & N) + B) & AndRHS -> (A + B) & AndRHS iff N&AndRHS == AndRHS. + // ((A | N) + B) & AndRHS -> (A + B) & AndRHS iff N&AndRHS == 0 + // ((A ^ N) + B) & AndRHS -> (A + B) & AndRHS iff N&AndRHS == 0 + if (Value *V = FoldLogicalPlusAnd(Op0LHS, Op0RHS, AndRHS, false, I)) + return BinaryOperator::CreateAnd(V, AndRHS); + if (Value *V = FoldLogicalPlusAnd(Op0RHS, Op0LHS, AndRHS, false, I)) + return BinaryOperator::CreateAnd(V, AndRHS); // Add commutes + break; + + case Instruction::Sub: + // ((A & N) - B) & AndRHS -> (A - B) & AndRHS iff N&AndRHS == AndRHS. + // ((A | N) - B) & AndRHS -> (A - B) & AndRHS iff N&AndRHS == 0 + // ((A ^ N) - B) & AndRHS -> (A - B) & AndRHS iff N&AndRHS == 0 + if (Value *V = FoldLogicalPlusAnd(Op0LHS, Op0RHS, AndRHS, true, I)) + return BinaryOperator::CreateAnd(V, AndRHS); + + // (A - N) & AndRHS -> -N & AndRHS iff A&AndRHS==0 and AndRHS + // has 1's for all bits that the subtraction with A might affect. + if (Op0I->hasOneUse() && !match(Op0LHS, m_Zero())) { + uint32_t BitWidth = AndRHSMask.getBitWidth(); + uint32_t Zeros = AndRHSMask.countLeadingZeros(); + APInt Mask = APInt::getLowBitsSet(BitWidth, BitWidth - Zeros); + + if (MaskedValueIsZero(Op0LHS, Mask)) { + Value *NewNeg = Builder->CreateNeg(Op0RHS); + return BinaryOperator::CreateAnd(NewNeg, AndRHS); + } + } + break; + + case Instruction::Shl: + case Instruction::LShr: + // (1 << x) & 1 --> zext(x == 0) + // (1 >> x) & 1 --> zext(x == 0) + if (AndRHSMask == 1 && Op0LHS == AndRHS) { + Value *NewICmp = + Builder->CreateICmpEQ(Op0RHS, Constant::getNullValue(I.getType())); + return new ZExtInst(NewICmp, I.getType()); + } + break; + } + + if (ConstantInt *Op0CI = dyn_cast<ConstantInt>(Op0I->getOperand(1))) + if (Instruction *Res = OptAndOp(Op0I, Op0CI, AndRHS, I)) + return Res; + } + + // If this is an integer truncation, and if the source is an 'and' with + // immediate, transform it. This frequently occurs for bitfield accesses. + { + Value *X = 0; ConstantInt *YC = 0; + if (match(Op0, m_Trunc(m_And(m_Value(X), m_ConstantInt(YC))))) { + // Change: and (trunc (and X, YC) to T), C2 + // into : and (trunc X to T), trunc(YC) & C2 + // This will fold the two constants together, which may allow + // other simplifications. + Value *NewCast = Builder->CreateTrunc(X, I.getType(), "and.shrunk"); + Constant *C3 = ConstantExpr::getTrunc(YC, I.getType()); + C3 = ConstantExpr::getAnd(C3, AndRHS); + return BinaryOperator::CreateAnd(NewCast, C3); + } + } + + // Try to fold constant and into select arguments. + if (SelectInst *SI = dyn_cast<SelectInst>(Op0)) + if (Instruction *R = FoldOpIntoSelect(I, SI)) + return R; + if (isa<PHINode>(Op0)) + if (Instruction *NV = FoldOpIntoPhi(I)) + return NV; + } + + + // (~A & ~B) == (~(A | B)) - De Morgan's Law + if (Value *Op0NotVal = dyn_castNotVal(Op0)) + if (Value *Op1NotVal = dyn_castNotVal(Op1)) + if (Op0->hasOneUse() && Op1->hasOneUse()) { + Value *Or = Builder->CreateOr(Op0NotVal, Op1NotVal, + I.getName()+".demorgan"); + return BinaryOperator::CreateNot(Or); + } + + { + Value *A = 0, *B = 0, *C = 0, *D = 0; + // (A|B) & ~(A&B) -> A^B + if (match(Op0, m_Or(m_Value(A), m_Value(B))) && + match(Op1, m_Not(m_And(m_Value(C), m_Value(D)))) && + ((A == C && B == D) || (A == D && B == C))) + return BinaryOperator::CreateXor(A, B); + + // ~(A&B) & (A|B) -> A^B + if (match(Op1, m_Or(m_Value(A), m_Value(B))) && + match(Op0, m_Not(m_And(m_Value(C), m_Value(D)))) && + ((A == C && B == D) || (A == D && B == C))) + return BinaryOperator::CreateXor(A, B); + + if (Op0->hasOneUse() && + match(Op0, m_Xor(m_Value(A), m_Value(B)))) { + if (A == Op1) { // (A^B)&A -> A&(A^B) + I.swapOperands(); // Simplify below + std::swap(Op0, Op1); + } else if (B == Op1) { // (A^B)&B -> B&(B^A) + cast<BinaryOperator>(Op0)->swapOperands(); + I.swapOperands(); // Simplify below + std::swap(Op0, Op1); + } + } + + if (Op1->hasOneUse() && + match(Op1, m_Xor(m_Value(A), m_Value(B)))) { + if (B == Op0) { // B&(A^B) -> B&(B^A) + cast<BinaryOperator>(Op1)->swapOperands(); + std::swap(A, B); + } + // Notice that the patten (A&(~B)) is actually (A&(-1^B)), so if + // A is originally -1 (or a vector of -1 and undefs), then we enter + // an endless loop. By checking that A is non-constant we ensure that + // we will never get to the loop. + if (A == Op0 && !isa<Constant>(A)) // A&(A^B) -> A & ~B + return BinaryOperator::CreateAnd(A, Builder->CreateNot(B, "tmp")); + } + + // (A&((~A)|B)) -> A&B + if (match(Op0, m_Or(m_Not(m_Specific(Op1)), m_Value(A))) || + match(Op0, m_Or(m_Value(A), m_Not(m_Specific(Op1))))) + return BinaryOperator::CreateAnd(A, Op1); + if (match(Op1, m_Or(m_Not(m_Specific(Op0)), m_Value(A))) || + match(Op1, m_Or(m_Value(A), m_Not(m_Specific(Op0))))) + return BinaryOperator::CreateAnd(A, Op0); + } + + if (ICmpInst *RHS = dyn_cast<ICmpInst>(Op1)) + if (ICmpInst *LHS = dyn_cast<ICmpInst>(Op0)) + if (Value *Res = FoldAndOfICmps(LHS, RHS)) + return ReplaceInstUsesWith(I, Res); + + // If and'ing two fcmp, try combine them into one. + if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0))) + if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1))) + if (Value *Res = FoldAndOfFCmps(LHS, RHS)) + return ReplaceInstUsesWith(I, Res); + + + // fold (and (cast A), (cast B)) -> (cast (and A, B)) + if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) + if (CastInst *Op1C = dyn_cast<CastInst>(Op1)) { + const Type *SrcTy = Op0C->getOperand(0)->getType(); + if (Op0C->getOpcode() == Op1C->getOpcode() && // same cast kind ? + SrcTy == Op1C->getOperand(0)->getType() && + SrcTy->isIntOrIntVectorTy()) { + Value *Op0COp = Op0C->getOperand(0), *Op1COp = Op1C->getOperand(0); + + // Only do this if the casts both really cause code to be generated. + if (ShouldOptimizeCast(Op0C->getOpcode(), Op0COp, I.getType()) && + ShouldOptimizeCast(Op1C->getOpcode(), Op1COp, I.getType())) { + Value *NewOp = Builder->CreateAnd(Op0COp, Op1COp, I.getName()); + return CastInst::Create(Op0C->getOpcode(), NewOp, I.getType()); + } + + // If this is and(cast(icmp), cast(icmp)), try to fold this even if the + // cast is otherwise not optimizable. This happens for vector sexts. + if (ICmpInst *RHS = dyn_cast<ICmpInst>(Op1COp)) + if (ICmpInst *LHS = dyn_cast<ICmpInst>(Op0COp)) + if (Value *Res = FoldAndOfICmps(LHS, RHS)) + return CastInst::Create(Op0C->getOpcode(), Res, I.getType()); + + // If this is and(cast(fcmp), cast(fcmp)), try to fold this even if the + // cast is otherwise not optimizable. This happens for vector sexts. + if (FCmpInst *RHS = dyn_cast<FCmpInst>(Op1COp)) + if (FCmpInst *LHS = dyn_cast<FCmpInst>(Op0COp)) + if (Value *Res = FoldAndOfFCmps(LHS, RHS)) + return CastInst::Create(Op0C->getOpcode(), Res, I.getType()); + } + } + + // (X >> Z) & (Y >> Z) -> (X&Y) >> Z for all shifts. + if (BinaryOperator *SI1 = dyn_cast<BinaryOperator>(Op1)) { + if (BinaryOperator *SI0 = dyn_cast<BinaryOperator>(Op0)) + if (SI0->isShift() && SI0->getOpcode() == SI1->getOpcode() && + SI0->getOperand(1) == SI1->getOperand(1) && + (SI0->hasOneUse() || SI1->hasOneUse())) { + Value *NewOp = + Builder->CreateAnd(SI0->getOperand(0), SI1->getOperand(0), + SI0->getName()); + return BinaryOperator::Create(SI1->getOpcode(), NewOp, + SI1->getOperand(1)); + } + } + + return Changed ? &I : 0; +} + +/// CollectBSwapParts - Analyze the specified subexpression and see if it is +/// capable of providing pieces of a bswap. The subexpression provides pieces +/// of a bswap if it is proven that each of the non-zero bytes in the output of +/// the expression came from the corresponding "byte swapped" byte in some other +/// value. For example, if the current subexpression is "(shl i32 %X, 24)" then +/// we know that the expression deposits the low byte of %X into the high byte +/// of the bswap result and that all other bytes are zero. This expression is +/// accepted, the high byte of ByteValues is set to X to indicate a correct +/// match. +/// +/// This function returns true if the match was unsuccessful and false if so. +/// On entry to the function the "OverallLeftShift" is a signed integer value +/// indicating the number of bytes that the subexpression is later shifted. For +/// example, if the expression is later right shifted by 16 bits, the +/// OverallLeftShift value would be -2 on entry. This is used to specify which +/// byte of ByteValues is actually being set. +/// +/// Similarly, ByteMask is a bitmask where a bit is clear if its corresponding +/// byte is masked to zero by a user. For example, in (X & 255), X will be +/// processed with a bytemask of 1. Because bytemask is 32-bits, this limits +/// this function to working on up to 32-byte (256 bit) values. ByteMask is +/// always in the local (OverallLeftShift) coordinate space. +/// +static bool CollectBSwapParts(Value *V, int OverallLeftShift, uint32_t ByteMask, + SmallVector<Value*, 8> &ByteValues) { + if (Instruction *I = dyn_cast<Instruction>(V)) { + // If this is an or instruction, it may be an inner node of the bswap. + if (I->getOpcode() == Instruction::Or) { + return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask, + ByteValues) || + CollectBSwapParts(I->getOperand(1), OverallLeftShift, ByteMask, + ByteValues); + } + + // If this is a logical shift by a constant multiple of 8, recurse with + // OverallLeftShift and ByteMask adjusted. + if (I->isLogicalShift() && isa<ConstantInt>(I->getOperand(1))) { + unsigned ShAmt = + cast<ConstantInt>(I->getOperand(1))->getLimitedValue(~0U); + // Ensure the shift amount is defined and of a byte value. + if ((ShAmt & 7) || (ShAmt > 8*ByteValues.size())) + return true; + + unsigned ByteShift = ShAmt >> 3; + if (I->getOpcode() == Instruction::Shl) { + // X << 2 -> collect(X, +2) + OverallLeftShift += ByteShift; + ByteMask >>= ByteShift; + } else { + // X >>u 2 -> collect(X, -2) + OverallLeftShift -= ByteShift; + ByteMask <<= ByteShift; + ByteMask &= (~0U >> (32-ByteValues.size())); + } + + if (OverallLeftShift >= (int)ByteValues.size()) return true; + if (OverallLeftShift <= -(int)ByteValues.size()) return true; + + return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask, + ByteValues); + } + + // If this is a logical 'and' with a mask that clears bytes, clear the + // corresponding bytes in ByteMask. + if (I->getOpcode() == Instruction::And && + isa<ConstantInt>(I->getOperand(1))) { + // Scan every byte of the and mask, seeing if the byte is either 0 or 255. + unsigned NumBytes = ByteValues.size(); + APInt Byte(I->getType()->getPrimitiveSizeInBits(), 255); + const APInt &AndMask = cast<ConstantInt>(I->getOperand(1))->getValue(); + + for (unsigned i = 0; i != NumBytes; ++i, Byte <<= 8) { + // If this byte is masked out by a later operation, we don't care what + // the and mask is. + if ((ByteMask & (1 << i)) == 0) + continue; + + // If the AndMask is all zeros for this byte, clear the bit. + APInt MaskB = AndMask & Byte; + if (MaskB == 0) { + ByteMask &= ~(1U << i); + continue; + } + + // If the AndMask is not all ones for this byte, it's not a bytezap. + if (MaskB != Byte) + return true; + + // Otherwise, this byte is kept. + } + + return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask, + ByteValues); + } + } + + // Okay, we got to something that isn't a shift, 'or' or 'and'. This must be + // the input value to the bswap. Some observations: 1) if more than one byte + // is demanded from this input, then it could not be successfully assembled + // into a byteswap. At least one of the two bytes would not be aligned with + // their ultimate destination. + if (!isPowerOf2_32(ByteMask)) return true; + unsigned InputByteNo = CountTrailingZeros_32(ByteMask); + + // 2) The input and ultimate destinations must line up: if byte 3 of an i32 + // is demanded, it needs to go into byte 0 of the result. This means that the + // byte needs to be shifted until it lands in the right byte bucket. The + // shift amount depends on the position: if the byte is coming from the high + // part of the value (e.g. byte 3) then it must be shifted right. If from the + // low part, it must be shifted left. + unsigned DestByteNo = InputByteNo + OverallLeftShift; + if (InputByteNo < ByteValues.size()/2) { + if (ByteValues.size()-1-DestByteNo != InputByteNo) + return true; + } else { + if (ByteValues.size()-1-DestByteNo != InputByteNo) + return true; + } + + // If the destination byte value is already defined, the values are or'd + // together, which isn't a bswap (unless it's an or of the same bits). + if (ByteValues[DestByteNo] && ByteValues[DestByteNo] != V) + return true; + ByteValues[DestByteNo] = V; + return false; +} + +/// MatchBSwap - Given an OR instruction, check to see if this is a bswap idiom. +/// If so, insert the new bswap intrinsic and return it. +Instruction *InstCombiner::MatchBSwap(BinaryOperator &I) { + const IntegerType *ITy = dyn_cast<IntegerType>(I.getType()); + if (!ITy || ITy->getBitWidth() % 16 || + // ByteMask only allows up to 32-byte values. + ITy->getBitWidth() > 32*8) + return 0; // Can only bswap pairs of bytes. Can't do vectors. + + /// ByteValues - For each byte of the result, we keep track of which value + /// defines each byte. + SmallVector<Value*, 8> ByteValues; + ByteValues.resize(ITy->getBitWidth()/8); + + // Try to find all the pieces corresponding to the bswap. + uint32_t ByteMask = ~0U >> (32-ByteValues.size()); + if (CollectBSwapParts(&I, 0, ByteMask, ByteValues)) + return 0; + + // Check to see if all of the bytes come from the same value. + Value *V = ByteValues[0]; + if (V == 0) return 0; // Didn't find a byte? Must be zero. + + // Check to make sure that all of the bytes come from the same value. + for (unsigned i = 1, e = ByteValues.size(); i != e; ++i) + if (ByteValues[i] != V) + return 0; + const Type *Tys[] = { ITy }; + Module *M = I.getParent()->getParent()->getParent(); + Function *F = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1); + return CallInst::Create(F, V); +} + +/// MatchSelectFromAndOr - We have an expression of the form (A&C)|(B&D). Check +/// If A is (cond?-1:0) and either B or D is ~(cond?-1,0) or (cond?0,-1), then +/// we can simplify this expression to "cond ? C : D or B". +static Instruction *MatchSelectFromAndOr(Value *A, Value *B, + Value *C, Value *D) { + // If A is not a select of -1/0, this cannot match. + Value *Cond = 0; + if (!match(A, m_SExt(m_Value(Cond))) || + !Cond->getType()->isIntegerTy(1)) + return 0; + + // ((cond?-1:0)&C) | (B&(cond?0:-1)) -> cond ? C : B. + if (match(D, m_Not(m_SExt(m_Specific(Cond))))) + return SelectInst::Create(Cond, C, B); + if (match(D, m_SExt(m_Not(m_Specific(Cond))))) + return SelectInst::Create(Cond, C, B); + + // ((cond?-1:0)&C) | ((cond?0:-1)&D) -> cond ? C : D. + if (match(B, m_Not(m_SExt(m_Specific(Cond))))) + return SelectInst::Create(Cond, C, D); + if (match(B, m_SExt(m_Not(m_Specific(Cond))))) + return SelectInst::Create(Cond, C, D); + return 0; +} + +/// FoldOrOfICmps - Fold (icmp)|(icmp) if possible. +Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) { + ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate(); + + // (icmp1 A, B) | (icmp2 A, B) --> (icmp3 A, B) + if (PredicatesFoldable(LHSCC, RHSCC)) { + if (LHS->getOperand(0) == RHS->getOperand(1) && + LHS->getOperand(1) == RHS->getOperand(0)) + LHS->swapOperands(); + if (LHS->getOperand(0) == RHS->getOperand(0) && + LHS->getOperand(1) == RHS->getOperand(1)) { + Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1); + unsigned Code = getICmpCode(LHS) | getICmpCode(RHS); + bool isSigned = LHS->isSigned() || RHS->isSigned(); + return getICmpValue(isSigned, Code, Op0, Op1, Builder); + } + } + + // handle (roughly): + // (icmp ne (A & B), C) | (icmp ne (A & D), E) + if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, ICmpInst::ICMP_NE, Builder)) + return V; + + // This only handles icmp of constants: (icmp1 A, C1) | (icmp2 B, C2). + Value *Val = LHS->getOperand(0), *Val2 = RHS->getOperand(0); + ConstantInt *LHSCst = dyn_cast<ConstantInt>(LHS->getOperand(1)); + ConstantInt *RHSCst = dyn_cast<ConstantInt>(RHS->getOperand(1)); + if (LHSCst == 0 || RHSCst == 0) return 0; + + if (LHSCst == RHSCst && LHSCC == RHSCC) { + // (icmp ne A, 0) | (icmp ne B, 0) --> (icmp ne (A|B), 0) + if (LHSCC == ICmpInst::ICMP_NE && LHSCst->isZero()) { + Value *NewOr = Builder->CreateOr(Val, Val2); + return Builder->CreateICmp(LHSCC, NewOr, LHSCst); + } + } + + // (icmp ult (X + CA), C1) | (icmp eq X, C2) -> (icmp ule (X + CA), C1) + // iff C2 + CA == C1. + if (LHSCC == ICmpInst::ICMP_ULT && RHSCC == ICmpInst::ICMP_EQ) { + ConstantInt *AddCst; + if (match(Val, m_Add(m_Specific(Val2), m_ConstantInt(AddCst)))) + if (RHSCst->getValue() + AddCst->getValue() == LHSCst->getValue()) + return Builder->CreateICmpULE(Val, LHSCst); + } + + // From here on, we only handle: + // (icmp1 A, C1) | (icmp2 A, C2) --> something simpler. + if (Val != Val2) return 0; + + // ICMP_[US][GL]E X, CST is folded to ICMP_[US][GL]T elsewhere. + if (LHSCC == ICmpInst::ICMP_UGE || LHSCC == ICmpInst::ICMP_ULE || + RHSCC == ICmpInst::ICMP_UGE || RHSCC == ICmpInst::ICMP_ULE || + LHSCC == ICmpInst::ICMP_SGE || LHSCC == ICmpInst::ICMP_SLE || + RHSCC == ICmpInst::ICMP_SGE || RHSCC == ICmpInst::ICMP_SLE) + return 0; + + // We can't fold (ugt x, C) | (sgt x, C2). + if (!PredicatesFoldable(LHSCC, RHSCC)) + return 0; + + // Ensure that the larger constant is on the RHS. + bool ShouldSwap; + if (CmpInst::isSigned(LHSCC) || + (ICmpInst::isEquality(LHSCC) && + CmpInst::isSigned(RHSCC))) + ShouldSwap = LHSCst->getValue().sgt(RHSCst->getValue()); + else + ShouldSwap = LHSCst->getValue().ugt(RHSCst->getValue()); + + if (ShouldSwap) { + std::swap(LHS, RHS); + std::swap(LHSCst, RHSCst); + std::swap(LHSCC, RHSCC); + } + + // At this point, we know we have two icmp instructions + // comparing a value against two constants and or'ing the result + // together. Because of the above check, we know that we only have + // ICMP_EQ, ICMP_NE, ICMP_LT, and ICMP_GT here. We also know (from the + // icmp folding check above), that the two constants are not + // equal. + assert(LHSCst != RHSCst && "Compares not folded above?"); + + switch (LHSCC) { + default: llvm_unreachable("Unknown integer condition code!"); + case ICmpInst::ICMP_EQ: + switch (RHSCC) { + default: llvm_unreachable("Unknown integer condition code!"); + case ICmpInst::ICMP_EQ: + if (LHSCst == SubOne(RHSCst)) { + // (X == 13 | X == 14) -> X-13 <u 2 + Constant *AddCST = ConstantExpr::getNeg(LHSCst); + Value *Add = Builder->CreateAdd(Val, AddCST, Val->getName()+".off"); + AddCST = ConstantExpr::getSub(AddOne(RHSCst), LHSCst); + return Builder->CreateICmpULT(Add, AddCST); + } + break; // (X == 13 | X == 15) -> no change + case ICmpInst::ICMP_UGT: // (X == 13 | X u> 14) -> no change + case ICmpInst::ICMP_SGT: // (X == 13 | X s> 14) -> no change + break; + case ICmpInst::ICMP_NE: // (X == 13 | X != 15) -> X != 15 + case ICmpInst::ICMP_ULT: // (X == 13 | X u< 15) -> X u< 15 + case ICmpInst::ICMP_SLT: // (X == 13 | X s< 15) -> X s< 15 + return RHS; + } + break; + case ICmpInst::ICMP_NE: + switch (RHSCC) { + default: llvm_unreachable("Unknown integer condition code!"); + case ICmpInst::ICMP_EQ: // (X != 13 | X == 15) -> X != 13 + case ICmpInst::ICMP_UGT: // (X != 13 | X u> 15) -> X != 13 + case ICmpInst::ICMP_SGT: // (X != 13 | X s> 15) -> X != 13 + return LHS; + case ICmpInst::ICMP_NE: // (X != 13 | X != 15) -> true + case ICmpInst::ICMP_ULT: // (X != 13 | X u< 15) -> true + case ICmpInst::ICMP_SLT: // (X != 13 | X s< 15) -> true + return ConstantInt::getTrue(LHS->getContext()); + } + break; + case ICmpInst::ICMP_ULT: + switch (RHSCC) { + default: llvm_unreachable("Unknown integer condition code!"); + case ICmpInst::ICMP_EQ: // (X u< 13 | X == 14) -> no change + break; + case ICmpInst::ICMP_UGT: // (X u< 13 | X u> 15) -> (X-13) u> 2 + // If RHSCst is [us]MAXINT, it is always false. Not handling + // this can cause overflow. + if (RHSCst->isMaxValue(false)) + return LHS; + return InsertRangeTest(Val, LHSCst, AddOne(RHSCst), false, false); + case ICmpInst::ICMP_SGT: // (X u< 13 | X s> 15) -> no change + break; + case ICmpInst::ICMP_NE: // (X u< 13 | X != 15) -> X != 15 + case ICmpInst::ICMP_ULT: // (X u< 13 | X u< 15) -> X u< 15 + return RHS; + case ICmpInst::ICMP_SLT: // (X u< 13 | X s< 15) -> no change + break; + } + break; + case ICmpInst::ICMP_SLT: + switch (RHSCC) { + default: llvm_unreachable("Unknown integer condition code!"); + case ICmpInst::ICMP_EQ: // (X s< 13 | X == 14) -> no change + break; + case ICmpInst::ICMP_SGT: // (X s< 13 | X s> 15) -> (X-13) s> 2 + // If RHSCst is [us]MAXINT, it is always false. Not handling + // this can cause overflow. + if (RHSCst->isMaxValue(true)) + return LHS; + return InsertRangeTest(Val, LHSCst, AddOne(RHSCst), true, false); + case ICmpInst::ICMP_UGT: // (X s< 13 | X u> 15) -> no change + break; + case ICmpInst::ICMP_NE: // (X s< 13 | X != 15) -> X != 15 + case ICmpInst::ICMP_SLT: // (X s< 13 | X s< 15) -> X s< 15 + return RHS; + case ICmpInst::ICMP_ULT: // (X s< 13 | X u< 15) -> no change + break; + } + break; + case ICmpInst::ICMP_UGT: + switch (RHSCC) { + default: llvm_unreachable("Unknown integer condition code!"); + case ICmpInst::ICMP_EQ: // (X u> 13 | X == 15) -> X u> 13 + case ICmpInst::ICMP_UGT: // (X u> 13 | X u> 15) -> X u> 13 + return LHS; + case ICmpInst::ICMP_SGT: // (X u> 13 | X s> 15) -> no change + break; + case ICmpInst::ICMP_NE: // (X u> 13 | X != 15) -> true + case ICmpInst::ICMP_ULT: // (X u> 13 | X u< 15) -> true + return ConstantInt::getTrue(LHS->getContext()); + case ICmpInst::ICMP_SLT: // (X u> 13 | X s< 15) -> no change + break; + } + break; + case ICmpInst::ICMP_SGT: + switch (RHSCC) { + default: llvm_unreachable("Unknown integer condition code!"); + case ICmpInst::ICMP_EQ: // (X s> 13 | X == 15) -> X > 13 + case ICmpInst::ICMP_SGT: // (X s> 13 | X s> 15) -> X > 13 + return LHS; + case ICmpInst::ICMP_UGT: // (X s> 13 | X u> 15) -> no change + break; + case ICmpInst::ICMP_NE: // (X s> 13 | X != 15) -> true + case ICmpInst::ICMP_SLT: // (X s> 13 | X s< 15) -> true + return ConstantInt::getTrue(LHS->getContext()); + case ICmpInst::ICMP_ULT: // (X s> 13 | X u< 15) -> no change + break; + } + break; + } + return 0; +} + +/// FoldOrOfFCmps - Optimize (fcmp)|(fcmp). NOTE: Unlike the rest of +/// instcombine, this returns a Value which should already be inserted into the +/// function. +Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { + if (LHS->getPredicate() == FCmpInst::FCMP_UNO && + RHS->getPredicate() == FCmpInst::FCMP_UNO && + LHS->getOperand(0)->getType() == RHS->getOperand(0)->getType()) { + if (ConstantFP *LHSC = dyn_cast<ConstantFP>(LHS->getOperand(1))) + if (ConstantFP *RHSC = dyn_cast<ConstantFP>(RHS->getOperand(1))) { + // If either of the constants are nans, then the whole thing returns + // true. + if (LHSC->getValueAPF().isNaN() || RHSC->getValueAPF().isNaN()) + return ConstantInt::getTrue(LHS->getContext()); + + // Otherwise, no need to compare the two constants, compare the + // rest. + return Builder->CreateFCmpUNO(LHS->getOperand(0), RHS->getOperand(0)); + } + + // Handle vector zeros. This occurs because the canonical form of + // "fcmp uno x,x" is "fcmp uno x, 0". + if (isa<ConstantAggregateZero>(LHS->getOperand(1)) && + isa<ConstantAggregateZero>(RHS->getOperand(1))) + return Builder->CreateFCmpUNO(LHS->getOperand(0), RHS->getOperand(0)); + + return 0; + } + + Value *Op0LHS = LHS->getOperand(0), *Op0RHS = LHS->getOperand(1); + Value *Op1LHS = RHS->getOperand(0), *Op1RHS = RHS->getOperand(1); + FCmpInst::Predicate Op0CC = LHS->getPredicate(), Op1CC = RHS->getPredicate(); + + if (Op0LHS == Op1RHS && Op0RHS == Op1LHS) { + // Swap RHS operands to match LHS. + Op1CC = FCmpInst::getSwappedPredicate(Op1CC); + std::swap(Op1LHS, Op1RHS); + } + if (Op0LHS == Op1LHS && Op0RHS == Op1RHS) { + // Simplify (fcmp cc0 x, y) | (fcmp cc1 x, y). + if (Op0CC == Op1CC) + return Builder->CreateFCmp((FCmpInst::Predicate)Op0CC, Op0LHS, Op0RHS); + if (Op0CC == FCmpInst::FCMP_TRUE || Op1CC == FCmpInst::FCMP_TRUE) + return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 1); + if (Op0CC == FCmpInst::FCMP_FALSE) + return RHS; + if (Op1CC == FCmpInst::FCMP_FALSE) + return LHS; + bool Op0Ordered; + bool Op1Ordered; + unsigned Op0Pred = getFCmpCode(Op0CC, Op0Ordered); + unsigned Op1Pred = getFCmpCode(Op1CC, Op1Ordered); + if (Op0Ordered == Op1Ordered) { + // If both are ordered or unordered, return a new fcmp with + // or'ed predicates. + return getFCmpValue(Op0Ordered, Op0Pred|Op1Pred, Op0LHS, Op0RHS, Builder); + } + } + return 0; +} + +/// FoldOrWithConstants - This helper function folds: +/// +/// ((A | B) & C1) | (B & C2) +/// +/// into: +/// +/// (A & C1) | B +/// +/// when the XOR of the two constants is "all ones" (-1). +Instruction *InstCombiner::FoldOrWithConstants(BinaryOperator &I, Value *Op, + Value *A, Value *B, Value *C) { + ConstantInt *CI1 = dyn_cast<ConstantInt>(C); + if (!CI1) return 0; + + Value *V1 = 0; + ConstantInt *CI2 = 0; + if (!match(Op, m_And(m_Value(V1), m_ConstantInt(CI2)))) return 0; + + APInt Xor = CI1->getValue() ^ CI2->getValue(); + if (!Xor.isAllOnesValue()) return 0; + + if (V1 == A || V1 == B) { + Value *NewOp = Builder->CreateAnd((V1 == A) ? B : A, CI1); + return BinaryOperator::CreateOr(NewOp, V1); + } + + return 0; +} + +Instruction *InstCombiner::visitOr(BinaryOperator &I) { + bool Changed = SimplifyAssociativeOrCommutative(I); + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + + if (Value *V = SimplifyOrInst(Op0, Op1, TD)) + return ReplaceInstUsesWith(I, V); + + // (A&B)|(A&C) -> A&(B|C) etc + if (Value *V = SimplifyUsingDistributiveLaws(I)) + return ReplaceInstUsesWith(I, V); + + // See if we can simplify any instructions used by the instruction whose sole + // purpose is to compute bits we don't care about. + if (SimplifyDemandedInstructionBits(I)) + return &I; + + if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) { + ConstantInt *C1 = 0; Value *X = 0; + // (X & C1) | C2 --> (X | C2) & (C1|C2) + // iff (C1 & C2) == 0. + if (match(Op0, m_And(m_Value(X), m_ConstantInt(C1))) && + (RHS->getValue() & C1->getValue()) != 0 && + Op0->hasOneUse()) { + Value *Or = Builder->CreateOr(X, RHS); + Or->takeName(Op0); + return BinaryOperator::CreateAnd(Or, + ConstantInt::get(I.getContext(), + RHS->getValue() | C1->getValue())); + } + + // (X ^ C1) | C2 --> (X | C2) ^ (C1&~C2) + if (match(Op0, m_Xor(m_Value(X), m_ConstantInt(C1))) && + Op0->hasOneUse()) { + Value *Or = Builder->CreateOr(X, RHS); + Or->takeName(Op0); + return BinaryOperator::CreateXor(Or, + ConstantInt::get(I.getContext(), + C1->getValue() & ~RHS->getValue())); + } + + // Try to fold constant and into select arguments. + if (SelectInst *SI = dyn_cast<SelectInst>(Op0)) + if (Instruction *R = FoldOpIntoSelect(I, SI)) + return R; + + if (isa<PHINode>(Op0)) + if (Instruction *NV = FoldOpIntoPhi(I)) + return NV; + } + + Value *A = 0, *B = 0; + ConstantInt *C1 = 0, *C2 = 0; + + // (A | B) | C and A | (B | C) -> bswap if possible. + // (A >> B) | (C << D) and (A << B) | (B >> C) -> bswap if possible. + if (match(Op0, m_Or(m_Value(), m_Value())) || + match(Op1, m_Or(m_Value(), m_Value())) || + (match(Op0, m_LogicalShift(m_Value(), m_Value())) && + match(Op1, m_LogicalShift(m_Value(), m_Value())))) { + if (Instruction *BSwap = MatchBSwap(I)) + return BSwap; + } + + // (X^C)|Y -> (X|Y)^C iff Y&C == 0 + if (Op0->hasOneUse() && + match(Op0, m_Xor(m_Value(A), m_ConstantInt(C1))) && + MaskedValueIsZero(Op1, C1->getValue())) { + Value *NOr = Builder->CreateOr(A, Op1); + NOr->takeName(Op0); + return BinaryOperator::CreateXor(NOr, C1); + } + + // Y|(X^C) -> (X|Y)^C iff Y&C == 0 + if (Op1->hasOneUse() && + match(Op1, m_Xor(m_Value(A), m_ConstantInt(C1))) && + MaskedValueIsZero(Op0, C1->getValue())) { + Value *NOr = Builder->CreateOr(A, Op0); + NOr->takeName(Op0); + return BinaryOperator::CreateXor(NOr, C1); + } + + // (A & C)|(B & D) + Value *C = 0, *D = 0; + if (match(Op0, m_And(m_Value(A), m_Value(C))) && + match(Op1, m_And(m_Value(B), m_Value(D)))) { + Value *V1 = 0, *V2 = 0; + C1 = dyn_cast<ConstantInt>(C); + C2 = dyn_cast<ConstantInt>(D); + if (C1 && C2) { // (A & C1)|(B & C2) + // If we have: ((V + N) & C1) | (V & C2) + // .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0 + // replace with V+N. + if (C1->getValue() == ~C2->getValue()) { + if ((C2->getValue() & (C2->getValue()+1)) == 0 && // C2 == 0+1+ + match(A, m_Add(m_Value(V1), m_Value(V2)))) { + // Add commutes, try both ways. + if (V1 == B && MaskedValueIsZero(V2, C2->getValue())) + return ReplaceInstUsesWith(I, A); + if (V2 == B && MaskedValueIsZero(V1, C2->getValue())) + return ReplaceInstUsesWith(I, A); + } + // Or commutes, try both ways. + if ((C1->getValue() & (C1->getValue()+1)) == 0 && + match(B, m_Add(m_Value(V1), m_Value(V2)))) { + // Add commutes, try both ways. + if (V1 == A && MaskedValueIsZero(V2, C1->getValue())) + return ReplaceInstUsesWith(I, B); + if (V2 == A && MaskedValueIsZero(V1, C1->getValue())) + return ReplaceInstUsesWith(I, B); + } + } + + if ((C1->getValue() & C2->getValue()) == 0) { + // ((V | N) & C1) | (V & C2) --> (V|N) & (C1|C2) + // iff (C1&C2) == 0 and (N&~C1) == 0 + if (match(A, m_Or(m_Value(V1), m_Value(V2))) && + ((V1 == B && MaskedValueIsZero(V2, ~C1->getValue())) || // (V|N) + (V2 == B && MaskedValueIsZero(V1, ~C1->getValue())))) // (N|V) + return BinaryOperator::CreateAnd(A, + ConstantInt::get(A->getContext(), + C1->getValue()|C2->getValue())); + // Or commutes, try both ways. + if (match(B, m_Or(m_Value(V1), m_Value(V2))) && + ((V1 == A && MaskedValueIsZero(V2, ~C2->getValue())) || // (V|N) + (V2 == A && MaskedValueIsZero(V1, ~C2->getValue())))) // (N|V) + return BinaryOperator::CreateAnd(B, + ConstantInt::get(B->getContext(), + C1->getValue()|C2->getValue())); + + // ((V|C3)&C1) | ((V|C4)&C2) --> (V|C3|C4)&(C1|C2) + // iff (C1&C2) == 0 and (C3&~C1) == 0 and (C4&~C2) == 0. + ConstantInt *C3 = 0, *C4 = 0; + if (match(A, m_Or(m_Value(V1), m_ConstantInt(C3))) && + (C3->getValue() & ~C1->getValue()) == 0 && + match(B, m_Or(m_Specific(V1), m_ConstantInt(C4))) && + (C4->getValue() & ~C2->getValue()) == 0) { + V2 = Builder->CreateOr(V1, ConstantExpr::getOr(C3, C4), "bitfield"); + return BinaryOperator::CreateAnd(V2, + ConstantInt::get(B->getContext(), + C1->getValue()|C2->getValue())); + } + } + } + + // (A & (C0?-1:0)) | (B & ~(C0?-1:0)) -> C0 ? A : B, and commuted variants. + // Don't do this for vector select idioms, the code generator doesn't handle + // them well yet. + if (!I.getType()->isVectorTy()) { + if (Instruction *Match = MatchSelectFromAndOr(A, B, C, D)) + return Match; + if (Instruction *Match = MatchSelectFromAndOr(B, A, D, C)) + return Match; + if (Instruction *Match = MatchSelectFromAndOr(C, B, A, D)) + return Match; + if (Instruction *Match = MatchSelectFromAndOr(D, A, B, C)) + return Match; + } + + // ((A&~B)|(~A&B)) -> A^B + if ((match(C, m_Not(m_Specific(D))) && + match(B, m_Not(m_Specific(A))))) + return BinaryOperator::CreateXor(A, D); + // ((~B&A)|(~A&B)) -> A^B + if ((match(A, m_Not(m_Specific(D))) && + match(B, m_Not(m_Specific(C))))) + return BinaryOperator::CreateXor(C, D); + // ((A&~B)|(B&~A)) -> A^B + if ((match(C, m_Not(m_Specific(B))) && + match(D, m_Not(m_Specific(A))))) + return BinaryOperator::CreateXor(A, B); + // ((~B&A)|(B&~A)) -> A^B + if ((match(A, m_Not(m_Specific(B))) && + match(D, m_Not(m_Specific(C))))) + return BinaryOperator::CreateXor(C, B); + + // ((A|B)&1)|(B&-2) -> (A&1) | B + if (match(A, m_Or(m_Value(V1), m_Specific(B))) || + match(A, m_Or(m_Specific(B), m_Value(V1)))) { + Instruction *Ret = FoldOrWithConstants(I, Op1, V1, B, C); + if (Ret) return Ret; + } + // (B&-2)|((A|B)&1) -> (A&1) | B + if (match(B, m_Or(m_Specific(A), m_Value(V1))) || + match(B, m_Or(m_Value(V1), m_Specific(A)))) { + Instruction *Ret = FoldOrWithConstants(I, Op0, A, V1, D); + if (Ret) return Ret; + } + } + + // (X >> Z) | (Y >> Z) -> (X|Y) >> Z for all shifts. + if (BinaryOperator *SI1 = dyn_cast<BinaryOperator>(Op1)) { + if (BinaryOperator *SI0 = dyn_cast<BinaryOperator>(Op0)) + if (SI0->isShift() && SI0->getOpcode() == SI1->getOpcode() && + SI0->getOperand(1) == SI1->getOperand(1) && + (SI0->hasOneUse() || SI1->hasOneUse())) { + Value *NewOp = Builder->CreateOr(SI0->getOperand(0), SI1->getOperand(0), + SI0->getName()); + return BinaryOperator::Create(SI1->getOpcode(), NewOp, + SI1->getOperand(1)); + } + } + + // (~A | ~B) == (~(A & B)) - De Morgan's Law + if (Value *Op0NotVal = dyn_castNotVal(Op0)) + if (Value *Op1NotVal = dyn_castNotVal(Op1)) + if (Op0->hasOneUse() && Op1->hasOneUse()) { + Value *And = Builder->CreateAnd(Op0NotVal, Op1NotVal, + I.getName()+".demorgan"); + return BinaryOperator::CreateNot(And); + } + + if (ICmpInst *RHS = dyn_cast<ICmpInst>(I.getOperand(1))) + if (ICmpInst *LHS = dyn_cast<ICmpInst>(I.getOperand(0))) + if (Value *Res = FoldOrOfICmps(LHS, RHS)) + return ReplaceInstUsesWith(I, Res); + + // (fcmp uno x, c) | (fcmp uno y, c) -> (fcmp uno x, y) + if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0))) + if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1))) + if (Value *Res = FoldOrOfFCmps(LHS, RHS)) + return ReplaceInstUsesWith(I, Res); + + // fold (or (cast A), (cast B)) -> (cast (or A, B)) + if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) { + CastInst *Op1C = dyn_cast<CastInst>(Op1); + if (Op1C && Op0C->getOpcode() == Op1C->getOpcode()) {// same cast kind ? + const Type *SrcTy = Op0C->getOperand(0)->getType(); + if (SrcTy == Op1C->getOperand(0)->getType() && + SrcTy->isIntOrIntVectorTy()) { + Value *Op0COp = Op0C->getOperand(0), *Op1COp = Op1C->getOperand(0); + + if ((!isa<ICmpInst>(Op0COp) || !isa<ICmpInst>(Op1COp)) && + // Only do this if the casts both really cause code to be + // generated. + ShouldOptimizeCast(Op0C->getOpcode(), Op0COp, I.getType()) && + ShouldOptimizeCast(Op1C->getOpcode(), Op1COp, I.getType())) { + Value *NewOp = Builder->CreateOr(Op0COp, Op1COp, I.getName()); + return CastInst::Create(Op0C->getOpcode(), NewOp, I.getType()); + } + + // If this is or(cast(icmp), cast(icmp)), try to fold this even if the + // cast is otherwise not optimizable. This happens for vector sexts. + if (ICmpInst *RHS = dyn_cast<ICmpInst>(Op1COp)) + if (ICmpInst *LHS = dyn_cast<ICmpInst>(Op0COp)) + if (Value *Res = FoldOrOfICmps(LHS, RHS)) + return CastInst::Create(Op0C->getOpcode(), Res, I.getType()); + + // If this is or(cast(fcmp), cast(fcmp)), try to fold this even if the + // cast is otherwise not optimizable. This happens for vector sexts. + if (FCmpInst *RHS = dyn_cast<FCmpInst>(Op1COp)) + if (FCmpInst *LHS = dyn_cast<FCmpInst>(Op0COp)) + if (Value *Res = FoldOrOfFCmps(LHS, RHS)) + return CastInst::Create(Op0C->getOpcode(), Res, I.getType()); + } + } + } + + // Note: If we've gotten to the point of visiting the outer OR, then the + // inner one couldn't be simplified. If it was a constant, then it won't + // be simplified by a later pass either, so we try swapping the inner/outer + // ORs in the hopes that we'll be able to simplify it this way. + // (X|C) | V --> (X|V) | C + if (Op0->hasOneUse() && !isa<ConstantInt>(Op1) && + match(Op0, m_Or(m_Value(A), m_ConstantInt(C1)))) { + Value *Inner = Builder->CreateOr(A, Op1); + Inner->takeName(Op0); + return BinaryOperator::CreateOr(Inner, C1); + } + + return Changed ? &I : 0; +} + +Instruction *InstCombiner::visitXor(BinaryOperator &I) { + bool Changed = SimplifyAssociativeOrCommutative(I); + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + + if (Value *V = SimplifyXorInst(Op0, Op1, TD)) + return ReplaceInstUsesWith(I, V); + + // (A&B)^(A&C) -> A&(B^C) etc + if (Value *V = SimplifyUsingDistributiveLaws(I)) + return ReplaceInstUsesWith(I, V); + + // See if we can simplify any instructions used by the instruction whose sole + // purpose is to compute bits we don't care about. + if (SimplifyDemandedInstructionBits(I)) + return &I; + + // Is this a ~ operation? + if (Value *NotOp = dyn_castNotVal(&I)) { + if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(NotOp)) { + if (Op0I->getOpcode() == Instruction::And || + Op0I->getOpcode() == Instruction::Or) { + // ~(~X & Y) --> (X | ~Y) - De Morgan's Law + // ~(~X | Y) === (X & ~Y) - De Morgan's Law + if (dyn_castNotVal(Op0I->getOperand(1))) + Op0I->swapOperands(); + if (Value *Op0NotVal = dyn_castNotVal(Op0I->getOperand(0))) { + Value *NotY = + Builder->CreateNot(Op0I->getOperand(1), + Op0I->getOperand(1)->getName()+".not"); + if (Op0I->getOpcode() == Instruction::And) + return BinaryOperator::CreateOr(Op0NotVal, NotY); + return BinaryOperator::CreateAnd(Op0NotVal, NotY); + } + + // ~(X & Y) --> (~X | ~Y) - De Morgan's Law + // ~(X | Y) === (~X & ~Y) - De Morgan's Law + if (isFreeToInvert(Op0I->getOperand(0)) && + isFreeToInvert(Op0I->getOperand(1))) { + Value *NotX = + Builder->CreateNot(Op0I->getOperand(0), "notlhs"); + Value *NotY = + Builder->CreateNot(Op0I->getOperand(1), "notrhs"); + if (Op0I->getOpcode() == Instruction::And) + return BinaryOperator::CreateOr(NotX, NotY); + return BinaryOperator::CreateAnd(NotX, NotY); + } + + } else if (Op0I->getOpcode() == Instruction::AShr) { + // ~(~X >>s Y) --> (X >>s Y) + if (Value *Op0NotVal = dyn_castNotVal(Op0I->getOperand(0))) + return BinaryOperator::CreateAShr(Op0NotVal, Op0I->getOperand(1)); + } + } + } + + + if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) { + if (RHS->isOne() && Op0->hasOneUse()) + // xor (cmp A, B), true = not (cmp A, B) = !cmp A, B + if (CmpInst *CI = dyn_cast<CmpInst>(Op0)) + return CmpInst::Create(CI->getOpcode(), + CI->getInversePredicate(), + CI->getOperand(0), CI->getOperand(1)); + + // fold (xor(zext(cmp)), 1) and (xor(sext(cmp)), -1) to ext(!cmp). + if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) { + if (CmpInst *CI = dyn_cast<CmpInst>(Op0C->getOperand(0))) { + if (CI->hasOneUse() && Op0C->hasOneUse()) { + Instruction::CastOps Opcode = Op0C->getOpcode(); + if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) && + (RHS == ConstantExpr::getCast(Opcode, + ConstantInt::getTrue(I.getContext()), + Op0C->getDestTy()))) { + CI->setPredicate(CI->getInversePredicate()); + return CastInst::Create(Opcode, CI, Op0C->getType()); + } + } + } + } + + if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0)) { + // ~(c-X) == X-c-1 == X+(-c-1) + if (Op0I->getOpcode() == Instruction::Sub && RHS->isAllOnesValue()) + if (Constant *Op0I0C = dyn_cast<Constant>(Op0I->getOperand(0))) { + Constant *NegOp0I0C = ConstantExpr::getNeg(Op0I0C); + Constant *ConstantRHS = ConstantExpr::getSub(NegOp0I0C, + ConstantInt::get(I.getType(), 1)); + return BinaryOperator::CreateAdd(Op0I->getOperand(1), ConstantRHS); + } + + if (ConstantInt *Op0CI = dyn_cast<ConstantInt>(Op0I->getOperand(1))) { + if (Op0I->getOpcode() == Instruction::Add) { + // ~(X-c) --> (-c-1)-X + if (RHS->isAllOnesValue()) { + Constant *NegOp0CI = ConstantExpr::getNeg(Op0CI); + return BinaryOperator::CreateSub( + ConstantExpr::getSub(NegOp0CI, + ConstantInt::get(I.getType(), 1)), + Op0I->getOperand(0)); + } else if (RHS->getValue().isSignBit()) { + // (X + C) ^ signbit -> (X + C + signbit) + Constant *C = ConstantInt::get(I.getContext(), + RHS->getValue() + Op0CI->getValue()); + return BinaryOperator::CreateAdd(Op0I->getOperand(0), C); + + } + } else if (Op0I->getOpcode() == Instruction::Or) { + // (X|C1)^C2 -> X^(C1|C2) iff X&~C1 == 0 + if (MaskedValueIsZero(Op0I->getOperand(0), Op0CI->getValue())) { + Constant *NewRHS = ConstantExpr::getOr(Op0CI, RHS); + // Anything in both C1 and C2 is known to be zero, remove it from + // NewRHS. + Constant *CommonBits = ConstantExpr::getAnd(Op0CI, RHS); + NewRHS = ConstantExpr::getAnd(NewRHS, + ConstantExpr::getNot(CommonBits)); + Worklist.Add(Op0I); + I.setOperand(0, Op0I->getOperand(0)); + I.setOperand(1, NewRHS); + return &I; + } + } + } + } + + // Try to fold constant and into select arguments. + if (SelectInst *SI = dyn_cast<SelectInst>(Op0)) + if (Instruction *R = FoldOpIntoSelect(I, SI)) + return R; + if (isa<PHINode>(Op0)) + if (Instruction *NV = FoldOpIntoPhi(I)) + return NV; + } + + BinaryOperator *Op1I = dyn_cast<BinaryOperator>(Op1); + if (Op1I) { + Value *A, *B; + if (match(Op1I, m_Or(m_Value(A), m_Value(B)))) { + if (A == Op0) { // B^(B|A) == (A|B)^B + Op1I->swapOperands(); + I.swapOperands(); + std::swap(Op0, Op1); + } else if (B == Op0) { // B^(A|B) == (A|B)^B + I.swapOperands(); // Simplified below. + std::swap(Op0, Op1); + } + } else if (match(Op1I, m_And(m_Value(A), m_Value(B))) && + Op1I->hasOneUse()){ + if (A == Op0) { // A^(A&B) -> A^(B&A) + Op1I->swapOperands(); + std::swap(A, B); + } + if (B == Op0) { // A^(B&A) -> (B&A)^A + I.swapOperands(); // Simplified below. + std::swap(Op0, Op1); + } + } + } + + BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0); + if (Op0I) { + Value *A, *B; + if (match(Op0I, m_Or(m_Value(A), m_Value(B))) && + Op0I->hasOneUse()) { + if (A == Op1) // (B|A)^B == (A|B)^B + std::swap(A, B); + if (B == Op1) // (A|B)^B == A & ~B + return BinaryOperator::CreateAnd(A, Builder->CreateNot(Op1, "tmp")); + } else if (match(Op0I, m_And(m_Value(A), m_Value(B))) && + Op0I->hasOneUse()){ + if (A == Op1) // (A&B)^A -> (B&A)^A + std::swap(A, B); + if (B == Op1 && // (B&A)^A == ~B & A + !isa<ConstantInt>(Op1)) { // Canonical form is (B&C)^C + return BinaryOperator::CreateAnd(Builder->CreateNot(A, "tmp"), Op1); + } + } + } + + // (X >> Z) ^ (Y >> Z) -> (X^Y) >> Z for all shifts. + if (Op0I && Op1I && Op0I->isShift() && + Op0I->getOpcode() == Op1I->getOpcode() && + Op0I->getOperand(1) == Op1I->getOperand(1) && + (Op1I->hasOneUse() || Op1I->hasOneUse())) { + Value *NewOp = + Builder->CreateXor(Op0I->getOperand(0), Op1I->getOperand(0), + Op0I->getName()); + return BinaryOperator::Create(Op1I->getOpcode(), NewOp, + Op1I->getOperand(1)); + } + + if (Op0I && Op1I) { + Value *A, *B, *C, *D; + // (A & B)^(A | B) -> A ^ B + if (match(Op0I, m_And(m_Value(A), m_Value(B))) && + match(Op1I, m_Or(m_Value(C), m_Value(D)))) { + if ((A == C && B == D) || (A == D && B == C)) + return BinaryOperator::CreateXor(A, B); + } + // (A | B)^(A & B) -> A ^ B + if (match(Op0I, m_Or(m_Value(A), m_Value(B))) && + match(Op1I, m_And(m_Value(C), m_Value(D)))) { + if ((A == C && B == D) || (A == D && B == C)) + return BinaryOperator::CreateXor(A, B); + } + } + + // (icmp1 A, B) ^ (icmp2 A, B) --> (icmp3 A, B) + if (ICmpInst *RHS = dyn_cast<ICmpInst>(I.getOperand(1))) + if (ICmpInst *LHS = dyn_cast<ICmpInst>(I.getOperand(0))) + if (PredicatesFoldable(LHS->getPredicate(), RHS->getPredicate())) { + if (LHS->getOperand(0) == RHS->getOperand(1) && + LHS->getOperand(1) == RHS->getOperand(0)) + LHS->swapOperands(); + if (LHS->getOperand(0) == RHS->getOperand(0) && + LHS->getOperand(1) == RHS->getOperand(1)) { + Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1); + unsigned Code = getICmpCode(LHS) ^ getICmpCode(RHS); + bool isSigned = LHS->isSigned() || RHS->isSigned(); + return ReplaceInstUsesWith(I, + getICmpValue(isSigned, Code, Op0, Op1, Builder)); + } + } + + // fold (xor (cast A), (cast B)) -> (cast (xor A, B)) + if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) { + if (CastInst *Op1C = dyn_cast<CastInst>(Op1)) + if (Op0C->getOpcode() == Op1C->getOpcode()) { // same cast kind? + const Type *SrcTy = Op0C->getOperand(0)->getType(); + if (SrcTy == Op1C->getOperand(0)->getType() && SrcTy->isIntegerTy() && + // Only do this if the casts both really cause code to be generated. + ShouldOptimizeCast(Op0C->getOpcode(), Op0C->getOperand(0), + I.getType()) && + ShouldOptimizeCast(Op1C->getOpcode(), Op1C->getOperand(0), + I.getType())) { + Value *NewOp = Builder->CreateXor(Op0C->getOperand(0), + Op1C->getOperand(0), I.getName()); + return CastInst::Create(Op0C->getOpcode(), NewOp, I.getType()); + } + } + } + + return Changed ? &I : 0; +} diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp new file mode 100644 index 0000000..8449f7b --- /dev/null +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -0,0 +1,1249 @@ +//===- InstCombineCalls.cpp -----------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the visitCall and visitInvoke functions. +// +//===----------------------------------------------------------------------===// + +#include "InstCombine.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Transforms/Utils/BuildLibCalls.h" +#include "llvm/Transforms/Utils/Local.h" +using namespace llvm; + +/// getPromotedType - Return the specified type promoted as it would be to pass +/// though a va_arg area. +static const Type *getPromotedType(const Type *Ty) { + if (const IntegerType* ITy = dyn_cast<IntegerType>(Ty)) { + if (ITy->getBitWidth() < 32) + return Type::getInt32Ty(Ty->getContext()); + } + return Ty; +} + + +Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) { + unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), TD); + unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), TD); + unsigned MinAlign = std::min(DstAlign, SrcAlign); + unsigned CopyAlign = MI->getAlignment(); + + if (CopyAlign < MinAlign) { + MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), + MinAlign, false)); + return MI; + } + + // If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with + // load/store. + ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getArgOperand(2)); + if (MemOpLength == 0) return 0; + + // Source and destination pointer types are always "i8*" for intrinsic. See + // if the size is something we can handle with a single primitive load/store. + // A single load+store correctly handles overlapping memory in the memmove + // case. + unsigned Size = MemOpLength->getZExtValue(); + if (Size == 0) return MI; // Delete this mem transfer. + + if (Size > 8 || (Size&(Size-1))) + return 0; // If not 1/2/4/8 bytes, exit. + + // Use an integer load+store unless we can find something better. + unsigned SrcAddrSp = + cast<PointerType>(MI->getArgOperand(1)->getType())->getAddressSpace(); + unsigned DstAddrSp = + cast<PointerType>(MI->getArgOperand(0)->getType())->getAddressSpace(); + + const IntegerType* IntType = IntegerType::get(MI->getContext(), Size<<3); + Type *NewSrcPtrTy = PointerType::get(IntType, SrcAddrSp); + Type *NewDstPtrTy = PointerType::get(IntType, DstAddrSp); + + // Memcpy forces the use of i8* for the source and destination. That means + // that if you're using memcpy to move one double around, you'll get a cast + // from double* to i8*. We'd much rather use a double load+store rather than + // an i64 load+store, here because this improves the odds that the source or + // dest address will be promotable. See if we can find a better type than the + // integer datatype. + Value *StrippedDest = MI->getArgOperand(0)->stripPointerCasts(); + if (StrippedDest != MI->getArgOperand(0)) { + const Type *SrcETy = cast<PointerType>(StrippedDest->getType()) + ->getElementType(); + if (TD && SrcETy->isSized() && TD->getTypeStoreSize(SrcETy) == Size) { + // The SrcETy might be something like {{{double}}} or [1 x double]. Rip + // down through these levels if so. + while (!SrcETy->isSingleValueType()) { + if (const StructType *STy = dyn_cast<StructType>(SrcETy)) { + if (STy->getNumElements() == 1) + SrcETy = STy->getElementType(0); + else + break; + } else if (const ArrayType *ATy = dyn_cast<ArrayType>(SrcETy)) { + if (ATy->getNumElements() == 1) + SrcETy = ATy->getElementType(); + else + break; + } else + break; + } + + if (SrcETy->isSingleValueType()) { + NewSrcPtrTy = PointerType::get(SrcETy, SrcAddrSp); + NewDstPtrTy = PointerType::get(SrcETy, DstAddrSp); + } + } + } + + + // If the memcpy/memmove provides better alignment info than we can + // infer, use it. + SrcAlign = std::max(SrcAlign, CopyAlign); + DstAlign = std::max(DstAlign, CopyAlign); + + Value *Src = Builder->CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy); + Value *Dest = Builder->CreateBitCast(MI->getArgOperand(0), NewDstPtrTy); + Instruction *L = new LoadInst(Src, "tmp", MI->isVolatile(), SrcAlign); + InsertNewInstBefore(L, *MI); + InsertNewInstBefore(new StoreInst(L, Dest, MI->isVolatile(), DstAlign), + *MI); + + // Set the size of the copy to 0, it will be deleted on the next iteration. + MI->setArgOperand(2, Constant::getNullValue(MemOpLength->getType())); + return MI; +} + +Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) { + unsigned Alignment = getKnownAlignment(MI->getDest(), TD); + if (MI->getAlignment() < Alignment) { + MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), + Alignment, false)); + return MI; + } + + // Extract the length and alignment and fill if they are constant. + ConstantInt *LenC = dyn_cast<ConstantInt>(MI->getLength()); + ConstantInt *FillC = dyn_cast<ConstantInt>(MI->getValue()); + if (!LenC || !FillC || !FillC->getType()->isIntegerTy(8)) + return 0; + uint64_t Len = LenC->getZExtValue(); + Alignment = MI->getAlignment(); + + // If the length is zero, this is a no-op + if (Len == 0) return MI; // memset(d,c,0,a) -> noop + + // memset(s,c,n) -> store s, c (for n=1,2,4,8) + if (Len <= 8 && isPowerOf2_32((uint32_t)Len)) { + const Type *ITy = IntegerType::get(MI->getContext(), Len*8); // n=1 -> i8. + + Value *Dest = MI->getDest(); + unsigned DstAddrSp = cast<PointerType>(Dest->getType())->getAddressSpace(); + Type *NewDstPtrTy = PointerType::get(ITy, DstAddrSp); + Dest = Builder->CreateBitCast(Dest, NewDstPtrTy); + + // Alignment 0 is identity for alignment 1 for memset, but not store. + if (Alignment == 0) Alignment = 1; + + // Extract the fill value and store. + uint64_t Fill = FillC->getZExtValue()*0x0101010101010101ULL; + InsertNewInstBefore(new StoreInst(ConstantInt::get(ITy, Fill), + Dest, false, Alignment), *MI); + + // Set the size of the copy to 0, it will be deleted on the next iteration. + MI->setLength(Constant::getNullValue(LenC->getType())); + return MI; + } + + return 0; +} + +/// visitCallInst - CallInst simplification. This mostly only handles folding +/// of intrinsic instructions. For normal calls, it allows visitCallSite to do +/// the heavy lifting. +/// +Instruction *InstCombiner::visitCallInst(CallInst &CI) { + if (isFreeCall(&CI)) + return visitFree(CI); + if (isMalloc(&CI)) + return visitMalloc(CI); + + // If the caller function is nounwind, mark the call as nounwind, even if the + // callee isn't. + if (CI.getParent()->getParent()->doesNotThrow() && + !CI.doesNotThrow()) { + CI.setDoesNotThrow(); + return &CI; + } + + IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CI); + if (!II) return visitCallSite(&CI); + + // Intrinsics cannot occur in an invoke, so handle them here instead of in + // visitCallSite. + if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(II)) { + bool Changed = false; + + // memmove/cpy/set of zero bytes is a noop. + if (Constant *NumBytes = dyn_cast<Constant>(MI->getLength())) { + if (NumBytes->isNullValue()) + return EraseInstFromFunction(CI); + + if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes)) + if (CI->getZExtValue() == 1) { + // Replace the instruction with just byte operations. We would + // transform other cases to loads/stores, but we don't know if + // alignment is sufficient. + } + } + + // No other transformations apply to volatile transfers. + if (MI->isVolatile()) + return 0; + + // If we have a memmove and the source operation is a constant global, + // then the source and dest pointers can't alias, so we can change this + // into a call to memcpy. + if (MemMoveInst *MMI = dyn_cast<MemMoveInst>(MI)) { + if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource())) + if (GVSrc->isConstant()) { + Module *M = CI.getParent()->getParent()->getParent(); + Intrinsic::ID MemCpyID = Intrinsic::memcpy; + const Type *Tys[3] = { CI.getArgOperand(0)->getType(), + CI.getArgOperand(1)->getType(), + CI.getArgOperand(2)->getType() }; + CI.setCalledFunction(Intrinsic::getDeclaration(M, MemCpyID, Tys, 3)); + Changed = true; + } + } + + if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) { + // memmove(x,x,size) -> noop. + if (MTI->getSource() == MTI->getDest()) + return EraseInstFromFunction(CI); + } + + // If we can determine a pointer alignment that is bigger than currently + // set, update the alignment. + if (isa<MemTransferInst>(MI)) { + if (Instruction *I = SimplifyMemTransfer(MI)) + return I; + } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(MI)) { + if (Instruction *I = SimplifyMemSet(MSI)) + return I; + } + + if (Changed) return II; + } + + switch (II->getIntrinsicID()) { + default: break; + case Intrinsic::objectsize: { + // We need target data for just about everything so depend on it. + if (!TD) break; + + const Type *ReturnTy = CI.getType(); + uint64_t DontKnow = II->getArgOperand(1) == Builder->getTrue() ? 0 : -1ULL; + + // Get to the real allocated thing and offset as fast as possible. + Value *Op1 = II->getArgOperand(0)->stripPointerCasts(); + + uint64_t Offset = 0; + uint64_t Size = -1ULL; + + // Try to look through constant GEPs. + if (GEPOperator *GEP = dyn_cast<GEPOperator>(Op1)) { + if (!GEP->hasAllConstantIndices()) break; + + // Get the current byte offset into the thing. Use the original + // operand in case we're looking through a bitcast. + SmallVector<Value*, 8> Ops(GEP->idx_begin(), GEP->idx_end()); + Offset = TD->getIndexedOffset(GEP->getPointerOperandType(), + Ops.data(), Ops.size()); + + Op1 = GEP->getPointerOperand()->stripPointerCasts(); + + // Make sure we're not a constant offset from an external + // global. + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Op1)) + if (!GV->hasDefinitiveInitializer()) break; + } + + // If we've stripped down to a single global variable that we + // can know the size of then just return that. + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Op1)) { + if (GV->hasDefinitiveInitializer()) { + Constant *C = GV->getInitializer(); + Size = TD->getTypeAllocSize(C->getType()); + } else { + // Can't determine size of the GV. + Constant *RetVal = ConstantInt::get(ReturnTy, DontKnow); + return ReplaceInstUsesWith(CI, RetVal); + } + } else if (AllocaInst *AI = dyn_cast<AllocaInst>(Op1)) { + // Get alloca size. + if (AI->getAllocatedType()->isSized()) { + Size = TD->getTypeAllocSize(AI->getAllocatedType()); + if (AI->isArrayAllocation()) { + const ConstantInt *C = dyn_cast<ConstantInt>(AI->getArraySize()); + if (!C) break; + Size *= C->getZExtValue(); + } + } + } else if (CallInst *MI = extractMallocCall(Op1)) { + // Get allocation size. + const Type* MallocType = getMallocAllocatedType(MI); + if (MallocType && MallocType->isSized()) + if (Value *NElems = getMallocArraySize(MI, TD, true)) + if (ConstantInt *NElements = dyn_cast<ConstantInt>(NElems)) + Size = NElements->getZExtValue() * TD->getTypeAllocSize(MallocType); + } + + // Do not return "I don't know" here. Later optimization passes could + // make it possible to evaluate objectsize to a constant. + if (Size == -1ULL) + break; + + if (Size < Offset) { + // Out of bound reference? Negative index normalized to large + // index? Just return "I don't know". + return ReplaceInstUsesWith(CI, ConstantInt::get(ReturnTy, DontKnow)); + } + return ReplaceInstUsesWith(CI, ConstantInt::get(ReturnTy, Size-Offset)); + } + case Intrinsic::bswap: + // bswap(bswap(x)) -> x + if (IntrinsicInst *Operand = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) + if (Operand->getIntrinsicID() == Intrinsic::bswap) + return ReplaceInstUsesWith(CI, Operand->getArgOperand(0)); + + // bswap(trunc(bswap(x))) -> trunc(lshr(x, c)) + if (TruncInst *TI = dyn_cast<TruncInst>(II->getArgOperand(0))) { + if (IntrinsicInst *Operand = dyn_cast<IntrinsicInst>(TI->getOperand(0))) + if (Operand->getIntrinsicID() == Intrinsic::bswap) { + unsigned C = Operand->getType()->getPrimitiveSizeInBits() - + TI->getType()->getPrimitiveSizeInBits(); + Value *CV = ConstantInt::get(Operand->getType(), C); + Value *V = Builder->CreateLShr(Operand->getArgOperand(0), CV); + return new TruncInst(V, TI->getType()); + } + } + + break; + case Intrinsic::powi: + if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getArgOperand(1))) { + // powi(x, 0) -> 1.0 + if (Power->isZero()) + return ReplaceInstUsesWith(CI, ConstantFP::get(CI.getType(), 1.0)); + // powi(x, 1) -> x + if (Power->isOne()) + return ReplaceInstUsesWith(CI, II->getArgOperand(0)); + // powi(x, -1) -> 1/x + if (Power->isAllOnesValue()) + return BinaryOperator::CreateFDiv(ConstantFP::get(CI.getType(), 1.0), + II->getArgOperand(0)); + } + break; + case Intrinsic::cttz: { + // If all bits below the first known one are known zero, + // this value is constant. + const IntegerType *IT = cast<IntegerType>(II->getArgOperand(0)->getType()); + uint32_t BitWidth = IT->getBitWidth(); + APInt KnownZero(BitWidth, 0); + APInt KnownOne(BitWidth, 0); + ComputeMaskedBits(II->getArgOperand(0), APInt::getAllOnesValue(BitWidth), + KnownZero, KnownOne); + unsigned TrailingZeros = KnownOne.countTrailingZeros(); + APInt Mask(APInt::getLowBitsSet(BitWidth, TrailingZeros)); + if ((Mask & KnownZero) == Mask) + return ReplaceInstUsesWith(CI, ConstantInt::get(IT, + APInt(BitWidth, TrailingZeros))); + + } + break; + case Intrinsic::ctlz: { + // If all bits above the first known one are known zero, + // this value is constant. + const IntegerType *IT = cast<IntegerType>(II->getArgOperand(0)->getType()); + uint32_t BitWidth = IT->getBitWidth(); + APInt KnownZero(BitWidth, 0); + APInt KnownOne(BitWidth, 0); + ComputeMaskedBits(II->getArgOperand(0), APInt::getAllOnesValue(BitWidth), + KnownZero, KnownOne); + unsigned LeadingZeros = KnownOne.countLeadingZeros(); + APInt Mask(APInt::getHighBitsSet(BitWidth, LeadingZeros)); + if ((Mask & KnownZero) == Mask) + return ReplaceInstUsesWith(CI, ConstantInt::get(IT, + APInt(BitWidth, LeadingZeros))); + + } + break; + case Intrinsic::uadd_with_overflow: { + Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1); + const IntegerType *IT = cast<IntegerType>(II->getArgOperand(0)->getType()); + uint32_t BitWidth = IT->getBitWidth(); + APInt Mask = APInt::getSignBit(BitWidth); + APInt LHSKnownZero(BitWidth, 0); + APInt LHSKnownOne(BitWidth, 0); + ComputeMaskedBits(LHS, Mask, LHSKnownZero, LHSKnownOne); + bool LHSKnownNegative = LHSKnownOne[BitWidth - 1]; + bool LHSKnownPositive = LHSKnownZero[BitWidth - 1]; + + if (LHSKnownNegative || LHSKnownPositive) { + APInt RHSKnownZero(BitWidth, 0); + APInt RHSKnownOne(BitWidth, 0); + ComputeMaskedBits(RHS, Mask, RHSKnownZero, RHSKnownOne); + bool RHSKnownNegative = RHSKnownOne[BitWidth - 1]; + bool RHSKnownPositive = RHSKnownZero[BitWidth - 1]; + if (LHSKnownNegative && RHSKnownNegative) { + // The sign bit is set in both cases: this MUST overflow. + // Create a simple add instruction, and insert it into the struct. + Instruction *Add = BinaryOperator::CreateAdd(LHS, RHS, "", &CI); + Worklist.Add(Add); + Constant *V[] = { + UndefValue::get(LHS->getType()),ConstantInt::getTrue(II->getContext()) + }; + Constant *Struct = ConstantStruct::get(II->getContext(), V, 2, false); + return InsertValueInst::Create(Struct, Add, 0); + } + + if (LHSKnownPositive && RHSKnownPositive) { + // The sign bit is clear in both cases: this CANNOT overflow. + // Create a simple add instruction, and insert it into the struct. + Instruction *Add = BinaryOperator::CreateNUWAdd(LHS, RHS, "", &CI); + Worklist.Add(Add); + Constant *V[] = { + UndefValue::get(LHS->getType()), + ConstantInt::getFalse(II->getContext()) + }; + Constant *Struct = ConstantStruct::get(II->getContext(), V, 2, false); + return InsertValueInst::Create(Struct, Add, 0); + } + } + } + // FALL THROUGH uadd into sadd + case Intrinsic::sadd_with_overflow: + // Canonicalize constants into the RHS. + if (isa<Constant>(II->getArgOperand(0)) && + !isa<Constant>(II->getArgOperand(1))) { + Value *LHS = II->getArgOperand(0); + II->setArgOperand(0, II->getArgOperand(1)); + II->setArgOperand(1, LHS); + return II; + } + + // X + undef -> undef + if (isa<UndefValue>(II->getArgOperand(1))) + return ReplaceInstUsesWith(CI, UndefValue::get(II->getType())); + + if (ConstantInt *RHS = dyn_cast<ConstantInt>(II->getArgOperand(1))) { + // X + 0 -> {X, false} + if (RHS->isZero()) { + Constant *V[] = { + UndefValue::get(II->getArgOperand(0)->getType()), + ConstantInt::getFalse(II->getContext()) + }; + Constant *Struct = ConstantStruct::get(II->getContext(), V, 2, false); + return InsertValueInst::Create(Struct, II->getArgOperand(0), 0); + } + } + break; + case Intrinsic::usub_with_overflow: + case Intrinsic::ssub_with_overflow: + // undef - X -> undef + // X - undef -> undef + if (isa<UndefValue>(II->getArgOperand(0)) || + isa<UndefValue>(II->getArgOperand(1))) + return ReplaceInstUsesWith(CI, UndefValue::get(II->getType())); + + if (ConstantInt *RHS = dyn_cast<ConstantInt>(II->getArgOperand(1))) { + // X - 0 -> {X, false} + if (RHS->isZero()) { + Constant *V[] = { + UndefValue::get(II->getArgOperand(0)->getType()), + ConstantInt::getFalse(II->getContext()) + }; + Constant *Struct = ConstantStruct::get(II->getContext(), V, 2, false); + return InsertValueInst::Create(Struct, II->getArgOperand(0), 0); + } + } + break; + case Intrinsic::umul_with_overflow: + case Intrinsic::smul_with_overflow: + // Canonicalize constants into the RHS. + if (isa<Constant>(II->getArgOperand(0)) && + !isa<Constant>(II->getArgOperand(1))) { + Value *LHS = II->getArgOperand(0); + II->setArgOperand(0, II->getArgOperand(1)); + II->setArgOperand(1, LHS); + return II; + } + + // X * undef -> undef + if (isa<UndefValue>(II->getArgOperand(1))) + return ReplaceInstUsesWith(CI, UndefValue::get(II->getType())); + + if (ConstantInt *RHSI = dyn_cast<ConstantInt>(II->getArgOperand(1))) { + // X*0 -> {0, false} + if (RHSI->isZero()) + return ReplaceInstUsesWith(CI, Constant::getNullValue(II->getType())); + + // X * 1 -> {X, false} + if (RHSI->equalsInt(1)) { + Constant *V[] = { + UndefValue::get(II->getArgOperand(0)->getType()), + ConstantInt::getFalse(II->getContext()) + }; + Constant *Struct = ConstantStruct::get(II->getContext(), V, 2, false); + return InsertValueInst::Create(Struct, II->getArgOperand(0), 0); + } + } + break; + case Intrinsic::ppc_altivec_lvx: + case Intrinsic::ppc_altivec_lvxl: + case Intrinsic::x86_sse_loadu_ps: + case Intrinsic::x86_sse2_loadu_pd: + case Intrinsic::x86_sse2_loadu_dq: + // Turn PPC lvx -> load if the pointer is known aligned. + // Turn X86 loadups -> load if the pointer is known aligned. + if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, TD) >= 16) { + Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), + PointerType::getUnqual(II->getType())); + return new LoadInst(Ptr); + } + break; + case Intrinsic::ppc_altivec_stvx: + case Intrinsic::ppc_altivec_stvxl: + // Turn stvx -> store if the pointer is known aligned. + if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, TD) >= 16) { + const Type *OpPtrTy = + PointerType::getUnqual(II->getArgOperand(0)->getType()); + Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy); + return new StoreInst(II->getArgOperand(0), Ptr); + } + break; + case Intrinsic::x86_sse_storeu_ps: + case Intrinsic::x86_sse2_storeu_pd: + case Intrinsic::x86_sse2_storeu_dq: + // Turn X86 storeu -> store if the pointer is known aligned. + if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, TD) >= 16) { + const Type *OpPtrTy = + PointerType::getUnqual(II->getArgOperand(1)->getType()); + Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), OpPtrTy); + return new StoreInst(II->getArgOperand(1), Ptr); + } + break; + + case Intrinsic::x86_sse_cvtss2si: + case Intrinsic::x86_sse_cvtss2si64: + case Intrinsic::x86_sse_cvttss2si: + case Intrinsic::x86_sse_cvttss2si64: + case Intrinsic::x86_sse2_cvtsd2si: + case Intrinsic::x86_sse2_cvtsd2si64: + case Intrinsic::x86_sse2_cvttsd2si: + case Intrinsic::x86_sse2_cvttsd2si64: { + // These intrinsics only demand the 0th element of their input vectors. If + // we can simplify the input based on that, do so now. + unsigned VWidth = + cast<VectorType>(II->getArgOperand(0)->getType())->getNumElements(); + APInt DemandedElts(VWidth, 1); + APInt UndefElts(VWidth, 0); + if (Value *V = SimplifyDemandedVectorElts(II->getArgOperand(0), + DemandedElts, UndefElts)) { + II->setArgOperand(0, V); + return II; + } + break; + } + + case Intrinsic::ppc_altivec_vperm: + // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant. + if (ConstantVector *Mask = dyn_cast<ConstantVector>(II->getArgOperand(2))) { + assert(Mask->getNumOperands() == 16 && "Bad type for intrinsic!"); + + // Check that all of the elements are integer constants or undefs. + bool AllEltsOk = true; + for (unsigned i = 0; i != 16; ++i) { + if (!isa<ConstantInt>(Mask->getOperand(i)) && + !isa<UndefValue>(Mask->getOperand(i))) { + AllEltsOk = false; + break; + } + } + + if (AllEltsOk) { + // Cast the input vectors to byte vectors. + Value *Op0 = Builder->CreateBitCast(II->getArgOperand(0), + Mask->getType()); + Value *Op1 = Builder->CreateBitCast(II->getArgOperand(1), + Mask->getType()); + Value *Result = UndefValue::get(Op0->getType()); + + // Only extract each element once. + Value *ExtractedElts[32]; + memset(ExtractedElts, 0, sizeof(ExtractedElts)); + + for (unsigned i = 0; i != 16; ++i) { + if (isa<UndefValue>(Mask->getOperand(i))) + continue; + unsigned Idx=cast<ConstantInt>(Mask->getOperand(i))->getZExtValue(); + Idx &= 31; // Match the hardware behavior. + + if (ExtractedElts[Idx] == 0) { + ExtractedElts[Idx] = + Builder->CreateExtractElement(Idx < 16 ? Op0 : Op1, + ConstantInt::get(Type::getInt32Ty(II->getContext()), + Idx&15, false), "tmp"); + } + + // Insert this value into the result vector. + Result = Builder->CreateInsertElement(Result, ExtractedElts[Idx], + ConstantInt::get(Type::getInt32Ty(II->getContext()), + i, false), "tmp"); + } + return CastInst::Create(Instruction::BitCast, Result, CI.getType()); + } + } + break; + + case Intrinsic::arm_neon_vld1: + case Intrinsic::arm_neon_vld2: + case Intrinsic::arm_neon_vld3: + case Intrinsic::arm_neon_vld4: + case Intrinsic::arm_neon_vld2lane: + case Intrinsic::arm_neon_vld3lane: + case Intrinsic::arm_neon_vld4lane: + case Intrinsic::arm_neon_vst1: + case Intrinsic::arm_neon_vst2: + case Intrinsic::arm_neon_vst3: + case Intrinsic::arm_neon_vst4: + case Intrinsic::arm_neon_vst2lane: + case Intrinsic::arm_neon_vst3lane: + case Intrinsic::arm_neon_vst4lane: { + unsigned MemAlign = getKnownAlignment(II->getArgOperand(0), TD); + unsigned AlignArg = II->getNumArgOperands() - 1; + ConstantInt *IntrAlign = dyn_cast<ConstantInt>(II->getArgOperand(AlignArg)); + if (IntrAlign && IntrAlign->getZExtValue() < MemAlign) { + II->setArgOperand(AlignArg, + ConstantInt::get(Type::getInt32Ty(II->getContext()), + MemAlign, false)); + return II; + } + break; + } + + case Intrinsic::stackrestore: { + // If the save is right next to the restore, remove the restore. This can + // happen when variable allocas are DCE'd. + if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) { + if (SS->getIntrinsicID() == Intrinsic::stacksave) { + BasicBlock::iterator BI = SS; + if (&*++BI == II) + return EraseInstFromFunction(CI); + } + } + + // Scan down this block to see if there is another stack restore in the + // same block without an intervening call/alloca. + BasicBlock::iterator BI = II; + TerminatorInst *TI = II->getParent()->getTerminator(); + bool CannotRemove = false; + for (++BI; &*BI != TI; ++BI) { + if (isa<AllocaInst>(BI) || isMalloc(BI)) { + CannotRemove = true; + break; + } + if (CallInst *BCI = dyn_cast<CallInst>(BI)) { + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(BCI)) { + // If there is a stackrestore below this one, remove this one. + if (II->getIntrinsicID() == Intrinsic::stackrestore) + return EraseInstFromFunction(CI); + // Otherwise, ignore the intrinsic. + } else { + // If we found a non-intrinsic call, we can't remove the stack + // restore. + CannotRemove = true; + break; + } + } + } + + // If the stack restore is in a return/unwind block and if there are no + // allocas or calls between the restore and the return, nuke the restore. + if (!CannotRemove && (isa<ReturnInst>(TI) || isa<UnwindInst>(TI))) + return EraseInstFromFunction(CI); + break; + } + } + + return visitCallSite(II); +} + +// InvokeInst simplification +// +Instruction *InstCombiner::visitInvokeInst(InvokeInst &II) { + return visitCallSite(&II); +} + +/// isSafeToEliminateVarargsCast - If this cast does not affect the value +/// passed through the varargs area, we can eliminate the use of the cast. +static bool isSafeToEliminateVarargsCast(const CallSite CS, + const CastInst * const CI, + const TargetData * const TD, + const int ix) { + if (!CI->isLosslessCast()) + return false; + + // The size of ByVal arguments is derived from the type, so we + // can't change to a type with a different size. If the size were + // passed explicitly we could avoid this check. + if (!CS.paramHasAttr(ix, Attribute::ByVal)) + return true; + + const Type* SrcTy = + cast<PointerType>(CI->getOperand(0)->getType())->getElementType(); + const Type* DstTy = cast<PointerType>(CI->getType())->getElementType(); + if (!SrcTy->isSized() || !DstTy->isSized()) + return false; + if (!TD || TD->getTypeAllocSize(SrcTy) != TD->getTypeAllocSize(DstTy)) + return false; + return true; +} + +namespace { +class InstCombineFortifiedLibCalls : public SimplifyFortifiedLibCalls { + InstCombiner *IC; +protected: + void replaceCall(Value *With) { + NewInstruction = IC->ReplaceInstUsesWith(*CI, With); + } + bool isFoldable(unsigned SizeCIOp, unsigned SizeArgOp, bool isString) const { + if (CI->getArgOperand(SizeCIOp) == CI->getArgOperand(SizeArgOp)) + return true; + if (ConstantInt *SizeCI = + dyn_cast<ConstantInt>(CI->getArgOperand(SizeCIOp))) { + if (SizeCI->isAllOnesValue()) + return true; + if (isString) + return SizeCI->getZExtValue() >= + GetStringLength(CI->getArgOperand(SizeArgOp)); + if (ConstantInt *Arg = dyn_cast<ConstantInt>( + CI->getArgOperand(SizeArgOp))) + return SizeCI->getZExtValue() >= Arg->getZExtValue(); + } + return false; + } +public: + InstCombineFortifiedLibCalls(InstCombiner *IC) : IC(IC), NewInstruction(0) { } + Instruction *NewInstruction; +}; +} // end anonymous namespace + +// Try to fold some different type of calls here. +// Currently we're only working with the checking functions, memcpy_chk, +// mempcpy_chk, memmove_chk, memset_chk, strcpy_chk, stpcpy_chk, strncpy_chk, +// strcat_chk and strncat_chk. +Instruction *InstCombiner::tryOptimizeCall(CallInst *CI, const TargetData *TD) { + if (CI->getCalledFunction() == 0) return 0; + + InstCombineFortifiedLibCalls Simplifier(this); + Simplifier.fold(CI, TD); + return Simplifier.NewInstruction; +} + +// visitCallSite - Improvements for call and invoke instructions. +// +Instruction *InstCombiner::visitCallSite(CallSite CS) { + bool Changed = false; + + // If the callee is a pointer to a function, attempt to move any casts to the + // arguments of the call/invoke. + Value *Callee = CS.getCalledValue(); + if (!isa<Function>(Callee) && transformConstExprCastCall(CS)) + return 0; + + if (Function *CalleeF = dyn_cast<Function>(Callee)) + // If the call and callee calling conventions don't match, this call must + // be unreachable, as the call is undefined. + if (CalleeF->getCallingConv() != CS.getCallingConv() && + // Only do this for calls to a function with a body. A prototype may + // not actually end up matching the implementation's calling conv for a + // variety of reasons (e.g. it may be written in assembly). + !CalleeF->isDeclaration()) { + Instruction *OldCall = CS.getInstruction(); + new StoreInst(ConstantInt::getTrue(Callee->getContext()), + UndefValue::get(Type::getInt1PtrTy(Callee->getContext())), + OldCall); + // If OldCall dues not return void then replaceAllUsesWith undef. + // This allows ValueHandlers and custom metadata to adjust itself. + if (!OldCall->getType()->isVoidTy()) + OldCall->replaceAllUsesWith(UndefValue::get(OldCall->getType())); + if (isa<CallInst>(OldCall)) + return EraseInstFromFunction(*OldCall); + + // We cannot remove an invoke, because it would change the CFG, just + // change the callee to a null pointer. + cast<InvokeInst>(OldCall)->setCalledFunction( + Constant::getNullValue(CalleeF->getType())); + return 0; + } + + if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) { + // This instruction is not reachable, just remove it. We insert a store to + // undef so that we know that this code is not reachable, despite the fact + // that we can't modify the CFG here. + new StoreInst(ConstantInt::getTrue(Callee->getContext()), + UndefValue::get(Type::getInt1PtrTy(Callee->getContext())), + CS.getInstruction()); + + // If CS does not return void then replaceAllUsesWith undef. + // This allows ValueHandlers and custom metadata to adjust itself. + if (!CS.getInstruction()->getType()->isVoidTy()) + CS.getInstruction()-> + replaceAllUsesWith(UndefValue::get(CS.getInstruction()->getType())); + + if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) { + // Don't break the CFG, insert a dummy cond branch. + BranchInst::Create(II->getNormalDest(), II->getUnwindDest(), + ConstantInt::getTrue(Callee->getContext()), II); + } + return EraseInstFromFunction(*CS.getInstruction()); + } + + if (BitCastInst *BC = dyn_cast<BitCastInst>(Callee)) + if (IntrinsicInst *In = dyn_cast<IntrinsicInst>(BC->getOperand(0))) + if (In->getIntrinsicID() == Intrinsic::init_trampoline) + return transformCallThroughTrampoline(CS); + + const PointerType *PTy = cast<PointerType>(Callee->getType()); + const FunctionType *FTy = cast<FunctionType>(PTy->getElementType()); + if (FTy->isVarArg()) { + int ix = FTy->getNumParams() + (isa<InvokeInst>(Callee) ? 3 : 1); + // See if we can optimize any arguments passed through the varargs area of + // the call. + for (CallSite::arg_iterator I = CS.arg_begin()+FTy->getNumParams(), + E = CS.arg_end(); I != E; ++I, ++ix) { + CastInst *CI = dyn_cast<CastInst>(*I); + if (CI && isSafeToEliminateVarargsCast(CS, CI, TD, ix)) { + *I = CI->getOperand(0); + Changed = true; + } + } + } + + if (isa<InlineAsm>(Callee) && !CS.doesNotThrow()) { + // Inline asm calls cannot throw - mark them 'nounwind'. + CS.setDoesNotThrow(); + Changed = true; + } + + // Try to optimize the call if possible, we require TargetData for most of + // this. None of these calls are seen as possibly dead so go ahead and + // delete the instruction now. + if (CallInst *CI = dyn_cast<CallInst>(CS.getInstruction())) { + Instruction *I = tryOptimizeCall(CI, TD); + // If we changed something return the result, etc. Otherwise let + // the fallthrough check. + if (I) return EraseInstFromFunction(*I); + } + + return Changed ? CS.getInstruction() : 0; +} + +// transformConstExprCastCall - If the callee is a constexpr cast of a function, +// attempt to move the cast to the arguments of the call/invoke. +// +bool InstCombiner::transformConstExprCastCall(CallSite CS) { + Function *Callee = + dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts()); + if (Callee == 0) + return false; + Instruction *Caller = CS.getInstruction(); + const AttrListPtr &CallerPAL = CS.getAttributes(); + + // Okay, this is a cast from a function to a different type. Unless doing so + // would cause a type conversion of one of our arguments, change this call to + // be a direct call with arguments casted to the appropriate types. + // + const FunctionType *FT = Callee->getFunctionType(); + const Type *OldRetTy = Caller->getType(); + const Type *NewRetTy = FT->getReturnType(); + + if (NewRetTy->isStructTy()) + return false; // TODO: Handle multiple return values. + + // Check to see if we are changing the return type... + if (OldRetTy != NewRetTy) { + if (Callee->isDeclaration() && + // Conversion is ok if changing from one pointer type to another or from + // a pointer to an integer of the same size. + !((OldRetTy->isPointerTy() || !TD || + OldRetTy == TD->getIntPtrType(Caller->getContext())) && + (NewRetTy->isPointerTy() || !TD || + NewRetTy == TD->getIntPtrType(Caller->getContext())))) + return false; // Cannot transform this return value. + + if (!Caller->use_empty() && + // void -> non-void is handled specially + !NewRetTy->isVoidTy() && !CastInst::isCastable(NewRetTy, OldRetTy)) + return false; // Cannot transform this return value. + + if (!CallerPAL.isEmpty() && !Caller->use_empty()) { + Attributes RAttrs = CallerPAL.getRetAttributes(); + if (RAttrs & Attribute::typeIncompatible(NewRetTy)) + return false; // Attribute not compatible with transformed value. + } + + // If the callsite is an invoke instruction, and the return value is used by + // a PHI node in a successor, we cannot change the return type of the call + // because there is no place to put the cast instruction (without breaking + // the critical edge). Bail out in this case. + if (!Caller->use_empty()) + if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) + for (Value::use_iterator UI = II->use_begin(), E = II->use_end(); + UI != E; ++UI) + if (PHINode *PN = dyn_cast<PHINode>(*UI)) + if (PN->getParent() == II->getNormalDest() || + PN->getParent() == II->getUnwindDest()) + return false; + } + + unsigned NumActualArgs = unsigned(CS.arg_end()-CS.arg_begin()); + unsigned NumCommonArgs = std::min(FT->getNumParams(), NumActualArgs); + + CallSite::arg_iterator AI = CS.arg_begin(); + for (unsigned i = 0, e = NumCommonArgs; i != e; ++i, ++AI) { + const Type *ParamTy = FT->getParamType(i); + const Type *ActTy = (*AI)->getType(); + + if (!CastInst::isCastable(ActTy, ParamTy)) + return false; // Cannot transform this parameter value. + + unsigned Attrs = CallerPAL.getParamAttributes(i + 1); + if (Attrs & Attribute::typeIncompatible(ParamTy)) + return false; // Attribute not compatible with transformed value. + + // If the parameter is passed as a byval argument, then we have to have a + // sized type and the sized type has to have the same size as the old type. + if (ParamTy != ActTy && (Attrs & Attribute::ByVal)) { + const PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy); + if (ParamPTy == 0 || !ParamPTy->getElementType()->isSized() || TD == 0) + return false; + + const Type *CurElTy = cast<PointerType>(ActTy)->getElementType(); + if (TD->getTypeAllocSize(CurElTy) != + TD->getTypeAllocSize(ParamPTy->getElementType())) + return false; + } + + // Converting from one pointer type to another or between a pointer and an + // integer of the same size is safe even if we do not have a body. + bool isConvertible = ActTy == ParamTy || + (TD && ((ParamTy->isPointerTy() || + ParamTy == TD->getIntPtrType(Caller->getContext())) && + (ActTy->isPointerTy() || + ActTy == TD->getIntPtrType(Caller->getContext())))); + if (Callee->isDeclaration() && !isConvertible) return false; + } + + if (FT->getNumParams() < NumActualArgs && !FT->isVarArg() && + Callee->isDeclaration()) + return false; // Do not delete arguments unless we have a function body. + + if (FT->getNumParams() < NumActualArgs && FT->isVarArg() && + !CallerPAL.isEmpty()) + // In this case we have more arguments than the new function type, but we + // won't be dropping them. Check that these extra arguments have attributes + // that are compatible with being a vararg call argument. + for (unsigned i = CallerPAL.getNumSlots(); i; --i) { + if (CallerPAL.getSlot(i - 1).Index <= FT->getNumParams()) + break; + Attributes PAttrs = CallerPAL.getSlot(i - 1).Attrs; + if (PAttrs & Attribute::VarArgsIncompatible) + return false; + } + + // Okay, we decided that this is a safe thing to do: go ahead and start + // inserting cast instructions as necessary... + std::vector<Value*> Args; + Args.reserve(NumActualArgs); + SmallVector<AttributeWithIndex, 8> attrVec; + attrVec.reserve(NumCommonArgs); + + // Get any return attributes. + Attributes RAttrs = CallerPAL.getRetAttributes(); + + // If the return value is not being used, the type may not be compatible + // with the existing attributes. Wipe out any problematic attributes. + RAttrs &= ~Attribute::typeIncompatible(NewRetTy); + + // Add the new return attributes. + if (RAttrs) + attrVec.push_back(AttributeWithIndex::get(0, RAttrs)); + + AI = CS.arg_begin(); + for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) { + const Type *ParamTy = FT->getParamType(i); + if ((*AI)->getType() == ParamTy) { + Args.push_back(*AI); + } else { + Instruction::CastOps opcode = CastInst::getCastOpcode(*AI, + false, ParamTy, false); + Args.push_back(Builder->CreateCast(opcode, *AI, ParamTy, "tmp")); + } + + // Add any parameter attributes. + if (Attributes PAttrs = CallerPAL.getParamAttributes(i + 1)) + attrVec.push_back(AttributeWithIndex::get(i + 1, PAttrs)); + } + + // If the function takes more arguments than the call was taking, add them + // now. + for (unsigned i = NumCommonArgs; i != FT->getNumParams(); ++i) + Args.push_back(Constant::getNullValue(FT->getParamType(i))); + + // If we are removing arguments to the function, emit an obnoxious warning. + if (FT->getNumParams() < NumActualArgs) { + if (!FT->isVarArg()) { + errs() << "WARNING: While resolving call to function '" + << Callee->getName() << "' arguments were dropped!\n"; + } else { + // Add all of the arguments in their promoted form to the arg list. + for (unsigned i = FT->getNumParams(); i != NumActualArgs; ++i, ++AI) { + const Type *PTy = getPromotedType((*AI)->getType()); + if (PTy != (*AI)->getType()) { + // Must promote to pass through va_arg area! + Instruction::CastOps opcode = + CastInst::getCastOpcode(*AI, false, PTy, false); + Args.push_back(Builder->CreateCast(opcode, *AI, PTy, "tmp")); + } else { + Args.push_back(*AI); + } + + // Add any parameter attributes. + if (Attributes PAttrs = CallerPAL.getParamAttributes(i + 1)) + attrVec.push_back(AttributeWithIndex::get(i + 1, PAttrs)); + } + } + } + + if (Attributes FnAttrs = CallerPAL.getFnAttributes()) + attrVec.push_back(AttributeWithIndex::get(~0, FnAttrs)); + + if (NewRetTy->isVoidTy()) + Caller->setName(""); // Void type should not have a name. + + const AttrListPtr &NewCallerPAL = AttrListPtr::get(attrVec.begin(), + attrVec.end()); + + Instruction *NC; + if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { + NC = InvokeInst::Create(Callee, II->getNormalDest(), II->getUnwindDest(), + Args.begin(), Args.end(), + Caller->getName(), Caller); + cast<InvokeInst>(NC)->setCallingConv(II->getCallingConv()); + cast<InvokeInst>(NC)->setAttributes(NewCallerPAL); + } else { + NC = CallInst::Create(Callee, Args.begin(), Args.end(), + Caller->getName(), Caller); + CallInst *CI = cast<CallInst>(Caller); + if (CI->isTailCall()) + cast<CallInst>(NC)->setTailCall(); + cast<CallInst>(NC)->setCallingConv(CI->getCallingConv()); + cast<CallInst>(NC)->setAttributes(NewCallerPAL); + } + + // Insert a cast of the return type as necessary. + Value *NV = NC; + if (OldRetTy != NV->getType() && !Caller->use_empty()) { + if (!NV->getType()->isVoidTy()) { + Instruction::CastOps opcode = + CastInst::getCastOpcode(NC, false, OldRetTy, false); + NV = NC = CastInst::Create(opcode, NC, OldRetTy, "tmp"); + + // If this is an invoke instruction, we should insert it after the first + // non-phi, instruction in the normal successor block. + if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { + BasicBlock::iterator I = II->getNormalDest()->getFirstNonPHI(); + InsertNewInstBefore(NC, *I); + } else { + // Otherwise, it's a call, just insert cast right after the call. + InsertNewInstBefore(NC, *Caller); + } + Worklist.AddUsersToWorkList(*Caller); + } else { + NV = UndefValue::get(Caller->getType()); + } + } + + if (!Caller->use_empty()) + Caller->replaceAllUsesWith(NV); + + EraseInstFromFunction(*Caller); + return true; +} + +// transformCallThroughTrampoline - Turn a call to a function created by the +// init_trampoline intrinsic into a direct call to the underlying function. +// +Instruction *InstCombiner::transformCallThroughTrampoline(CallSite CS) { + Value *Callee = CS.getCalledValue(); + const PointerType *PTy = cast<PointerType>(Callee->getType()); + const FunctionType *FTy = cast<FunctionType>(PTy->getElementType()); + const AttrListPtr &Attrs = CS.getAttributes(); + + // If the call already has the 'nest' attribute somewhere then give up - + // otherwise 'nest' would occur twice after splicing in the chain. + if (Attrs.hasAttrSomewhere(Attribute::Nest)) + return 0; + + IntrinsicInst *Tramp = + cast<IntrinsicInst>(cast<BitCastInst>(Callee)->getOperand(0)); + + Function *NestF =cast<Function>(Tramp->getArgOperand(1)->stripPointerCasts()); + const PointerType *NestFPTy = cast<PointerType>(NestF->getType()); + const FunctionType *NestFTy = cast<FunctionType>(NestFPTy->getElementType()); + + const AttrListPtr &NestAttrs = NestF->getAttributes(); + if (!NestAttrs.isEmpty()) { + unsigned NestIdx = 1; + const Type *NestTy = 0; + Attributes NestAttr = Attribute::None; + + // Look for a parameter marked with the 'nest' attribute. + for (FunctionType::param_iterator I = NestFTy->param_begin(), + E = NestFTy->param_end(); I != E; ++NestIdx, ++I) + if (NestAttrs.paramHasAttr(NestIdx, Attribute::Nest)) { + // Record the parameter type and any other attributes. + NestTy = *I; + NestAttr = NestAttrs.getParamAttributes(NestIdx); + break; + } + + if (NestTy) { + Instruction *Caller = CS.getInstruction(); + std::vector<Value*> NewArgs; + NewArgs.reserve(unsigned(CS.arg_end()-CS.arg_begin())+1); + + SmallVector<AttributeWithIndex, 8> NewAttrs; + NewAttrs.reserve(Attrs.getNumSlots() + 1); + + // Insert the nest argument into the call argument list, which may + // mean appending it. Likewise for attributes. + + // Add any result attributes. + if (Attributes Attr = Attrs.getRetAttributes()) + NewAttrs.push_back(AttributeWithIndex::get(0, Attr)); + + { + unsigned Idx = 1; + CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); + do { + if (Idx == NestIdx) { + // Add the chain argument and attributes. + Value *NestVal = Tramp->getArgOperand(2); + if (NestVal->getType() != NestTy) + NestVal = new BitCastInst(NestVal, NestTy, "nest", Caller); + NewArgs.push_back(NestVal); + NewAttrs.push_back(AttributeWithIndex::get(NestIdx, NestAttr)); + } + + if (I == E) + break; + + // Add the original argument and attributes. + NewArgs.push_back(*I); + if (Attributes Attr = Attrs.getParamAttributes(Idx)) + NewAttrs.push_back + (AttributeWithIndex::get(Idx + (Idx >= NestIdx), Attr)); + + ++Idx, ++I; + } while (1); + } + + // Add any function attributes. + if (Attributes Attr = Attrs.getFnAttributes()) + NewAttrs.push_back(AttributeWithIndex::get(~0, Attr)); + + // The trampoline may have been bitcast to a bogus type (FTy). + // Handle this by synthesizing a new function type, equal to FTy + // with the chain parameter inserted. + + std::vector<const Type*> NewTypes; + NewTypes.reserve(FTy->getNumParams()+1); + + // Insert the chain's type into the list of parameter types, which may + // mean appending it. + { + unsigned Idx = 1; + FunctionType::param_iterator I = FTy->param_begin(), + E = FTy->param_end(); + + do { + if (Idx == NestIdx) + // Add the chain's type. + NewTypes.push_back(NestTy); + + if (I == E) + break; + + // Add the original type. + NewTypes.push_back(*I); + + ++Idx, ++I; + } while (1); + } + + // Replace the trampoline call with a direct call. Let the generic + // code sort out any function type mismatches. + FunctionType *NewFTy = FunctionType::get(FTy->getReturnType(), NewTypes, + FTy->isVarArg()); + Constant *NewCallee = + NestF->getType() == PointerType::getUnqual(NewFTy) ? + NestF : ConstantExpr::getBitCast(NestF, + PointerType::getUnqual(NewFTy)); + const AttrListPtr &NewPAL = AttrListPtr::get(NewAttrs.begin(), + NewAttrs.end()); + + Instruction *NewCaller; + if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { + NewCaller = InvokeInst::Create(NewCallee, + II->getNormalDest(), II->getUnwindDest(), + NewArgs.begin(), NewArgs.end(), + Caller->getName(), Caller); + cast<InvokeInst>(NewCaller)->setCallingConv(II->getCallingConv()); + cast<InvokeInst>(NewCaller)->setAttributes(NewPAL); + } else { + NewCaller = CallInst::Create(NewCallee, NewArgs.begin(), NewArgs.end(), + Caller->getName(), Caller); + if (cast<CallInst>(Caller)->isTailCall()) + cast<CallInst>(NewCaller)->setTailCall(); + cast<CallInst>(NewCaller)-> + setCallingConv(cast<CallInst>(Caller)->getCallingConv()); + cast<CallInst>(NewCaller)->setAttributes(NewPAL); + } + if (!Caller->getType()->isVoidTy()) + Caller->replaceAllUsesWith(NewCaller); + Caller->eraseFromParent(); + Worklist.Remove(Caller); + return 0; + } + } + + // Replace the trampoline call with a direct call. Since there is no 'nest' + // parameter, there is no need to adjust the argument list. Let the generic + // code sort out any function type mismatches. + Constant *NewCallee = + NestF->getType() == PTy ? NestF : + ConstantExpr::getBitCast(NestF, PTy); + CS.setCalledFunction(NewCallee); + return CS.getInstruction(); +} + diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp new file mode 100644 index 0000000..b432641 --- /dev/null +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -0,0 +1,1709 @@ +//===- InstCombineCasts.cpp -----------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the visit functions for cast operations. +// +//===----------------------------------------------------------------------===// + +#include "InstCombine.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Support/PatternMatch.h" +using namespace llvm; +using namespace PatternMatch; + +/// DecomposeSimpleLinearExpr - Analyze 'Val', seeing if it is a simple linear +/// expression. If so, decompose it, returning some value X, such that Val is +/// X*Scale+Offset. +/// +static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale, + uint64_t &Offset) { + if (ConstantInt *CI = dyn_cast<ConstantInt>(Val)) { + Offset = CI->getZExtValue(); + Scale = 0; + return ConstantInt::get(Val->getType(), 0); + } + + if (BinaryOperator *I = dyn_cast<BinaryOperator>(Val)) { + if (ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1))) { + if (I->getOpcode() == Instruction::Shl) { + // This is a value scaled by '1 << the shift amt'. + Scale = UINT64_C(1) << RHS->getZExtValue(); + Offset = 0; + return I->getOperand(0); + } + + if (I->getOpcode() == Instruction::Mul) { + // This value is scaled by 'RHS'. + Scale = RHS->getZExtValue(); + Offset = 0; + return I->getOperand(0); + } + + if (I->getOpcode() == Instruction::Add) { + // We have X+C. Check to see if we really have (X*C2)+C1, + // where C1 is divisible by C2. + unsigned SubScale; + Value *SubVal = + DecomposeSimpleLinearExpr(I->getOperand(0), SubScale, Offset); + Offset += RHS->getZExtValue(); + Scale = SubScale; + return SubVal; + } + } + } + + // Otherwise, we can't look past this. + Scale = 1; + Offset = 0; + return Val; +} + +/// PromoteCastOfAllocation - If we find a cast of an allocation instruction, +/// try to eliminate the cast by moving the type information into the alloc. +Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI, + AllocaInst &AI) { + // This requires TargetData to get the alloca alignment and size information. + if (!TD) return 0; + + const PointerType *PTy = cast<PointerType>(CI.getType()); + + BuilderTy AllocaBuilder(*Builder); + AllocaBuilder.SetInsertPoint(AI.getParent(), &AI); + + // Get the type really allocated and the type casted to. + const Type *AllocElTy = AI.getAllocatedType(); + const Type *CastElTy = PTy->getElementType(); + if (!AllocElTy->isSized() || !CastElTy->isSized()) return 0; + + unsigned AllocElTyAlign = TD->getABITypeAlignment(AllocElTy); + unsigned CastElTyAlign = TD->getABITypeAlignment(CastElTy); + if (CastElTyAlign < AllocElTyAlign) return 0; + + // If the allocation has multiple uses, only promote it if we are strictly + // increasing the alignment of the resultant allocation. If we keep it the + // same, we open the door to infinite loops of various kinds. (A reference + // from a dbg.declare doesn't count as a use for this purpose.) + if (!AI.hasOneUse() && !hasOneUsePlusDeclare(&AI) && + CastElTyAlign == AllocElTyAlign) return 0; + + uint64_t AllocElTySize = TD->getTypeAllocSize(AllocElTy); + uint64_t CastElTySize = TD->getTypeAllocSize(CastElTy); + if (CastElTySize == 0 || AllocElTySize == 0) return 0; + + // See if we can satisfy the modulus by pulling a scale out of the array + // size argument. + unsigned ArraySizeScale; + uint64_t ArrayOffset; + Value *NumElements = // See if the array size is a decomposable linear expr. + DecomposeSimpleLinearExpr(AI.getOperand(0), ArraySizeScale, ArrayOffset); + + // If we can now satisfy the modulus, by using a non-1 scale, we really can + // do the xform. + if ((AllocElTySize*ArraySizeScale) % CastElTySize != 0 || + (AllocElTySize*ArrayOffset ) % CastElTySize != 0) return 0; + + unsigned Scale = (AllocElTySize*ArraySizeScale)/CastElTySize; + Value *Amt = 0; + if (Scale == 1) { + Amt = NumElements; + } else { + Amt = ConstantInt::get(AI.getArraySize()->getType(), Scale); + // Insert before the alloca, not before the cast. + Amt = AllocaBuilder.CreateMul(Amt, NumElements, "tmp"); + } + + if (uint64_t Offset = (AllocElTySize*ArrayOffset)/CastElTySize) { + Value *Off = ConstantInt::get(AI.getArraySize()->getType(), + Offset, true); + Amt = AllocaBuilder.CreateAdd(Amt, Off, "tmp"); + } + + AllocaInst *New = AllocaBuilder.CreateAlloca(CastElTy, Amt); + New->setAlignment(AI.getAlignment()); + New->takeName(&AI); + + // If the allocation has one real use plus a dbg.declare, just remove the + // declare. + if (DbgDeclareInst *DI = hasOneUsePlusDeclare(&AI)) { + EraseInstFromFunction(*(Instruction*)DI); + } + // If the allocation has multiple real uses, insert a cast and change all + // things that used it to use the new cast. This will also hack on CI, but it + // will die soon. + else if (!AI.hasOneUse()) { + // New is the allocation instruction, pointer typed. AI is the original + // allocation instruction, also pointer typed. Thus, cast to use is BitCast. + Value *NewCast = AllocaBuilder.CreateBitCast(New, AI.getType(), "tmpcast"); + AI.replaceAllUsesWith(NewCast); + } + return ReplaceInstUsesWith(CI, New); +} + + + +/// EvaluateInDifferentType - Given an expression that +/// CanEvaluateTruncated or CanEvaluateSExtd returns true for, actually +/// insert the code to evaluate the expression. +Value *InstCombiner::EvaluateInDifferentType(Value *V, const Type *Ty, + bool isSigned) { + if (Constant *C = dyn_cast<Constant>(V)) { + C = ConstantExpr::getIntegerCast(C, Ty, isSigned /*Sext or ZExt*/); + // If we got a constantexpr back, try to simplify it with TD info. + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) + C = ConstantFoldConstantExpression(CE, TD); + return C; + } + + // Otherwise, it must be an instruction. + Instruction *I = cast<Instruction>(V); + Instruction *Res = 0; + unsigned Opc = I->getOpcode(); + switch (Opc) { + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::AShr: + case Instruction::LShr: + case Instruction::Shl: + case Instruction::UDiv: + case Instruction::URem: { + Value *LHS = EvaluateInDifferentType(I->getOperand(0), Ty, isSigned); + Value *RHS = EvaluateInDifferentType(I->getOperand(1), Ty, isSigned); + Res = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS); + break; + } + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + // If the source type of the cast is the type we're trying for then we can + // just return the source. There's no need to insert it because it is not + // new. + if (I->getOperand(0)->getType() == Ty) + return I->getOperand(0); + + // Otherwise, must be the same type of cast, so just reinsert a new one. + // This also handles the case of zext(trunc(x)) -> zext(x). + Res = CastInst::CreateIntegerCast(I->getOperand(0), Ty, + Opc == Instruction::SExt); + break; + case Instruction::Select: { + Value *True = EvaluateInDifferentType(I->getOperand(1), Ty, isSigned); + Value *False = EvaluateInDifferentType(I->getOperand(2), Ty, isSigned); + Res = SelectInst::Create(I->getOperand(0), True, False); + break; + } + case Instruction::PHI: { + PHINode *OPN = cast<PHINode>(I); + PHINode *NPN = PHINode::Create(Ty); + for (unsigned i = 0, e = OPN->getNumIncomingValues(); i != e; ++i) { + Value *V =EvaluateInDifferentType(OPN->getIncomingValue(i), Ty, isSigned); + NPN->addIncoming(V, OPN->getIncomingBlock(i)); + } + Res = NPN; + break; + } + default: + // TODO: Can handle more cases here. + llvm_unreachable("Unreachable!"); + break; + } + + Res->takeName(I); + return InsertNewInstBefore(Res, *I); +} + + +/// This function is a wrapper around CastInst::isEliminableCastPair. It +/// simply extracts arguments and returns what that function returns. +static Instruction::CastOps +isEliminableCastPair( + const CastInst *CI, ///< The first cast instruction + unsigned opcode, ///< The opcode of the second cast instruction + const Type *DstTy, ///< The target type for the second cast instruction + TargetData *TD ///< The target data for pointer size +) { + + const Type *SrcTy = CI->getOperand(0)->getType(); // A from above + const Type *MidTy = CI->getType(); // B from above + + // Get the opcodes of the two Cast instructions + Instruction::CastOps firstOp = Instruction::CastOps(CI->getOpcode()); + Instruction::CastOps secondOp = Instruction::CastOps(opcode); + + unsigned Res = CastInst::isEliminableCastPair(firstOp, secondOp, SrcTy, MidTy, + DstTy, + TD ? TD->getIntPtrType(CI->getContext()) : 0); + + // We don't want to form an inttoptr or ptrtoint that converts to an integer + // type that differs from the pointer size. + if ((Res == Instruction::IntToPtr && + (!TD || SrcTy != TD->getIntPtrType(CI->getContext()))) || + (Res == Instruction::PtrToInt && + (!TD || DstTy != TD->getIntPtrType(CI->getContext())))) + Res = 0; + + return Instruction::CastOps(Res); +} + +/// ShouldOptimizeCast - Return true if the cast from "V to Ty" actually +/// results in any code being generated and is interesting to optimize out. If +/// the cast can be eliminated by some other simple transformation, we prefer +/// to do the simplification first. +bool InstCombiner::ShouldOptimizeCast(Instruction::CastOps opc, const Value *V, + const Type *Ty) { + // Noop casts and casts of constants should be eliminated trivially. + if (V->getType() == Ty || isa<Constant>(V)) return false; + + // If this is another cast that can be eliminated, we prefer to have it + // eliminated. + if (const CastInst *CI = dyn_cast<CastInst>(V)) + if (isEliminableCastPair(CI, opc, Ty, TD)) + return false; + + // If this is a vector sext from a compare, then we don't want to break the + // idiom where each element of the extended vector is either zero or all ones. + if (opc == Instruction::SExt && isa<CmpInst>(V) && Ty->isVectorTy()) + return false; + + return true; +} + + +/// @brief Implement the transforms common to all CastInst visitors. +Instruction *InstCombiner::commonCastTransforms(CastInst &CI) { + Value *Src = CI.getOperand(0); + + // Many cases of "cast of a cast" are eliminable. If it's eliminable we just + // eliminate it now. + if (CastInst *CSrc = dyn_cast<CastInst>(Src)) { // A->B->C cast + if (Instruction::CastOps opc = + isEliminableCastPair(CSrc, CI.getOpcode(), CI.getType(), TD)) { + // The first cast (CSrc) is eliminable so we need to fix up or replace + // the second cast (CI). CSrc will then have a good chance of being dead. + return CastInst::Create(opc, CSrc->getOperand(0), CI.getType()); + } + } + + // If we are casting a select then fold the cast into the select + if (SelectInst *SI = dyn_cast<SelectInst>(Src)) + if (Instruction *NV = FoldOpIntoSelect(CI, SI)) + return NV; + + // If we are casting a PHI then fold the cast into the PHI + if (isa<PHINode>(Src)) { + // We don't do this if this would create a PHI node with an illegal type if + // it is currently legal. + if (!Src->getType()->isIntegerTy() || + !CI.getType()->isIntegerTy() || + ShouldChangeType(CI.getType(), Src->getType())) + if (Instruction *NV = FoldOpIntoPhi(CI)) + return NV; + } + + return 0; +} + +/// CanEvaluateTruncated - Return true if we can evaluate the specified +/// expression tree as type Ty instead of its larger type, and arrive with the +/// same value. This is used by code that tries to eliminate truncates. +/// +/// Ty will always be a type smaller than V. We should return true if trunc(V) +/// can be computed by computing V in the smaller type. If V is an instruction, +/// then trunc(inst(x,y)) can be computed as inst(trunc(x),trunc(y)), which only +/// makes sense if x and y can be efficiently truncated. +/// +/// This function works on both vectors and scalars. +/// +static bool CanEvaluateTruncated(Value *V, const Type *Ty) { + // We can always evaluate constants in another type. + if (isa<Constant>(V)) + return true; + + Instruction *I = dyn_cast<Instruction>(V); + if (!I) return false; + + const Type *OrigTy = V->getType(); + + // If this is an extension from the dest type, we can eliminate it, even if it + // has multiple uses. + if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) && + I->getOperand(0)->getType() == Ty) + return true; + + // We can't extend or shrink something that has multiple uses: doing so would + // require duplicating the instruction in general, which isn't profitable. + if (!I->hasOneUse()) return false; + + unsigned Opc = I->getOpcode(); + switch (Opc) { + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + // These operators can all arbitrarily be extended or truncated. + return CanEvaluateTruncated(I->getOperand(0), Ty) && + CanEvaluateTruncated(I->getOperand(1), Ty); + + case Instruction::UDiv: + case Instruction::URem: { + // UDiv and URem can be truncated if all the truncated bits are zero. + uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits(); + uint32_t BitWidth = Ty->getScalarSizeInBits(); + if (BitWidth < OrigBitWidth) { + APInt Mask = APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth); + if (MaskedValueIsZero(I->getOperand(0), Mask) && + MaskedValueIsZero(I->getOperand(1), Mask)) { + return CanEvaluateTruncated(I->getOperand(0), Ty) && + CanEvaluateTruncated(I->getOperand(1), Ty); + } + } + break; + } + case Instruction::Shl: + // If we are truncating the result of this SHL, and if it's a shift of a + // constant amount, we can always perform a SHL in a smaller type. + if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) { + uint32_t BitWidth = Ty->getScalarSizeInBits(); + if (CI->getLimitedValue(BitWidth) < BitWidth) + return CanEvaluateTruncated(I->getOperand(0), Ty); + } + break; + case Instruction::LShr: + // If this is a truncate of a logical shr, we can truncate it to a smaller + // lshr iff we know that the bits we would otherwise be shifting in are + // already zeros. + if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) { + uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits(); + uint32_t BitWidth = Ty->getScalarSizeInBits(); + if (MaskedValueIsZero(I->getOperand(0), + APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth)) && + CI->getLimitedValue(BitWidth) < BitWidth) { + return CanEvaluateTruncated(I->getOperand(0), Ty); + } + } + break; + case Instruction::Trunc: + // trunc(trunc(x)) -> trunc(x) + return true; + case Instruction::ZExt: + case Instruction::SExt: + // trunc(ext(x)) -> ext(x) if the source type is smaller than the new dest + // trunc(ext(x)) -> trunc(x) if the source type is larger than the new dest + return true; + case Instruction::Select: { + SelectInst *SI = cast<SelectInst>(I); + return CanEvaluateTruncated(SI->getTrueValue(), Ty) && + CanEvaluateTruncated(SI->getFalseValue(), Ty); + } + case Instruction::PHI: { + // We can change a phi if we can change all operands. Note that we never + // get into trouble with cyclic PHIs here because we only consider + // instructions with a single use. + PHINode *PN = cast<PHINode>(I); + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + if (!CanEvaluateTruncated(PN->getIncomingValue(i), Ty)) + return false; + return true; + } + default: + // TODO: Can handle more cases here. + break; + } + + return false; +} + +Instruction *InstCombiner::visitTrunc(TruncInst &CI) { + if (Instruction *Result = commonCastTransforms(CI)) + return Result; + + // See if we can simplify any instructions used by the input whose sole + // purpose is to compute bits we don't care about. + if (SimplifyDemandedInstructionBits(CI)) + return &CI; + + Value *Src = CI.getOperand(0); + const Type *DestTy = CI.getType(), *SrcTy = Src->getType(); + + // Attempt to truncate the entire input expression tree to the destination + // type. Only do this if the dest type is a simple type, don't convert the + // expression tree to something weird like i93 unless the source is also + // strange. + if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) && + CanEvaluateTruncated(Src, DestTy)) { + + // If this cast is a truncate, evaluting in a different type always + // eliminates the cast, so it is always a win. + DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type" + " to avoid cast: " << CI << '\n'); + Value *Res = EvaluateInDifferentType(Src, DestTy, false); + assert(Res->getType() == DestTy); + return ReplaceInstUsesWith(CI, Res); + } + + // Canonicalize trunc x to i1 -> (icmp ne (and x, 1), 0), likewise for vector. + if (DestTy->getScalarSizeInBits() == 1) { + Constant *One = ConstantInt::get(Src->getType(), 1); + Src = Builder->CreateAnd(Src, One, "tmp"); + Value *Zero = Constant::getNullValue(Src->getType()); + return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero); + } + + // Transform trunc(lshr (zext A), Cst) to eliminate one type conversion. + Value *A = 0; ConstantInt *Cst = 0; + if (Src->hasOneUse() && + match(Src, m_LShr(m_ZExt(m_Value(A)), m_ConstantInt(Cst)))) { + // We have three types to worry about here, the type of A, the source of + // the truncate (MidSize), and the destination of the truncate. We know that + // ASize < MidSize and MidSize > ResultSize, but don't know the relation + // between ASize and ResultSize. + unsigned ASize = A->getType()->getPrimitiveSizeInBits(); + + // If the shift amount is larger than the size of A, then the result is + // known to be zero because all the input bits got shifted out. + if (Cst->getZExtValue() >= ASize) + return ReplaceInstUsesWith(CI, Constant::getNullValue(CI.getType())); + + // Since we're doing an lshr and a zero extend, and know that the shift + // amount is smaller than ASize, it is always safe to do the shift in A's + // type, then zero extend or truncate to the result. + Value *Shift = Builder->CreateLShr(A, Cst->getZExtValue()); + Shift->takeName(Src); + return CastInst::CreateIntegerCast(Shift, CI.getType(), false); + } + + // Transform "trunc (and X, cst)" -> "and (trunc X), cst" so long as the dest + // type isn't non-native. + if (Src->hasOneUse() && isa<IntegerType>(Src->getType()) && + ShouldChangeType(Src->getType(), CI.getType()) && + match(Src, m_And(m_Value(A), m_ConstantInt(Cst)))) { + Value *NewTrunc = Builder->CreateTrunc(A, CI.getType(), A->getName()+".tr"); + return BinaryOperator::CreateAnd(NewTrunc, + ConstantExpr::getTrunc(Cst, CI.getType())); + } + + return 0; +} + +/// transformZExtICmp - Transform (zext icmp) to bitwise / integer operations +/// in order to eliminate the icmp. +Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI, + bool DoXform) { + // If we are just checking for a icmp eq of a single bit and zext'ing it + // to an integer, then shift the bit to the appropriate place and then + // cast to integer to avoid the comparison. + if (ConstantInt *Op1C = dyn_cast<ConstantInt>(ICI->getOperand(1))) { + const APInt &Op1CV = Op1C->getValue(); + + // zext (x <s 0) to i32 --> x>>u31 true if signbit set. + // zext (x >s -1) to i32 --> (x>>u31)^1 true if signbit clear. + if ((ICI->getPredicate() == ICmpInst::ICMP_SLT && Op1CV == 0) || + (ICI->getPredicate() == ICmpInst::ICMP_SGT &&Op1CV.isAllOnesValue())) { + if (!DoXform) return ICI; + + Value *In = ICI->getOperand(0); + Value *Sh = ConstantInt::get(In->getType(), + In->getType()->getScalarSizeInBits()-1); + In = Builder->CreateLShr(In, Sh, In->getName()+".lobit"); + if (In->getType() != CI.getType()) + In = Builder->CreateIntCast(In, CI.getType(), false/*ZExt*/, "tmp"); + + if (ICI->getPredicate() == ICmpInst::ICMP_SGT) { + Constant *One = ConstantInt::get(In->getType(), 1); + In = Builder->CreateXor(In, One, In->getName()+".not"); + } + + return ReplaceInstUsesWith(CI, In); + } + + + + // zext (X == 0) to i32 --> X^1 iff X has only the low bit set. + // zext (X == 0) to i32 --> (X>>1)^1 iff X has only the 2nd bit set. + // zext (X == 1) to i32 --> X iff X has only the low bit set. + // zext (X == 2) to i32 --> X>>1 iff X has only the 2nd bit set. + // zext (X != 0) to i32 --> X iff X has only the low bit set. + // zext (X != 0) to i32 --> X>>1 iff X has only the 2nd bit set. + // zext (X != 1) to i32 --> X^1 iff X has only the low bit set. + // zext (X != 2) to i32 --> (X>>1)^1 iff X has only the 2nd bit set. + if ((Op1CV == 0 || Op1CV.isPowerOf2()) && + // This only works for EQ and NE + ICI->isEquality()) { + // If Op1C some other power of two, convert: + uint32_t BitWidth = Op1C->getType()->getBitWidth(); + APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0); + APInt TypeMask(APInt::getAllOnesValue(BitWidth)); + ComputeMaskedBits(ICI->getOperand(0), TypeMask, KnownZero, KnownOne); + + APInt KnownZeroMask(~KnownZero); + if (KnownZeroMask.isPowerOf2()) { // Exactly 1 possible 1? + if (!DoXform) return ICI; + + bool isNE = ICI->getPredicate() == ICmpInst::ICMP_NE; + if (Op1CV != 0 && (Op1CV != KnownZeroMask)) { + // (X&4) == 2 --> false + // (X&4) != 2 --> true + Constant *Res = ConstantInt::get(Type::getInt1Ty(CI.getContext()), + isNE); + Res = ConstantExpr::getZExt(Res, CI.getType()); + return ReplaceInstUsesWith(CI, Res); + } + + uint32_t ShiftAmt = KnownZeroMask.logBase2(); + Value *In = ICI->getOperand(0); + if (ShiftAmt) { + // Perform a logical shr by shiftamt. + // Insert the shift to put the result in the low bit. + In = Builder->CreateLShr(In, ConstantInt::get(In->getType(),ShiftAmt), + In->getName()+".lobit"); + } + + if ((Op1CV != 0) == isNE) { // Toggle the low bit. + Constant *One = ConstantInt::get(In->getType(), 1); + In = Builder->CreateXor(In, One, "tmp"); + } + + if (CI.getType() == In->getType()) + return ReplaceInstUsesWith(CI, In); + return CastInst::CreateIntegerCast(In, CI.getType(), false/*ZExt*/); + } + } + } + + // icmp ne A, B is equal to xor A, B when A and B only really have one bit. + // It is also profitable to transform icmp eq into not(xor(A, B)) because that + // may lead to additional simplifications. + if (ICI->isEquality() && CI.getType() == ICI->getOperand(0)->getType()) { + if (const IntegerType *ITy = dyn_cast<IntegerType>(CI.getType())) { + uint32_t BitWidth = ITy->getBitWidth(); + Value *LHS = ICI->getOperand(0); + Value *RHS = ICI->getOperand(1); + + APInt KnownZeroLHS(BitWidth, 0), KnownOneLHS(BitWidth, 0); + APInt KnownZeroRHS(BitWidth, 0), KnownOneRHS(BitWidth, 0); + APInt TypeMask(APInt::getAllOnesValue(BitWidth)); + ComputeMaskedBits(LHS, TypeMask, KnownZeroLHS, KnownOneLHS); + ComputeMaskedBits(RHS, TypeMask, KnownZeroRHS, KnownOneRHS); + + if (KnownZeroLHS == KnownZeroRHS && KnownOneLHS == KnownOneRHS) { + APInt KnownBits = KnownZeroLHS | KnownOneLHS; + APInt UnknownBit = ~KnownBits; + if (UnknownBit.countPopulation() == 1) { + if (!DoXform) return ICI; + + Value *Result = Builder->CreateXor(LHS, RHS); + + // Mask off any bits that are set and won't be shifted away. + if (KnownOneLHS.uge(UnknownBit)) + Result = Builder->CreateAnd(Result, + ConstantInt::get(ITy, UnknownBit)); + + // Shift the bit we're testing down to the lsb. + Result = Builder->CreateLShr( + Result, ConstantInt::get(ITy, UnknownBit.countTrailingZeros())); + + if (ICI->getPredicate() == ICmpInst::ICMP_EQ) + Result = Builder->CreateXor(Result, ConstantInt::get(ITy, 1)); + Result->takeName(ICI); + return ReplaceInstUsesWith(CI, Result); + } + } + } + } + + return 0; +} + +/// CanEvaluateZExtd - Determine if the specified value can be computed in the +/// specified wider type and produce the same low bits. If not, return false. +/// +/// If this function returns true, it can also return a non-zero number of bits +/// (in BitsToClear) which indicates that the value it computes is correct for +/// the zero extend, but that the additional BitsToClear bits need to be zero'd +/// out. For example, to promote something like: +/// +/// %B = trunc i64 %A to i32 +/// %C = lshr i32 %B, 8 +/// %E = zext i32 %C to i64 +/// +/// CanEvaluateZExtd for the 'lshr' will return true, and BitsToClear will be +/// set to 8 to indicate that the promoted value needs to have bits 24-31 +/// cleared in addition to bits 32-63. Since an 'and' will be generated to +/// clear the top bits anyway, doing this has no extra cost. +/// +/// This function works on both vectors and scalars. +static bool CanEvaluateZExtd(Value *V, const Type *Ty, unsigned &BitsToClear) { + BitsToClear = 0; + if (isa<Constant>(V)) + return true; + + Instruction *I = dyn_cast<Instruction>(V); + if (!I) return false; + + // If the input is a truncate from the destination type, we can trivially + // eliminate it, even if it has multiple uses. + // FIXME: This is currently disabled until codegen can handle this without + // pessimizing code, PR5997. + if (0 && isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty) + return true; + + // We can't extend or shrink something that has multiple uses: doing so would + // require duplicating the instruction in general, which isn't profitable. + if (!I->hasOneUse()) return false; + + unsigned Opc = I->getOpcode(), Tmp; + switch (Opc) { + case Instruction::ZExt: // zext(zext(x)) -> zext(x). + case Instruction::SExt: // zext(sext(x)) -> sext(x). + case Instruction::Trunc: // zext(trunc(x)) -> trunc(x) or zext(x) + return true; + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + case Instruction::Shl: + if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear) || + !CanEvaluateZExtd(I->getOperand(1), Ty, Tmp)) + return false; + // These can all be promoted if neither operand has 'bits to clear'. + if (BitsToClear == 0 && Tmp == 0) + return true; + + // If the operation is an AND/OR/XOR and the bits to clear are zero in the + // other side, BitsToClear is ok. + if (Tmp == 0 && + (Opc == Instruction::And || Opc == Instruction::Or || + Opc == Instruction::Xor)) { + // We use MaskedValueIsZero here for generality, but the case we care + // about the most is constant RHS. + unsigned VSize = V->getType()->getScalarSizeInBits(); + if (MaskedValueIsZero(I->getOperand(1), + APInt::getHighBitsSet(VSize, BitsToClear))) + return true; + } + + // Otherwise, we don't know how to analyze this BitsToClear case yet. + return false; + + case Instruction::LShr: + // We can promote lshr(x, cst) if we can promote x. This requires the + // ultimate 'and' to clear out the high zero bits we're clearing out though. + if (ConstantInt *Amt = dyn_cast<ConstantInt>(I->getOperand(1))) { + if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear)) + return false; + BitsToClear += Amt->getZExtValue(); + if (BitsToClear > V->getType()->getScalarSizeInBits()) + BitsToClear = V->getType()->getScalarSizeInBits(); + return true; + } + // Cannot promote variable LSHR. + return false; + case Instruction::Select: + if (!CanEvaluateZExtd(I->getOperand(1), Ty, Tmp) || + !CanEvaluateZExtd(I->getOperand(2), Ty, BitsToClear) || + // TODO: If important, we could handle the case when the BitsToClear are + // known zero in the disagreeing side. + Tmp != BitsToClear) + return false; + return true; + + case Instruction::PHI: { + // We can change a phi if we can change all operands. Note that we never + // get into trouble with cyclic PHIs here because we only consider + // instructions with a single use. + PHINode *PN = cast<PHINode>(I); + if (!CanEvaluateZExtd(PN->getIncomingValue(0), Ty, BitsToClear)) + return false; + for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i) + if (!CanEvaluateZExtd(PN->getIncomingValue(i), Ty, Tmp) || + // TODO: If important, we could handle the case when the BitsToClear + // are known zero in the disagreeing input. + Tmp != BitsToClear) + return false; + return true; + } + default: + // TODO: Can handle more cases here. + return false; + } +} + +Instruction *InstCombiner::visitZExt(ZExtInst &CI) { + // If this zero extend is only used by a truncate, let the truncate by + // eliminated before we try to optimize this zext. + if (CI.hasOneUse() && isa<TruncInst>(CI.use_back())) + return 0; + + // If one of the common conversion will work, do it. + if (Instruction *Result = commonCastTransforms(CI)) + return Result; + + // See if we can simplify any instructions used by the input whose sole + // purpose is to compute bits we don't care about. + if (SimplifyDemandedInstructionBits(CI)) + return &CI; + + Value *Src = CI.getOperand(0); + const Type *SrcTy = Src->getType(), *DestTy = CI.getType(); + + // Attempt to extend the entire input expression tree to the destination + // type. Only do this if the dest type is a simple type, don't convert the + // expression tree to something weird like i93 unless the source is also + // strange. + unsigned BitsToClear; + if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) && + CanEvaluateZExtd(Src, DestTy, BitsToClear)) { + assert(BitsToClear < SrcTy->getScalarSizeInBits() && + "Unreasonable BitsToClear"); + + // Okay, we can transform this! Insert the new expression now. + DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type" + " to avoid zero extend: " << CI); + Value *Res = EvaluateInDifferentType(Src, DestTy, false); + assert(Res->getType() == DestTy); + + uint32_t SrcBitsKept = SrcTy->getScalarSizeInBits()-BitsToClear; + uint32_t DestBitSize = DestTy->getScalarSizeInBits(); + + // If the high bits are already filled with zeros, just replace this + // cast with the result. + if (MaskedValueIsZero(Res, APInt::getHighBitsSet(DestBitSize, + DestBitSize-SrcBitsKept))) + return ReplaceInstUsesWith(CI, Res); + + // We need to emit an AND to clear the high bits. + Constant *C = ConstantInt::get(Res->getType(), + APInt::getLowBitsSet(DestBitSize, SrcBitsKept)); + return BinaryOperator::CreateAnd(Res, C); + } + + // If this is a TRUNC followed by a ZEXT then we are dealing with integral + // types and if the sizes are just right we can convert this into a logical + // 'and' which will be much cheaper than the pair of casts. + if (TruncInst *CSrc = dyn_cast<TruncInst>(Src)) { // A->B->C cast + // TODO: Subsume this into EvaluateInDifferentType. + + // Get the sizes of the types involved. We know that the intermediate type + // will be smaller than A or C, but don't know the relation between A and C. + Value *A = CSrc->getOperand(0); + unsigned SrcSize = A->getType()->getScalarSizeInBits(); + unsigned MidSize = CSrc->getType()->getScalarSizeInBits(); + unsigned DstSize = CI.getType()->getScalarSizeInBits(); + // If we're actually extending zero bits, then if + // SrcSize < DstSize: zext(a & mask) + // SrcSize == DstSize: a & mask + // SrcSize > DstSize: trunc(a) & mask + if (SrcSize < DstSize) { + APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize)); + Constant *AndConst = ConstantInt::get(A->getType(), AndValue); + Value *And = Builder->CreateAnd(A, AndConst, CSrc->getName()+".mask"); + return new ZExtInst(And, CI.getType()); + } + + if (SrcSize == DstSize) { + APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize)); + return BinaryOperator::CreateAnd(A, ConstantInt::get(A->getType(), + AndValue)); + } + if (SrcSize > DstSize) { + Value *Trunc = Builder->CreateTrunc(A, CI.getType(), "tmp"); + APInt AndValue(APInt::getLowBitsSet(DstSize, MidSize)); + return BinaryOperator::CreateAnd(Trunc, + ConstantInt::get(Trunc->getType(), + AndValue)); + } + } + + if (ICmpInst *ICI = dyn_cast<ICmpInst>(Src)) + return transformZExtICmp(ICI, CI); + + BinaryOperator *SrcI = dyn_cast<BinaryOperator>(Src); + if (SrcI && SrcI->getOpcode() == Instruction::Or) { + // zext (or icmp, icmp) --> or (zext icmp), (zext icmp) if at least one + // of the (zext icmp) will be transformed. + ICmpInst *LHS = dyn_cast<ICmpInst>(SrcI->getOperand(0)); + ICmpInst *RHS = dyn_cast<ICmpInst>(SrcI->getOperand(1)); + if (LHS && RHS && LHS->hasOneUse() && RHS->hasOneUse() && + (transformZExtICmp(LHS, CI, false) || + transformZExtICmp(RHS, CI, false))) { + Value *LCast = Builder->CreateZExt(LHS, CI.getType(), LHS->getName()); + Value *RCast = Builder->CreateZExt(RHS, CI.getType(), RHS->getName()); + return BinaryOperator::Create(Instruction::Or, LCast, RCast); + } + } + + // zext(trunc(t) & C) -> (t & zext(C)). + if (SrcI && SrcI->getOpcode() == Instruction::And && SrcI->hasOneUse()) + if (ConstantInt *C = dyn_cast<ConstantInt>(SrcI->getOperand(1))) + if (TruncInst *TI = dyn_cast<TruncInst>(SrcI->getOperand(0))) { + Value *TI0 = TI->getOperand(0); + if (TI0->getType() == CI.getType()) + return + BinaryOperator::CreateAnd(TI0, + ConstantExpr::getZExt(C, CI.getType())); + } + + // zext((trunc(t) & C) ^ C) -> ((t & zext(C)) ^ zext(C)). + if (SrcI && SrcI->getOpcode() == Instruction::Xor && SrcI->hasOneUse()) + if (ConstantInt *C = dyn_cast<ConstantInt>(SrcI->getOperand(1))) + if (BinaryOperator *And = dyn_cast<BinaryOperator>(SrcI->getOperand(0))) + if (And->getOpcode() == Instruction::And && And->hasOneUse() && + And->getOperand(1) == C) + if (TruncInst *TI = dyn_cast<TruncInst>(And->getOperand(0))) { + Value *TI0 = TI->getOperand(0); + if (TI0->getType() == CI.getType()) { + Constant *ZC = ConstantExpr::getZExt(C, CI.getType()); + Value *NewAnd = Builder->CreateAnd(TI0, ZC, "tmp"); + return BinaryOperator::CreateXor(NewAnd, ZC); + } + } + + // zext (xor i1 X, true) to i32 --> xor (zext i1 X to i32), 1 + Value *X; + if (SrcI && SrcI->hasOneUse() && SrcI->getType()->isIntegerTy(1) && + match(SrcI, m_Not(m_Value(X))) && + (!X->hasOneUse() || !isa<CmpInst>(X))) { + Value *New = Builder->CreateZExt(X, CI.getType()); + return BinaryOperator::CreateXor(New, ConstantInt::get(CI.getType(), 1)); + } + + return 0; +} + +/// CanEvaluateSExtd - Return true if we can take the specified value +/// and return it as type Ty without inserting any new casts and without +/// changing the value of the common low bits. This is used by code that tries +/// to promote integer operations to a wider types will allow us to eliminate +/// the extension. +/// +/// This function works on both vectors and scalars. +/// +static bool CanEvaluateSExtd(Value *V, const Type *Ty) { + assert(V->getType()->getScalarSizeInBits() < Ty->getScalarSizeInBits() && + "Can't sign extend type to a smaller type"); + // If this is a constant, it can be trivially promoted. + if (isa<Constant>(V)) + return true; + + Instruction *I = dyn_cast<Instruction>(V); + if (!I) return false; + + // If this is a truncate from the dest type, we can trivially eliminate it, + // even if it has multiple uses. + // FIXME: This is currently disabled until codegen can handle this without + // pessimizing code, PR5997. + if (0 && isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty) + return true; + + // We can't extend or shrink something that has multiple uses: doing so would + // require duplicating the instruction in general, which isn't profitable. + if (!I->hasOneUse()) return false; + + switch (I->getOpcode()) { + case Instruction::SExt: // sext(sext(x)) -> sext(x) + case Instruction::ZExt: // sext(zext(x)) -> zext(x) + case Instruction::Trunc: // sext(trunc(x)) -> trunc(x) or sext(x) + return true; + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + // These operators can all arbitrarily be extended if their inputs can. + return CanEvaluateSExtd(I->getOperand(0), Ty) && + CanEvaluateSExtd(I->getOperand(1), Ty); + + //case Instruction::Shl: TODO + //case Instruction::LShr: TODO + + case Instruction::Select: + return CanEvaluateSExtd(I->getOperand(1), Ty) && + CanEvaluateSExtd(I->getOperand(2), Ty); + + case Instruction::PHI: { + // We can change a phi if we can change all operands. Note that we never + // get into trouble with cyclic PHIs here because we only consider + // instructions with a single use. + PHINode *PN = cast<PHINode>(I); + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + if (!CanEvaluateSExtd(PN->getIncomingValue(i), Ty)) return false; + return true; + } + default: + // TODO: Can handle more cases here. + break; + } + + return false; +} + +Instruction *InstCombiner::visitSExt(SExtInst &CI) { + // If this sign extend is only used by a truncate, let the truncate by + // eliminated before we try to optimize this zext. + if (CI.hasOneUse() && isa<TruncInst>(CI.use_back())) + return 0; + + if (Instruction *I = commonCastTransforms(CI)) + return I; + + // See if we can simplify any instructions used by the input whose sole + // purpose is to compute bits we don't care about. + if (SimplifyDemandedInstructionBits(CI)) + return &CI; + + Value *Src = CI.getOperand(0); + const Type *SrcTy = Src->getType(), *DestTy = CI.getType(); + + // Attempt to extend the entire input expression tree to the destination + // type. Only do this if the dest type is a simple type, don't convert the + // expression tree to something weird like i93 unless the source is also + // strange. + if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) && + CanEvaluateSExtd(Src, DestTy)) { + // Okay, we can transform this! Insert the new expression now. + DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type" + " to avoid sign extend: " << CI); + Value *Res = EvaluateInDifferentType(Src, DestTy, true); + assert(Res->getType() == DestTy); + + uint32_t SrcBitSize = SrcTy->getScalarSizeInBits(); + uint32_t DestBitSize = DestTy->getScalarSizeInBits(); + + // If the high bits are already filled with sign bit, just replace this + // cast with the result. + if (ComputeNumSignBits(Res) > DestBitSize - SrcBitSize) + return ReplaceInstUsesWith(CI, Res); + + // We need to emit a shl + ashr to do the sign extend. + Value *ShAmt = ConstantInt::get(DestTy, DestBitSize-SrcBitSize); + return BinaryOperator::CreateAShr(Builder->CreateShl(Res, ShAmt, "sext"), + ShAmt); + } + + // If this input is a trunc from our destination, then turn sext(trunc(x)) + // into shifts. + if (TruncInst *TI = dyn_cast<TruncInst>(Src)) + if (TI->hasOneUse() && TI->getOperand(0)->getType() == DestTy) { + uint32_t SrcBitSize = SrcTy->getScalarSizeInBits(); + uint32_t DestBitSize = DestTy->getScalarSizeInBits(); + + // We need to emit a shl + ashr to do the sign extend. + Value *ShAmt = ConstantInt::get(DestTy, DestBitSize-SrcBitSize); + Value *Res = Builder->CreateShl(TI->getOperand(0), ShAmt, "sext"); + return BinaryOperator::CreateAShr(Res, ShAmt); + } + + + // (x <s 0) ? -1 : 0 -> ashr x, 31 -> all ones if signed + // (x >s -1) ? -1 : 0 -> ashr x, 31 -> all ones if not signed + { + ICmpInst::Predicate Pred; Value *CmpLHS; ConstantInt *CmpRHS; + if (match(Src, m_ICmp(Pred, m_Value(CmpLHS), m_ConstantInt(CmpRHS)))) { + // sext (x <s 0) to i32 --> x>>s31 true if signbit set. + // sext (x >s -1) to i32 --> (x>>s31)^-1 true if signbit clear. + if ((Pred == ICmpInst::ICMP_SLT && CmpRHS->isZero()) || + (Pred == ICmpInst::ICMP_SGT && CmpRHS->isAllOnesValue())) { + Value *Sh = ConstantInt::get(CmpLHS->getType(), + CmpLHS->getType()->getScalarSizeInBits()-1); + Value *In = Builder->CreateAShr(CmpLHS, Sh, CmpLHS->getName()+".lobit"); + if (In->getType() != CI.getType()) + In = Builder->CreateIntCast(In, CI.getType(), true/*SExt*/, "tmp"); + + if (Pred == ICmpInst::ICMP_SGT) + In = Builder->CreateNot(In, In->getName()+".not"); + return ReplaceInstUsesWith(CI, In); + } + } + } + + // vector (x <s 0) ? -1 : 0 -> ashr x, 31 -> all ones if signed. + if (const VectorType *VTy = dyn_cast<VectorType>(DestTy)) { + ICmpInst::Predicate Pred; Value *CmpLHS; + if (match(Src, m_ICmp(Pred, m_Value(CmpLHS), m_Zero()))) { + if (Pred == ICmpInst::ICMP_SLT && CmpLHS->getType() == DestTy) { + const Type *EltTy = VTy->getElementType(); + + // splat the shift constant to a constant vector. + Constant *VSh = ConstantInt::get(VTy, EltTy->getScalarSizeInBits()-1); + Value *In = Builder->CreateAShr(CmpLHS, VSh,CmpLHS->getName()+".lobit"); + return ReplaceInstUsesWith(CI, In); + } + } + } + + // If the input is a shl/ashr pair of a same constant, then this is a sign + // extension from a smaller value. If we could trust arbitrary bitwidth + // integers, we could turn this into a truncate to the smaller bit and then + // use a sext for the whole extension. Since we don't, look deeper and check + // for a truncate. If the source and dest are the same type, eliminate the + // trunc and extend and just do shifts. For example, turn: + // %a = trunc i32 %i to i8 + // %b = shl i8 %a, 6 + // %c = ashr i8 %b, 6 + // %d = sext i8 %c to i32 + // into: + // %a = shl i32 %i, 30 + // %d = ashr i32 %a, 30 + Value *A = 0; + // TODO: Eventually this could be subsumed by EvaluateInDifferentType. + ConstantInt *BA = 0, *CA = 0; + if (match(Src, m_AShr(m_Shl(m_Trunc(m_Value(A)), m_ConstantInt(BA)), + m_ConstantInt(CA))) && + BA == CA && A->getType() == CI.getType()) { + unsigned MidSize = Src->getType()->getScalarSizeInBits(); + unsigned SrcDstSize = CI.getType()->getScalarSizeInBits(); + unsigned ShAmt = CA->getZExtValue()+SrcDstSize-MidSize; + Constant *ShAmtV = ConstantInt::get(CI.getType(), ShAmt); + A = Builder->CreateShl(A, ShAmtV, CI.getName()); + return BinaryOperator::CreateAShr(A, ShAmtV); + } + + return 0; +} + + +/// FitsInFPType - Return a Constant* for the specified FP constant if it fits +/// in the specified FP type without changing its value. +static Constant *FitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) { + bool losesInfo; + APFloat F = CFP->getValueAPF(); + (void)F.convert(Sem, APFloat::rmNearestTiesToEven, &losesInfo); + if (!losesInfo) + return ConstantFP::get(CFP->getContext(), F); + return 0; +} + +/// LookThroughFPExtensions - If this is an fp extension instruction, look +/// through it until we get the source value. +static Value *LookThroughFPExtensions(Value *V) { + if (Instruction *I = dyn_cast<Instruction>(V)) + if (I->getOpcode() == Instruction::FPExt) + return LookThroughFPExtensions(I->getOperand(0)); + + // If this value is a constant, return the constant in the smallest FP type + // that can accurately represent it. This allows us to turn + // (float)((double)X+2.0) into x+2.0f. + if (ConstantFP *CFP = dyn_cast<ConstantFP>(V)) { + if (CFP->getType() == Type::getPPC_FP128Ty(V->getContext())) + return V; // No constant folding of this. + // See if the value can be truncated to float and then reextended. + if (Value *V = FitsInFPType(CFP, APFloat::IEEEsingle)) + return V; + if (CFP->getType()->isDoubleTy()) + return V; // Won't shrink. + if (Value *V = FitsInFPType(CFP, APFloat::IEEEdouble)) + return V; + // Don't try to shrink to various long double types. + } + + return V; +} + +Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) { + if (Instruction *I = commonCastTransforms(CI)) + return I; + + // If we have fptrunc(fadd (fpextend x), (fpextend y)), where x and y are + // smaller than the destination type, we can eliminate the truncate by doing + // the add as the smaller type. This applies to fadd/fsub/fmul/fdiv as well + // as many builtins (sqrt, etc). + BinaryOperator *OpI = dyn_cast<BinaryOperator>(CI.getOperand(0)); + if (OpI && OpI->hasOneUse()) { + switch (OpI->getOpcode()) { + default: break; + case Instruction::FAdd: + case Instruction::FSub: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + const Type *SrcTy = OpI->getType(); + Value *LHSTrunc = LookThroughFPExtensions(OpI->getOperand(0)); + Value *RHSTrunc = LookThroughFPExtensions(OpI->getOperand(1)); + if (LHSTrunc->getType() != SrcTy && + RHSTrunc->getType() != SrcTy) { + unsigned DstSize = CI.getType()->getScalarSizeInBits(); + // If the source types were both smaller than the destination type of + // the cast, do this xform. + if (LHSTrunc->getType()->getScalarSizeInBits() <= DstSize && + RHSTrunc->getType()->getScalarSizeInBits() <= DstSize) { + LHSTrunc = Builder->CreateFPExt(LHSTrunc, CI.getType()); + RHSTrunc = Builder->CreateFPExt(RHSTrunc, CI.getType()); + return BinaryOperator::Create(OpI->getOpcode(), LHSTrunc, RHSTrunc); + } + } + break; + } + } + + // Fold (fptrunc (sqrt (fpext x))) -> (sqrtf x) + // NOTE: This should be disabled by -fno-builtin-sqrt if we ever support it. + CallInst *Call = dyn_cast<CallInst>(CI.getOperand(0)); + if (Call && Call->getCalledFunction() && + Call->getCalledFunction()->getName() == "sqrt" && + Call->getNumArgOperands() == 1) { + CastInst *Arg = dyn_cast<CastInst>(Call->getArgOperand(0)); + if (Arg && Arg->getOpcode() == Instruction::FPExt && + CI.getType()->isFloatTy() && + Call->getType()->isDoubleTy() && + Arg->getType()->isDoubleTy() && + Arg->getOperand(0)->getType()->isFloatTy()) { + Function *Callee = Call->getCalledFunction(); + Module *M = CI.getParent()->getParent()->getParent(); + Constant *SqrtfFunc = M->getOrInsertFunction("sqrtf", + Callee->getAttributes(), + Builder->getFloatTy(), + Builder->getFloatTy(), + NULL); + CallInst *ret = CallInst::Create(SqrtfFunc, Arg->getOperand(0), + "sqrtfcall"); + ret->setAttributes(Callee->getAttributes()); + + + // Remove the old Call. With -fmath-errno, it won't get marked readnone. + Call->replaceAllUsesWith(UndefValue::get(Call->getType())); + EraseInstFromFunction(*Call); + return ret; + } + } + + return 0; +} + +Instruction *InstCombiner::visitFPExt(CastInst &CI) { + return commonCastTransforms(CI); +} + +Instruction *InstCombiner::visitFPToUI(FPToUIInst &FI) { + Instruction *OpI = dyn_cast<Instruction>(FI.getOperand(0)); + if (OpI == 0) + return commonCastTransforms(FI); + + // fptoui(uitofp(X)) --> X + // fptoui(sitofp(X)) --> X + // This is safe if the intermediate type has enough bits in its mantissa to + // accurately represent all values of X. For example, do not do this with + // i64->float->i64. This is also safe for sitofp case, because any negative + // 'X' value would cause an undefined result for the fptoui. + if ((isa<UIToFPInst>(OpI) || isa<SIToFPInst>(OpI)) && + OpI->getOperand(0)->getType() == FI.getType() && + (int)FI.getType()->getScalarSizeInBits() < /*extra bit for sign */ + OpI->getType()->getFPMantissaWidth()) + return ReplaceInstUsesWith(FI, OpI->getOperand(0)); + + return commonCastTransforms(FI); +} + +Instruction *InstCombiner::visitFPToSI(FPToSIInst &FI) { + Instruction *OpI = dyn_cast<Instruction>(FI.getOperand(0)); + if (OpI == 0) + return commonCastTransforms(FI); + + // fptosi(sitofp(X)) --> X + // fptosi(uitofp(X)) --> X + // This is safe if the intermediate type has enough bits in its mantissa to + // accurately represent all values of X. For example, do not do this with + // i64->float->i64. This is also safe for sitofp case, because any negative + // 'X' value would cause an undefined result for the fptoui. + if ((isa<UIToFPInst>(OpI) || isa<SIToFPInst>(OpI)) && + OpI->getOperand(0)->getType() == FI.getType() && + (int)FI.getType()->getScalarSizeInBits() <= + OpI->getType()->getFPMantissaWidth()) + return ReplaceInstUsesWith(FI, OpI->getOperand(0)); + + return commonCastTransforms(FI); +} + +Instruction *InstCombiner::visitUIToFP(CastInst &CI) { + return commonCastTransforms(CI); +} + +Instruction *InstCombiner::visitSIToFP(CastInst &CI) { + return commonCastTransforms(CI); +} + +Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) { + // If the source integer type is not the intptr_t type for this target, do a + // trunc or zext to the intptr_t type, then inttoptr of it. This allows the + // cast to be exposed to other transforms. + if (TD) { + if (CI.getOperand(0)->getType()->getScalarSizeInBits() > + TD->getPointerSizeInBits()) { + Value *P = Builder->CreateTrunc(CI.getOperand(0), + TD->getIntPtrType(CI.getContext()), "tmp"); + return new IntToPtrInst(P, CI.getType()); + } + if (CI.getOperand(0)->getType()->getScalarSizeInBits() < + TD->getPointerSizeInBits()) { + Value *P = Builder->CreateZExt(CI.getOperand(0), + TD->getIntPtrType(CI.getContext()), "tmp"); + return new IntToPtrInst(P, CI.getType()); + } + } + + if (Instruction *I = commonCastTransforms(CI)) + return I; + + return 0; +} + +/// @brief Implement the transforms for cast of pointer (bitcast/ptrtoint) +Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) { + Value *Src = CI.getOperand(0); + + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Src)) { + // If casting the result of a getelementptr instruction with no offset, turn + // this into a cast of the original pointer! + if (GEP->hasAllZeroIndices()) { + // Changing the cast operand is usually not a good idea but it is safe + // here because the pointer operand is being replaced with another + // pointer operand so the opcode doesn't need to change. + Worklist.Add(GEP); + CI.setOperand(0, GEP->getOperand(0)); + return &CI; + } + + // If the GEP has a single use, and the base pointer is a bitcast, and the + // GEP computes a constant offset, see if we can convert these three + // instructions into fewer. This typically happens with unions and other + // non-type-safe code. + if (TD && GEP->hasOneUse() && isa<BitCastInst>(GEP->getOperand(0)) && + GEP->hasAllConstantIndices()) { + // We are guaranteed to get a constant from EmitGEPOffset. + ConstantInt *OffsetV = cast<ConstantInt>(EmitGEPOffset(GEP)); + int64_t Offset = OffsetV->getSExtValue(); + + // Get the base pointer input of the bitcast, and the type it points to. + Value *OrigBase = cast<BitCastInst>(GEP->getOperand(0))->getOperand(0); + const Type *GEPIdxTy = + cast<PointerType>(OrigBase->getType())->getElementType(); + SmallVector<Value*, 8> NewIndices; + if (FindElementAtOffset(GEPIdxTy, Offset, NewIndices)) { + // If we were able to index down into an element, create the GEP + // and bitcast the result. This eliminates one bitcast, potentially + // two. + Value *NGEP = cast<GEPOperator>(GEP)->isInBounds() ? + Builder->CreateInBoundsGEP(OrigBase, + NewIndices.begin(), NewIndices.end()) : + Builder->CreateGEP(OrigBase, NewIndices.begin(), NewIndices.end()); + NGEP->takeName(GEP); + + if (isa<BitCastInst>(CI)) + return new BitCastInst(NGEP, CI.getType()); + assert(isa<PtrToIntInst>(CI)); + return new PtrToIntInst(NGEP, CI.getType()); + } + } + } + + return commonCastTransforms(CI); +} + +Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) { + // If the destination integer type is not the intptr_t type for this target, + // do a ptrtoint to intptr_t then do a trunc or zext. This allows the cast + // to be exposed to other transforms. + if (TD) { + if (CI.getType()->getScalarSizeInBits() < TD->getPointerSizeInBits()) { + Value *P = Builder->CreatePtrToInt(CI.getOperand(0), + TD->getIntPtrType(CI.getContext()), + "tmp"); + return new TruncInst(P, CI.getType()); + } + if (CI.getType()->getScalarSizeInBits() > TD->getPointerSizeInBits()) { + Value *P = Builder->CreatePtrToInt(CI.getOperand(0), + TD->getIntPtrType(CI.getContext()), + "tmp"); + return new ZExtInst(P, CI.getType()); + } + } + + return commonPointerCastTransforms(CI); +} + +/// OptimizeVectorResize - This input value (which is known to have vector type) +/// is being zero extended or truncated to the specified vector type. Try to +/// replace it with a shuffle (and vector/vector bitcast) if possible. +/// +/// The source and destination vector types may have different element types. +static Instruction *OptimizeVectorResize(Value *InVal, const VectorType *DestTy, + InstCombiner &IC) { + // We can only do this optimization if the output is a multiple of the input + // element size, or the input is a multiple of the output element size. + // Convert the input type to have the same element type as the output. + const VectorType *SrcTy = cast<VectorType>(InVal->getType()); + + if (SrcTy->getElementType() != DestTy->getElementType()) { + // The input types don't need to be identical, but for now they must be the + // same size. There is no specific reason we couldn't handle things like + // <4 x i16> -> <4 x i32> by bitcasting to <2 x i32> but haven't gotten + // there yet. + if (SrcTy->getElementType()->getPrimitiveSizeInBits() != + DestTy->getElementType()->getPrimitiveSizeInBits()) + return 0; + + SrcTy = VectorType::get(DestTy->getElementType(), SrcTy->getNumElements()); + InVal = IC.Builder->CreateBitCast(InVal, SrcTy); + } + + // Now that the element types match, get the shuffle mask and RHS of the + // shuffle to use, which depends on whether we're increasing or decreasing the + // size of the input. + SmallVector<Constant*, 16> ShuffleMask; + Value *V2; + const IntegerType *Int32Ty = Type::getInt32Ty(SrcTy->getContext()); + + if (SrcTy->getNumElements() > DestTy->getNumElements()) { + // If we're shrinking the number of elements, just shuffle in the low + // elements from the input and use undef as the second shuffle input. + V2 = UndefValue::get(SrcTy); + for (unsigned i = 0, e = DestTy->getNumElements(); i != e; ++i) + ShuffleMask.push_back(ConstantInt::get(Int32Ty, i)); + + } else { + // If we're increasing the number of elements, shuffle in all of the + // elements from InVal and fill the rest of the result elements with zeros + // from a constant zero. + V2 = Constant::getNullValue(SrcTy); + unsigned SrcElts = SrcTy->getNumElements(); + for (unsigned i = 0, e = SrcElts; i != e; ++i) + ShuffleMask.push_back(ConstantInt::get(Int32Ty, i)); + + // The excess elements reference the first element of the zero input. + ShuffleMask.append(DestTy->getNumElements()-SrcElts, + ConstantInt::get(Int32Ty, SrcElts)); + } + + return new ShuffleVectorInst(InVal, V2, ConstantVector::get(ShuffleMask)); +} + +static bool isMultipleOfTypeSize(unsigned Value, const Type *Ty) { + return Value % Ty->getPrimitiveSizeInBits() == 0; +} + +static unsigned getTypeSizeIndex(unsigned Value, const Type *Ty) { + return Value / Ty->getPrimitiveSizeInBits(); +} + +/// CollectInsertionElements - V is a value which is inserted into a vector of +/// VecEltTy. Look through the value to see if we can decompose it into +/// insertions into the vector. See the example in the comment for +/// OptimizeIntegerToVectorInsertions for the pattern this handles. +/// The type of V is always a non-zero multiple of VecEltTy's size. +/// +/// This returns false if the pattern can't be matched or true if it can, +/// filling in Elements with the elements found here. +static bool CollectInsertionElements(Value *V, unsigned ElementIndex, + SmallVectorImpl<Value*> &Elements, + const Type *VecEltTy) { + // Undef values never contribute useful bits to the result. + if (isa<UndefValue>(V)) return true; + + // If we got down to a value of the right type, we win, try inserting into the + // right element. + if (V->getType() == VecEltTy) { + // Inserting null doesn't actually insert any elements. + if (Constant *C = dyn_cast<Constant>(V)) + if (C->isNullValue()) + return true; + + // Fail if multiple elements are inserted into this slot. + if (ElementIndex >= Elements.size() || Elements[ElementIndex] != 0) + return false; + + Elements[ElementIndex] = V; + return true; + } + + if (Constant *C = dyn_cast<Constant>(V)) { + // Figure out the # elements this provides, and bitcast it or slice it up + // as required. + unsigned NumElts = getTypeSizeIndex(C->getType()->getPrimitiveSizeInBits(), + VecEltTy); + // If the constant is the size of a vector element, we just need to bitcast + // it to the right type so it gets properly inserted. + if (NumElts == 1) + return CollectInsertionElements(ConstantExpr::getBitCast(C, VecEltTy), + ElementIndex, Elements, VecEltTy); + + // Okay, this is a constant that covers multiple elements. Slice it up into + // pieces and insert each element-sized piece into the vector. + if (!isa<IntegerType>(C->getType())) + C = ConstantExpr::getBitCast(C, IntegerType::get(V->getContext(), + C->getType()->getPrimitiveSizeInBits())); + unsigned ElementSize = VecEltTy->getPrimitiveSizeInBits(); + const Type *ElementIntTy = IntegerType::get(C->getContext(), ElementSize); + + for (unsigned i = 0; i != NumElts; ++i) { + Constant *Piece = ConstantExpr::getLShr(C, ConstantInt::get(C->getType(), + i*ElementSize)); + Piece = ConstantExpr::getTrunc(Piece, ElementIntTy); + if (!CollectInsertionElements(Piece, ElementIndex+i, Elements, VecEltTy)) + return false; + } + return true; + } + + if (!V->hasOneUse()) return false; + + Instruction *I = dyn_cast<Instruction>(V); + if (I == 0) return false; + switch (I->getOpcode()) { + default: return false; // Unhandled case. + case Instruction::BitCast: + return CollectInsertionElements(I->getOperand(0), ElementIndex, + Elements, VecEltTy); + case Instruction::ZExt: + if (!isMultipleOfTypeSize( + I->getOperand(0)->getType()->getPrimitiveSizeInBits(), + VecEltTy)) + return false; + return CollectInsertionElements(I->getOperand(0), ElementIndex, + Elements, VecEltTy); + case Instruction::Or: + return CollectInsertionElements(I->getOperand(0), ElementIndex, + Elements, VecEltTy) && + CollectInsertionElements(I->getOperand(1), ElementIndex, + Elements, VecEltTy); + case Instruction::Shl: { + // Must be shifting by a constant that is a multiple of the element size. + ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1)); + if (CI == 0) return false; + if (!isMultipleOfTypeSize(CI->getZExtValue(), VecEltTy)) return false; + unsigned IndexShift = getTypeSizeIndex(CI->getZExtValue(), VecEltTy); + + return CollectInsertionElements(I->getOperand(0), ElementIndex+IndexShift, + Elements, VecEltTy); + } + + } +} + + +/// OptimizeIntegerToVectorInsertions - If the input is an 'or' instruction, we +/// may be doing shifts and ors to assemble the elements of the vector manually. +/// Try to rip the code out and replace it with insertelements. This is to +/// optimize code like this: +/// +/// %tmp37 = bitcast float %inc to i32 +/// %tmp38 = zext i32 %tmp37 to i64 +/// %tmp31 = bitcast float %inc5 to i32 +/// %tmp32 = zext i32 %tmp31 to i64 +/// %tmp33 = shl i64 %tmp32, 32 +/// %ins35 = or i64 %tmp33, %tmp38 +/// %tmp43 = bitcast i64 %ins35 to <2 x float> +/// +/// Into two insertelements that do "buildvector{%inc, %inc5}". +static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI, + InstCombiner &IC) { + const VectorType *DestVecTy = cast<VectorType>(CI.getType()); + Value *IntInput = CI.getOperand(0); + + SmallVector<Value*, 8> Elements(DestVecTy->getNumElements()); + if (!CollectInsertionElements(IntInput, 0, Elements, + DestVecTy->getElementType())) + return 0; + + // If we succeeded, we know that all of the element are specified by Elements + // or are zero if Elements has a null entry. Recast this as a set of + // insertions. + Value *Result = Constant::getNullValue(CI.getType()); + for (unsigned i = 0, e = Elements.size(); i != e; ++i) { + if (Elements[i] == 0) continue; // Unset element. + + Result = IC.Builder->CreateInsertElement(Result, Elements[i], + IC.Builder->getInt32(i)); + } + + return Result; +} + + +/// OptimizeIntToFloatBitCast - See if we can optimize an integer->float/double +/// bitcast. The various long double bitcasts can't get in here. +static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){ + Value *Src = CI.getOperand(0); + const Type *DestTy = CI.getType(); + + // If this is a bitcast from int to float, check to see if the int is an + // extraction from a vector. + Value *VecInput = 0; + // bitcast(trunc(bitcast(somevector))) + if (match(Src, m_Trunc(m_BitCast(m_Value(VecInput)))) && + isa<VectorType>(VecInput->getType())) { + const VectorType *VecTy = cast<VectorType>(VecInput->getType()); + unsigned DestWidth = DestTy->getPrimitiveSizeInBits(); + + if (VecTy->getPrimitiveSizeInBits() % DestWidth == 0) { + // If the element type of the vector doesn't match the result type, + // bitcast it to be a vector type we can extract from. + if (VecTy->getElementType() != DestTy) { + VecTy = VectorType::get(DestTy, + VecTy->getPrimitiveSizeInBits() / DestWidth); + VecInput = IC.Builder->CreateBitCast(VecInput, VecTy); + } + + return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(0)); + } + } + + // bitcast(trunc(lshr(bitcast(somevector), cst)) + ConstantInt *ShAmt = 0; + if (match(Src, m_Trunc(m_LShr(m_BitCast(m_Value(VecInput)), + m_ConstantInt(ShAmt)))) && + isa<VectorType>(VecInput->getType())) { + const VectorType *VecTy = cast<VectorType>(VecInput->getType()); + unsigned DestWidth = DestTy->getPrimitiveSizeInBits(); + if (VecTy->getPrimitiveSizeInBits() % DestWidth == 0 && + ShAmt->getZExtValue() % DestWidth == 0) { + // If the element type of the vector doesn't match the result type, + // bitcast it to be a vector type we can extract from. + if (VecTy->getElementType() != DestTy) { + VecTy = VectorType::get(DestTy, + VecTy->getPrimitiveSizeInBits() / DestWidth); + VecInput = IC.Builder->CreateBitCast(VecInput, VecTy); + } + + unsigned Elt = ShAmt->getZExtValue() / DestWidth; + return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt)); + } + } + return 0; +} + +Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { + // If the operands are integer typed then apply the integer transforms, + // otherwise just apply the common ones. + Value *Src = CI.getOperand(0); + const Type *SrcTy = Src->getType(); + const Type *DestTy = CI.getType(); + + // Get rid of casts from one type to the same type. These are useless and can + // be replaced by the operand. + if (DestTy == Src->getType()) + return ReplaceInstUsesWith(CI, Src); + + if (const PointerType *DstPTy = dyn_cast<PointerType>(DestTy)) { + const PointerType *SrcPTy = cast<PointerType>(SrcTy); + const Type *DstElTy = DstPTy->getElementType(); + const Type *SrcElTy = SrcPTy->getElementType(); + + // If the address spaces don't match, don't eliminate the bitcast, which is + // required for changing types. + if (SrcPTy->getAddressSpace() != DstPTy->getAddressSpace()) + return 0; + + // If we are casting a alloca to a pointer to a type of the same + // size, rewrite the allocation instruction to allocate the "right" type. + // There is no need to modify malloc calls because it is their bitcast that + // needs to be cleaned up. + if (AllocaInst *AI = dyn_cast<AllocaInst>(Src)) + if (Instruction *V = PromoteCastOfAllocation(CI, *AI)) + return V; + + // If the source and destination are pointers, and this cast is equivalent + // to a getelementptr X, 0, 0, 0... turn it into the appropriate gep. + // This can enhance SROA and other transforms that want type-safe pointers. + Constant *ZeroUInt = + Constant::getNullValue(Type::getInt32Ty(CI.getContext())); + unsigned NumZeros = 0; + while (SrcElTy != DstElTy && + isa<CompositeType>(SrcElTy) && !SrcElTy->isPointerTy() && + SrcElTy->getNumContainedTypes() /* not "{}" */) { + SrcElTy = cast<CompositeType>(SrcElTy)->getTypeAtIndex(ZeroUInt); + ++NumZeros; + } + + // If we found a path from the src to dest, create the getelementptr now. + if (SrcElTy == DstElTy) { + SmallVector<Value*, 8> Idxs(NumZeros+1, ZeroUInt); + return GetElementPtrInst::CreateInBounds(Src, Idxs.begin(), Idxs.end(),"", + ((Instruction*)NULL)); + } + } + + // Try to optimize int -> float bitcasts. + if ((DestTy->isFloatTy() || DestTy->isDoubleTy()) && isa<IntegerType>(SrcTy)) + if (Instruction *I = OptimizeIntToFloatBitCast(CI, *this)) + return I; + + if (const VectorType *DestVTy = dyn_cast<VectorType>(DestTy)) { + if (DestVTy->getNumElements() == 1 && !SrcTy->isVectorTy()) { + Value *Elem = Builder->CreateBitCast(Src, DestVTy->getElementType()); + return InsertElementInst::Create(UndefValue::get(DestTy), Elem, + Constant::getNullValue(Type::getInt32Ty(CI.getContext()))); + // FIXME: Canonicalize bitcast(insertelement) -> insertelement(bitcast) + } + + if (isa<IntegerType>(SrcTy)) { + // If this is a cast from an integer to vector, check to see if the input + // is a trunc or zext of a bitcast from vector. If so, we can replace all + // the casts with a shuffle and (potentially) a bitcast. + if (isa<TruncInst>(Src) || isa<ZExtInst>(Src)) { + CastInst *SrcCast = cast<CastInst>(Src); + if (BitCastInst *BCIn = dyn_cast<BitCastInst>(SrcCast->getOperand(0))) + if (isa<VectorType>(BCIn->getOperand(0)->getType())) + if (Instruction *I = OptimizeVectorResize(BCIn->getOperand(0), + cast<VectorType>(DestTy), *this)) + return I; + } + + // If the input is an 'or' instruction, we may be doing shifts and ors to + // assemble the elements of the vector manually. Try to rip the code out + // and replace it with insertelements. + if (Value *V = OptimizeIntegerToVectorInsertions(CI, *this)) + return ReplaceInstUsesWith(CI, V); + } + } + + if (const VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy)) { + if (SrcVTy->getNumElements() == 1 && !DestTy->isVectorTy()) { + Value *Elem = + Builder->CreateExtractElement(Src, + Constant::getNullValue(Type::getInt32Ty(CI.getContext()))); + return CastInst::Create(Instruction::BitCast, Elem, DestTy); + } + } + + if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(Src)) { + // Okay, we have (bitcast (shuffle ..)). Check to see if this is + // a bitcast to a vector with the same # elts. + if (SVI->hasOneUse() && DestTy->isVectorTy() && + cast<VectorType>(DestTy)->getNumElements() == + SVI->getType()->getNumElements() && + SVI->getType()->getNumElements() == + cast<VectorType>(SVI->getOperand(0)->getType())->getNumElements()) { + BitCastInst *Tmp; + // If either of the operands is a cast from CI.getType(), then + // evaluating the shuffle in the casted destination's type will allow + // us to eliminate at least one cast. + if (((Tmp = dyn_cast<BitCastInst>(SVI->getOperand(0))) && + Tmp->getOperand(0)->getType() == DestTy) || + ((Tmp = dyn_cast<BitCastInst>(SVI->getOperand(1))) && + Tmp->getOperand(0)->getType() == DestTy)) { + Value *LHS = Builder->CreateBitCast(SVI->getOperand(0), DestTy); + Value *RHS = Builder->CreateBitCast(SVI->getOperand(1), DestTy); + // Return a new shuffle vector. Use the same element ID's, as we + // know the vector types match #elts. + return new ShuffleVectorInst(LHS, RHS, SVI->getOperand(2)); + } + } + } + + if (SrcTy->isPointerTy()) + return commonPointerCastTransforms(CI); + return commonCastTransforms(CI); +} diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp new file mode 100644 index 0000000..999de34 --- /dev/null +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -0,0 +1,2759 @@ +//===- InstCombineCompares.cpp --------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the visitICmp and visitFCmp functions. +// +//===----------------------------------------------------------------------===// + +#include "InstCombine.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Support/ConstantRange.h" +#include "llvm/Support/GetElementPtrTypeIterator.h" +#include "llvm/Support/PatternMatch.h" +using namespace llvm; +using namespace PatternMatch; + +static ConstantInt *getOne(Constant *C) { + return ConstantInt::get(cast<IntegerType>(C->getType()), 1); +} + +/// AddOne - Add one to a ConstantInt +static Constant *AddOne(Constant *C) { + return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1)); +} +/// SubOne - Subtract one from a ConstantInt +static Constant *SubOne(Constant *C) { + return ConstantExpr::getSub(C, ConstantInt::get(C->getType(), 1)); +} + +static ConstantInt *ExtractElement(Constant *V, Constant *Idx) { + return cast<ConstantInt>(ConstantExpr::getExtractElement(V, Idx)); +} + +static bool HasAddOverflow(ConstantInt *Result, + ConstantInt *In1, ConstantInt *In2, + bool IsSigned) { + if (IsSigned) + if (In2->getValue().isNegative()) + return Result->getValue().sgt(In1->getValue()); + else + return Result->getValue().slt(In1->getValue()); + else + return Result->getValue().ult(In1->getValue()); +} + +/// AddWithOverflow - Compute Result = In1+In2, returning true if the result +/// overflowed for this type. +static bool AddWithOverflow(Constant *&Result, Constant *In1, + Constant *In2, bool IsSigned = false) { + Result = ConstantExpr::getAdd(In1, In2); + + if (const VectorType *VTy = dyn_cast<VectorType>(In1->getType())) { + for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) { + Constant *Idx = ConstantInt::get(Type::getInt32Ty(In1->getContext()), i); + if (HasAddOverflow(ExtractElement(Result, Idx), + ExtractElement(In1, Idx), + ExtractElement(In2, Idx), + IsSigned)) + return true; + } + return false; + } + + return HasAddOverflow(cast<ConstantInt>(Result), + cast<ConstantInt>(In1), cast<ConstantInt>(In2), + IsSigned); +} + +static bool HasSubOverflow(ConstantInt *Result, + ConstantInt *In1, ConstantInt *In2, + bool IsSigned) { + if (IsSigned) + if (In2->getValue().isNegative()) + return Result->getValue().slt(In1->getValue()); + else + return Result->getValue().sgt(In1->getValue()); + else + return Result->getValue().ugt(In1->getValue()); +} + +/// SubWithOverflow - Compute Result = In1-In2, returning true if the result +/// overflowed for this type. +static bool SubWithOverflow(Constant *&Result, Constant *In1, + Constant *In2, bool IsSigned = false) { + Result = ConstantExpr::getSub(In1, In2); + + if (const VectorType *VTy = dyn_cast<VectorType>(In1->getType())) { + for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) { + Constant *Idx = ConstantInt::get(Type::getInt32Ty(In1->getContext()), i); + if (HasSubOverflow(ExtractElement(Result, Idx), + ExtractElement(In1, Idx), + ExtractElement(In2, Idx), + IsSigned)) + return true; + } + return false; + } + + return HasSubOverflow(cast<ConstantInt>(Result), + cast<ConstantInt>(In1), cast<ConstantInt>(In2), + IsSigned); +} + +/// isSignBitCheck - Given an exploded icmp instruction, return true if the +/// comparison only checks the sign bit. If it only checks the sign bit, set +/// TrueIfSigned if the result of the comparison is true when the input value is +/// signed. +static bool isSignBitCheck(ICmpInst::Predicate pred, ConstantInt *RHS, + bool &TrueIfSigned) { + switch (pred) { + case ICmpInst::ICMP_SLT: // True if LHS s< 0 + TrueIfSigned = true; + return RHS->isZero(); + case ICmpInst::ICMP_SLE: // True if LHS s<= RHS and RHS == -1 + TrueIfSigned = true; + return RHS->isAllOnesValue(); + case ICmpInst::ICMP_SGT: // True if LHS s> -1 + TrueIfSigned = false; + return RHS->isAllOnesValue(); + case ICmpInst::ICMP_UGT: + // True if LHS u> RHS and RHS == high-bit-mask - 1 + TrueIfSigned = true; + return RHS->getValue() == + APInt::getSignedMaxValue(RHS->getType()->getPrimitiveSizeInBits()); + case ICmpInst::ICMP_UGE: + // True if LHS u>= RHS and RHS == high-bit-mask (2^7, 2^15, 2^31, etc) + TrueIfSigned = true; + return RHS->getValue().isSignBit(); + default: + return false; + } +} + +// isHighOnes - Return true if the constant is of the form 1+0+. +// This is the same as lowones(~X). +static bool isHighOnes(const ConstantInt *CI) { + return (~CI->getValue() + 1).isPowerOf2(); +} + +/// ComputeSignedMinMaxValuesFromKnownBits - Given a signed integer type and a +/// set of known zero and one bits, compute the maximum and minimum values that +/// could have the specified known zero and known one bits, returning them in +/// min/max. +static void ComputeSignedMinMaxValuesFromKnownBits(const APInt& KnownZero, + const APInt& KnownOne, + APInt& Min, APInt& Max) { + assert(KnownZero.getBitWidth() == KnownOne.getBitWidth() && + KnownZero.getBitWidth() == Min.getBitWidth() && + KnownZero.getBitWidth() == Max.getBitWidth() && + "KnownZero, KnownOne and Min, Max must have equal bitwidth."); + APInt UnknownBits = ~(KnownZero|KnownOne); + + // The minimum value is when all unknown bits are zeros, EXCEPT for the sign + // bit if it is unknown. + Min = KnownOne; + Max = KnownOne|UnknownBits; + + if (UnknownBits.isNegative()) { // Sign bit is unknown + Min.setBit(Min.getBitWidth()-1); + Max.clearBit(Max.getBitWidth()-1); + } +} + +// ComputeUnsignedMinMaxValuesFromKnownBits - Given an unsigned integer type and +// a set of known zero and one bits, compute the maximum and minimum values that +// could have the specified known zero and known one bits, returning them in +// min/max. +static void ComputeUnsignedMinMaxValuesFromKnownBits(const APInt &KnownZero, + const APInt &KnownOne, + APInt &Min, APInt &Max) { + assert(KnownZero.getBitWidth() == KnownOne.getBitWidth() && + KnownZero.getBitWidth() == Min.getBitWidth() && + KnownZero.getBitWidth() == Max.getBitWidth() && + "Ty, KnownZero, KnownOne and Min, Max must have equal bitwidth."); + APInt UnknownBits = ~(KnownZero|KnownOne); + + // The minimum value is when the unknown bits are all zeros. + Min = KnownOne; + // The maximum value is when the unknown bits are all ones. + Max = KnownOne|UnknownBits; +} + + + +/// FoldCmpLoadFromIndexedGlobal - Called we see this pattern: +/// cmp pred (load (gep GV, ...)), cmpcst +/// where GV is a global variable with a constant initializer. Try to simplify +/// this into some simple computation that does not need the load. For example +/// we can optimize "icmp eq (load (gep "foo", 0, i)), 0" into "icmp eq i, 3". +/// +/// If AndCst is non-null, then the loaded value is masked with that constant +/// before doing the comparison. This handles cases like "A[i]&4 == 0". +Instruction *InstCombiner:: +FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV, + CmpInst &ICI, ConstantInt *AndCst) { + // We need TD information to know the pointer size unless this is inbounds. + if (!GEP->isInBounds() && TD == 0) return 0; + + ConstantArray *Init = dyn_cast<ConstantArray>(GV->getInitializer()); + if (Init == 0 || Init->getNumOperands() > 1024) return 0; + + // There are many forms of this optimization we can handle, for now, just do + // the simple index into a single-dimensional array. + // + // Require: GEP GV, 0, i {{, constant indices}} + if (GEP->getNumOperands() < 3 || + !isa<ConstantInt>(GEP->getOperand(1)) || + !cast<ConstantInt>(GEP->getOperand(1))->isZero() || + isa<Constant>(GEP->getOperand(2))) + return 0; + + // Check that indices after the variable are constants and in-range for the + // type they index. Collect the indices. This is typically for arrays of + // structs. + SmallVector<unsigned, 4> LaterIndices; + + const Type *EltTy = cast<ArrayType>(Init->getType())->getElementType(); + for (unsigned i = 3, e = GEP->getNumOperands(); i != e; ++i) { + ConstantInt *Idx = dyn_cast<ConstantInt>(GEP->getOperand(i)); + if (Idx == 0) return 0; // Variable index. + + uint64_t IdxVal = Idx->getZExtValue(); + if ((unsigned)IdxVal != IdxVal) return 0; // Too large array index. + + if (const StructType *STy = dyn_cast<StructType>(EltTy)) + EltTy = STy->getElementType(IdxVal); + else if (const ArrayType *ATy = dyn_cast<ArrayType>(EltTy)) { + if (IdxVal >= ATy->getNumElements()) return 0; + EltTy = ATy->getElementType(); + } else { + return 0; // Unknown type. + } + + LaterIndices.push_back(IdxVal); + } + + enum { Overdefined = -3, Undefined = -2 }; + + // Variables for our state machines. + + // FirstTrueElement/SecondTrueElement - Used to emit a comparison of the form + // "i == 47 | i == 87", where 47 is the first index the condition is true for, + // and 87 is the second (and last) index. FirstTrueElement is -2 when + // undefined, otherwise set to the first true element. SecondTrueElement is + // -2 when undefined, -3 when overdefined and >= 0 when that index is true. + int FirstTrueElement = Undefined, SecondTrueElement = Undefined; + + // FirstFalseElement/SecondFalseElement - Used to emit a comparison of the + // form "i != 47 & i != 87". Same state transitions as for true elements. + int FirstFalseElement = Undefined, SecondFalseElement = Undefined; + + /// TrueRangeEnd/FalseRangeEnd - In conjunction with First*Element, these + /// define a state machine that triggers for ranges of values that the index + /// is true or false for. This triggers on things like "abbbbc"[i] == 'b'. + /// This is -2 when undefined, -3 when overdefined, and otherwise the last + /// index in the range (inclusive). We use -2 for undefined here because we + /// use relative comparisons and don't want 0-1 to match -1. + int TrueRangeEnd = Undefined, FalseRangeEnd = Undefined; + + // MagicBitvector - This is a magic bitvector where we set a bit if the + // comparison is true for element 'i'. If there are 64 elements or less in + // the array, this will fully represent all the comparison results. + uint64_t MagicBitvector = 0; + + + // Scan the array and see if one of our patterns matches. + Constant *CompareRHS = cast<Constant>(ICI.getOperand(1)); + for (unsigned i = 0, e = Init->getNumOperands(); i != e; ++i) { + Constant *Elt = Init->getOperand(i); + + // If this is indexing an array of structures, get the structure element. + if (!LaterIndices.empty()) + Elt = ConstantExpr::getExtractValue(Elt, LaterIndices.data(), + LaterIndices.size()); + + // If the element is masked, handle it. + if (AndCst) Elt = ConstantExpr::getAnd(Elt, AndCst); + + // Find out if the comparison would be true or false for the i'th element. + Constant *C = ConstantFoldCompareInstOperands(ICI.getPredicate(), Elt, + CompareRHS, TD); + // If the result is undef for this element, ignore it. + if (isa<UndefValue>(C)) { + // Extend range state machines to cover this element in case there is an + // undef in the middle of the range. + if (TrueRangeEnd == (int)i-1) + TrueRangeEnd = i; + if (FalseRangeEnd == (int)i-1) + FalseRangeEnd = i; + continue; + } + + // If we can't compute the result for any of the elements, we have to give + // up evaluating the entire conditional. + if (!isa<ConstantInt>(C)) return 0; + + // Otherwise, we know if the comparison is true or false for this element, + // update our state machines. + bool IsTrueForElt = !cast<ConstantInt>(C)->isZero(); + + // State machine for single/double/range index comparison. + if (IsTrueForElt) { + // Update the TrueElement state machine. + if (FirstTrueElement == Undefined) + FirstTrueElement = TrueRangeEnd = i; // First true element. + else { + // Update double-compare state machine. + if (SecondTrueElement == Undefined) + SecondTrueElement = i; + else + SecondTrueElement = Overdefined; + + // Update range state machine. + if (TrueRangeEnd == (int)i-1) + TrueRangeEnd = i; + else + TrueRangeEnd = Overdefined; + } + } else { + // Update the FalseElement state machine. + if (FirstFalseElement == Undefined) + FirstFalseElement = FalseRangeEnd = i; // First false element. + else { + // Update double-compare state machine. + if (SecondFalseElement == Undefined) + SecondFalseElement = i; + else + SecondFalseElement = Overdefined; + + // Update range state machine. + if (FalseRangeEnd == (int)i-1) + FalseRangeEnd = i; + else + FalseRangeEnd = Overdefined; + } + } + + + // If this element is in range, update our magic bitvector. + if (i < 64 && IsTrueForElt) + MagicBitvector |= 1ULL << i; + + // If all of our states become overdefined, bail out early. Since the + // predicate is expensive, only check it every 8 elements. This is only + // really useful for really huge arrays. + if ((i & 8) == 0 && i >= 64 && SecondTrueElement == Overdefined && + SecondFalseElement == Overdefined && TrueRangeEnd == Overdefined && + FalseRangeEnd == Overdefined) + return 0; + } + + // Now that we've scanned the entire array, emit our new comparison(s). We + // order the state machines in complexity of the generated code. + Value *Idx = GEP->getOperand(2); + + // If the index is larger than the pointer size of the target, truncate the + // index down like the GEP would do implicitly. We don't have to do this for + // an inbounds GEP because the index can't be out of range. + if (!GEP->isInBounds() && + Idx->getType()->getPrimitiveSizeInBits() > TD->getPointerSizeInBits()) + Idx = Builder->CreateTrunc(Idx, TD->getIntPtrType(Idx->getContext())); + + // If the comparison is only true for one or two elements, emit direct + // comparisons. + if (SecondTrueElement != Overdefined) { + // None true -> false. + if (FirstTrueElement == Undefined) + return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(GEP->getContext())); + + Value *FirstTrueIdx = ConstantInt::get(Idx->getType(), FirstTrueElement); + + // True for one element -> 'i == 47'. + if (SecondTrueElement == Undefined) + return new ICmpInst(ICmpInst::ICMP_EQ, Idx, FirstTrueIdx); + + // True for two elements -> 'i == 47 | i == 72'. + Value *C1 = Builder->CreateICmpEQ(Idx, FirstTrueIdx); + Value *SecondTrueIdx = ConstantInt::get(Idx->getType(), SecondTrueElement); + Value *C2 = Builder->CreateICmpEQ(Idx, SecondTrueIdx); + return BinaryOperator::CreateOr(C1, C2); + } + + // If the comparison is only false for one or two elements, emit direct + // comparisons. + if (SecondFalseElement != Overdefined) { + // None false -> true. + if (FirstFalseElement == Undefined) + return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(GEP->getContext())); + + Value *FirstFalseIdx = ConstantInt::get(Idx->getType(), FirstFalseElement); + + // False for one element -> 'i != 47'. + if (SecondFalseElement == Undefined) + return new ICmpInst(ICmpInst::ICMP_NE, Idx, FirstFalseIdx); + + // False for two elements -> 'i != 47 & i != 72'. + Value *C1 = Builder->CreateICmpNE(Idx, FirstFalseIdx); + Value *SecondFalseIdx = ConstantInt::get(Idx->getType(),SecondFalseElement); + Value *C2 = Builder->CreateICmpNE(Idx, SecondFalseIdx); + return BinaryOperator::CreateAnd(C1, C2); + } + + // If the comparison can be replaced with a range comparison for the elements + // where it is true, emit the range check. + if (TrueRangeEnd != Overdefined) { + assert(TrueRangeEnd != FirstTrueElement && "Should emit single compare"); + + // Generate (i-FirstTrue) <u (TrueRangeEnd-FirstTrue+1). + if (FirstTrueElement) { + Value *Offs = ConstantInt::get(Idx->getType(), -FirstTrueElement); + Idx = Builder->CreateAdd(Idx, Offs); + } + + Value *End = ConstantInt::get(Idx->getType(), + TrueRangeEnd-FirstTrueElement+1); + return new ICmpInst(ICmpInst::ICMP_ULT, Idx, End); + } + + // False range check. + if (FalseRangeEnd != Overdefined) { + assert(FalseRangeEnd != FirstFalseElement && "Should emit single compare"); + // Generate (i-FirstFalse) >u (FalseRangeEnd-FirstFalse). + if (FirstFalseElement) { + Value *Offs = ConstantInt::get(Idx->getType(), -FirstFalseElement); + Idx = Builder->CreateAdd(Idx, Offs); + } + + Value *End = ConstantInt::get(Idx->getType(), + FalseRangeEnd-FirstFalseElement); + return new ICmpInst(ICmpInst::ICMP_UGT, Idx, End); + } + + + // If a 32-bit or 64-bit magic bitvector captures the entire comparison state + // of this load, replace it with computation that does: + // ((magic_cst >> i) & 1) != 0 + if (Init->getNumOperands() <= 32 || + (TD && Init->getNumOperands() <= 64 && TD->isLegalInteger(64))) { + const Type *Ty; + if (Init->getNumOperands() <= 32) + Ty = Type::getInt32Ty(Init->getContext()); + else + Ty = Type::getInt64Ty(Init->getContext()); + Value *V = Builder->CreateIntCast(Idx, Ty, false); + V = Builder->CreateLShr(ConstantInt::get(Ty, MagicBitvector), V); + V = Builder->CreateAnd(ConstantInt::get(Ty, 1), V); + return new ICmpInst(ICmpInst::ICMP_NE, V, ConstantInt::get(Ty, 0)); + } + + return 0; +} + + +/// EvaluateGEPOffsetExpression - Return a value that can be used to compare +/// the *offset* implied by a GEP to zero. For example, if we have &A[i], we +/// want to return 'i' for "icmp ne i, 0". Note that, in general, indices can +/// be complex, and scales are involved. The above expression would also be +/// legal to codegen as "icmp ne (i*4), 0" (assuming A is a pointer to i32). +/// This later form is less amenable to optimization though, and we are allowed +/// to generate the first by knowing that pointer arithmetic doesn't overflow. +/// +/// If we can't emit an optimized form for this expression, this returns null. +/// +static Value *EvaluateGEPOffsetExpression(User *GEP, Instruction &I, + InstCombiner &IC) { + TargetData &TD = *IC.getTargetData(); + gep_type_iterator GTI = gep_type_begin(GEP); + + // Check to see if this gep only has a single variable index. If so, and if + // any constant indices are a multiple of its scale, then we can compute this + // in terms of the scale of the variable index. For example, if the GEP + // implies an offset of "12 + i*4", then we can codegen this as "3 + i", + // because the expression will cross zero at the same point. + unsigned i, e = GEP->getNumOperands(); + int64_t Offset = 0; + for (i = 1; i != e; ++i, ++GTI) { + if (ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i))) { + // Compute the aggregate offset of constant indices. + if (CI->isZero()) continue; + + // Handle a struct index, which adds its field offset to the pointer. + if (const StructType *STy = dyn_cast<StructType>(*GTI)) { + Offset += TD.getStructLayout(STy)->getElementOffset(CI->getZExtValue()); + } else { + uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType()); + Offset += Size*CI->getSExtValue(); + } + } else { + // Found our variable index. + break; + } + } + + // If there are no variable indices, we must have a constant offset, just + // evaluate it the general way. + if (i == e) return 0; + + Value *VariableIdx = GEP->getOperand(i); + // Determine the scale factor of the variable element. For example, this is + // 4 if the variable index is into an array of i32. + uint64_t VariableScale = TD.getTypeAllocSize(GTI.getIndexedType()); + + // Verify that there are no other variable indices. If so, emit the hard way. + for (++i, ++GTI; i != e; ++i, ++GTI) { + ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i)); + if (!CI) return 0; + + // Compute the aggregate offset of constant indices. + if (CI->isZero()) continue; + + // Handle a struct index, which adds its field offset to the pointer. + if (const StructType *STy = dyn_cast<StructType>(*GTI)) { + Offset += TD.getStructLayout(STy)->getElementOffset(CI->getZExtValue()); + } else { + uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType()); + Offset += Size*CI->getSExtValue(); + } + } + + // Okay, we know we have a single variable index, which must be a + // pointer/array/vector index. If there is no offset, life is simple, return + // the index. + unsigned IntPtrWidth = TD.getPointerSizeInBits(); + if (Offset == 0) { + // Cast to intptrty in case a truncation occurs. If an extension is needed, + // we don't need to bother extending: the extension won't affect where the + // computation crosses zero. + if (VariableIdx->getType()->getPrimitiveSizeInBits() > IntPtrWidth) + VariableIdx = new TruncInst(VariableIdx, + TD.getIntPtrType(VariableIdx->getContext()), + VariableIdx->getName(), &I); + return VariableIdx; + } + + // Otherwise, there is an index. The computation we will do will be modulo + // the pointer size, so get it. + uint64_t PtrSizeMask = ~0ULL >> (64-IntPtrWidth); + + Offset &= PtrSizeMask; + VariableScale &= PtrSizeMask; + + // To do this transformation, any constant index must be a multiple of the + // variable scale factor. For example, we can evaluate "12 + 4*i" as "3 + i", + // but we can't evaluate "10 + 3*i" in terms of i. Check that the offset is a + // multiple of the variable scale. + int64_t NewOffs = Offset / (int64_t)VariableScale; + if (Offset != NewOffs*(int64_t)VariableScale) + return 0; + + // Okay, we can do this evaluation. Start by converting the index to intptr. + const Type *IntPtrTy = TD.getIntPtrType(VariableIdx->getContext()); + if (VariableIdx->getType() != IntPtrTy) + VariableIdx = CastInst::CreateIntegerCast(VariableIdx, IntPtrTy, + true /*SExt*/, + VariableIdx->getName(), &I); + Constant *OffsetVal = ConstantInt::get(IntPtrTy, NewOffs); + return BinaryOperator::CreateAdd(VariableIdx, OffsetVal, "offset", &I); +} + +/// FoldGEPICmp - Fold comparisons between a GEP instruction and something +/// else. At this point we know that the GEP is on the LHS of the comparison. +Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS, + ICmpInst::Predicate Cond, + Instruction &I) { + // Look through bitcasts. + if (BitCastInst *BCI = dyn_cast<BitCastInst>(RHS)) + RHS = BCI->getOperand(0); + + Value *PtrBase = GEPLHS->getOperand(0); + if (TD && PtrBase == RHS && GEPLHS->isInBounds()) { + // ((gep Ptr, OFFSET) cmp Ptr) ---> (OFFSET cmp 0). + // This transformation (ignoring the base and scales) is valid because we + // know pointers can't overflow since the gep is inbounds. See if we can + // output an optimized form. + Value *Offset = EvaluateGEPOffsetExpression(GEPLHS, I, *this); + + // If not, synthesize the offset the hard way. + if (Offset == 0) + Offset = EmitGEPOffset(GEPLHS); + return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Offset, + Constant::getNullValue(Offset->getType())); + } else if (GEPOperator *GEPRHS = dyn_cast<GEPOperator>(RHS)) { + // If the base pointers are different, but the indices are the same, just + // compare the base pointer. + if (PtrBase != GEPRHS->getOperand(0)) { + bool IndicesTheSame = GEPLHS->getNumOperands()==GEPRHS->getNumOperands(); + IndicesTheSame &= GEPLHS->getOperand(0)->getType() == + GEPRHS->getOperand(0)->getType(); + if (IndicesTheSame) + for (unsigned i = 1, e = GEPLHS->getNumOperands(); i != e; ++i) + if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) { + IndicesTheSame = false; + break; + } + + // If all indices are the same, just compare the base pointers. + if (IndicesTheSame) + return new ICmpInst(ICmpInst::getSignedPredicate(Cond), + GEPLHS->getOperand(0), GEPRHS->getOperand(0)); + + // Otherwise, the base pointers are different and the indices are + // different, bail out. + return 0; + } + + // If one of the GEPs has all zero indices, recurse. + bool AllZeros = true; + for (unsigned i = 1, e = GEPLHS->getNumOperands(); i != e; ++i) + if (!isa<Constant>(GEPLHS->getOperand(i)) || + !cast<Constant>(GEPLHS->getOperand(i))->isNullValue()) { + AllZeros = false; + break; + } + if (AllZeros) + return FoldGEPICmp(GEPRHS, GEPLHS->getOperand(0), + ICmpInst::getSwappedPredicate(Cond), I); + + // If the other GEP has all zero indices, recurse. + AllZeros = true; + for (unsigned i = 1, e = GEPRHS->getNumOperands(); i != e; ++i) + if (!isa<Constant>(GEPRHS->getOperand(i)) || + !cast<Constant>(GEPRHS->getOperand(i))->isNullValue()) { + AllZeros = false; + break; + } + if (AllZeros) + return FoldGEPICmp(GEPLHS, GEPRHS->getOperand(0), Cond, I); + + if (GEPLHS->getNumOperands() == GEPRHS->getNumOperands()) { + // If the GEPs only differ by one index, compare it. + unsigned NumDifferences = 0; // Keep track of # differences. + unsigned DiffOperand = 0; // The operand that differs. + for (unsigned i = 1, e = GEPRHS->getNumOperands(); i != e; ++i) + if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) { + if (GEPLHS->getOperand(i)->getType()->getPrimitiveSizeInBits() != + GEPRHS->getOperand(i)->getType()->getPrimitiveSizeInBits()) { + // Irreconcilable differences. + NumDifferences = 2; + break; + } else { + if (NumDifferences++) break; + DiffOperand = i; + } + } + + if (NumDifferences == 0) // SAME GEP? + return ReplaceInstUsesWith(I, // No comparison is needed here. + ConstantInt::get(Type::getInt1Ty(I.getContext()), + ICmpInst::isTrueWhenEqual(Cond))); + + else if (NumDifferences == 1) { + Value *LHSV = GEPLHS->getOperand(DiffOperand); + Value *RHSV = GEPRHS->getOperand(DiffOperand); + // Make sure we do a signed comparison here. + return new ICmpInst(ICmpInst::getSignedPredicate(Cond), LHSV, RHSV); + } + } + + // Only lower this if the icmp is the only user of the GEP or if we expect + // the result to fold to a constant! + if (TD && + (isa<ConstantExpr>(GEPLHS) || GEPLHS->hasOneUse()) && + (isa<ConstantExpr>(GEPRHS) || GEPRHS->hasOneUse())) { + // ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2) ---> (OFFSET1 cmp OFFSET2) + Value *L = EmitGEPOffset(GEPLHS); + Value *R = EmitGEPOffset(GEPRHS); + return new ICmpInst(ICmpInst::getSignedPredicate(Cond), L, R); + } + } + return 0; +} + +/// FoldICmpAddOpCst - Fold "icmp pred (X+CI), X". +Instruction *InstCombiner::FoldICmpAddOpCst(ICmpInst &ICI, + Value *X, ConstantInt *CI, + ICmpInst::Predicate Pred, + Value *TheAdd) { + // If we have X+0, exit early (simplifying logic below) and let it get folded + // elsewhere. icmp X+0, X -> icmp X, X + if (CI->isZero()) { + bool isTrue = ICmpInst::isTrueWhenEqual(Pred); + return ReplaceInstUsesWith(ICI, ConstantInt::get(ICI.getType(), isTrue)); + } + + // (X+4) == X -> false. + if (Pred == ICmpInst::ICMP_EQ) + return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(X->getContext())); + + // (X+4) != X -> true. + if (Pred == ICmpInst::ICMP_NE) + return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(X->getContext())); + + // From this point on, we know that (X+C <= X) --> (X+C < X) because C != 0, + // so the values can never be equal. Similiarly for all other "or equals" + // operators. + + // (X+1) <u X --> X >u (MAXUINT-1) --> X == 255 + // (X+2) <u X --> X >u (MAXUINT-2) --> X > 253 + // (X+MAXUINT) <u X --> X >u (MAXUINT-MAXUINT) --> X != 0 + if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_ULE) { + Value *R = + ConstantExpr::getSub(ConstantInt::getAllOnesValue(CI->getType()), CI); + return new ICmpInst(ICmpInst::ICMP_UGT, X, R); + } + + // (X+1) >u X --> X <u (0-1) --> X != 255 + // (X+2) >u X --> X <u (0-2) --> X <u 254 + // (X+MAXUINT) >u X --> X <u (0-MAXUINT) --> X <u 1 --> X == 0 + if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_UGE) + return new ICmpInst(ICmpInst::ICMP_ULT, X, ConstantExpr::getNeg(CI)); + + unsigned BitWidth = CI->getType()->getPrimitiveSizeInBits(); + ConstantInt *SMax = ConstantInt::get(X->getContext(), + APInt::getSignedMaxValue(BitWidth)); + + // (X+ 1) <s X --> X >s (MAXSINT-1) --> X == 127 + // (X+ 2) <s X --> X >s (MAXSINT-2) --> X >s 125 + // (X+MAXSINT) <s X --> X >s (MAXSINT-MAXSINT) --> X >s 0 + // (X+MINSINT) <s X --> X >s (MAXSINT-MINSINT) --> X >s -1 + // (X+ -2) <s X --> X >s (MAXSINT- -2) --> X >s 126 + // (X+ -1) <s X --> X >s (MAXSINT- -1) --> X != 127 + if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE) + return new ICmpInst(ICmpInst::ICMP_SGT, X, ConstantExpr::getSub(SMax, CI)); + + // (X+ 1) >s X --> X <s (MAXSINT-(1-1)) --> X != 127 + // (X+ 2) >s X --> X <s (MAXSINT-(2-1)) --> X <s 126 + // (X+MAXSINT) >s X --> X <s (MAXSINT-(MAXSINT-1)) --> X <s 1 + // (X+MINSINT) >s X --> X <s (MAXSINT-(MINSINT-1)) --> X <s -2 + // (X+ -2) >s X --> X <s (MAXSINT-(-2-1)) --> X <s -126 + // (X+ -1) >s X --> X <s (MAXSINT-(-1-1)) --> X == -128 + + assert(Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE); + Constant *C = ConstantInt::get(X->getContext(), CI->getValue()-1); + return new ICmpInst(ICmpInst::ICMP_SLT, X, ConstantExpr::getSub(SMax, C)); +} + +/// FoldICmpDivCst - Fold "icmp pred, ([su]div X, DivRHS), CmpRHS" where DivRHS +/// and CmpRHS are both known to be integer constants. +Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI, + ConstantInt *DivRHS) { + ConstantInt *CmpRHS = cast<ConstantInt>(ICI.getOperand(1)); + const APInt &CmpRHSV = CmpRHS->getValue(); + + // FIXME: If the operand types don't match the type of the divide + // then don't attempt this transform. The code below doesn't have the + // logic to deal with a signed divide and an unsigned compare (and + // vice versa). This is because (x /s C1) <s C2 produces different + // results than (x /s C1) <u C2 or (x /u C1) <s C2 or even + // (x /u C1) <u C2. Simply casting the operands and result won't + // work. :( The if statement below tests that condition and bails + // if it finds it. + bool DivIsSigned = DivI->getOpcode() == Instruction::SDiv; + if (!ICI.isEquality() && DivIsSigned != ICI.isSigned()) + return 0; + if (DivRHS->isZero()) + return 0; // The ProdOV computation fails on divide by zero. + if (DivIsSigned && DivRHS->isAllOnesValue()) + return 0; // The overflow computation also screws up here + if (DivRHS->isOne()) { + // This eliminates some funny cases with INT_MIN. + ICI.setOperand(0, DivI->getOperand(0)); // X/1 == X. + return &ICI; + } + + // Compute Prod = CI * DivRHS. We are essentially solving an equation + // of form X/C1=C2. We solve for X by multiplying C1 (DivRHS) and + // C2 (CI). By solving for X we can turn this into a range check + // instead of computing a divide. + Constant *Prod = ConstantExpr::getMul(CmpRHS, DivRHS); + + // Determine if the product overflows by seeing if the product is + // not equal to the divide. Make sure we do the same kind of divide + // as in the LHS instruction that we're folding. + bool ProdOV = (DivIsSigned ? ConstantExpr::getSDiv(Prod, DivRHS) : + ConstantExpr::getUDiv(Prod, DivRHS)) != CmpRHS; + + // Get the ICmp opcode + ICmpInst::Predicate Pred = ICI.getPredicate(); + + /// If the division is known to be exact, then there is no remainder from the + /// divide, so the covered range size is unit, otherwise it is the divisor. + ConstantInt *RangeSize = DivI->isExact() ? getOne(Prod) : DivRHS; + + // Figure out the interval that is being checked. For example, a comparison + // like "X /u 5 == 0" is really checking that X is in the interval [0, 5). + // Compute this interval based on the constants involved and the signedness of + // the compare/divide. This computes a half-open interval, keeping track of + // whether either value in the interval overflows. After analysis each + // overflow variable is set to 0 if it's corresponding bound variable is valid + // -1 if overflowed off the bottom end, or +1 if overflowed off the top end. + int LoOverflow = 0, HiOverflow = 0; + Constant *LoBound = 0, *HiBound = 0; + + if (!DivIsSigned) { // udiv + // e.g. X/5 op 3 --> [15, 20) + LoBound = Prod; + HiOverflow = LoOverflow = ProdOV; + if (!HiOverflow) { + // If this is not an exact divide, then many values in the range collapse + // to the same result value. + HiOverflow = AddWithOverflow(HiBound, LoBound, RangeSize, false); + } + + } else if (DivRHS->getValue().isStrictlyPositive()) { // Divisor is > 0. + if (CmpRHSV == 0) { // (X / pos) op 0 + // Can't overflow. e.g. X/2 op 0 --> [-1, 2) + LoBound = ConstantExpr::getNeg(SubOne(RangeSize)); + HiBound = RangeSize; + } else if (CmpRHSV.isStrictlyPositive()) { // (X / pos) op pos + LoBound = Prod; // e.g. X/5 op 3 --> [15, 20) + HiOverflow = LoOverflow = ProdOV; + if (!HiOverflow) + HiOverflow = AddWithOverflow(HiBound, Prod, RangeSize, true); + } else { // (X / pos) op neg + // e.g. X/5 op -3 --> [-15-4, -15+1) --> [-19, -14) + HiBound = AddOne(Prod); + LoOverflow = HiOverflow = ProdOV ? -1 : 0; + if (!LoOverflow) { + ConstantInt *DivNeg =cast<ConstantInt>(ConstantExpr::getNeg(RangeSize)); + LoOverflow = AddWithOverflow(LoBound, HiBound, DivNeg, true) ? -1 : 0; + } + } + } else if (DivRHS->getValue().isNegative()) { // Divisor is < 0. + if (DivI->isExact()) + RangeSize = cast<ConstantInt>(ConstantExpr::getNeg(RangeSize)); + if (CmpRHSV == 0) { // (X / neg) op 0 + // e.g. X/-5 op 0 --> [-4, 5) + LoBound = AddOne(RangeSize); + HiBound = cast<ConstantInt>(ConstantExpr::getNeg(RangeSize)); + if (HiBound == DivRHS) { // -INTMIN = INTMIN + HiOverflow = 1; // [INTMIN+1, overflow) + HiBound = 0; // e.g. X/INTMIN = 0 --> X > INTMIN + } + } else if (CmpRHSV.isStrictlyPositive()) { // (X / neg) op pos + // e.g. X/-5 op 3 --> [-19, -14) + HiBound = AddOne(Prod); + HiOverflow = LoOverflow = ProdOV ? -1 : 0; + if (!LoOverflow) + LoOverflow = AddWithOverflow(LoBound, HiBound, RangeSize, true) ? -1:0; + } else { // (X / neg) op neg + LoBound = Prod; // e.g. X/-5 op -3 --> [15, 20) + LoOverflow = HiOverflow = ProdOV; + if (!HiOverflow) + HiOverflow = SubWithOverflow(HiBound, Prod, RangeSize, true); + } + + // Dividing by a negative swaps the condition. LT <-> GT + Pred = ICmpInst::getSwappedPredicate(Pred); + } + + Value *X = DivI->getOperand(0); + switch (Pred) { + default: llvm_unreachable("Unhandled icmp opcode!"); + case ICmpInst::ICMP_EQ: + if (LoOverflow && HiOverflow) + return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(ICI.getContext())); + if (HiOverflow) + return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE : + ICmpInst::ICMP_UGE, X, LoBound); + if (LoOverflow) + return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT : + ICmpInst::ICMP_ULT, X, HiBound); + return ReplaceInstUsesWith(ICI, InsertRangeTest(X, LoBound, HiBound, + DivIsSigned, true)); + case ICmpInst::ICMP_NE: + if (LoOverflow && HiOverflow) + return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(ICI.getContext())); + if (HiOverflow) + return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT : + ICmpInst::ICMP_ULT, X, LoBound); + if (LoOverflow) + return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE : + ICmpInst::ICMP_UGE, X, HiBound); + return ReplaceInstUsesWith(ICI, InsertRangeTest(X, LoBound, HiBound, + DivIsSigned, false)); + case ICmpInst::ICMP_ULT: + case ICmpInst::ICMP_SLT: + if (LoOverflow == +1) // Low bound is greater than input range. + return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(ICI.getContext())); + if (LoOverflow == -1) // Low bound is less than input range. + return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(ICI.getContext())); + return new ICmpInst(Pred, X, LoBound); + case ICmpInst::ICMP_UGT: + case ICmpInst::ICMP_SGT: + if (HiOverflow == +1) // High bound greater than input range. + return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(ICI.getContext())); + if (HiOverflow == -1) // High bound less than input range. + return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(ICI.getContext())); + if (Pred == ICmpInst::ICMP_UGT) + return new ICmpInst(ICmpInst::ICMP_UGE, X, HiBound); + return new ICmpInst(ICmpInst::ICMP_SGE, X, HiBound); + } +} + +/// FoldICmpShrCst - Handle "icmp(([al]shr X, cst1), cst2)". +Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr, + ConstantInt *ShAmt) { + const APInt &CmpRHSV = cast<ConstantInt>(ICI.getOperand(1))->getValue(); + + // Check that the shift amount is in range. If not, don't perform + // undefined shifts. When the shift is visited it will be + // simplified. + uint32_t TypeBits = CmpRHSV.getBitWidth(); + uint32_t ShAmtVal = (uint32_t)ShAmt->getLimitedValue(TypeBits); + if (ShAmtVal >= TypeBits || ShAmtVal == 0) + return 0; + + if (!ICI.isEquality()) { + // If we have an unsigned comparison and an ashr, we can't simplify this. + // Similarly for signed comparisons with lshr. + if (ICI.isSigned() != (Shr->getOpcode() == Instruction::AShr)) + return 0; + + // Otherwise, all lshr and all exact ashr's are equivalent to a udiv/sdiv by + // a power of 2. Since we already have logic to simplify these, transform + // to div and then simplify the resultant comparison. + if (Shr->getOpcode() == Instruction::AShr && + !Shr->isExact()) + return 0; + + // Revisit the shift (to delete it). + Worklist.Add(Shr); + + Constant *DivCst = + ConstantInt::get(Shr->getType(), APInt::getOneBitSet(TypeBits, ShAmtVal)); + + Value *Tmp = + Shr->getOpcode() == Instruction::AShr ? + Builder->CreateSDiv(Shr->getOperand(0), DivCst, "", Shr->isExact()) : + Builder->CreateUDiv(Shr->getOperand(0), DivCst, "", Shr->isExact()); + + ICI.setOperand(0, Tmp); + + // If the builder folded the binop, just return it. + BinaryOperator *TheDiv = dyn_cast<BinaryOperator>(Tmp); + if (TheDiv == 0) + return &ICI; + + // Otherwise, fold this div/compare. + assert(TheDiv->getOpcode() == Instruction::SDiv || + TheDiv->getOpcode() == Instruction::UDiv); + + Instruction *Res = FoldICmpDivCst(ICI, TheDiv, cast<ConstantInt>(DivCst)); + assert(Res && "This div/cst should have folded!"); + return Res; + } + + + // If we are comparing against bits always shifted out, the + // comparison cannot succeed. + APInt Comp = CmpRHSV << ShAmtVal; + ConstantInt *ShiftedCmpRHS = ConstantInt::get(ICI.getContext(), Comp); + if (Shr->getOpcode() == Instruction::LShr) + Comp = Comp.lshr(ShAmtVal); + else + Comp = Comp.ashr(ShAmtVal); + + if (Comp != CmpRHSV) { // Comparing against a bit that we know is zero. + bool IsICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE; + Constant *Cst = ConstantInt::get(Type::getInt1Ty(ICI.getContext()), + IsICMP_NE); + return ReplaceInstUsesWith(ICI, Cst); + } + + // Otherwise, check to see if the bits shifted out are known to be zero. + // If so, we can compare against the unshifted value: + // (X & 4) >> 1 == 2 --> (X & 4) == 4. + if (Shr->hasOneUse() && Shr->isExact()) + return new ICmpInst(ICI.getPredicate(), Shr->getOperand(0), ShiftedCmpRHS); + + if (Shr->hasOneUse()) { + // Otherwise strength reduce the shift into an and. + APInt Val(APInt::getHighBitsSet(TypeBits, TypeBits - ShAmtVal)); + Constant *Mask = ConstantInt::get(ICI.getContext(), Val); + + Value *And = Builder->CreateAnd(Shr->getOperand(0), + Mask, Shr->getName()+".mask"); + return new ICmpInst(ICI.getPredicate(), And, ShiftedCmpRHS); + } + return 0; +} + + +/// visitICmpInstWithInstAndIntCst - Handle "icmp (instr, intcst)". +/// +Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, + Instruction *LHSI, + ConstantInt *RHS) { + const APInt &RHSV = RHS->getValue(); + + switch (LHSI->getOpcode()) { + case Instruction::Trunc: + if (ICI.isEquality() && LHSI->hasOneUse()) { + // Simplify icmp eq (trunc x to i8), 42 -> icmp eq x, 42|highbits if all + // of the high bits truncated out of x are known. + unsigned DstBits = LHSI->getType()->getPrimitiveSizeInBits(), + SrcBits = LHSI->getOperand(0)->getType()->getPrimitiveSizeInBits(); + APInt Mask(APInt::getHighBitsSet(SrcBits, SrcBits-DstBits)); + APInt KnownZero(SrcBits, 0), KnownOne(SrcBits, 0); + ComputeMaskedBits(LHSI->getOperand(0), Mask, KnownZero, KnownOne); + + // If all the high bits are known, we can do this xform. + if ((KnownZero|KnownOne).countLeadingOnes() >= SrcBits-DstBits) { + // Pull in the high bits from known-ones set. + APInt NewRHS = RHS->getValue().zext(SrcBits); + NewRHS |= KnownOne; + return new ICmpInst(ICI.getPredicate(), LHSI->getOperand(0), + ConstantInt::get(ICI.getContext(), NewRHS)); + } + } + break; + + case Instruction::Xor: // (icmp pred (xor X, XorCST), CI) + if (ConstantInt *XorCST = dyn_cast<ConstantInt>(LHSI->getOperand(1))) { + // If this is a comparison that tests the signbit (X < 0) or (x > -1), + // fold the xor. + if ((ICI.getPredicate() == ICmpInst::ICMP_SLT && RHSV == 0) || + (ICI.getPredicate() == ICmpInst::ICMP_SGT && RHSV.isAllOnesValue())) { + Value *CompareVal = LHSI->getOperand(0); + + // If the sign bit of the XorCST is not set, there is no change to + // the operation, just stop using the Xor. + if (!XorCST->getValue().isNegative()) { + ICI.setOperand(0, CompareVal); + Worklist.Add(LHSI); + return &ICI; + } + + // Was the old condition true if the operand is positive? + bool isTrueIfPositive = ICI.getPredicate() == ICmpInst::ICMP_SGT; + + // If so, the new one isn't. + isTrueIfPositive ^= true; + + if (isTrueIfPositive) + return new ICmpInst(ICmpInst::ICMP_SGT, CompareVal, + SubOne(RHS)); + else + return new ICmpInst(ICmpInst::ICMP_SLT, CompareVal, + AddOne(RHS)); + } + + if (LHSI->hasOneUse()) { + // (icmp u/s (xor A SignBit), C) -> (icmp s/u A, (xor C SignBit)) + if (!ICI.isEquality() && XorCST->getValue().isSignBit()) { + const APInt &SignBit = XorCST->getValue(); + ICmpInst::Predicate Pred = ICI.isSigned() + ? ICI.getUnsignedPredicate() + : ICI.getSignedPredicate(); + return new ICmpInst(Pred, LHSI->getOperand(0), + ConstantInt::get(ICI.getContext(), + RHSV ^ SignBit)); + } + + // (icmp u/s (xor A ~SignBit), C) -> (icmp s/u (xor C ~SignBit), A) + if (!ICI.isEquality() && XorCST->getValue().isMaxSignedValue()) { + const APInt &NotSignBit = XorCST->getValue(); + ICmpInst::Predicate Pred = ICI.isSigned() + ? ICI.getUnsignedPredicate() + : ICI.getSignedPredicate(); + Pred = ICI.getSwappedPredicate(Pred); + return new ICmpInst(Pred, LHSI->getOperand(0), + ConstantInt::get(ICI.getContext(), + RHSV ^ NotSignBit)); + } + } + } + break; + case Instruction::And: // (icmp pred (and X, AndCST), RHS) + if (LHSI->hasOneUse() && isa<ConstantInt>(LHSI->getOperand(1)) && + LHSI->getOperand(0)->hasOneUse()) { + ConstantInt *AndCST = cast<ConstantInt>(LHSI->getOperand(1)); + + // If the LHS is an AND of a truncating cast, we can widen the + // and/compare to be the input width without changing the value + // produced, eliminating a cast. + if (TruncInst *Cast = dyn_cast<TruncInst>(LHSI->getOperand(0))) { + // We can do this transformation if either the AND constant does not + // have its sign bit set or if it is an equality comparison. + // Extending a relational comparison when we're checking the sign + // bit would not work. + if (Cast->hasOneUse() && + (ICI.isEquality() || + (AndCST->getValue().isNonNegative() && RHSV.isNonNegative()))) { + uint32_t BitWidth = + cast<IntegerType>(Cast->getOperand(0)->getType())->getBitWidth(); + APInt NewCST = AndCST->getValue().zext(BitWidth); + APInt NewCI = RHSV.zext(BitWidth); + Value *NewAnd = + Builder->CreateAnd(Cast->getOperand(0), + ConstantInt::get(ICI.getContext(), NewCST), + LHSI->getName()); + return new ICmpInst(ICI.getPredicate(), NewAnd, + ConstantInt::get(ICI.getContext(), NewCI)); + } + } + + // If this is: (X >> C1) & C2 != C3 (where any shift and any compare + // could exist), turn it into (X & (C2 << C1)) != (C3 << C1). This + // happens a LOT in code produced by the C front-end, for bitfield + // access. + BinaryOperator *Shift = dyn_cast<BinaryOperator>(LHSI->getOperand(0)); + if (Shift && !Shift->isShift()) + Shift = 0; + + ConstantInt *ShAmt; + ShAmt = Shift ? dyn_cast<ConstantInt>(Shift->getOperand(1)) : 0; + const Type *Ty = Shift ? Shift->getType() : 0; // Type of the shift. + const Type *AndTy = AndCST->getType(); // Type of the and. + + // We can fold this as long as we can't shift unknown bits + // into the mask. This can only happen with signed shift + // rights, as they sign-extend. + if (ShAmt) { + bool CanFold = Shift->isLogicalShift(); + if (!CanFold) { + // To test for the bad case of the signed shr, see if any + // of the bits shifted in could be tested after the mask. + uint32_t TyBits = Ty->getPrimitiveSizeInBits(); + int ShAmtVal = TyBits - ShAmt->getLimitedValue(TyBits); + + uint32_t BitWidth = AndTy->getPrimitiveSizeInBits(); + if ((APInt::getHighBitsSet(BitWidth, BitWidth-ShAmtVal) & + AndCST->getValue()) == 0) + CanFold = true; + } + + if (CanFold) { + Constant *NewCst; + if (Shift->getOpcode() == Instruction::Shl) + NewCst = ConstantExpr::getLShr(RHS, ShAmt); + else + NewCst = ConstantExpr::getShl(RHS, ShAmt); + + // Check to see if we are shifting out any of the bits being + // compared. + if (ConstantExpr::get(Shift->getOpcode(), + NewCst, ShAmt) != RHS) { + // If we shifted bits out, the fold is not going to work out. + // As a special case, check to see if this means that the + // result is always true or false now. + if (ICI.getPredicate() == ICmpInst::ICMP_EQ) + return ReplaceInstUsesWith(ICI, + ConstantInt::getFalse(ICI.getContext())); + if (ICI.getPredicate() == ICmpInst::ICMP_NE) + return ReplaceInstUsesWith(ICI, + ConstantInt::getTrue(ICI.getContext())); + } else { + ICI.setOperand(1, NewCst); + Constant *NewAndCST; + if (Shift->getOpcode() == Instruction::Shl) + NewAndCST = ConstantExpr::getLShr(AndCST, ShAmt); + else + NewAndCST = ConstantExpr::getShl(AndCST, ShAmt); + LHSI->setOperand(1, NewAndCST); + LHSI->setOperand(0, Shift->getOperand(0)); + Worklist.Add(Shift); // Shift is dead. + return &ICI; + } + } + } + + // Turn ((X >> Y) & C) == 0 into (X & (C << Y)) == 0. The later is + // preferable because it allows the C<<Y expression to be hoisted out + // of a loop if Y is invariant and X is not. + if (Shift && Shift->hasOneUse() && RHSV == 0 && + ICI.isEquality() && !Shift->isArithmeticShift() && + !isa<Constant>(Shift->getOperand(0))) { + // Compute C << Y. + Value *NS; + if (Shift->getOpcode() == Instruction::LShr) { + NS = Builder->CreateShl(AndCST, Shift->getOperand(1), "tmp"); + } else { + // Insert a logical shift. + NS = Builder->CreateLShr(AndCST, Shift->getOperand(1), "tmp"); + } + + // Compute X & (C << Y). + Value *NewAnd = + Builder->CreateAnd(Shift->getOperand(0), NS, LHSI->getName()); + + ICI.setOperand(0, NewAnd); + return &ICI; + } + } + + // Try to optimize things like "A[i]&42 == 0" to index computations. + if (LoadInst *LI = dyn_cast<LoadInst>(LHSI->getOperand(0))) { + if (GetElementPtrInst *GEP = + dyn_cast<GetElementPtrInst>(LI->getOperand(0))) + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) + if (GV->isConstant() && GV->hasDefinitiveInitializer() && + !LI->isVolatile() && isa<ConstantInt>(LHSI->getOperand(1))) { + ConstantInt *C = cast<ConstantInt>(LHSI->getOperand(1)); + if (Instruction *Res = FoldCmpLoadFromIndexedGlobal(GEP, GV,ICI, C)) + return Res; + } + } + break; + + case Instruction::Or: { + if (!ICI.isEquality() || !RHS->isNullValue() || !LHSI->hasOneUse()) + break; + Value *P, *Q; + if (match(LHSI, m_Or(m_PtrToInt(m_Value(P)), m_PtrToInt(m_Value(Q))))) { + // Simplify icmp eq (or (ptrtoint P), (ptrtoint Q)), 0 + // -> and (icmp eq P, null), (icmp eq Q, null). + Value *ICIP = Builder->CreateICmp(ICI.getPredicate(), P, + Constant::getNullValue(P->getType())); + Value *ICIQ = Builder->CreateICmp(ICI.getPredicate(), Q, + Constant::getNullValue(Q->getType())); + Instruction *Op; + if (ICI.getPredicate() == ICmpInst::ICMP_EQ) + Op = BinaryOperator::CreateAnd(ICIP, ICIQ); + else + Op = BinaryOperator::CreateOr(ICIP, ICIQ); + return Op; + } + break; + } + + case Instruction::Shl: { // (icmp pred (shl X, ShAmt), CI) + ConstantInt *ShAmt = dyn_cast<ConstantInt>(LHSI->getOperand(1)); + if (!ShAmt) break; + + uint32_t TypeBits = RHSV.getBitWidth(); + + // Check that the shift amount is in range. If not, don't perform + // undefined shifts. When the shift is visited it will be + // simplified. + if (ShAmt->uge(TypeBits)) + break; + + if (ICI.isEquality()) { + // If we are comparing against bits always shifted out, the + // comparison cannot succeed. + Constant *Comp = + ConstantExpr::getShl(ConstantExpr::getLShr(RHS, ShAmt), + ShAmt); + if (Comp != RHS) {// Comparing against a bit that we know is zero. + bool IsICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE; + Constant *Cst = + ConstantInt::get(Type::getInt1Ty(ICI.getContext()), IsICMP_NE); + return ReplaceInstUsesWith(ICI, Cst); + } + + // If the shift is NUW, then it is just shifting out zeros, no need for an + // AND. + if (cast<BinaryOperator>(LHSI)->hasNoUnsignedWrap()) + return new ICmpInst(ICI.getPredicate(), LHSI->getOperand(0), + ConstantExpr::getLShr(RHS, ShAmt)); + + if (LHSI->hasOneUse()) { + // Otherwise strength reduce the shift into an and. + uint32_t ShAmtVal = (uint32_t)ShAmt->getLimitedValue(TypeBits); + Constant *Mask = + ConstantInt::get(ICI.getContext(), APInt::getLowBitsSet(TypeBits, + TypeBits-ShAmtVal)); + + Value *And = + Builder->CreateAnd(LHSI->getOperand(0),Mask, LHSI->getName()+".mask"); + return new ICmpInst(ICI.getPredicate(), And, + ConstantExpr::getLShr(RHS, ShAmt)); + } + } + + // Otherwise, if this is a comparison of the sign bit, simplify to and/test. + bool TrueIfSigned = false; + if (LHSI->hasOneUse() && + isSignBitCheck(ICI.getPredicate(), RHS, TrueIfSigned)) { + // (X << 31) <s 0 --> (X&1) != 0 + Constant *Mask = ConstantInt::get(LHSI->getOperand(0)->getType(), + APInt::getOneBitSet(TypeBits, + TypeBits-ShAmt->getZExtValue()-1)); + Value *And = + Builder->CreateAnd(LHSI->getOperand(0), Mask, LHSI->getName()+".mask"); + return new ICmpInst(TrueIfSigned ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ, + And, Constant::getNullValue(And->getType())); + } + break; + } + + case Instruction::LShr: // (icmp pred (shr X, ShAmt), CI) + case Instruction::AShr: + // Only handle equality comparisons of shift-by-constant. + if (ConstantInt *ShAmt = dyn_cast<ConstantInt>(LHSI->getOperand(1))) + if (Instruction *Res = FoldICmpShrCst(ICI, cast<BinaryOperator>(LHSI), + ShAmt)) + return Res; + break; + + case Instruction::SDiv: + case Instruction::UDiv: + // Fold: icmp pred ([us]div X, C1), C2 -> range test + // Fold this div into the comparison, producing a range check. + // Determine, based on the divide type, what the range is being + // checked. If there is an overflow on the low or high side, remember + // it, otherwise compute the range [low, hi) bounding the new value. + // See: InsertRangeTest above for the kinds of replacements possible. + if (ConstantInt *DivRHS = dyn_cast<ConstantInt>(LHSI->getOperand(1))) + if (Instruction *R = FoldICmpDivCst(ICI, cast<BinaryOperator>(LHSI), + DivRHS)) + return R; + break; + + case Instruction::Add: + // Fold: icmp pred (add X, C1), C2 + if (!ICI.isEquality()) { + ConstantInt *LHSC = dyn_cast<ConstantInt>(LHSI->getOperand(1)); + if (!LHSC) break; + const APInt &LHSV = LHSC->getValue(); + + ConstantRange CR = ICI.makeConstantRange(ICI.getPredicate(), RHSV) + .subtract(LHSV); + + if (ICI.isSigned()) { + if (CR.getLower().isSignBit()) { + return new ICmpInst(ICmpInst::ICMP_SLT, LHSI->getOperand(0), + ConstantInt::get(ICI.getContext(),CR.getUpper())); + } else if (CR.getUpper().isSignBit()) { + return new ICmpInst(ICmpInst::ICMP_SGE, LHSI->getOperand(0), + ConstantInt::get(ICI.getContext(),CR.getLower())); + } + } else { + if (CR.getLower().isMinValue()) { + return new ICmpInst(ICmpInst::ICMP_ULT, LHSI->getOperand(0), + ConstantInt::get(ICI.getContext(),CR.getUpper())); + } else if (CR.getUpper().isMinValue()) { + return new ICmpInst(ICmpInst::ICMP_UGE, LHSI->getOperand(0), + ConstantInt::get(ICI.getContext(),CR.getLower())); + } + } + } + break; + } + + // Simplify icmp_eq and icmp_ne instructions with integer constant RHS. + if (ICI.isEquality()) { + bool isICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE; + + // If the first operand is (add|sub|and|or|xor|rem) with a constant, and + // the second operand is a constant, simplify a bit. + if (BinaryOperator *BO = dyn_cast<BinaryOperator>(LHSI)) { + switch (BO->getOpcode()) { + case Instruction::SRem: + // If we have a signed (X % (2^c)) == 0, turn it into an unsigned one. + if (RHSV == 0 && isa<ConstantInt>(BO->getOperand(1)) &&BO->hasOneUse()){ + const APInt &V = cast<ConstantInt>(BO->getOperand(1))->getValue(); + if (V.sgt(1) && V.isPowerOf2()) { + Value *NewRem = + Builder->CreateURem(BO->getOperand(0), BO->getOperand(1), + BO->getName()); + return new ICmpInst(ICI.getPredicate(), NewRem, + Constant::getNullValue(BO->getType())); + } + } + break; + case Instruction::Add: + // Replace ((add A, B) != C) with (A != C-B) if B & C are constants. + if (ConstantInt *BOp1C = dyn_cast<ConstantInt>(BO->getOperand(1))) { + if (BO->hasOneUse()) + return new ICmpInst(ICI.getPredicate(), BO->getOperand(0), + ConstantExpr::getSub(RHS, BOp1C)); + } else if (RHSV == 0) { + // Replace ((add A, B) != 0) with (A != -B) if A or B is + // efficiently invertible, or if the add has just this one use. + Value *BOp0 = BO->getOperand(0), *BOp1 = BO->getOperand(1); + + if (Value *NegVal = dyn_castNegVal(BOp1)) + return new ICmpInst(ICI.getPredicate(), BOp0, NegVal); + else if (Value *NegVal = dyn_castNegVal(BOp0)) + return new ICmpInst(ICI.getPredicate(), NegVal, BOp1); + else if (BO->hasOneUse()) { + Value *Neg = Builder->CreateNeg(BOp1); + Neg->takeName(BO); + return new ICmpInst(ICI.getPredicate(), BOp0, Neg); + } + } + break; + case Instruction::Xor: + // For the xor case, we can xor two constants together, eliminating + // the explicit xor. + if (Constant *BOC = dyn_cast<Constant>(BO->getOperand(1))) + return new ICmpInst(ICI.getPredicate(), BO->getOperand(0), + ConstantExpr::getXor(RHS, BOC)); + + // FALLTHROUGH + case Instruction::Sub: + // Replace (([sub|xor] A, B) != 0) with (A != B) + if (RHSV == 0) + return new ICmpInst(ICI.getPredicate(), BO->getOperand(0), + BO->getOperand(1)); + break; + + case Instruction::Or: + // If bits are being or'd in that are not present in the constant we + // are comparing against, then the comparison could never succeed! + if (ConstantInt *BOC = dyn_cast<ConstantInt>(BO->getOperand(1))) { + Constant *NotCI = ConstantExpr::getNot(RHS); + if (!ConstantExpr::getAnd(BOC, NotCI)->isNullValue()) + return ReplaceInstUsesWith(ICI, + ConstantInt::get(Type::getInt1Ty(ICI.getContext()), + isICMP_NE)); + } + break; + + case Instruction::And: + if (ConstantInt *BOC = dyn_cast<ConstantInt>(BO->getOperand(1))) { + // If bits are being compared against that are and'd out, then the + // comparison can never succeed! + if ((RHSV & ~BOC->getValue()) != 0) + return ReplaceInstUsesWith(ICI, + ConstantInt::get(Type::getInt1Ty(ICI.getContext()), + isICMP_NE)); + + // If we have ((X & C) == C), turn it into ((X & C) != 0). + if (RHS == BOC && RHSV.isPowerOf2()) + return new ICmpInst(isICMP_NE ? ICmpInst::ICMP_EQ : + ICmpInst::ICMP_NE, LHSI, + Constant::getNullValue(RHS->getType())); + + // Replace (and X, (1 << size(X)-1) != 0) with x s< 0 + if (BOC->getValue().isSignBit()) { + Value *X = BO->getOperand(0); + Constant *Zero = Constant::getNullValue(X->getType()); + ICmpInst::Predicate pred = isICMP_NE ? + ICmpInst::ICMP_SLT : ICmpInst::ICMP_SGE; + return new ICmpInst(pred, X, Zero); + } + + // ((X & ~7) == 0) --> X < 8 + if (RHSV == 0 && isHighOnes(BOC)) { + Value *X = BO->getOperand(0); + Constant *NegX = ConstantExpr::getNeg(BOC); + ICmpInst::Predicate pred = isICMP_NE ? + ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT; + return new ICmpInst(pred, X, NegX); + } + } + default: break; + } + } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(LHSI)) { + // Handle icmp {eq|ne} <intrinsic>, intcst. + switch (II->getIntrinsicID()) { + case Intrinsic::bswap: + Worklist.Add(II); + ICI.setOperand(0, II->getArgOperand(0)); + ICI.setOperand(1, ConstantInt::get(II->getContext(), RHSV.byteSwap())); + return &ICI; + case Intrinsic::ctlz: + case Intrinsic::cttz: + // ctz(A) == bitwidth(a) -> A == 0 and likewise for != + if (RHSV == RHS->getType()->getBitWidth()) { + Worklist.Add(II); + ICI.setOperand(0, II->getArgOperand(0)); + ICI.setOperand(1, ConstantInt::get(RHS->getType(), 0)); + return &ICI; + } + break; + case Intrinsic::ctpop: + // popcount(A) == 0 -> A == 0 and likewise for != + if (RHS->isZero()) { + Worklist.Add(II); + ICI.setOperand(0, II->getArgOperand(0)); + ICI.setOperand(1, RHS); + return &ICI; + } + break; + default: + break; + } + } + } + return 0; +} + +/// visitICmpInstWithCastAndCast - Handle icmp (cast x to y), (cast/cst). +/// We only handle extending casts so far. +/// +Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) { + const CastInst *LHSCI = cast<CastInst>(ICI.getOperand(0)); + Value *LHSCIOp = LHSCI->getOperand(0); + const Type *SrcTy = LHSCIOp->getType(); + const Type *DestTy = LHSCI->getType(); + Value *RHSCIOp; + + // Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the + // integer type is the same size as the pointer type. + if (TD && LHSCI->getOpcode() == Instruction::PtrToInt && + TD->getPointerSizeInBits() == + cast<IntegerType>(DestTy)->getBitWidth()) { + Value *RHSOp = 0; + if (Constant *RHSC = dyn_cast<Constant>(ICI.getOperand(1))) { + RHSOp = ConstantExpr::getIntToPtr(RHSC, SrcTy); + } else if (PtrToIntInst *RHSC = dyn_cast<PtrToIntInst>(ICI.getOperand(1))) { + RHSOp = RHSC->getOperand(0); + // If the pointer types don't match, insert a bitcast. + if (LHSCIOp->getType() != RHSOp->getType()) + RHSOp = Builder->CreateBitCast(RHSOp, LHSCIOp->getType()); + } + + if (RHSOp) + return new ICmpInst(ICI.getPredicate(), LHSCIOp, RHSOp); + } + + // The code below only handles extension cast instructions, so far. + // Enforce this. + if (LHSCI->getOpcode() != Instruction::ZExt && + LHSCI->getOpcode() != Instruction::SExt) + return 0; + + bool isSignedExt = LHSCI->getOpcode() == Instruction::SExt; + bool isSignedCmp = ICI.isSigned(); + + if (CastInst *CI = dyn_cast<CastInst>(ICI.getOperand(1))) { + // Not an extension from the same type? + RHSCIOp = CI->getOperand(0); + if (RHSCIOp->getType() != LHSCIOp->getType()) + return 0; + + // If the signedness of the two casts doesn't agree (i.e. one is a sext + // and the other is a zext), then we can't handle this. + if (CI->getOpcode() != LHSCI->getOpcode()) + return 0; + + // Deal with equality cases early. + if (ICI.isEquality()) + return new ICmpInst(ICI.getPredicate(), LHSCIOp, RHSCIOp); + + // A signed comparison of sign extended values simplifies into a + // signed comparison. + if (isSignedCmp && isSignedExt) + return new ICmpInst(ICI.getPredicate(), LHSCIOp, RHSCIOp); + + // The other three cases all fold into an unsigned comparison. + return new ICmpInst(ICI.getUnsignedPredicate(), LHSCIOp, RHSCIOp); + } + + // If we aren't dealing with a constant on the RHS, exit early + ConstantInt *CI = dyn_cast<ConstantInt>(ICI.getOperand(1)); + if (!CI) + return 0; + + // Compute the constant that would happen if we truncated to SrcTy then + // reextended to DestTy. + Constant *Res1 = ConstantExpr::getTrunc(CI, SrcTy); + Constant *Res2 = ConstantExpr::getCast(LHSCI->getOpcode(), + Res1, DestTy); + + // If the re-extended constant didn't change... + if (Res2 == CI) { + // Deal with equality cases early. + if (ICI.isEquality()) + return new ICmpInst(ICI.getPredicate(), LHSCIOp, Res1); + + // A signed comparison of sign extended values simplifies into a + // signed comparison. + if (isSignedExt && isSignedCmp) + return new ICmpInst(ICI.getPredicate(), LHSCIOp, Res1); + + // The other three cases all fold into an unsigned comparison. + return new ICmpInst(ICI.getUnsignedPredicate(), LHSCIOp, Res1); + } + + // The re-extended constant changed so the constant cannot be represented + // in the shorter type. Consequently, we cannot emit a simple comparison. + // All the cases that fold to true or false will have already been handled + // by SimplifyICmpInst, so only deal with the tricky case. + + if (isSignedCmp || !isSignedExt) + return 0; + + // Evaluate the comparison for LT (we invert for GT below). LE and GE cases + // should have been folded away previously and not enter in here. + + // We're performing an unsigned comp with a sign extended value. + // This is true if the input is >= 0. [aka >s -1] + Constant *NegOne = Constant::getAllOnesValue(SrcTy); + Value *Result = Builder->CreateICmpSGT(LHSCIOp, NegOne, ICI.getName()); + + // Finally, return the value computed. + if (ICI.getPredicate() == ICmpInst::ICMP_ULT) + return ReplaceInstUsesWith(ICI, Result); + + assert(ICI.getPredicate() == ICmpInst::ICMP_UGT && "ICmp should be folded!"); + return BinaryOperator::CreateNot(Result); +} + +/// ProcessUGT_ADDCST_ADD - The caller has matched a pattern of the form: +/// I = icmp ugt (add (add A, B), CI2), CI1 +/// If this is of the form: +/// sum = a + b +/// if (sum+128 >u 255) +/// Then replace it with llvm.sadd.with.overflow.i8. +/// +static Instruction *ProcessUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B, + ConstantInt *CI2, ConstantInt *CI1, + InstCombiner &IC) { + // The transformation we're trying to do here is to transform this into an + // llvm.sadd.with.overflow. To do this, we have to replace the original add + // with a narrower add, and discard the add-with-constant that is part of the + // range check (if we can't eliminate it, this isn't profitable). + + // In order to eliminate the add-with-constant, the compare can be its only + // use. + Instruction *AddWithCst = cast<Instruction>(I.getOperand(0)); + if (!AddWithCst->hasOneUse()) return 0; + + // If CI2 is 2^7, 2^15, 2^31, then it might be an sadd.with.overflow. + if (!CI2->getValue().isPowerOf2()) return 0; + unsigned NewWidth = CI2->getValue().countTrailingZeros(); + if (NewWidth != 7 && NewWidth != 15 && NewWidth != 31) return 0; + + // The width of the new add formed is 1 more than the bias. + ++NewWidth; + + // Check to see that CI1 is an all-ones value with NewWidth bits. + if (CI1->getBitWidth() == NewWidth || + CI1->getValue() != APInt::getLowBitsSet(CI1->getBitWidth(), NewWidth)) + return 0; + + // In order to replace the original add with a narrower + // llvm.sadd.with.overflow, the only uses allowed are the add-with-constant + // and truncates that discard the high bits of the add. Verify that this is + // the case. + Instruction *OrigAdd = cast<Instruction>(AddWithCst->getOperand(0)); + for (Value::use_iterator UI = OrigAdd->use_begin(), E = OrigAdd->use_end(); + UI != E; ++UI) { + if (*UI == AddWithCst) continue; + + // Only accept truncates for now. We would really like a nice recursive + // predicate like SimplifyDemandedBits, but which goes downwards the use-def + // chain to see which bits of a value are actually demanded. If the + // original add had another add which was then immediately truncated, we + // could still do the transformation. + TruncInst *TI = dyn_cast<TruncInst>(*UI); + if (TI == 0 || + TI->getType()->getPrimitiveSizeInBits() > NewWidth) return 0; + } + + // If the pattern matches, truncate the inputs to the narrower type and + // use the sadd_with_overflow intrinsic to efficiently compute both the + // result and the overflow bit. + Module *M = I.getParent()->getParent()->getParent(); + + const Type *NewType = IntegerType::get(OrigAdd->getContext(), NewWidth); + Value *F = Intrinsic::getDeclaration(M, Intrinsic::sadd_with_overflow, + &NewType, 1); + + InstCombiner::BuilderTy *Builder = IC.Builder; + + // Put the new code above the original add, in case there are any uses of the + // add between the add and the compare. + Builder->SetInsertPoint(OrigAdd); + + Value *TruncA = Builder->CreateTrunc(A, NewType, A->getName()+".trunc"); + Value *TruncB = Builder->CreateTrunc(B, NewType, B->getName()+".trunc"); + CallInst *Call = Builder->CreateCall2(F, TruncA, TruncB, "sadd"); + Value *Add = Builder->CreateExtractValue(Call, 0, "sadd.result"); + Value *ZExt = Builder->CreateZExt(Add, OrigAdd->getType()); + + // The inner add was the result of the narrow add, zero extended to the + // wider type. Replace it with the result computed by the intrinsic. + IC.ReplaceInstUsesWith(*OrigAdd, ZExt); + + // The original icmp gets replaced with the overflow value. + return ExtractValueInst::Create(Call, 1, "sadd.overflow"); +} + +static Instruction *ProcessUAddIdiom(Instruction &I, Value *OrigAddV, + InstCombiner &IC) { + // Don't bother doing this transformation for pointers, don't do it for + // vectors. + if (!isa<IntegerType>(OrigAddV->getType())) return 0; + + // If the add is a constant expr, then we don't bother transforming it. + Instruction *OrigAdd = dyn_cast<Instruction>(OrigAddV); + if (OrigAdd == 0) return 0; + + Value *LHS = OrigAdd->getOperand(0), *RHS = OrigAdd->getOperand(1); + + // Put the new code above the original add, in case there are any uses of the + // add between the add and the compare. + InstCombiner::BuilderTy *Builder = IC.Builder; + Builder->SetInsertPoint(OrigAdd); + + Module *M = I.getParent()->getParent()->getParent(); + const Type *Ty = LHS->getType(); + Value *F = Intrinsic::getDeclaration(M, Intrinsic::uadd_with_overflow, &Ty,1); + CallInst *Call = Builder->CreateCall2(F, LHS, RHS, "uadd"); + Value *Add = Builder->CreateExtractValue(Call, 0); + + IC.ReplaceInstUsesWith(*OrigAdd, Add); + + // The original icmp gets replaced with the overflow value. + return ExtractValueInst::Create(Call, 1, "uadd.overflow"); +} + +// DemandedBitsLHSMask - When performing a comparison against a constant, +// it is possible that not all the bits in the LHS are demanded. This helper +// method computes the mask that IS demanded. +static APInt DemandedBitsLHSMask(ICmpInst &I, + unsigned BitWidth, bool isSignCheck) { + if (isSignCheck) + return APInt::getSignBit(BitWidth); + + ConstantInt *CI = dyn_cast<ConstantInt>(I.getOperand(1)); + if (!CI) return APInt::getAllOnesValue(BitWidth); + const APInt &RHS = CI->getValue(); + + switch (I.getPredicate()) { + // For a UGT comparison, we don't care about any bits that + // correspond to the trailing ones of the comparand. The value of these + // bits doesn't impact the outcome of the comparison, because any value + // greater than the RHS must differ in a bit higher than these due to carry. + case ICmpInst::ICMP_UGT: { + unsigned trailingOnes = RHS.countTrailingOnes(); + APInt lowBitsSet = APInt::getLowBitsSet(BitWidth, trailingOnes); + return ~lowBitsSet; + } + + // Similarly, for a ULT comparison, we don't care about the trailing zeros. + // Any value less than the RHS must differ in a higher bit because of carries. + case ICmpInst::ICMP_ULT: { + unsigned trailingZeros = RHS.countTrailingZeros(); + APInt lowBitsSet = APInt::getLowBitsSet(BitWidth, trailingZeros); + return ~lowBitsSet; + } + + default: + return APInt::getAllOnesValue(BitWidth); + } + +} + +Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { + bool Changed = false; + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + + /// Orders the operands of the compare so that they are listed from most + /// complex to least complex. This puts constants before unary operators, + /// before binary operators. + if (getComplexity(Op0) < getComplexity(Op1)) { + I.swapOperands(); + std::swap(Op0, Op1); + Changed = true; + } + + if (Value *V = SimplifyICmpInst(I.getPredicate(), Op0, Op1, TD)) + return ReplaceInstUsesWith(I, V); + + const Type *Ty = Op0->getType(); + + // icmp's with boolean values can always be turned into bitwise operations + if (Ty->isIntegerTy(1)) { + switch (I.getPredicate()) { + default: llvm_unreachable("Invalid icmp instruction!"); + case ICmpInst::ICMP_EQ: { // icmp eq i1 A, B -> ~(A^B) + Value *Xor = Builder->CreateXor(Op0, Op1, I.getName()+"tmp"); + return BinaryOperator::CreateNot(Xor); + } + case ICmpInst::ICMP_NE: // icmp eq i1 A, B -> A^B + return BinaryOperator::CreateXor(Op0, Op1); + + case ICmpInst::ICMP_UGT: + std::swap(Op0, Op1); // Change icmp ugt -> icmp ult + // FALL THROUGH + case ICmpInst::ICMP_ULT:{ // icmp ult i1 A, B -> ~A & B + Value *Not = Builder->CreateNot(Op0, I.getName()+"tmp"); + return BinaryOperator::CreateAnd(Not, Op1); + } + case ICmpInst::ICMP_SGT: + std::swap(Op0, Op1); // Change icmp sgt -> icmp slt + // FALL THROUGH + case ICmpInst::ICMP_SLT: { // icmp slt i1 A, B -> A & ~B + Value *Not = Builder->CreateNot(Op1, I.getName()+"tmp"); + return BinaryOperator::CreateAnd(Not, Op0); + } + case ICmpInst::ICMP_UGE: + std::swap(Op0, Op1); // Change icmp uge -> icmp ule + // FALL THROUGH + case ICmpInst::ICMP_ULE: { // icmp ule i1 A, B -> ~A | B + Value *Not = Builder->CreateNot(Op0, I.getName()+"tmp"); + return BinaryOperator::CreateOr(Not, Op1); + } + case ICmpInst::ICMP_SGE: + std::swap(Op0, Op1); // Change icmp sge -> icmp sle + // FALL THROUGH + case ICmpInst::ICMP_SLE: { // icmp sle i1 A, B -> A | ~B + Value *Not = Builder->CreateNot(Op1, I.getName()+"tmp"); + return BinaryOperator::CreateOr(Not, Op0); + } + } + } + + unsigned BitWidth = 0; + if (Ty->isIntOrIntVectorTy()) + BitWidth = Ty->getScalarSizeInBits(); + else if (TD) // Pointers require TD info to get their size. + BitWidth = TD->getTypeSizeInBits(Ty->getScalarType()); + + bool isSignBit = false; + + // See if we are doing a comparison with a constant. + if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) { + Value *A = 0, *B = 0; + + // Match the following pattern, which is a common idiom when writing + // overflow-safe integer arithmetic function. The source performs an + // addition in wider type, and explicitly checks for overflow using + // comparisons against INT_MIN and INT_MAX. Simplify this by using the + // sadd_with_overflow intrinsic. + // + // TODO: This could probably be generalized to handle other overflow-safe + // operations if we worked out the formulas to compute the appropriate + // magic constants. + // + // sum = a + b + // if (sum+128 >u 255) ... -> llvm.sadd.with.overflow.i8 + { + ConstantInt *CI2; // I = icmp ugt (add (add A, B), CI2), CI + if (I.getPredicate() == ICmpInst::ICMP_UGT && + match(Op0, m_Add(m_Add(m_Value(A), m_Value(B)), m_ConstantInt(CI2)))) + if (Instruction *Res = ProcessUGT_ADDCST_ADD(I, A, B, CI2, CI, *this)) + return Res; + } + + // (icmp ne/eq (sub A B) 0) -> (icmp ne/eq A, B) + if (I.isEquality() && CI->isZero() && + match(Op0, m_Sub(m_Value(A), m_Value(B)))) { + // (icmp cond A B) if cond is equality + return new ICmpInst(I.getPredicate(), A, B); + } + + // If we have an icmp le or icmp ge instruction, turn it into the + // appropriate icmp lt or icmp gt instruction. This allows us to rely on + // them being folded in the code below. The SimplifyICmpInst code has + // already handled the edge cases for us, so we just assert on them. + switch (I.getPredicate()) { + default: break; + case ICmpInst::ICMP_ULE: + assert(!CI->isMaxValue(false)); // A <=u MAX -> TRUE + return new ICmpInst(ICmpInst::ICMP_ULT, Op0, + ConstantInt::get(CI->getContext(), CI->getValue()+1)); + case ICmpInst::ICMP_SLE: + assert(!CI->isMaxValue(true)); // A <=s MAX -> TRUE + return new ICmpInst(ICmpInst::ICMP_SLT, Op0, + ConstantInt::get(CI->getContext(), CI->getValue()+1)); + case ICmpInst::ICMP_UGE: + assert(!CI->isMinValue(false)); // A >=u MIN -> TRUE + return new ICmpInst(ICmpInst::ICMP_UGT, Op0, + ConstantInt::get(CI->getContext(), CI->getValue()-1)); + case ICmpInst::ICMP_SGE: + assert(!CI->isMinValue(true)); // A >=s MIN -> TRUE + return new ICmpInst(ICmpInst::ICMP_SGT, Op0, + ConstantInt::get(CI->getContext(), CI->getValue()-1)); + } + + // If this comparison is a normal comparison, it demands all + // bits, if it is a sign bit comparison, it only demands the sign bit. + bool UnusedBit; + isSignBit = isSignBitCheck(I.getPredicate(), CI, UnusedBit); + } + + // See if we can fold the comparison based on range information we can get + // by checking whether bits are known to be zero or one in the input. + if (BitWidth != 0) { + APInt Op0KnownZero(BitWidth, 0), Op0KnownOne(BitWidth, 0); + APInt Op1KnownZero(BitWidth, 0), Op1KnownOne(BitWidth, 0); + + if (SimplifyDemandedBits(I.getOperandUse(0), + DemandedBitsLHSMask(I, BitWidth, isSignBit), + Op0KnownZero, Op0KnownOne, 0)) + return &I; + if (SimplifyDemandedBits(I.getOperandUse(1), + APInt::getAllOnesValue(BitWidth), + Op1KnownZero, Op1KnownOne, 0)) + return &I; + + // Given the known and unknown bits, compute a range that the LHS could be + // in. Compute the Min, Max and RHS values based on the known bits. For the + // EQ and NE we use unsigned values. + APInt Op0Min(BitWidth, 0), Op0Max(BitWidth, 0); + APInt Op1Min(BitWidth, 0), Op1Max(BitWidth, 0); + if (I.isSigned()) { + ComputeSignedMinMaxValuesFromKnownBits(Op0KnownZero, Op0KnownOne, + Op0Min, Op0Max); + ComputeSignedMinMaxValuesFromKnownBits(Op1KnownZero, Op1KnownOne, + Op1Min, Op1Max); + } else { + ComputeUnsignedMinMaxValuesFromKnownBits(Op0KnownZero, Op0KnownOne, + Op0Min, Op0Max); + ComputeUnsignedMinMaxValuesFromKnownBits(Op1KnownZero, Op1KnownOne, + Op1Min, Op1Max); + } + + // If Min and Max are known to be the same, then SimplifyDemandedBits + // figured out that the LHS is a constant. Just constant fold this now so + // that code below can assume that Min != Max. + if (!isa<Constant>(Op0) && Op0Min == Op0Max) + return new ICmpInst(I.getPredicate(), + ConstantInt::get(I.getContext(), Op0Min), Op1); + if (!isa<Constant>(Op1) && Op1Min == Op1Max) + return new ICmpInst(I.getPredicate(), Op0, + ConstantInt::get(I.getContext(), Op1Min)); + + // Based on the range information we know about the LHS, see if we can + // simplify this comparison. For example, (x&4) < 8 is always true. + switch (I.getPredicate()) { + default: llvm_unreachable("Unknown icmp opcode!"); + case ICmpInst::ICMP_EQ: { + if (Op0Max.ult(Op1Min) || Op0Min.ugt(Op1Max)) + return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext())); + + // If all bits are known zero except for one, then we know at most one + // bit is set. If the comparison is against zero, then this is a check + // to see if *that* bit is set. + APInt Op0KnownZeroInverted = ~Op0KnownZero; + if (~Op1KnownZero == 0 && Op0KnownZeroInverted.isPowerOf2()) { + // If the LHS is an AND with the same constant, look through it. + Value *LHS = 0; + ConstantInt *LHSC = 0; + if (!match(Op0, m_And(m_Value(LHS), m_ConstantInt(LHSC))) || + LHSC->getValue() != Op0KnownZeroInverted) + LHS = Op0; + + // If the LHS is 1 << x, and we know the result is a power of 2 like 8, + // then turn "((1 << x)&8) == 0" into "x != 3". + Value *X = 0; + if (match(LHS, m_Shl(m_One(), m_Value(X)))) { + unsigned CmpVal = Op0KnownZeroInverted.countTrailingZeros(); + return new ICmpInst(ICmpInst::ICMP_NE, X, + ConstantInt::get(X->getType(), CmpVal)); + } + + // If the LHS is 8 >>u x, and we know the result is a power of 2 like 1, + // then turn "((8 >>u x)&1) == 0" into "x != 3". + const APInt *CI; + if (Op0KnownZeroInverted == 1 && + match(LHS, m_LShr(m_Power2(CI), m_Value(X)))) + return new ICmpInst(ICmpInst::ICMP_NE, X, + ConstantInt::get(X->getType(), + CI->countTrailingZeros())); + } + + break; + } + case ICmpInst::ICMP_NE: { + if (Op0Max.ult(Op1Min) || Op0Min.ugt(Op1Max)) + return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext())); + + // If all bits are known zero except for one, then we know at most one + // bit is set. If the comparison is against zero, then this is a check + // to see if *that* bit is set. + APInt Op0KnownZeroInverted = ~Op0KnownZero; + if (~Op1KnownZero == 0 && Op0KnownZeroInverted.isPowerOf2()) { + // If the LHS is an AND with the same constant, look through it. + Value *LHS = 0; + ConstantInt *LHSC = 0; + if (!match(Op0, m_And(m_Value(LHS), m_ConstantInt(LHSC))) || + LHSC->getValue() != Op0KnownZeroInverted) + LHS = Op0; + + // If the LHS is 1 << x, and we know the result is a power of 2 like 8, + // then turn "((1 << x)&8) != 0" into "x == 3". + Value *X = 0; + if (match(LHS, m_Shl(m_One(), m_Value(X)))) { + unsigned CmpVal = Op0KnownZeroInverted.countTrailingZeros(); + return new ICmpInst(ICmpInst::ICMP_EQ, X, + ConstantInt::get(X->getType(), CmpVal)); + } + + // If the LHS is 8 >>u x, and we know the result is a power of 2 like 1, + // then turn "((8 >>u x)&1) != 0" into "x == 3". + const APInt *CI; + if (Op0KnownZeroInverted == 1 && + match(LHS, m_LShr(m_Power2(CI), m_Value(X)))) + return new ICmpInst(ICmpInst::ICMP_EQ, X, + ConstantInt::get(X->getType(), + CI->countTrailingZeros())); + } + + break; + } + case ICmpInst::ICMP_ULT: + if (Op0Max.ult(Op1Min)) // A <u B -> true if max(A) < min(B) + return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext())); + if (Op0Min.uge(Op1Max)) // A <u B -> false if min(A) >= max(B) + return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext())); + if (Op1Min == Op0Max) // A <u B -> A != B if max(A) == min(B) + return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1); + if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) { + if (Op1Max == Op0Min+1) // A <u C -> A == C-1 if min(A)+1 == C + return new ICmpInst(ICmpInst::ICMP_EQ, Op0, + ConstantInt::get(CI->getContext(), CI->getValue()-1)); + + // (x <u 2147483648) -> (x >s -1) -> true if sign bit clear + if (CI->isMinValue(true)) + return new ICmpInst(ICmpInst::ICMP_SGT, Op0, + Constant::getAllOnesValue(Op0->getType())); + } + break; + case ICmpInst::ICMP_UGT: + if (Op0Min.ugt(Op1Max)) // A >u B -> true if min(A) > max(B) + return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext())); + if (Op0Max.ule(Op1Min)) // A >u B -> false if max(A) <= max(B) + return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext())); + + if (Op1Max == Op0Min) // A >u B -> A != B if min(A) == max(B) + return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1); + if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) { + if (Op1Min == Op0Max-1) // A >u C -> A == C+1 if max(a)-1 == C + return new ICmpInst(ICmpInst::ICMP_EQ, Op0, + ConstantInt::get(CI->getContext(), CI->getValue()+1)); + + // (x >u 2147483647) -> (x <s 0) -> true if sign bit set + if (CI->isMaxValue(true)) + return new ICmpInst(ICmpInst::ICMP_SLT, Op0, + Constant::getNullValue(Op0->getType())); + } + break; + case ICmpInst::ICMP_SLT: + if (Op0Max.slt(Op1Min)) // A <s B -> true if max(A) < min(C) + return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext())); + if (Op0Min.sge(Op1Max)) // A <s B -> false if min(A) >= max(C) + return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext())); + if (Op1Min == Op0Max) // A <s B -> A != B if max(A) == min(B) + return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1); + if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) { + if (Op1Max == Op0Min+1) // A <s C -> A == C-1 if min(A)+1 == C + return new ICmpInst(ICmpInst::ICMP_EQ, Op0, + ConstantInt::get(CI->getContext(), CI->getValue()-1)); + } + break; + case ICmpInst::ICMP_SGT: + if (Op0Min.sgt(Op1Max)) // A >s B -> true if min(A) > max(B) + return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext())); + if (Op0Max.sle(Op1Min)) // A >s B -> false if max(A) <= min(B) + return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext())); + + if (Op1Max == Op0Min) // A >s B -> A != B if min(A) == max(B) + return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1); + if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) { + if (Op1Min == Op0Max-1) // A >s C -> A == C+1 if max(A)-1 == C + return new ICmpInst(ICmpInst::ICMP_EQ, Op0, + ConstantInt::get(CI->getContext(), CI->getValue()+1)); + } + break; + case ICmpInst::ICMP_SGE: + assert(!isa<ConstantInt>(Op1) && "ICMP_SGE with ConstantInt not folded!"); + if (Op0Min.sge(Op1Max)) // A >=s B -> true if min(A) >= max(B) + return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext())); + if (Op0Max.slt(Op1Min)) // A >=s B -> false if max(A) < min(B) + return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext())); + break; + case ICmpInst::ICMP_SLE: + assert(!isa<ConstantInt>(Op1) && "ICMP_SLE with ConstantInt not folded!"); + if (Op0Max.sle(Op1Min)) // A <=s B -> true if max(A) <= min(B) + return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext())); + if (Op0Min.sgt(Op1Max)) // A <=s B -> false if min(A) > max(B) + return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext())); + break; + case ICmpInst::ICMP_UGE: + assert(!isa<ConstantInt>(Op1) && "ICMP_UGE with ConstantInt not folded!"); + if (Op0Min.uge(Op1Max)) // A >=u B -> true if min(A) >= max(B) + return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext())); + if (Op0Max.ult(Op1Min)) // A >=u B -> false if max(A) < min(B) + return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext())); + break; + case ICmpInst::ICMP_ULE: + assert(!isa<ConstantInt>(Op1) && "ICMP_ULE with ConstantInt not folded!"); + if (Op0Max.ule(Op1Min)) // A <=u B -> true if max(A) <= min(B) + return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext())); + if (Op0Min.ugt(Op1Max)) // A <=u B -> false if min(A) > max(B) + return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext())); + break; + } + + // Turn a signed comparison into an unsigned one if both operands + // are known to have the same sign. + if (I.isSigned() && + ((Op0KnownZero.isNegative() && Op1KnownZero.isNegative()) || + (Op0KnownOne.isNegative() && Op1KnownOne.isNegative()))) + return new ICmpInst(I.getUnsignedPredicate(), Op0, Op1); + } + + // Test if the ICmpInst instruction is used exclusively by a select as + // part of a minimum or maximum operation. If so, refrain from doing + // any other folding. This helps out other analyses which understand + // non-obfuscated minimum and maximum idioms, such as ScalarEvolution + // and CodeGen. And in this case, at least one of the comparison + // operands has at least one user besides the compare (the select), + // which would often largely negate the benefit of folding anyway. + if (I.hasOneUse()) + if (SelectInst *SI = dyn_cast<SelectInst>(*I.use_begin())) + if ((SI->getOperand(1) == Op0 && SI->getOperand(2) == Op1) || + (SI->getOperand(2) == Op0 && SI->getOperand(1) == Op1)) + return 0; + + // See if we are doing a comparison between a constant and an instruction that + // can be folded into the comparison. + if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) { + // Since the RHS is a ConstantInt (CI), if the left hand side is an + // instruction, see if that instruction also has constants so that the + // instruction can be folded into the icmp + if (Instruction *LHSI = dyn_cast<Instruction>(Op0)) + if (Instruction *Res = visitICmpInstWithInstAndIntCst(I, LHSI, CI)) + return Res; + } + + // Handle icmp with constant (but not simple integer constant) RHS + if (Constant *RHSC = dyn_cast<Constant>(Op1)) { + if (Instruction *LHSI = dyn_cast<Instruction>(Op0)) + switch (LHSI->getOpcode()) { + case Instruction::GetElementPtr: + // icmp pred GEP (P, int 0, int 0, int 0), null -> icmp pred P, null + if (RHSC->isNullValue() && + cast<GetElementPtrInst>(LHSI)->hasAllZeroIndices()) + return new ICmpInst(I.getPredicate(), LHSI->getOperand(0), + Constant::getNullValue(LHSI->getOperand(0)->getType())); + break; + case Instruction::PHI: + // Only fold icmp into the PHI if the phi and icmp are in the same + // block. If in the same block, we're encouraging jump threading. If + // not, we are just pessimizing the code by making an i1 phi. + if (LHSI->getParent() == I.getParent()) + if (Instruction *NV = FoldOpIntoPhi(I)) + return NV; + break; + case Instruction::Select: { + // If either operand of the select is a constant, we can fold the + // comparison into the select arms, which will cause one to be + // constant folded and the select turned into a bitwise or. + Value *Op1 = 0, *Op2 = 0; + if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1))) + Op1 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC); + if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(2))) + Op2 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC); + + // We only want to perform this transformation if it will not lead to + // additional code. This is true if either both sides of the select + // fold to a constant (in which case the icmp is replaced with a select + // which will usually simplify) or this is the only user of the + // select (in which case we are trading a select+icmp for a simpler + // select+icmp). + if ((Op1 && Op2) || (LHSI->hasOneUse() && (Op1 || Op2))) { + if (!Op1) + Op1 = Builder->CreateICmp(I.getPredicate(), LHSI->getOperand(1), + RHSC, I.getName()); + if (!Op2) + Op2 = Builder->CreateICmp(I.getPredicate(), LHSI->getOperand(2), + RHSC, I.getName()); + return SelectInst::Create(LHSI->getOperand(0), Op1, Op2); + } + break; + } + case Instruction::IntToPtr: + // icmp pred inttoptr(X), null -> icmp pred X, 0 + if (RHSC->isNullValue() && TD && + TD->getIntPtrType(RHSC->getContext()) == + LHSI->getOperand(0)->getType()) + return new ICmpInst(I.getPredicate(), LHSI->getOperand(0), + Constant::getNullValue(LHSI->getOperand(0)->getType())); + break; + + case Instruction::Load: + // Try to optimize things like "A[i] > 4" to index computations. + if (GetElementPtrInst *GEP = + dyn_cast<GetElementPtrInst>(LHSI->getOperand(0))) { + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) + if (GV->isConstant() && GV->hasDefinitiveInitializer() && + !cast<LoadInst>(LHSI)->isVolatile()) + if (Instruction *Res = FoldCmpLoadFromIndexedGlobal(GEP, GV, I)) + return Res; + } + break; + } + } + + // If we can optimize a 'icmp GEP, P' or 'icmp P, GEP', do so now. + if (GEPOperator *GEP = dyn_cast<GEPOperator>(Op0)) + if (Instruction *NI = FoldGEPICmp(GEP, Op1, I.getPredicate(), I)) + return NI; + if (GEPOperator *GEP = dyn_cast<GEPOperator>(Op1)) + if (Instruction *NI = FoldGEPICmp(GEP, Op0, + ICmpInst::getSwappedPredicate(I.getPredicate()), I)) + return NI; + + // Test to see if the operands of the icmp are casted versions of other + // values. If the ptr->ptr cast can be stripped off both arguments, we do so + // now. + if (BitCastInst *CI = dyn_cast<BitCastInst>(Op0)) { + if (Op0->getType()->isPointerTy() && + (isa<Constant>(Op1) || isa<BitCastInst>(Op1))) { + // We keep moving the cast from the left operand over to the right + // operand, where it can often be eliminated completely. + Op0 = CI->getOperand(0); + + // If operand #1 is a bitcast instruction, it must also be a ptr->ptr cast + // so eliminate it as well. + if (BitCastInst *CI2 = dyn_cast<BitCastInst>(Op1)) + Op1 = CI2->getOperand(0); + + // If Op1 is a constant, we can fold the cast into the constant. + if (Op0->getType() != Op1->getType()) { + if (Constant *Op1C = dyn_cast<Constant>(Op1)) { + Op1 = ConstantExpr::getBitCast(Op1C, Op0->getType()); + } else { + // Otherwise, cast the RHS right before the icmp + Op1 = Builder->CreateBitCast(Op1, Op0->getType()); + } + } + return new ICmpInst(I.getPredicate(), Op0, Op1); + } + } + + if (isa<CastInst>(Op0)) { + // Handle the special case of: icmp (cast bool to X), <cst> + // This comes up when you have code like + // int X = A < B; + // if (X) ... + // For generality, we handle any zero-extension of any operand comparison + // with a constant or another cast from the same type. + if (isa<Constant>(Op1) || isa<CastInst>(Op1)) + if (Instruction *R = visitICmpInstWithCastAndCast(I)) + return R; + } + + // Special logic for binary operators. + BinaryOperator *BO0 = dyn_cast<BinaryOperator>(Op0); + BinaryOperator *BO1 = dyn_cast<BinaryOperator>(Op1); + if (BO0 || BO1) { + CmpInst::Predicate Pred = I.getPredicate(); + bool NoOp0WrapProblem = false, NoOp1WrapProblem = false; + if (BO0 && isa<OverflowingBinaryOperator>(BO0)) + NoOp0WrapProblem = ICmpInst::isEquality(Pred) || + (CmpInst::isUnsigned(Pred) && BO0->hasNoUnsignedWrap()) || + (CmpInst::isSigned(Pred) && BO0->hasNoSignedWrap()); + if (BO1 && isa<OverflowingBinaryOperator>(BO1)) + NoOp1WrapProblem = ICmpInst::isEquality(Pred) || + (CmpInst::isUnsigned(Pred) && BO1->hasNoUnsignedWrap()) || + (CmpInst::isSigned(Pred) && BO1->hasNoSignedWrap()); + + // Analyze the case when either Op0 or Op1 is an add instruction. + // Op0 = A + B (or A and B are null); Op1 = C + D (or C and D are null). + Value *A = 0, *B = 0, *C = 0, *D = 0; + if (BO0 && BO0->getOpcode() == Instruction::Add) + A = BO0->getOperand(0), B = BO0->getOperand(1); + if (BO1 && BO1->getOpcode() == Instruction::Add) + C = BO1->getOperand(0), D = BO1->getOperand(1); + + // icmp (X+Y), X -> icmp Y, 0 for equalities or if there is no overflow. + if ((A == Op1 || B == Op1) && NoOp0WrapProblem) + return new ICmpInst(Pred, A == Op1 ? B : A, + Constant::getNullValue(Op1->getType())); + + // icmp X, (X+Y) -> icmp 0, Y for equalities or if there is no overflow. + if ((C == Op0 || D == Op0) && NoOp1WrapProblem) + return new ICmpInst(Pred, Constant::getNullValue(Op0->getType()), + C == Op0 ? D : C); + + // icmp (X+Y), (X+Z) -> icmp Y, Z for equalities or if there is no overflow. + if (A && C && (A == C || A == D || B == C || B == D) && + NoOp0WrapProblem && NoOp1WrapProblem && + // Try not to increase register pressure. + BO0->hasOneUse() && BO1->hasOneUse()) { + // Determine Y and Z in the form icmp (X+Y), (X+Z). + Value *Y = (A == C || A == D) ? B : A; + Value *Z = (C == A || C == B) ? D : C; + return new ICmpInst(Pred, Y, Z); + } + + // Analyze the case when either Op0 or Op1 is a sub instruction. + // Op0 = A - B (or A and B are null); Op1 = C - D (or C and D are null). + A = 0; B = 0; C = 0; D = 0; + if (BO0 && BO0->getOpcode() == Instruction::Sub) + A = BO0->getOperand(0), B = BO0->getOperand(1); + if (BO1 && BO1->getOpcode() == Instruction::Sub) + C = BO1->getOperand(0), D = BO1->getOperand(1); + + // icmp (X-Y), X -> icmp 0, Y for equalities or if there is no overflow. + if (A == Op1 && NoOp0WrapProblem) + return new ICmpInst(Pred, Constant::getNullValue(Op1->getType()), B); + + // icmp X, (X-Y) -> icmp Y, 0 for equalities or if there is no overflow. + if (C == Op0 && NoOp1WrapProblem) + return new ICmpInst(Pred, D, Constant::getNullValue(Op0->getType())); + + // icmp (Y-X), (Z-X) -> icmp Y, Z for equalities or if there is no overflow. + if (B && D && B == D && NoOp0WrapProblem && NoOp1WrapProblem && + // Try not to increase register pressure. + BO0->hasOneUse() && BO1->hasOneUse()) + return new ICmpInst(Pred, A, C); + + // icmp (X-Y), (X-Z) -> icmp Z, Y for equalities or if there is no overflow. + if (A && C && A == C && NoOp0WrapProblem && NoOp1WrapProblem && + // Try not to increase register pressure. + BO0->hasOneUse() && BO1->hasOneUse()) + return new ICmpInst(Pred, D, B); + + if (BO0 && BO1 && BO0->getOpcode() == BO1->getOpcode() && + BO0->hasOneUse() && BO1->hasOneUse() && + BO0->getOperand(1) == BO1->getOperand(1)) { + switch (BO0->getOpcode()) { + default: break; + case Instruction::Add: + case Instruction::Sub: + case Instruction::Xor: + if (I.isEquality()) // a+x icmp eq/ne b+x --> a icmp b + return new ICmpInst(I.getPredicate(), BO0->getOperand(0), + BO1->getOperand(0)); + // icmp u/s (a ^ signbit), (b ^ signbit) --> icmp s/u a, b + if (ConstantInt *CI = dyn_cast<ConstantInt>(BO0->getOperand(1))) { + if (CI->getValue().isSignBit()) { + ICmpInst::Predicate Pred = I.isSigned() + ? I.getUnsignedPredicate() + : I.getSignedPredicate(); + return new ICmpInst(Pred, BO0->getOperand(0), + BO1->getOperand(0)); + } + + if (CI->getValue().isMaxSignedValue()) { + ICmpInst::Predicate Pred = I.isSigned() + ? I.getUnsignedPredicate() + : I.getSignedPredicate(); + Pred = I.getSwappedPredicate(Pred); + return new ICmpInst(Pred, BO0->getOperand(0), + BO1->getOperand(0)); + } + } + break; + case Instruction::Mul: + if (!I.isEquality()) + break; + + if (ConstantInt *CI = dyn_cast<ConstantInt>(BO0->getOperand(1))) { + // a * Cst icmp eq/ne b * Cst --> a & Mask icmp b & Mask + // Mask = -1 >> count-trailing-zeros(Cst). + if (!CI->isZero() && !CI->isOne()) { + const APInt &AP = CI->getValue(); + ConstantInt *Mask = ConstantInt::get(I.getContext(), + APInt::getLowBitsSet(AP.getBitWidth(), + AP.getBitWidth() - + AP.countTrailingZeros())); + Value *And1 = Builder->CreateAnd(BO0->getOperand(0), Mask); + Value *And2 = Builder->CreateAnd(BO1->getOperand(0), Mask); + return new ICmpInst(I.getPredicate(), And1, And2); + } + } + break; + } + } + } + + { Value *A, *B; + // ~x < ~y --> y < x + // ~x < cst --> ~cst < x + if (match(Op0, m_Not(m_Value(A)))) { + if (match(Op1, m_Not(m_Value(B)))) + return new ICmpInst(I.getPredicate(), B, A); + if (ConstantInt *RHSC = dyn_cast<ConstantInt>(Op1)) + return new ICmpInst(I.getPredicate(), ConstantExpr::getNot(RHSC), A); + } + + // (a+b) <u a --> llvm.uadd.with.overflow. + // (a+b) <u b --> llvm.uadd.with.overflow. + if (I.getPredicate() == ICmpInst::ICMP_ULT && + match(Op0, m_Add(m_Value(A), m_Value(B))) && + (Op1 == A || Op1 == B)) + if (Instruction *R = ProcessUAddIdiom(I, Op0, *this)) + return R; + + // a >u (a+b) --> llvm.uadd.with.overflow. + // b >u (a+b) --> llvm.uadd.with.overflow. + if (I.getPredicate() == ICmpInst::ICMP_UGT && + match(Op1, m_Add(m_Value(A), m_Value(B))) && + (Op0 == A || Op0 == B)) + if (Instruction *R = ProcessUAddIdiom(I, Op1, *this)) + return R; + } + + if (I.isEquality()) { + Value *A, *B, *C, *D; + + if (match(Op0, m_Xor(m_Value(A), m_Value(B)))) { + if (A == Op1 || B == Op1) { // (A^B) == A -> B == 0 + Value *OtherVal = A == Op1 ? B : A; + return new ICmpInst(I.getPredicate(), OtherVal, + Constant::getNullValue(A->getType())); + } + + if (match(Op1, m_Xor(m_Value(C), m_Value(D)))) { + // A^c1 == C^c2 --> A == C^(c1^c2) + ConstantInt *C1, *C2; + if (match(B, m_ConstantInt(C1)) && + match(D, m_ConstantInt(C2)) && Op1->hasOneUse()) { + Constant *NC = ConstantInt::get(I.getContext(), + C1->getValue() ^ C2->getValue()); + Value *Xor = Builder->CreateXor(C, NC, "tmp"); + return new ICmpInst(I.getPredicate(), A, Xor); + } + + // A^B == A^D -> B == D + if (A == C) return new ICmpInst(I.getPredicate(), B, D); + if (A == D) return new ICmpInst(I.getPredicate(), B, C); + if (B == C) return new ICmpInst(I.getPredicate(), A, D); + if (B == D) return new ICmpInst(I.getPredicate(), A, C); + } + } + + if (match(Op1, m_Xor(m_Value(A), m_Value(B))) && + (A == Op0 || B == Op0)) { + // A == (A^B) -> B == 0 + Value *OtherVal = A == Op0 ? B : A; + return new ICmpInst(I.getPredicate(), OtherVal, + Constant::getNullValue(A->getType())); + } + + // (X&Z) == (Y&Z) -> (X^Y) & Z == 0 + if (Op0->hasOneUse() && Op1->hasOneUse() && + match(Op0, m_And(m_Value(A), m_Value(B))) && + match(Op1, m_And(m_Value(C), m_Value(D)))) { + Value *X = 0, *Y = 0, *Z = 0; + + if (A == C) { + X = B; Y = D; Z = A; + } else if (A == D) { + X = B; Y = C; Z = A; + } else if (B == C) { + X = A; Y = D; Z = B; + } else if (B == D) { + X = A; Y = C; Z = B; + } + + if (X) { // Build (X^Y) & Z + Op1 = Builder->CreateXor(X, Y, "tmp"); + Op1 = Builder->CreateAnd(Op1, Z, "tmp"); + I.setOperand(0, Op1); + I.setOperand(1, Constant::getNullValue(Op1->getType())); + return &I; + } + } + } + + { + Value *X; ConstantInt *Cst; + // icmp X+Cst, X + if (match(Op0, m_Add(m_Value(X), m_ConstantInt(Cst))) && Op1 == X) + return FoldICmpAddOpCst(I, X, Cst, I.getPredicate(), Op0); + + // icmp X, X+Cst + if (match(Op1, m_Add(m_Value(X), m_ConstantInt(Cst))) && Op0 == X) + return FoldICmpAddOpCst(I, X, Cst, I.getSwappedPredicate(), Op1); + } + return Changed ? &I : 0; +} + + + + + + +/// FoldFCmp_IntToFP_Cst - Fold fcmp ([us]itofp x, cst) if possible. +/// +Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I, + Instruction *LHSI, + Constant *RHSC) { + if (!isa<ConstantFP>(RHSC)) return 0; + const APFloat &RHS = cast<ConstantFP>(RHSC)->getValueAPF(); + + // Get the width of the mantissa. We don't want to hack on conversions that + // might lose information from the integer, e.g. "i64 -> float" + int MantissaWidth = LHSI->getType()->getFPMantissaWidth(); + if (MantissaWidth == -1) return 0; // Unknown. + + // Check to see that the input is converted from an integer type that is small + // enough that preserves all bits. TODO: check here for "known" sign bits. + // This would allow us to handle (fptosi (x >>s 62) to float) if x is i64 f.e. + unsigned InputSize = LHSI->getOperand(0)->getType()->getScalarSizeInBits(); + + // If this is a uitofp instruction, we need an extra bit to hold the sign. + bool LHSUnsigned = isa<UIToFPInst>(LHSI); + if (LHSUnsigned) + ++InputSize; + + // If the conversion would lose info, don't hack on this. + if ((int)InputSize > MantissaWidth) + return 0; + + // Otherwise, we can potentially simplify the comparison. We know that it + // will always come through as an integer value and we know the constant is + // not a NAN (it would have been previously simplified). + assert(!RHS.isNaN() && "NaN comparison not already folded!"); + + ICmpInst::Predicate Pred; + switch (I.getPredicate()) { + default: llvm_unreachable("Unexpected predicate!"); + case FCmpInst::FCMP_UEQ: + case FCmpInst::FCMP_OEQ: + Pred = ICmpInst::ICMP_EQ; + break; + case FCmpInst::FCMP_UGT: + case FCmpInst::FCMP_OGT: + Pred = LHSUnsigned ? ICmpInst::ICMP_UGT : ICmpInst::ICMP_SGT; + break; + case FCmpInst::FCMP_UGE: + case FCmpInst::FCMP_OGE: + Pred = LHSUnsigned ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_SGE; + break; + case FCmpInst::FCMP_ULT: + case FCmpInst::FCMP_OLT: + Pred = LHSUnsigned ? ICmpInst::ICMP_ULT : ICmpInst::ICMP_SLT; + break; + case FCmpInst::FCMP_ULE: + case FCmpInst::FCMP_OLE: + Pred = LHSUnsigned ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_SLE; + break; + case FCmpInst::FCMP_UNE: + case FCmpInst::FCMP_ONE: + Pred = ICmpInst::ICMP_NE; + break; + case FCmpInst::FCMP_ORD: + return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext())); + case FCmpInst::FCMP_UNO: + return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext())); + } + + const IntegerType *IntTy = cast<IntegerType>(LHSI->getOperand(0)->getType()); + + // Now we know that the APFloat is a normal number, zero or inf. + + // See if the FP constant is too large for the integer. For example, + // comparing an i8 to 300.0. + unsigned IntWidth = IntTy->getScalarSizeInBits(); + + if (!LHSUnsigned) { + // If the RHS value is > SignedMax, fold the comparison. This handles +INF + // and large values. + APFloat SMax(RHS.getSemantics(), APFloat::fcZero, false); + SMax.convertFromAPInt(APInt::getSignedMaxValue(IntWidth), true, + APFloat::rmNearestTiesToEven); + if (SMax.compare(RHS) == APFloat::cmpLessThan) { // smax < 13123.0 + if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SLT || + Pred == ICmpInst::ICMP_SLE) + return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext())); + return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext())); + } + } else { + // If the RHS value is > UnsignedMax, fold the comparison. This handles + // +INF and large values. + APFloat UMax(RHS.getSemantics(), APFloat::fcZero, false); + UMax.convertFromAPInt(APInt::getMaxValue(IntWidth), false, + APFloat::rmNearestTiesToEven); + if (UMax.compare(RHS) == APFloat::cmpLessThan) { // umax < 13123.0 + if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_ULT || + Pred == ICmpInst::ICMP_ULE) + return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext())); + return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext())); + } + } + + if (!LHSUnsigned) { + // See if the RHS value is < SignedMin. + APFloat SMin(RHS.getSemantics(), APFloat::fcZero, false); + SMin.convertFromAPInt(APInt::getSignedMinValue(IntWidth), true, + APFloat::rmNearestTiesToEven); + if (SMin.compare(RHS) == APFloat::cmpGreaterThan) { // smin > 12312.0 + if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SGT || + Pred == ICmpInst::ICMP_SGE) + return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext())); + return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext())); + } + } + + // Okay, now we know that the FP constant fits in the range [SMIN, SMAX] or + // [0, UMAX], but it may still be fractional. See if it is fractional by + // casting the FP value to the integer value and back, checking for equality. + // Don't do this for zero, because -0.0 is not fractional. + Constant *RHSInt = LHSUnsigned + ? ConstantExpr::getFPToUI(RHSC, IntTy) + : ConstantExpr::getFPToSI(RHSC, IntTy); + if (!RHS.isZero()) { + bool Equal = LHSUnsigned + ? ConstantExpr::getUIToFP(RHSInt, RHSC->getType()) == RHSC + : ConstantExpr::getSIToFP(RHSInt, RHSC->getType()) == RHSC; + if (!Equal) { + // If we had a comparison against a fractional value, we have to adjust + // the compare predicate and sometimes the value. RHSC is rounded towards + // zero at this point. + switch (Pred) { + default: llvm_unreachable("Unexpected integer comparison!"); + case ICmpInst::ICMP_NE: // (float)int != 4.4 --> true + return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext())); + case ICmpInst::ICMP_EQ: // (float)int == 4.4 --> false + return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext())); + case ICmpInst::ICMP_ULE: + // (float)int <= 4.4 --> int <= 4 + // (float)int <= -4.4 --> false + if (RHS.isNegative()) + return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext())); + break; + case ICmpInst::ICMP_SLE: + // (float)int <= 4.4 --> int <= 4 + // (float)int <= -4.4 --> int < -4 + if (RHS.isNegative()) + Pred = ICmpInst::ICMP_SLT; + break; + case ICmpInst::ICMP_ULT: + // (float)int < -4.4 --> false + // (float)int < 4.4 --> int <= 4 + if (RHS.isNegative()) + return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext())); + Pred = ICmpInst::ICMP_ULE; + break; + case ICmpInst::ICMP_SLT: + // (float)int < -4.4 --> int < -4 + // (float)int < 4.4 --> int <= 4 + if (!RHS.isNegative()) + Pred = ICmpInst::ICMP_SLE; + break; + case ICmpInst::ICMP_UGT: + // (float)int > 4.4 --> int > 4 + // (float)int > -4.4 --> true + if (RHS.isNegative()) + return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext())); + break; + case ICmpInst::ICMP_SGT: + // (float)int > 4.4 --> int > 4 + // (float)int > -4.4 --> int >= -4 + if (RHS.isNegative()) + Pred = ICmpInst::ICMP_SGE; + break; + case ICmpInst::ICMP_UGE: + // (float)int >= -4.4 --> true + // (float)int >= 4.4 --> int > 4 + if (!RHS.isNegative()) + return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext())); + Pred = ICmpInst::ICMP_UGT; + break; + case ICmpInst::ICMP_SGE: + // (float)int >= -4.4 --> int >= -4 + // (float)int >= 4.4 --> int > 4 + if (!RHS.isNegative()) + Pred = ICmpInst::ICMP_SGT; + break; + } + } + } + + // Lower this FP comparison into an appropriate integer version of the + // comparison. + return new ICmpInst(Pred, LHSI->getOperand(0), RHSInt); +} + +Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) { + bool Changed = false; + + /// Orders the operands of the compare so that they are listed from most + /// complex to least complex. This puts constants before unary operators, + /// before binary operators. + if (getComplexity(I.getOperand(0)) < getComplexity(I.getOperand(1))) { + I.swapOperands(); + Changed = true; + } + + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + + if (Value *V = SimplifyFCmpInst(I.getPredicate(), Op0, Op1, TD)) + return ReplaceInstUsesWith(I, V); + + // Simplify 'fcmp pred X, X' + if (Op0 == Op1) { + switch (I.getPredicate()) { + default: llvm_unreachable("Unknown predicate!"); + case FCmpInst::FCMP_UNO: // True if unordered: isnan(X) | isnan(Y) + case FCmpInst::FCMP_ULT: // True if unordered or less than + case FCmpInst::FCMP_UGT: // True if unordered or greater than + case FCmpInst::FCMP_UNE: // True if unordered or not equal + // Canonicalize these to be 'fcmp uno %X, 0.0'. + I.setPredicate(FCmpInst::FCMP_UNO); + I.setOperand(1, Constant::getNullValue(Op0->getType())); + return &I; + + case FCmpInst::FCMP_ORD: // True if ordered (no nans) + case FCmpInst::FCMP_OEQ: // True if ordered and equal + case FCmpInst::FCMP_OGE: // True if ordered and greater than or equal + case FCmpInst::FCMP_OLE: // True if ordered and less than or equal + // Canonicalize these to be 'fcmp ord %X, 0.0'. + I.setPredicate(FCmpInst::FCMP_ORD); + I.setOperand(1, Constant::getNullValue(Op0->getType())); + return &I; + } + } + + // Handle fcmp with constant RHS + if (Constant *RHSC = dyn_cast<Constant>(Op1)) { + if (Instruction *LHSI = dyn_cast<Instruction>(Op0)) + switch (LHSI->getOpcode()) { + case Instruction::PHI: + // Only fold fcmp into the PHI if the phi and fcmp are in the same + // block. If in the same block, we're encouraging jump threading. If + // not, we are just pessimizing the code by making an i1 phi. + if (LHSI->getParent() == I.getParent()) + if (Instruction *NV = FoldOpIntoPhi(I)) + return NV; + break; + case Instruction::SIToFP: + case Instruction::UIToFP: + if (Instruction *NV = FoldFCmp_IntToFP_Cst(I, LHSI, RHSC)) + return NV; + break; + case Instruction::Select: { + // If either operand of the select is a constant, we can fold the + // comparison into the select arms, which will cause one to be + // constant folded and the select turned into a bitwise or. + Value *Op1 = 0, *Op2 = 0; + if (LHSI->hasOneUse()) { + if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1))) { + // Fold the known value into the constant operand. + Op1 = ConstantExpr::getCompare(I.getPredicate(), C, RHSC); + // Insert a new FCmp of the other select operand. + Op2 = Builder->CreateFCmp(I.getPredicate(), + LHSI->getOperand(2), RHSC, I.getName()); + } else if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(2))) { + // Fold the known value into the constant operand. + Op2 = ConstantExpr::getCompare(I.getPredicate(), C, RHSC); + // Insert a new FCmp of the other select operand. + Op1 = Builder->CreateFCmp(I.getPredicate(), LHSI->getOperand(1), + RHSC, I.getName()); + } + } + + if (Op1) + return SelectInst::Create(LHSI->getOperand(0), Op1, Op2); + break; + } + case Instruction::Load: + if (GetElementPtrInst *GEP = + dyn_cast<GetElementPtrInst>(LHSI->getOperand(0))) { + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) + if (GV->isConstant() && GV->hasDefinitiveInitializer() && + !cast<LoadInst>(LHSI)->isVolatile()) + if (Instruction *Res = FoldCmpLoadFromIndexedGlobal(GEP, GV, I)) + return Res; + } + break; + } + } + + return Changed ? &I : 0; +} diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp new file mode 100644 index 0000000..78ff734 --- /dev/null +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -0,0 +1,642 @@ +//===- InstCombineLoadStoreAlloca.cpp -------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the visit functions for load, store and alloca. +// +//===----------------------------------------------------------------------===// + +#include "InstCombine.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumDeadStore, "Number of dead stores eliminated"); + +Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { + // Ensure that the alloca array size argument has type intptr_t, so that + // any casting is exposed early. + if (TD) { + const Type *IntPtrTy = TD->getIntPtrType(AI.getContext()); + if (AI.getArraySize()->getType() != IntPtrTy) { + Value *V = Builder->CreateIntCast(AI.getArraySize(), + IntPtrTy, false); + AI.setOperand(0, V); + return &AI; + } + } + + // Convert: alloca Ty, C - where C is a constant != 1 into: alloca [C x Ty], 1 + if (AI.isArrayAllocation()) { // Check C != 1 + if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) { + const Type *NewTy = + ArrayType::get(AI.getAllocatedType(), C->getZExtValue()); + assert(isa<AllocaInst>(AI) && "Unknown type of allocation inst!"); + AllocaInst *New = Builder->CreateAlloca(NewTy, 0, AI.getName()); + New->setAlignment(AI.getAlignment()); + + // Scan to the end of the allocation instructions, to skip over a block of + // allocas if possible...also skip interleaved debug info + // + BasicBlock::iterator It = New; + while (isa<AllocaInst>(*It) || isa<DbgInfoIntrinsic>(*It)) ++It; + + // Now that I is pointing to the first non-allocation-inst in the block, + // insert our getelementptr instruction... + // + Value *NullIdx =Constant::getNullValue(Type::getInt32Ty(AI.getContext())); + Value *Idx[2]; + Idx[0] = NullIdx; + Idx[1] = NullIdx; + Value *V = GetElementPtrInst::CreateInBounds(New, Idx, Idx + 2, + New->getName()+".sub", It); + + // Now make everything use the getelementptr instead of the original + // allocation. + return ReplaceInstUsesWith(AI, V); + } else if (isa<UndefValue>(AI.getArraySize())) { + return ReplaceInstUsesWith(AI, Constant::getNullValue(AI.getType())); + } + } + + if (TD && isa<AllocaInst>(AI) && AI.getAllocatedType()->isSized()) { + // If alloca'ing a zero byte object, replace the alloca with a null pointer. + // Note that we only do this for alloca's, because malloc should allocate + // and return a unique pointer, even for a zero byte allocation. + if (TD->getTypeAllocSize(AI.getAllocatedType()) == 0) + return ReplaceInstUsesWith(AI, Constant::getNullValue(AI.getType())); + + // If the alignment is 0 (unspecified), assign it the preferred alignment. + if (AI.getAlignment() == 0) + AI.setAlignment(TD->getPrefTypeAlignment(AI.getAllocatedType())); + } + + return 0; +} + + +/// InstCombineLoadCast - Fold 'load (cast P)' -> cast (load P)' when possible. +static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI, + const TargetData *TD) { + User *CI = cast<User>(LI.getOperand(0)); + Value *CastOp = CI->getOperand(0); + + const PointerType *DestTy = cast<PointerType>(CI->getType()); + const Type *DestPTy = DestTy->getElementType(); + if (const PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType())) { + + // If the address spaces don't match, don't eliminate the cast. + if (DestTy->getAddressSpace() != SrcTy->getAddressSpace()) + return 0; + + const Type *SrcPTy = SrcTy->getElementType(); + + if (DestPTy->isIntegerTy() || DestPTy->isPointerTy() || + DestPTy->isVectorTy()) { + // If the source is an array, the code below will not succeed. Check to + // see if a trivial 'gep P, 0, 0' will help matters. Only do this for + // constants. + if (const ArrayType *ASrcTy = dyn_cast<ArrayType>(SrcPTy)) + if (Constant *CSrc = dyn_cast<Constant>(CastOp)) + if (ASrcTy->getNumElements() != 0) { + Value *Idxs[2]; + Idxs[0] = Constant::getNullValue(Type::getInt32Ty(LI.getContext())); + Idxs[1] = Idxs[0]; + CastOp = ConstantExpr::getGetElementPtr(CSrc, Idxs, 2); + SrcTy = cast<PointerType>(CastOp->getType()); + SrcPTy = SrcTy->getElementType(); + } + + if (IC.getTargetData() && + (SrcPTy->isIntegerTy() || SrcPTy->isPointerTy() || + SrcPTy->isVectorTy()) && + // Do not allow turning this into a load of an integer, which is then + // casted to a pointer, this pessimizes pointer analysis a lot. + (SrcPTy->isPointerTy() == LI.getType()->isPointerTy()) && + IC.getTargetData()->getTypeSizeInBits(SrcPTy) == + IC.getTargetData()->getTypeSizeInBits(DestPTy)) { + + // Okay, we are casting from one integer or pointer type to another of + // the same size. Instead of casting the pointer before the load, cast + // the result of the loaded value. + LoadInst *NewLoad = + IC.Builder->CreateLoad(CastOp, LI.isVolatile(), CI->getName()); + NewLoad->setAlignment(LI.getAlignment()); + // Now cast the result of the load. + return new BitCastInst(NewLoad, LI.getType()); + } + } + } + return 0; +} + +Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { + Value *Op = LI.getOperand(0); + + // Attempt to improve the alignment. + if (TD) { + unsigned KnownAlign = + getOrEnforceKnownAlignment(Op, TD->getPrefTypeAlignment(LI.getType()),TD); + unsigned LoadAlign = LI.getAlignment(); + unsigned EffectiveLoadAlign = LoadAlign != 0 ? LoadAlign : + TD->getABITypeAlignment(LI.getType()); + + if (KnownAlign > EffectiveLoadAlign) + LI.setAlignment(KnownAlign); + else if (LoadAlign == 0) + LI.setAlignment(EffectiveLoadAlign); + } + + // load (cast X) --> cast (load X) iff safe. + if (isa<CastInst>(Op)) + if (Instruction *Res = InstCombineLoadCast(*this, LI, TD)) + return Res; + + // None of the following transforms are legal for volatile loads. + if (LI.isVolatile()) return 0; + + // Do really simple store-to-load forwarding and load CSE, to catch cases + // where there are several consecutive memory accesses to the same location, + // separated by a few arithmetic operations. + BasicBlock::iterator BBI = &LI; + if (Value *AvailableVal = FindAvailableLoadedValue(Op, LI.getParent(), BBI,6)) + return ReplaceInstUsesWith(LI, AvailableVal); + + // load(gep null, ...) -> unreachable + if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Op)) { + const Value *GEPI0 = GEPI->getOperand(0); + // TODO: Consider a target hook for valid address spaces for this xform. + if (isa<ConstantPointerNull>(GEPI0) && GEPI->getPointerAddressSpace() == 0){ + // Insert a new store to null instruction before the load to indicate + // that this code is not reachable. We do this instead of inserting + // an unreachable instruction directly because we cannot modify the + // CFG. + new StoreInst(UndefValue::get(LI.getType()), + Constant::getNullValue(Op->getType()), &LI); + return ReplaceInstUsesWith(LI, UndefValue::get(LI.getType())); + } + } + + // load null/undef -> unreachable + // TODO: Consider a target hook for valid address spaces for this xform. + if (isa<UndefValue>(Op) || + (isa<ConstantPointerNull>(Op) && LI.getPointerAddressSpace() == 0)) { + // Insert a new store to null instruction before the load to indicate that + // this code is not reachable. We do this instead of inserting an + // unreachable instruction directly because we cannot modify the CFG. + new StoreInst(UndefValue::get(LI.getType()), + Constant::getNullValue(Op->getType()), &LI); + return ReplaceInstUsesWith(LI, UndefValue::get(LI.getType())); + } + + // Instcombine load (constantexpr_cast global) -> cast (load global) + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Op)) + if (CE->isCast()) + if (Instruction *Res = InstCombineLoadCast(*this, LI, TD)) + return Res; + + if (Op->hasOneUse()) { + // Change select and PHI nodes to select values instead of addresses: this + // helps alias analysis out a lot, allows many others simplifications, and + // exposes redundancy in the code. + // + // Note that we cannot do the transformation unless we know that the + // introduced loads cannot trap! Something like this is valid as long as + // the condition is always false: load (select bool %C, int* null, int* %G), + // but it would not be valid if we transformed it to load from null + // unconditionally. + // + if (SelectInst *SI = dyn_cast<SelectInst>(Op)) { + // load (select (Cond, &V1, &V2)) --> select(Cond, load &V1, load &V2). + unsigned Align = LI.getAlignment(); + if (isSafeToLoadUnconditionally(SI->getOperand(1), SI, Align, TD) && + isSafeToLoadUnconditionally(SI->getOperand(2), SI, Align, TD)) { + LoadInst *V1 = Builder->CreateLoad(SI->getOperand(1), + SI->getOperand(1)->getName()+".val"); + LoadInst *V2 = Builder->CreateLoad(SI->getOperand(2), + SI->getOperand(2)->getName()+".val"); + V1->setAlignment(Align); + V2->setAlignment(Align); + return SelectInst::Create(SI->getCondition(), V1, V2); + } + + // load (select (cond, null, P)) -> load P + if (Constant *C = dyn_cast<Constant>(SI->getOperand(1))) + if (C->isNullValue()) { + LI.setOperand(0, SI->getOperand(2)); + return &LI; + } + + // load (select (cond, P, null)) -> load P + if (Constant *C = dyn_cast<Constant>(SI->getOperand(2))) + if (C->isNullValue()) { + LI.setOperand(0, SI->getOperand(1)); + return &LI; + } + } + } + return 0; +} + +/// InstCombineStoreToCast - Fold store V, (cast P) -> store (cast V), P +/// when possible. This makes it generally easy to do alias analysis and/or +/// SROA/mem2reg of the memory object. +static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) { + User *CI = cast<User>(SI.getOperand(1)); + Value *CastOp = CI->getOperand(0); + + const Type *DestPTy = cast<PointerType>(CI->getType())->getElementType(); + const PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType()); + if (SrcTy == 0) return 0; + + const Type *SrcPTy = SrcTy->getElementType(); + + if (!DestPTy->isIntegerTy() && !DestPTy->isPointerTy()) + return 0; + + /// NewGEPIndices - If SrcPTy is an aggregate type, we can emit a "noop gep" + /// to its first element. This allows us to handle things like: + /// store i32 xxx, (bitcast {foo*, float}* %P to i32*) + /// on 32-bit hosts. + SmallVector<Value*, 4> NewGEPIndices; + + // If the source is an array, the code below will not succeed. Check to + // see if a trivial 'gep P, 0, 0' will help matters. Only do this for + // constants. + if (SrcPTy->isArrayTy() || SrcPTy->isStructTy()) { + // Index through pointer. + Constant *Zero = Constant::getNullValue(Type::getInt32Ty(SI.getContext())); + NewGEPIndices.push_back(Zero); + + while (1) { + if (const StructType *STy = dyn_cast<StructType>(SrcPTy)) { + if (!STy->getNumElements()) /* Struct can be empty {} */ + break; + NewGEPIndices.push_back(Zero); + SrcPTy = STy->getElementType(0); + } else if (const ArrayType *ATy = dyn_cast<ArrayType>(SrcPTy)) { + NewGEPIndices.push_back(Zero); + SrcPTy = ATy->getElementType(); + } else { + break; + } + } + + SrcTy = PointerType::get(SrcPTy, SrcTy->getAddressSpace()); + } + + if (!SrcPTy->isIntegerTy() && !SrcPTy->isPointerTy()) + return 0; + + // If the pointers point into different address spaces or if they point to + // values with different sizes, we can't do the transformation. + if (!IC.getTargetData() || + SrcTy->getAddressSpace() != + cast<PointerType>(CI->getType())->getAddressSpace() || + IC.getTargetData()->getTypeSizeInBits(SrcPTy) != + IC.getTargetData()->getTypeSizeInBits(DestPTy)) + return 0; + + // Okay, we are casting from one integer or pointer type to another of + // the same size. Instead of casting the pointer before + // the store, cast the value to be stored. + Value *NewCast; + Value *SIOp0 = SI.getOperand(0); + Instruction::CastOps opcode = Instruction::BitCast; + const Type* CastSrcTy = SIOp0->getType(); + const Type* CastDstTy = SrcPTy; + if (CastDstTy->isPointerTy()) { + if (CastSrcTy->isIntegerTy()) + opcode = Instruction::IntToPtr; + } else if (CastDstTy->isIntegerTy()) { + if (SIOp0->getType()->isPointerTy()) + opcode = Instruction::PtrToInt; + } + + // SIOp0 is a pointer to aggregate and this is a store to the first field, + // emit a GEP to index into its first field. + if (!NewGEPIndices.empty()) + CastOp = IC.Builder->CreateInBoundsGEP(CastOp, NewGEPIndices.begin(), + NewGEPIndices.end()); + + NewCast = IC.Builder->CreateCast(opcode, SIOp0, CastDstTy, + SIOp0->getName()+".c"); + SI.setOperand(0, NewCast); + SI.setOperand(1, CastOp); + return &SI; +} + +/// equivalentAddressValues - Test if A and B will obviously have the same +/// value. This includes recognizing that %t0 and %t1 will have the same +/// value in code like this: +/// %t0 = getelementptr \@a, 0, 3 +/// store i32 0, i32* %t0 +/// %t1 = getelementptr \@a, 0, 3 +/// %t2 = load i32* %t1 +/// +static bool equivalentAddressValues(Value *A, Value *B) { + // Test if the values are trivially equivalent. + if (A == B) return true; + + // Test if the values come form identical arithmetic instructions. + // This uses isIdenticalToWhenDefined instead of isIdenticalTo because + // its only used to compare two uses within the same basic block, which + // means that they'll always either have the same value or one of them + // will have an undefined value. + if (isa<BinaryOperator>(A) || + isa<CastInst>(A) || + isa<PHINode>(A) || + isa<GetElementPtrInst>(A)) + if (Instruction *BI = dyn_cast<Instruction>(B)) + if (cast<Instruction>(A)->isIdenticalToWhenDefined(BI)) + return true; + + // Otherwise they may not be equivalent. + return false; +} + +// If this instruction has two uses, one of which is a llvm.dbg.declare, +// return the llvm.dbg.declare. +DbgDeclareInst *InstCombiner::hasOneUsePlusDeclare(Value *V) { + if (!V->hasNUses(2)) + return 0; + for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); + UI != E; ++UI) { + User *U = *UI; + if (DbgDeclareInst *DI = dyn_cast<DbgDeclareInst>(U)) + return DI; + if (isa<BitCastInst>(U) && U->hasOneUse()) { + if (DbgDeclareInst *DI = dyn_cast<DbgDeclareInst>(*U->use_begin())) + return DI; + } + } + return 0; +} + +Instruction *InstCombiner::visitStoreInst(StoreInst &SI) { + Value *Val = SI.getOperand(0); + Value *Ptr = SI.getOperand(1); + + // If the RHS is an alloca with a single use, zapify the store, making the + // alloca dead. + // If the RHS is an alloca with a two uses, the other one being a + // llvm.dbg.declare, zapify the store and the declare, making the + // alloca dead. We must do this to prevent declares from affecting + // codegen. + if (!SI.isVolatile()) { + if (Ptr->hasOneUse()) { + if (isa<AllocaInst>(Ptr)) + return EraseInstFromFunction(SI); + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) { + if (isa<AllocaInst>(GEP->getOperand(0))) { + if (GEP->getOperand(0)->hasOneUse()) + return EraseInstFromFunction(SI); + if (DbgDeclareInst *DI = hasOneUsePlusDeclare(GEP->getOperand(0))) { + EraseInstFromFunction(*DI); + return EraseInstFromFunction(SI); + } + } + } + } + if (DbgDeclareInst *DI = hasOneUsePlusDeclare(Ptr)) { + EraseInstFromFunction(*DI); + return EraseInstFromFunction(SI); + } + } + + // Attempt to improve the alignment. + if (TD) { + unsigned KnownAlign = + getOrEnforceKnownAlignment(Ptr, TD->getPrefTypeAlignment(Val->getType()), + TD); + unsigned StoreAlign = SI.getAlignment(); + unsigned EffectiveStoreAlign = StoreAlign != 0 ? StoreAlign : + TD->getABITypeAlignment(Val->getType()); + + if (KnownAlign > EffectiveStoreAlign) + SI.setAlignment(KnownAlign); + else if (StoreAlign == 0) + SI.setAlignment(EffectiveStoreAlign); + } + + // Do really simple DSE, to catch cases where there are several consecutive + // stores to the same location, separated by a few arithmetic operations. This + // situation often occurs with bitfield accesses. + BasicBlock::iterator BBI = &SI; + for (unsigned ScanInsts = 6; BBI != SI.getParent()->begin() && ScanInsts; + --ScanInsts) { + --BBI; + // Don't count debug info directives, lest they affect codegen, + // and we skip pointer-to-pointer bitcasts, which are NOPs. + if (isa<DbgInfoIntrinsic>(BBI) || + (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) { + ScanInsts++; + continue; + } + + if (StoreInst *PrevSI = dyn_cast<StoreInst>(BBI)) { + // Prev store isn't volatile, and stores to the same location? + if (!PrevSI->isVolatile() &&equivalentAddressValues(PrevSI->getOperand(1), + SI.getOperand(1))) { + ++NumDeadStore; + ++BBI; + EraseInstFromFunction(*PrevSI); + continue; + } + break; + } + + // If this is a load, we have to stop. However, if the loaded value is from + // the pointer we're loading and is producing the pointer we're storing, + // then *this* store is dead (X = load P; store X -> P). + if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) { + if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr) && + !SI.isVolatile()) + return EraseInstFromFunction(SI); + + // Otherwise, this is a load from some other location. Stores before it + // may not be dead. + break; + } + + // Don't skip over loads or things that can modify memory. + if (BBI->mayWriteToMemory() || BBI->mayReadFromMemory()) + break; + } + + + if (SI.isVolatile()) return 0; // Don't hack volatile stores. + + // store X, null -> turns into 'unreachable' in SimplifyCFG + if (isa<ConstantPointerNull>(Ptr) && SI.getPointerAddressSpace() == 0) { + if (!isa<UndefValue>(Val)) { + SI.setOperand(0, UndefValue::get(Val->getType())); + if (Instruction *U = dyn_cast<Instruction>(Val)) + Worklist.Add(U); // Dropped a use. + } + return 0; // Do not modify these! + } + + // store undef, Ptr -> noop + if (isa<UndefValue>(Val)) + return EraseInstFromFunction(SI); + + // If the pointer destination is a cast, see if we can fold the cast into the + // source instead. + if (isa<CastInst>(Ptr)) + if (Instruction *Res = InstCombineStoreToCast(*this, SI)) + return Res; + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) + if (CE->isCast()) + if (Instruction *Res = InstCombineStoreToCast(*this, SI)) + return Res; + + + // If this store is the last instruction in the basic block (possibly + // excepting debug info instructions), and if the block ends with an + // unconditional branch, try to move it to the successor block. + BBI = &SI; + do { + ++BBI; + } while (isa<DbgInfoIntrinsic>(BBI) || + (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())); + if (BranchInst *BI = dyn_cast<BranchInst>(BBI)) + if (BI->isUnconditional()) + if (SimplifyStoreAtEndOfBlock(SI)) + return 0; // xform done! + + return 0; +} + +/// SimplifyStoreAtEndOfBlock - Turn things like: +/// if () { *P = v1; } else { *P = v2 } +/// into a phi node with a store in the successor. +/// +/// Simplify things like: +/// *P = v1; if () { *P = v2; } +/// into a phi node with a store in the successor. +/// +bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) { + BasicBlock *StoreBB = SI.getParent(); + + // Check to see if the successor block has exactly two incoming edges. If + // so, see if the other predecessor contains a store to the same location. + // if so, insert a PHI node (if needed) and move the stores down. + BasicBlock *DestBB = StoreBB->getTerminator()->getSuccessor(0); + + // Determine whether Dest has exactly two predecessors and, if so, compute + // the other predecessor. + pred_iterator PI = pred_begin(DestBB); + BasicBlock *P = *PI; + BasicBlock *OtherBB = 0; + + if (P != StoreBB) + OtherBB = P; + + if (++PI == pred_end(DestBB)) + return false; + + P = *PI; + if (P != StoreBB) { + if (OtherBB) + return false; + OtherBB = P; + } + if (++PI != pred_end(DestBB)) + return false; + + // Bail out if all the relevant blocks aren't distinct (this can happen, + // for example, if SI is in an infinite loop) + if (StoreBB == DestBB || OtherBB == DestBB) + return false; + + // Verify that the other block ends in a branch and is not otherwise empty. + BasicBlock::iterator BBI = OtherBB->getTerminator(); + BranchInst *OtherBr = dyn_cast<BranchInst>(BBI); + if (!OtherBr || BBI == OtherBB->begin()) + return false; + + // If the other block ends in an unconditional branch, check for the 'if then + // else' case. there is an instruction before the branch. + StoreInst *OtherStore = 0; + if (OtherBr->isUnconditional()) { + --BBI; + // Skip over debugging info. + while (isa<DbgInfoIntrinsic>(BBI) || + (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) { + if (BBI==OtherBB->begin()) + return false; + --BBI; + } + // If this isn't a store, isn't a store to the same location, or if the + // alignments differ, bail out. + OtherStore = dyn_cast<StoreInst>(BBI); + if (!OtherStore || OtherStore->getOperand(1) != SI.getOperand(1) || + OtherStore->getAlignment() != SI.getAlignment()) + return false; + } else { + // Otherwise, the other block ended with a conditional branch. If one of the + // destinations is StoreBB, then we have the if/then case. + if (OtherBr->getSuccessor(0) != StoreBB && + OtherBr->getSuccessor(1) != StoreBB) + return false; + + // Okay, we know that OtherBr now goes to Dest and StoreBB, so this is an + // if/then triangle. See if there is a store to the same ptr as SI that + // lives in OtherBB. + for (;; --BBI) { + // Check to see if we find the matching store. + if ((OtherStore = dyn_cast<StoreInst>(BBI))) { + if (OtherStore->getOperand(1) != SI.getOperand(1) || + OtherStore->getAlignment() != SI.getAlignment()) + return false; + break; + } + // If we find something that may be using or overwriting the stored + // value, or if we run out of instructions, we can't do the xform. + if (BBI->mayReadFromMemory() || BBI->mayWriteToMemory() || + BBI == OtherBB->begin()) + return false; + } + + // In order to eliminate the store in OtherBr, we have to + // make sure nothing reads or overwrites the stored value in + // StoreBB. + for (BasicBlock::iterator I = StoreBB->begin(); &*I != &SI; ++I) { + // FIXME: This should really be AA driven. + if (I->mayReadFromMemory() || I->mayWriteToMemory()) + return false; + } + } + + // Insert a PHI node now if we need it. + Value *MergedVal = OtherStore->getOperand(0); + if (MergedVal != SI.getOperand(0)) { + PHINode *PN = PHINode::Create(MergedVal->getType(), "storemerge"); + PN->reserveOperandSpace(2); + PN->addIncoming(SI.getOperand(0), SI.getParent()); + PN->addIncoming(OtherStore->getOperand(0), OtherBB); + MergedVal = InsertNewInstBefore(PN, DestBB->front()); + } + + // Advance to a place where it is safe to insert the new store and + // insert it. + BBI = DestBB->getFirstNonPHI(); + InsertNewInstBefore(new StoreInst(MergedVal, SI.getOperand(1), + OtherStore->isVolatile(), + SI.getAlignment()), *BBI); + + // Nuke the old stores. + EraseInstFromFunction(SI); + EraseInstFromFunction(*OtherStore); + return true; +} diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp new file mode 100644 index 0000000..d1a1fd6 --- /dev/null +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -0,0 +1,622 @@ +//===- InstCombineMulDivRem.cpp -------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the visit functions for mul, fmul, sdiv, udiv, fdiv, +// srem, urem, frem. +// +//===----------------------------------------------------------------------===// + +#include "InstCombine.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Support/PatternMatch.h" +using namespace llvm; +using namespace PatternMatch; + +/// MultiplyOverflows - True if the multiply can not be expressed in an int +/// this size. +static bool MultiplyOverflows(ConstantInt *C1, ConstantInt *C2, bool sign) { + uint32_t W = C1->getBitWidth(); + APInt LHSExt = C1->getValue(), RHSExt = C2->getValue(); + if (sign) { + LHSExt = LHSExt.sext(W * 2); + RHSExt = RHSExt.sext(W * 2); + } else { + LHSExt = LHSExt.zext(W * 2); + RHSExt = RHSExt.zext(W * 2); + } + + APInt MulExt = LHSExt * RHSExt; + + if (!sign) + return MulExt.ugt(APInt::getLowBitsSet(W * 2, W)); + + APInt Min = APInt::getSignedMinValue(W).sext(W * 2); + APInt Max = APInt::getSignedMaxValue(W).sext(W * 2); + return MulExt.slt(Min) || MulExt.sgt(Max); +} + +Instruction *InstCombiner::visitMul(BinaryOperator &I) { + bool Changed = SimplifyAssociativeOrCommutative(I); + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + + if (Value *V = SimplifyMulInst(Op0, Op1, TD)) + return ReplaceInstUsesWith(I, V); + + if (Value *V = SimplifyUsingDistributiveLaws(I)) + return ReplaceInstUsesWith(I, V); + + if (match(Op1, m_AllOnes())) // X * -1 == 0 - X + return BinaryOperator::CreateNeg(Op0, I.getName()); + + if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) { + + // ((X << C1)*C2) == (X * (C2 << C1)) + if (BinaryOperator *SI = dyn_cast<BinaryOperator>(Op0)) + if (SI->getOpcode() == Instruction::Shl) + if (Constant *ShOp = dyn_cast<Constant>(SI->getOperand(1))) + return BinaryOperator::CreateMul(SI->getOperand(0), + ConstantExpr::getShl(CI, ShOp)); + + const APInt &Val = CI->getValue(); + if (Val.isPowerOf2()) { // Replace X*(2^C) with X << C + Constant *NewCst = ConstantInt::get(Op0->getType(), Val.logBase2()); + BinaryOperator *Shl = BinaryOperator::CreateShl(Op0, NewCst); + if (I.hasNoSignedWrap()) Shl->setHasNoSignedWrap(); + if (I.hasNoUnsignedWrap()) Shl->setHasNoUnsignedWrap(); + return Shl; + } + + // Canonicalize (X+C1)*CI -> X*CI+C1*CI. + { Value *X; ConstantInt *C1; + if (Op0->hasOneUse() && + match(Op0, m_Add(m_Value(X), m_ConstantInt(C1)))) { + Value *Add = Builder->CreateMul(X, CI, "tmp"); + return BinaryOperator::CreateAdd(Add, Builder->CreateMul(C1, CI)); + } + } + } + + // Simplify mul instructions with a constant RHS. + if (isa<Constant>(Op1)) { + // Try to fold constant mul into select arguments. + if (SelectInst *SI = dyn_cast<SelectInst>(Op0)) + if (Instruction *R = FoldOpIntoSelect(I, SI)) + return R; + + if (isa<PHINode>(Op0)) + if (Instruction *NV = FoldOpIntoPhi(I)) + return NV; + } + + if (Value *Op0v = dyn_castNegVal(Op0)) // -X * -Y = X*Y + if (Value *Op1v = dyn_castNegVal(Op1)) + return BinaryOperator::CreateMul(Op0v, Op1v); + + // (X / Y) * Y = X - (X % Y) + // (X / Y) * -Y = (X % Y) - X + { + Value *Op1C = Op1; + BinaryOperator *BO = dyn_cast<BinaryOperator>(Op0); + if (!BO || + (BO->getOpcode() != Instruction::UDiv && + BO->getOpcode() != Instruction::SDiv)) { + Op1C = Op0; + BO = dyn_cast<BinaryOperator>(Op1); + } + Value *Neg = dyn_castNegVal(Op1C); + if (BO && BO->hasOneUse() && + (BO->getOperand(1) == Op1C || BO->getOperand(1) == Neg) && + (BO->getOpcode() == Instruction::UDiv || + BO->getOpcode() == Instruction::SDiv)) { + Value *Op0BO = BO->getOperand(0), *Op1BO = BO->getOperand(1); + + // If the division is exact, X % Y is zero, so we end up with X or -X. + if (PossiblyExactOperator *SDiv = dyn_cast<PossiblyExactOperator>(BO)) + if (SDiv->isExact()) { + if (Op1BO == Op1C) + return ReplaceInstUsesWith(I, Op0BO); + return BinaryOperator::CreateNeg(Op0BO); + } + + Value *Rem; + if (BO->getOpcode() == Instruction::UDiv) + Rem = Builder->CreateURem(Op0BO, Op1BO); + else + Rem = Builder->CreateSRem(Op0BO, Op1BO); + Rem->takeName(BO); + + if (Op1BO == Op1C) + return BinaryOperator::CreateSub(Op0BO, Rem); + return BinaryOperator::CreateSub(Rem, Op0BO); + } + } + + /// i1 mul -> i1 and. + if (I.getType()->isIntegerTy(1)) + return BinaryOperator::CreateAnd(Op0, Op1); + + // X*(1 << Y) --> X << Y + // (1 << Y)*X --> X << Y + { + Value *Y; + if (match(Op0, m_Shl(m_One(), m_Value(Y)))) + return BinaryOperator::CreateShl(Op1, Y); + if (match(Op1, m_Shl(m_One(), m_Value(Y)))) + return BinaryOperator::CreateShl(Op0, Y); + } + + // If one of the operands of the multiply is a cast from a boolean value, then + // we know the bool is either zero or one, so this is a 'masking' multiply. + // X * Y (where Y is 0 or 1) -> X & (0-Y) + if (!I.getType()->isVectorTy()) { + // -2 is "-1 << 1" so it is all bits set except the low one. + APInt Negative2(I.getType()->getPrimitiveSizeInBits(), (uint64_t)-2, true); + + Value *BoolCast = 0, *OtherOp = 0; + if (MaskedValueIsZero(Op0, Negative2)) + BoolCast = Op0, OtherOp = Op1; + else if (MaskedValueIsZero(Op1, Negative2)) + BoolCast = Op1, OtherOp = Op0; + + if (BoolCast) { + Value *V = Builder->CreateSub(Constant::getNullValue(I.getType()), + BoolCast, "tmp"); + return BinaryOperator::CreateAnd(V, OtherOp); + } + } + + return Changed ? &I : 0; +} + +Instruction *InstCombiner::visitFMul(BinaryOperator &I) { + bool Changed = SimplifyAssociativeOrCommutative(I); + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + + // Simplify mul instructions with a constant RHS... + if (Constant *Op1C = dyn_cast<Constant>(Op1)) { + if (ConstantFP *Op1F = dyn_cast<ConstantFP>(Op1C)) { + // "In IEEE floating point, x*1 is not equivalent to x for nans. However, + // ANSI says we can drop signals, so we can do this anyway." (from GCC) + if (Op1F->isExactlyValue(1.0)) + return ReplaceInstUsesWith(I, Op0); // Eliminate 'fmul double %X, 1.0' + } else if (Op1C->getType()->isVectorTy()) { + if (ConstantVector *Op1V = dyn_cast<ConstantVector>(Op1C)) { + // As above, vector X*splat(1.0) -> X in all defined cases. + if (Constant *Splat = Op1V->getSplatValue()) { + if (ConstantFP *F = dyn_cast<ConstantFP>(Splat)) + if (F->isExactlyValue(1.0)) + return ReplaceInstUsesWith(I, Op0); + } + } + } + + // Try to fold constant mul into select arguments. + if (SelectInst *SI = dyn_cast<SelectInst>(Op0)) + if (Instruction *R = FoldOpIntoSelect(I, SI)) + return R; + + if (isa<PHINode>(Op0)) + if (Instruction *NV = FoldOpIntoPhi(I)) + return NV; + } + + if (Value *Op0v = dyn_castFNegVal(Op0)) // -X * -Y = X*Y + if (Value *Op1v = dyn_castFNegVal(Op1)) + return BinaryOperator::CreateFMul(Op0v, Op1v); + + return Changed ? &I : 0; +} + +/// SimplifyDivRemOfSelect - Try to fold a divide or remainder of a select +/// instruction. +bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) { + SelectInst *SI = cast<SelectInst>(I.getOperand(1)); + + // div/rem X, (Cond ? 0 : Y) -> div/rem X, Y + int NonNullOperand = -1; + if (Constant *ST = dyn_cast<Constant>(SI->getOperand(1))) + if (ST->isNullValue()) + NonNullOperand = 2; + // div/rem X, (Cond ? Y : 0) -> div/rem X, Y + if (Constant *ST = dyn_cast<Constant>(SI->getOperand(2))) + if (ST->isNullValue()) + NonNullOperand = 1; + + if (NonNullOperand == -1) + return false; + + Value *SelectCond = SI->getOperand(0); + + // Change the div/rem to use 'Y' instead of the select. + I.setOperand(1, SI->getOperand(NonNullOperand)); + + // Okay, we know we replace the operand of the div/rem with 'Y' with no + // problem. However, the select, or the condition of the select may have + // multiple uses. Based on our knowledge that the operand must be non-zero, + // propagate the known value for the select into other uses of it, and + // propagate a known value of the condition into its other users. + + // If the select and condition only have a single use, don't bother with this, + // early exit. + if (SI->use_empty() && SelectCond->hasOneUse()) + return true; + + // Scan the current block backward, looking for other uses of SI. + BasicBlock::iterator BBI = &I, BBFront = I.getParent()->begin(); + + while (BBI != BBFront) { + --BBI; + // If we found a call to a function, we can't assume it will return, so + // information from below it cannot be propagated above it. + if (isa<CallInst>(BBI) && !isa<IntrinsicInst>(BBI)) + break; + + // Replace uses of the select or its condition with the known values. + for (Instruction::op_iterator I = BBI->op_begin(), E = BBI->op_end(); + I != E; ++I) { + if (*I == SI) { + *I = SI->getOperand(NonNullOperand); + Worklist.Add(BBI); + } else if (*I == SelectCond) { + *I = NonNullOperand == 1 ? ConstantInt::getTrue(BBI->getContext()) : + ConstantInt::getFalse(BBI->getContext()); + Worklist.Add(BBI); + } + } + + // If we past the instruction, quit looking for it. + if (&*BBI == SI) + SI = 0; + if (&*BBI == SelectCond) + SelectCond = 0; + + // If we ran out of things to eliminate, break out of the loop. + if (SelectCond == 0 && SI == 0) + break; + + } + return true; +} + + +/// This function implements the transforms common to both integer division +/// instructions (udiv and sdiv). It is called by the visitors to those integer +/// division instructions. +/// @brief Common integer divide transforms +Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) { + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + + // Handle cases involving: [su]div X, (select Cond, Y, Z) + // This does not apply for fdiv. + if (isa<SelectInst>(Op1) && SimplifyDivRemOfSelect(I)) + return &I; + + if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) { + // (X / C1) / C2 -> X / (C1*C2) + if (Instruction *LHS = dyn_cast<Instruction>(Op0)) + if (Instruction::BinaryOps(LHS->getOpcode()) == I.getOpcode()) + if (ConstantInt *LHSRHS = dyn_cast<ConstantInt>(LHS->getOperand(1))) { + if (MultiplyOverflows(RHS, LHSRHS, + I.getOpcode()==Instruction::SDiv)) + return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType())); + return BinaryOperator::Create(I.getOpcode(), LHS->getOperand(0), + ConstantExpr::getMul(RHS, LHSRHS)); + } + + if (!RHS->isZero()) { // avoid X udiv 0 + if (SelectInst *SI = dyn_cast<SelectInst>(Op0)) + if (Instruction *R = FoldOpIntoSelect(I, SI)) + return R; + if (isa<PHINode>(Op0)) + if (Instruction *NV = FoldOpIntoPhi(I)) + return NV; + } + } + + // (X - (X rem Y)) / Y -> X / Y; usually originates as ((X / Y) * Y) / Y + Value *X = 0, *Z = 0; + if (match(Op0, m_Sub(m_Value(X), m_Value(Z)))) { // (X - Z) / Y; Y = Op1 + bool isSigned = I.getOpcode() == Instruction::SDiv; + if ((isSigned && match(Z, m_SRem(m_Specific(X), m_Specific(Op1)))) || + (!isSigned && match(Z, m_URem(m_Specific(X), m_Specific(Op1))))) + return BinaryOperator::Create(I.getOpcode(), X, Op1); + } + + return 0; +} + +Instruction *InstCombiner::visitUDiv(BinaryOperator &I) { + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + + if (Value *V = SimplifyUDivInst(Op0, Op1, TD)) + return ReplaceInstUsesWith(I, V); + + // Handle the integer div common cases + if (Instruction *Common = commonIDivTransforms(I)) + return Common; + + if (ConstantInt *C = dyn_cast<ConstantInt>(Op1)) { + // X udiv 2^C -> X >> C + // Check to see if this is an unsigned division with an exact power of 2, + // if so, convert to a right shift. + if (C->getValue().isPowerOf2()) { // 0 not included in isPowerOf2 + BinaryOperator *LShr = + BinaryOperator::CreateLShr(Op0, + ConstantInt::get(Op0->getType(), C->getValue().logBase2())); + if (I.isExact()) LShr->setIsExact(); + return LShr; + } + + // X udiv C, where C >= signbit + if (C->getValue().isNegative()) { + Value *IC = Builder->CreateICmpULT(Op0, C); + return SelectInst::Create(IC, Constant::getNullValue(I.getType()), + ConstantInt::get(I.getType(), 1)); + } + } + + // X udiv (C1 << N), where C1 is "1<<C2" --> X >> (N+C2) + { const APInt *CI; Value *N; + if (match(Op1, m_Shl(m_Power2(CI), m_Value(N)))) { + if (*CI != 1) + N = Builder->CreateAdd(N, ConstantInt::get(I.getType(), CI->logBase2()), + "tmp"); + if (I.isExact()) + return BinaryOperator::CreateExactLShr(Op0, N); + return BinaryOperator::CreateLShr(Op0, N); + } + } + + // udiv X, (Select Cond, C1, C2) --> Select Cond, (shr X, C1), (shr X, C2) + // where C1&C2 are powers of two. + { Value *Cond; const APInt *C1, *C2; + if (match(Op1, m_Select(m_Value(Cond), m_Power2(C1), m_Power2(C2)))) { + // Construct the "on true" case of the select + Value *TSI = Builder->CreateLShr(Op0, C1->logBase2(), Op1->getName()+".t", + I.isExact()); + + // Construct the "on false" case of the select + Value *FSI = Builder->CreateLShr(Op0, C2->logBase2(), Op1->getName()+".f", + I.isExact()); + + // construct the select instruction and return it. + return SelectInst::Create(Cond, TSI, FSI); + } + } + return 0; +} + +Instruction *InstCombiner::visitSDiv(BinaryOperator &I) { + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + + if (Value *V = SimplifySDivInst(Op0, Op1, TD)) + return ReplaceInstUsesWith(I, V); + + // Handle the integer div common cases + if (Instruction *Common = commonIDivTransforms(I)) + return Common; + + if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) { + // sdiv X, -1 == -X + if (RHS->isAllOnesValue()) + return BinaryOperator::CreateNeg(Op0); + + // sdiv X, C --> ashr exact X, log2(C) + if (I.isExact() && RHS->getValue().isNonNegative() && + RHS->getValue().isPowerOf2()) { + Value *ShAmt = llvm::ConstantInt::get(RHS->getType(), + RHS->getValue().exactLogBase2()); + return BinaryOperator::CreateExactAShr(Op0, ShAmt, I.getName()); + } + + // -X/C --> X/-C provided the negation doesn't overflow. + if (SubOperator *Sub = dyn_cast<SubOperator>(Op0)) + if (match(Sub->getOperand(0), m_Zero()) && Sub->hasNoSignedWrap()) + return BinaryOperator::CreateSDiv(Sub->getOperand(1), + ConstantExpr::getNeg(RHS)); + } + + // If the sign bits of both operands are zero (i.e. we can prove they are + // unsigned inputs), turn this into a udiv. + if (I.getType()->isIntegerTy()) { + APInt Mask(APInt::getSignBit(I.getType()->getPrimitiveSizeInBits())); + if (MaskedValueIsZero(Op0, Mask)) { + if (MaskedValueIsZero(Op1, Mask)) { + // X sdiv Y -> X udiv Y, iff X and Y don't have sign bit set + return BinaryOperator::CreateUDiv(Op0, Op1, I.getName()); + } + + if (match(Op1, m_Shl(m_Power2(), m_Value()))) { + // X sdiv (1 << Y) -> X udiv (1 << Y) ( -> X u>> Y) + // Safe because the only negative value (1 << Y) can take on is + // INT_MIN, and X sdiv INT_MIN == X udiv INT_MIN == 0 if X doesn't have + // the sign bit set. + return BinaryOperator::CreateUDiv(Op0, Op1, I.getName()); + } + } + } + + return 0; +} + +Instruction *InstCombiner::visitFDiv(BinaryOperator &I) { + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + + if (Value *V = SimplifyFDivInst(Op0, Op1, TD)) + return ReplaceInstUsesWith(I, V); + + return 0; +} + +/// This function implements the transforms on rem instructions that work +/// regardless of the kind of rem instruction it is (urem, srem, or frem). It +/// is used by the visitors to those instructions. +/// @brief Transforms common to all three rem instructions +Instruction *InstCombiner::commonRemTransforms(BinaryOperator &I) { + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + + if (isa<UndefValue>(Op0)) { // undef % X -> 0 + if (I.getType()->isFPOrFPVectorTy()) + return ReplaceInstUsesWith(I, Op0); // X % undef -> undef (could be SNaN) + return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType())); + } + if (isa<UndefValue>(Op1)) + return ReplaceInstUsesWith(I, Op1); // X % undef -> undef + + // Handle cases involving: rem X, (select Cond, Y, Z) + if (isa<SelectInst>(Op1) && SimplifyDivRemOfSelect(I)) + return &I; + + return 0; +} + +/// This function implements the transforms common to both integer remainder +/// instructions (urem and srem). It is called by the visitors to those integer +/// remainder instructions. +/// @brief Common integer remainder transforms +Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) { + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + + if (Instruction *common = commonRemTransforms(I)) + return common; + + // X % X == 0 + if (Op0 == Op1) + return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType())); + + // 0 % X == 0 for integer, we don't need to preserve faults! + if (Constant *LHS = dyn_cast<Constant>(Op0)) + if (LHS->isNullValue()) + return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType())); + + if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) { + // X % 0 == undef, we don't need to preserve faults! + if (RHS->equalsInt(0)) + return ReplaceInstUsesWith(I, UndefValue::get(I.getType())); + + if (RHS->equalsInt(1)) // X % 1 == 0 + return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType())); + + if (Instruction *Op0I = dyn_cast<Instruction>(Op0)) { + if (SelectInst *SI = dyn_cast<SelectInst>(Op0I)) { + if (Instruction *R = FoldOpIntoSelect(I, SI)) + return R; + } else if (isa<PHINode>(Op0I)) { + if (Instruction *NV = FoldOpIntoPhi(I)) + return NV; + } + + // See if we can fold away this rem instruction. + if (SimplifyDemandedInstructionBits(I)) + return &I; + } + } + + return 0; +} + +Instruction *InstCombiner::visitURem(BinaryOperator &I) { + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + + if (Instruction *common = commonIRemTransforms(I)) + return common; + + // X urem C^2 -> X and C-1 + { const APInt *C; + if (match(Op1, m_Power2(C))) + return BinaryOperator::CreateAnd(Op0, + ConstantInt::get(I.getType(), *C-1)); + } + + // Turn A % (C << N), where C is 2^k, into A & ((C << N)-1) + if (match(Op1, m_Shl(m_Power2(), m_Value()))) { + Constant *N1 = Constant::getAllOnesValue(I.getType()); + Value *Add = Builder->CreateAdd(Op1, N1, "tmp"); + return BinaryOperator::CreateAnd(Op0, Add); + } + + // urem X, (select Cond, 2^C1, 2^C2) --> + // select Cond, (and X, C1-1), (and X, C2-1) + // when C1&C2 are powers of two. + { Value *Cond; const APInt *C1, *C2; + if (match(Op1, m_Select(m_Value(Cond), m_Power2(C1), m_Power2(C2)))) { + Value *TrueAnd = Builder->CreateAnd(Op0, *C1-1, Op1->getName()+".t"); + Value *FalseAnd = Builder->CreateAnd(Op0, *C2-1, Op1->getName()+".f"); + return SelectInst::Create(Cond, TrueAnd, FalseAnd); + } + } + + return 0; +} + +Instruction *InstCombiner::visitSRem(BinaryOperator &I) { + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + + // Handle the integer rem common cases + if (Instruction *Common = commonIRemTransforms(I)) + return Common; + + if (Value *RHSNeg = dyn_castNegVal(Op1)) + if (!isa<Constant>(RHSNeg) || + (isa<ConstantInt>(RHSNeg) && + cast<ConstantInt>(RHSNeg)->getValue().isStrictlyPositive())) { + // X % -Y -> X % Y + Worklist.AddValue(I.getOperand(1)); + I.setOperand(1, RHSNeg); + return &I; + } + + // If the sign bits of both operands are zero (i.e. we can prove they are + // unsigned inputs), turn this into a urem. + if (I.getType()->isIntegerTy()) { + APInt Mask(APInt::getSignBit(I.getType()->getPrimitiveSizeInBits())); + if (MaskedValueIsZero(Op1, Mask) && MaskedValueIsZero(Op0, Mask)) { + // X srem Y -> X urem Y, iff X and Y don't have sign bit set + return BinaryOperator::CreateURem(Op0, Op1, I.getName()); + } + } + + // If it's a constant vector, flip any negative values positive. + if (ConstantVector *RHSV = dyn_cast<ConstantVector>(Op1)) { + unsigned VWidth = RHSV->getNumOperands(); + + bool hasNegative = false; + for (unsigned i = 0; !hasNegative && i != VWidth; ++i) + if (ConstantInt *RHS = dyn_cast<ConstantInt>(RHSV->getOperand(i))) + if (RHS->getValue().isNegative()) + hasNegative = true; + + if (hasNegative) { + std::vector<Constant *> Elts(VWidth); + for (unsigned i = 0; i != VWidth; ++i) { + if (ConstantInt *RHS = dyn_cast<ConstantInt>(RHSV->getOperand(i))) { + if (RHS->getValue().isNegative()) + Elts[i] = cast<ConstantInt>(ConstantExpr::getNeg(RHS)); + else + Elts[i] = RHS; + } + } + + Constant *NewRHSV = ConstantVector::get(Elts); + if (NewRHSV != RHSV) { + Worklist.AddValue(I.getOperand(1)); + I.setOperand(1, NewRHSV); + return &I; + } + } + } + + return 0; +} + +Instruction *InstCombiner::visitFRem(BinaryOperator &I) { + return commonRemTransforms(I); +} + diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp new file mode 100644 index 0000000..297a18c --- /dev/null +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -0,0 +1,891 @@ +//===- InstCombinePHI.cpp -------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the visitPHINode function. +// +//===----------------------------------------------------------------------===// + +#include "InstCombine.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Target/TargetData.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/STLExtras.h" +using namespace llvm; + +/// FoldPHIArgBinOpIntoPHI - If we have something like phi [add (a,b), add(a,c)] +/// and if a/b/c and the add's all have a single use, turn this into a phi +/// and a single binop. +Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) { + Instruction *FirstInst = cast<Instruction>(PN.getIncomingValue(0)); + assert(isa<BinaryOperator>(FirstInst) || isa<CmpInst>(FirstInst)); + unsigned Opc = FirstInst->getOpcode(); + Value *LHSVal = FirstInst->getOperand(0); + Value *RHSVal = FirstInst->getOperand(1); + + const Type *LHSType = LHSVal->getType(); + const Type *RHSType = RHSVal->getType(); + + bool isNUW = false, isNSW = false, isExact = false; + if (OverflowingBinaryOperator *BO = + dyn_cast<OverflowingBinaryOperator>(FirstInst)) { + isNUW = BO->hasNoUnsignedWrap(); + isNSW = BO->hasNoSignedWrap(); + } else if (PossiblyExactOperator *PEO = + dyn_cast<PossiblyExactOperator>(FirstInst)) + isExact = PEO->isExact(); + + // Scan to see if all operands are the same opcode, and all have one use. + for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) { + Instruction *I = dyn_cast<Instruction>(PN.getIncomingValue(i)); + if (!I || I->getOpcode() != Opc || !I->hasOneUse() || + // Verify type of the LHS matches so we don't fold cmp's of different + // types. + I->getOperand(0)->getType() != LHSType || + I->getOperand(1)->getType() != RHSType) + return 0; + + // If they are CmpInst instructions, check their predicates + if (CmpInst *CI = dyn_cast<CmpInst>(I)) + if (CI->getPredicate() != cast<CmpInst>(FirstInst)->getPredicate()) + return 0; + + if (isNUW) + isNUW = cast<OverflowingBinaryOperator>(I)->hasNoUnsignedWrap(); + if (isNSW) + isNSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap(); + if (isExact) + isExact = cast<PossiblyExactOperator>(I)->isExact(); + + // Keep track of which operand needs a phi node. + if (I->getOperand(0) != LHSVal) LHSVal = 0; + if (I->getOperand(1) != RHSVal) RHSVal = 0; + } + + // If both LHS and RHS would need a PHI, don't do this transformation, + // because it would increase the number of PHIs entering the block, + // which leads to higher register pressure. This is especially + // bad when the PHIs are in the header of a loop. + if (!LHSVal && !RHSVal) + return 0; + + // Otherwise, this is safe to transform! + + Value *InLHS = FirstInst->getOperand(0); + Value *InRHS = FirstInst->getOperand(1); + PHINode *NewLHS = 0, *NewRHS = 0; + if (LHSVal == 0) { + NewLHS = PHINode::Create(LHSType, + FirstInst->getOperand(0)->getName() + ".pn"); + NewLHS->reserveOperandSpace(PN.getNumOperands()/2); + NewLHS->addIncoming(InLHS, PN.getIncomingBlock(0)); + InsertNewInstBefore(NewLHS, PN); + LHSVal = NewLHS; + } + + if (RHSVal == 0) { + NewRHS = PHINode::Create(RHSType, + FirstInst->getOperand(1)->getName() + ".pn"); + NewRHS->reserveOperandSpace(PN.getNumOperands()/2); + NewRHS->addIncoming(InRHS, PN.getIncomingBlock(0)); + InsertNewInstBefore(NewRHS, PN); + RHSVal = NewRHS; + } + + // Add all operands to the new PHIs. + if (NewLHS || NewRHS) { + for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) { + Instruction *InInst = cast<Instruction>(PN.getIncomingValue(i)); + if (NewLHS) { + Value *NewInLHS = InInst->getOperand(0); + NewLHS->addIncoming(NewInLHS, PN.getIncomingBlock(i)); + } + if (NewRHS) { + Value *NewInRHS = InInst->getOperand(1); + NewRHS->addIncoming(NewInRHS, PN.getIncomingBlock(i)); + } + } + } + + if (CmpInst *CIOp = dyn_cast<CmpInst>(FirstInst)) + return CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(), + LHSVal, RHSVal); + + BinaryOperator *BinOp = cast<BinaryOperator>(FirstInst); + BinaryOperator *NewBinOp = + BinaryOperator::Create(BinOp->getOpcode(), LHSVal, RHSVal); + if (isNUW) NewBinOp->setHasNoUnsignedWrap(); + if (isNSW) NewBinOp->setHasNoSignedWrap(); + if (isExact) NewBinOp->setIsExact(); + return NewBinOp; +} + +Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) { + GetElementPtrInst *FirstInst =cast<GetElementPtrInst>(PN.getIncomingValue(0)); + + SmallVector<Value*, 16> FixedOperands(FirstInst->op_begin(), + FirstInst->op_end()); + // This is true if all GEP bases are allocas and if all indices into them are + // constants. + bool AllBasePointersAreAllocas = true; + + // We don't want to replace this phi if the replacement would require + // more than one phi, which leads to higher register pressure. This is + // especially bad when the PHIs are in the header of a loop. + bool NeededPhi = false; + + bool AllInBounds = true; + + // Scan to see if all operands are the same opcode, and all have one use. + for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) { + GetElementPtrInst *GEP= dyn_cast<GetElementPtrInst>(PN.getIncomingValue(i)); + if (!GEP || !GEP->hasOneUse() || GEP->getType() != FirstInst->getType() || + GEP->getNumOperands() != FirstInst->getNumOperands()) + return 0; + + AllInBounds &= GEP->isInBounds(); + + // Keep track of whether or not all GEPs are of alloca pointers. + if (AllBasePointersAreAllocas && + (!isa<AllocaInst>(GEP->getOperand(0)) || + !GEP->hasAllConstantIndices())) + AllBasePointersAreAllocas = false; + + // Compare the operand lists. + for (unsigned op = 0, e = FirstInst->getNumOperands(); op != e; ++op) { + if (FirstInst->getOperand(op) == GEP->getOperand(op)) + continue; + + // Don't merge two GEPs when two operands differ (introducing phi nodes) + // if one of the PHIs has a constant for the index. The index may be + // substantially cheaper to compute for the constants, so making it a + // variable index could pessimize the path. This also handles the case + // for struct indices, which must always be constant. + if (isa<ConstantInt>(FirstInst->getOperand(op)) || + isa<ConstantInt>(GEP->getOperand(op))) + return 0; + + if (FirstInst->getOperand(op)->getType() !=GEP->getOperand(op)->getType()) + return 0; + + // If we already needed a PHI for an earlier operand, and another operand + // also requires a PHI, we'd be introducing more PHIs than we're + // eliminating, which increases register pressure on entry to the PHI's + // block. + if (NeededPhi) + return 0; + + FixedOperands[op] = 0; // Needs a PHI. + NeededPhi = true; + } + } + + // If all of the base pointers of the PHI'd GEPs are from allocas, don't + // bother doing this transformation. At best, this will just save a bit of + // offset calculation, but all the predecessors will have to materialize the + // stack address into a register anyway. We'd actually rather *clone* the + // load up into the predecessors so that we have a load of a gep of an alloca, + // which can usually all be folded into the load. + if (AllBasePointersAreAllocas) + return 0; + + // Otherwise, this is safe to transform. Insert PHI nodes for each operand + // that is variable. + SmallVector<PHINode*, 16> OperandPhis(FixedOperands.size()); + + bool HasAnyPHIs = false; + for (unsigned i = 0, e = FixedOperands.size(); i != e; ++i) { + if (FixedOperands[i]) continue; // operand doesn't need a phi. + Value *FirstOp = FirstInst->getOperand(i); + PHINode *NewPN = PHINode::Create(FirstOp->getType(), + FirstOp->getName()+".pn"); + InsertNewInstBefore(NewPN, PN); + + NewPN->reserveOperandSpace(e); + NewPN->addIncoming(FirstOp, PN.getIncomingBlock(0)); + OperandPhis[i] = NewPN; + FixedOperands[i] = NewPN; + HasAnyPHIs = true; + } + + + // Add all operands to the new PHIs. + if (HasAnyPHIs) { + for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) { + GetElementPtrInst *InGEP =cast<GetElementPtrInst>(PN.getIncomingValue(i)); + BasicBlock *InBB = PN.getIncomingBlock(i); + + for (unsigned op = 0, e = OperandPhis.size(); op != e; ++op) + if (PHINode *OpPhi = OperandPhis[op]) + OpPhi->addIncoming(InGEP->getOperand(op), InBB); + } + } + + Value *Base = FixedOperands[0]; + GetElementPtrInst *NewGEP = + GetElementPtrInst::Create(Base, FixedOperands.begin()+1, + FixedOperands.end()); + if (AllInBounds) NewGEP->setIsInBounds(); + return NewGEP; +} + + +/// isSafeAndProfitableToSinkLoad - Return true if we know that it is safe to +/// sink the load out of the block that defines it. This means that it must be +/// obvious the value of the load is not changed from the point of the load to +/// the end of the block it is in. +/// +/// Finally, it is safe, but not profitable, to sink a load targetting a +/// non-address-taken alloca. Doing so will cause us to not promote the alloca +/// to a register. +static bool isSafeAndProfitableToSinkLoad(LoadInst *L) { + BasicBlock::iterator BBI = L, E = L->getParent()->end(); + + for (++BBI; BBI != E; ++BBI) + if (BBI->mayWriteToMemory()) + return false; + + // Check for non-address taken alloca. If not address-taken already, it isn't + // profitable to do this xform. + if (AllocaInst *AI = dyn_cast<AllocaInst>(L->getOperand(0))) { + bool isAddressTaken = false; + for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); + UI != E; ++UI) { + User *U = *UI; + if (isa<LoadInst>(U)) continue; + if (StoreInst *SI = dyn_cast<StoreInst>(U)) { + // If storing TO the alloca, then the address isn't taken. + if (SI->getOperand(1) == AI) continue; + } + isAddressTaken = true; + break; + } + + if (!isAddressTaken && AI->isStaticAlloca()) + return false; + } + + // If this load is a load from a GEP with a constant offset from an alloca, + // then we don't want to sink it. In its present form, it will be + // load [constant stack offset]. Sinking it will cause us to have to + // materialize the stack addresses in each predecessor in a register only to + // do a shared load from register in the successor. + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(L->getOperand(0))) + if (AllocaInst *AI = dyn_cast<AllocaInst>(GEP->getOperand(0))) + if (AI->isStaticAlloca() && GEP->hasAllConstantIndices()) + return false; + + return true; +} + +Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) { + LoadInst *FirstLI = cast<LoadInst>(PN.getIncomingValue(0)); + + // When processing loads, we need to propagate two bits of information to the + // sunk load: whether it is volatile, and what its alignment is. We currently + // don't sink loads when some have their alignment specified and some don't. + // visitLoadInst will propagate an alignment onto the load when TD is around, + // and if TD isn't around, we can't handle the mixed case. + bool isVolatile = FirstLI->isVolatile(); + unsigned LoadAlignment = FirstLI->getAlignment(); + unsigned LoadAddrSpace = FirstLI->getPointerAddressSpace(); + + // We can't sink the load if the loaded value could be modified between the + // load and the PHI. + if (FirstLI->getParent() != PN.getIncomingBlock(0) || + !isSafeAndProfitableToSinkLoad(FirstLI)) + return 0; + + // If the PHI is of volatile loads and the load block has multiple + // successors, sinking it would remove a load of the volatile value from + // the path through the other successor. + if (isVolatile && + FirstLI->getParent()->getTerminator()->getNumSuccessors() != 1) + return 0; + + // Check to see if all arguments are the same operation. + for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) { + LoadInst *LI = dyn_cast<LoadInst>(PN.getIncomingValue(i)); + if (!LI || !LI->hasOneUse()) + return 0; + + // We can't sink the load if the loaded value could be modified between + // the load and the PHI. + if (LI->isVolatile() != isVolatile || + LI->getParent() != PN.getIncomingBlock(i) || + LI->getPointerAddressSpace() != LoadAddrSpace || + !isSafeAndProfitableToSinkLoad(LI)) + return 0; + + // If some of the loads have an alignment specified but not all of them, + // we can't do the transformation. + if ((LoadAlignment != 0) != (LI->getAlignment() != 0)) + return 0; + + LoadAlignment = std::min(LoadAlignment, LI->getAlignment()); + + // If the PHI is of volatile loads and the load block has multiple + // successors, sinking it would remove a load of the volatile value from + // the path through the other successor. + if (isVolatile && + LI->getParent()->getTerminator()->getNumSuccessors() != 1) + return 0; + } + + // Okay, they are all the same operation. Create a new PHI node of the + // correct type, and PHI together all of the LHS's of the instructions. + PHINode *NewPN = PHINode::Create(FirstLI->getOperand(0)->getType(), + PN.getName()+".in"); + NewPN->reserveOperandSpace(PN.getNumOperands()/2); + + Value *InVal = FirstLI->getOperand(0); + NewPN->addIncoming(InVal, PN.getIncomingBlock(0)); + + // Add all operands to the new PHI. + for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) { + Value *NewInVal = cast<LoadInst>(PN.getIncomingValue(i))->getOperand(0); + if (NewInVal != InVal) + InVal = 0; + NewPN->addIncoming(NewInVal, PN.getIncomingBlock(i)); + } + + Value *PhiVal; + if (InVal) { + // The new PHI unions all of the same values together. This is really + // common, so we handle it intelligently here for compile-time speed. + PhiVal = InVal; + delete NewPN; + } else { + InsertNewInstBefore(NewPN, PN); + PhiVal = NewPN; + } + + // If this was a volatile load that we are merging, make sure to loop through + // and mark all the input loads as non-volatile. If we don't do this, we will + // insert a new volatile load and the old ones will not be deletable. + if (isVolatile) + for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) + cast<LoadInst>(PN.getIncomingValue(i))->setVolatile(false); + + return new LoadInst(PhiVal, "", isVolatile, LoadAlignment); +} + + + +/// FoldPHIArgOpIntoPHI - If all operands to a PHI node are the same "unary" +/// operator and they all are only used by the PHI, PHI together their +/// inputs, and do the operation once, to the result of the PHI. +Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) { + Instruction *FirstInst = cast<Instruction>(PN.getIncomingValue(0)); + + if (isa<GetElementPtrInst>(FirstInst)) + return FoldPHIArgGEPIntoPHI(PN); + if (isa<LoadInst>(FirstInst)) + return FoldPHIArgLoadIntoPHI(PN); + + // Scan the instruction, looking for input operations that can be folded away. + // If all input operands to the phi are the same instruction (e.g. a cast from + // the same type or "+42") we can pull the operation through the PHI, reducing + // code size and simplifying code. + Constant *ConstantOp = 0; + const Type *CastSrcTy = 0; + bool isNUW = false, isNSW = false, isExact = false; + + if (isa<CastInst>(FirstInst)) { + CastSrcTy = FirstInst->getOperand(0)->getType(); + + // Be careful about transforming integer PHIs. We don't want to pessimize + // the code by turning an i32 into an i1293. + if (PN.getType()->isIntegerTy() && CastSrcTy->isIntegerTy()) { + if (!ShouldChangeType(PN.getType(), CastSrcTy)) + return 0; + } + } else if (isa<BinaryOperator>(FirstInst) || isa<CmpInst>(FirstInst)) { + // Can fold binop, compare or shift here if the RHS is a constant, + // otherwise call FoldPHIArgBinOpIntoPHI. + ConstantOp = dyn_cast<Constant>(FirstInst->getOperand(1)); + if (ConstantOp == 0) + return FoldPHIArgBinOpIntoPHI(PN); + + if (OverflowingBinaryOperator *BO = + dyn_cast<OverflowingBinaryOperator>(FirstInst)) { + isNUW = BO->hasNoUnsignedWrap(); + isNSW = BO->hasNoSignedWrap(); + } else if (PossiblyExactOperator *PEO = + dyn_cast<PossiblyExactOperator>(FirstInst)) + isExact = PEO->isExact(); + } else { + return 0; // Cannot fold this operation. + } + + // Check to see if all arguments are the same operation. + for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) { + Instruction *I = dyn_cast<Instruction>(PN.getIncomingValue(i)); + if (I == 0 || !I->hasOneUse() || !I->isSameOperationAs(FirstInst)) + return 0; + if (CastSrcTy) { + if (I->getOperand(0)->getType() != CastSrcTy) + return 0; // Cast operation must match. + } else if (I->getOperand(1) != ConstantOp) { + return 0; + } + + if (isNUW) + isNUW = cast<OverflowingBinaryOperator>(I)->hasNoUnsignedWrap(); + if (isNSW) + isNSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap(); + if (isExact) + isExact = cast<PossiblyExactOperator>(I)->isExact(); + } + + // Okay, they are all the same operation. Create a new PHI node of the + // correct type, and PHI together all of the LHS's of the instructions. + PHINode *NewPN = PHINode::Create(FirstInst->getOperand(0)->getType(), + PN.getName()+".in"); + NewPN->reserveOperandSpace(PN.getNumOperands()/2); + + Value *InVal = FirstInst->getOperand(0); + NewPN->addIncoming(InVal, PN.getIncomingBlock(0)); + + // Add all operands to the new PHI. + for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) { + Value *NewInVal = cast<Instruction>(PN.getIncomingValue(i))->getOperand(0); + if (NewInVal != InVal) + InVal = 0; + NewPN->addIncoming(NewInVal, PN.getIncomingBlock(i)); + } + + Value *PhiVal; + if (InVal) { + // The new PHI unions all of the same values together. This is really + // common, so we handle it intelligently here for compile-time speed. + PhiVal = InVal; + delete NewPN; + } else { + InsertNewInstBefore(NewPN, PN); + PhiVal = NewPN; + } + + // Insert and return the new operation. + if (CastInst *FirstCI = dyn_cast<CastInst>(FirstInst)) + return CastInst::Create(FirstCI->getOpcode(), PhiVal, PN.getType()); + + if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(FirstInst)) { + BinOp = BinaryOperator::Create(BinOp->getOpcode(), PhiVal, ConstantOp); + if (isNUW) BinOp->setHasNoUnsignedWrap(); + if (isNSW) BinOp->setHasNoSignedWrap(); + if (isExact) BinOp->setIsExact(); + return BinOp; + } + + CmpInst *CIOp = cast<CmpInst>(FirstInst); + return CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(), + PhiVal, ConstantOp); +} + +/// DeadPHICycle - Return true if this PHI node is only used by a PHI node cycle +/// that is dead. +static bool DeadPHICycle(PHINode *PN, + SmallPtrSet<PHINode*, 16> &PotentiallyDeadPHIs) { + if (PN->use_empty()) return true; + if (!PN->hasOneUse()) return false; + + // Remember this node, and if we find the cycle, return. + if (!PotentiallyDeadPHIs.insert(PN)) + return true; + + // Don't scan crazily complex things. + if (PotentiallyDeadPHIs.size() == 16) + return false; + + if (PHINode *PU = dyn_cast<PHINode>(PN->use_back())) + return DeadPHICycle(PU, PotentiallyDeadPHIs); + + return false; +} + +/// PHIsEqualValue - Return true if this phi node is always equal to +/// NonPhiInVal. This happens with mutually cyclic phi nodes like: +/// z = some value; x = phi (y, z); y = phi (x, z) +static bool PHIsEqualValue(PHINode *PN, Value *NonPhiInVal, + SmallPtrSet<PHINode*, 16> &ValueEqualPHIs) { + // See if we already saw this PHI node. + if (!ValueEqualPHIs.insert(PN)) + return true; + + // Don't scan crazily complex things. + if (ValueEqualPHIs.size() == 16) + return false; + + // Scan the operands to see if they are either phi nodes or are equal to + // the value. + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + Value *Op = PN->getIncomingValue(i); + if (PHINode *OpPN = dyn_cast<PHINode>(Op)) { + if (!PHIsEqualValue(OpPN, NonPhiInVal, ValueEqualPHIs)) + return false; + } else if (Op != NonPhiInVal) + return false; + } + + return true; +} + + +namespace { +struct PHIUsageRecord { + unsigned PHIId; // The ID # of the PHI (something determinstic to sort on) + unsigned Shift; // The amount shifted. + Instruction *Inst; // The trunc instruction. + + PHIUsageRecord(unsigned pn, unsigned Sh, Instruction *User) + : PHIId(pn), Shift(Sh), Inst(User) {} + + bool operator<(const PHIUsageRecord &RHS) const { + if (PHIId < RHS.PHIId) return true; + if (PHIId > RHS.PHIId) return false; + if (Shift < RHS.Shift) return true; + if (Shift > RHS.Shift) return false; + return Inst->getType()->getPrimitiveSizeInBits() < + RHS.Inst->getType()->getPrimitiveSizeInBits(); + } +}; + +struct LoweredPHIRecord { + PHINode *PN; // The PHI that was lowered. + unsigned Shift; // The amount shifted. + unsigned Width; // The width extracted. + + LoweredPHIRecord(PHINode *pn, unsigned Sh, const Type *Ty) + : PN(pn), Shift(Sh), Width(Ty->getPrimitiveSizeInBits()) {} + + // Ctor form used by DenseMap. + LoweredPHIRecord(PHINode *pn, unsigned Sh) + : PN(pn), Shift(Sh), Width(0) {} +}; +} + +namespace llvm { + template<> + struct DenseMapInfo<LoweredPHIRecord> { + static inline LoweredPHIRecord getEmptyKey() { + return LoweredPHIRecord(0, 0); + } + static inline LoweredPHIRecord getTombstoneKey() { + return LoweredPHIRecord(0, 1); + } + static unsigned getHashValue(const LoweredPHIRecord &Val) { + return DenseMapInfo<PHINode*>::getHashValue(Val.PN) ^ (Val.Shift>>3) ^ + (Val.Width>>3); + } + static bool isEqual(const LoweredPHIRecord &LHS, + const LoweredPHIRecord &RHS) { + return LHS.PN == RHS.PN && LHS.Shift == RHS.Shift && + LHS.Width == RHS.Width; + } + }; + template <> + struct isPodLike<LoweredPHIRecord> { static const bool value = true; }; +} + + +/// SliceUpIllegalIntegerPHI - This is an integer PHI and we know that it has an +/// illegal type: see if it is only used by trunc or trunc(lshr) operations. If +/// so, we split the PHI into the various pieces being extracted. This sort of +/// thing is introduced when SROA promotes an aggregate to large integer values. +/// +/// TODO: The user of the trunc may be an bitcast to float/double/vector or an +/// inttoptr. We should produce new PHIs in the right type. +/// +Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) { + // PHIUsers - Keep track of all of the truncated values extracted from a set + // of PHIs, along with their offset. These are the things we want to rewrite. + SmallVector<PHIUsageRecord, 16> PHIUsers; + + // PHIs are often mutually cyclic, so we keep track of a whole set of PHI + // nodes which are extracted from. PHIsToSlice is a set we use to avoid + // revisiting PHIs, PHIsInspected is a ordered list of PHIs that we need to + // check the uses of (to ensure they are all extracts). + SmallVector<PHINode*, 8> PHIsToSlice; + SmallPtrSet<PHINode*, 8> PHIsInspected; + + PHIsToSlice.push_back(&FirstPhi); + PHIsInspected.insert(&FirstPhi); + + for (unsigned PHIId = 0; PHIId != PHIsToSlice.size(); ++PHIId) { + PHINode *PN = PHIsToSlice[PHIId]; + + // Scan the input list of the PHI. If any input is an invoke, and if the + // input is defined in the predecessor, then we won't be split the critical + // edge which is required to insert a truncate. Because of this, we have to + // bail out. + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + InvokeInst *II = dyn_cast<InvokeInst>(PN->getIncomingValue(i)); + if (II == 0) continue; + if (II->getParent() != PN->getIncomingBlock(i)) + continue; + + // If we have a phi, and if it's directly in the predecessor, then we have + // a critical edge where we need to put the truncate. Since we can't + // split the edge in instcombine, we have to bail out. + return 0; + } + + + for (Value::use_iterator UI = PN->use_begin(), E = PN->use_end(); + UI != E; ++UI) { + Instruction *User = cast<Instruction>(*UI); + + // If the user is a PHI, inspect its uses recursively. + if (PHINode *UserPN = dyn_cast<PHINode>(User)) { + if (PHIsInspected.insert(UserPN)) + PHIsToSlice.push_back(UserPN); + continue; + } + + // Truncates are always ok. + if (isa<TruncInst>(User)) { + PHIUsers.push_back(PHIUsageRecord(PHIId, 0, User)); + continue; + } + + // Otherwise it must be a lshr which can only be used by one trunc. + if (User->getOpcode() != Instruction::LShr || + !User->hasOneUse() || !isa<TruncInst>(User->use_back()) || + !isa<ConstantInt>(User->getOperand(1))) + return 0; + + unsigned Shift = cast<ConstantInt>(User->getOperand(1))->getZExtValue(); + PHIUsers.push_back(PHIUsageRecord(PHIId, Shift, User->use_back())); + } + } + + // If we have no users, they must be all self uses, just nuke the PHI. + if (PHIUsers.empty()) + return ReplaceInstUsesWith(FirstPhi, UndefValue::get(FirstPhi.getType())); + + // If this phi node is transformable, create new PHIs for all the pieces + // extracted out of it. First, sort the users by their offset and size. + array_pod_sort(PHIUsers.begin(), PHIUsers.end()); + + DEBUG(errs() << "SLICING UP PHI: " << FirstPhi << '\n'; + for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i) + errs() << "AND USER PHI #" << i << ": " << *PHIsToSlice[i] <<'\n'; + ); + + // PredValues - This is a temporary used when rewriting PHI nodes. It is + // hoisted out here to avoid construction/destruction thrashing. + DenseMap<BasicBlock*, Value*> PredValues; + + // ExtractedVals - Each new PHI we introduce is saved here so we don't + // introduce redundant PHIs. + DenseMap<LoweredPHIRecord, PHINode*> ExtractedVals; + + for (unsigned UserI = 0, UserE = PHIUsers.size(); UserI != UserE; ++UserI) { + unsigned PHIId = PHIUsers[UserI].PHIId; + PHINode *PN = PHIsToSlice[PHIId]; + unsigned Offset = PHIUsers[UserI].Shift; + const Type *Ty = PHIUsers[UserI].Inst->getType(); + + PHINode *EltPHI; + + // If we've already lowered a user like this, reuse the previously lowered + // value. + if ((EltPHI = ExtractedVals[LoweredPHIRecord(PN, Offset, Ty)]) == 0) { + + // Otherwise, Create the new PHI node for this user. + EltPHI = PHINode::Create(Ty, PN->getName()+".off"+Twine(Offset), PN); + assert(EltPHI->getType() != PN->getType() && + "Truncate didn't shrink phi?"); + + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + BasicBlock *Pred = PN->getIncomingBlock(i); + Value *&PredVal = PredValues[Pred]; + + // If we already have a value for this predecessor, reuse it. + if (PredVal) { + EltPHI->addIncoming(PredVal, Pred); + continue; + } + + // Handle the PHI self-reuse case. + Value *InVal = PN->getIncomingValue(i); + if (InVal == PN) { + PredVal = EltPHI; + EltPHI->addIncoming(PredVal, Pred); + continue; + } + + if (PHINode *InPHI = dyn_cast<PHINode>(PN)) { + // If the incoming value was a PHI, and if it was one of the PHIs we + // already rewrote it, just use the lowered value. + if (Value *Res = ExtractedVals[LoweredPHIRecord(InPHI, Offset, Ty)]) { + PredVal = Res; + EltPHI->addIncoming(PredVal, Pred); + continue; + } + } + + // Otherwise, do an extract in the predecessor. + Builder->SetInsertPoint(Pred, Pred->getTerminator()); + Value *Res = InVal; + if (Offset) + Res = Builder->CreateLShr(Res, ConstantInt::get(InVal->getType(), + Offset), "extract"); + Res = Builder->CreateTrunc(Res, Ty, "extract.t"); + PredVal = Res; + EltPHI->addIncoming(Res, Pred); + + // If the incoming value was a PHI, and if it was one of the PHIs we are + // rewriting, we will ultimately delete the code we inserted. This + // means we need to revisit that PHI to make sure we extract out the + // needed piece. + if (PHINode *OldInVal = dyn_cast<PHINode>(PN->getIncomingValue(i))) + if (PHIsInspected.count(OldInVal)) { + unsigned RefPHIId = std::find(PHIsToSlice.begin(),PHIsToSlice.end(), + OldInVal)-PHIsToSlice.begin(); + PHIUsers.push_back(PHIUsageRecord(RefPHIId, Offset, + cast<Instruction>(Res))); + ++UserE; + } + } + PredValues.clear(); + + DEBUG(errs() << " Made element PHI for offset " << Offset << ": " + << *EltPHI << '\n'); + ExtractedVals[LoweredPHIRecord(PN, Offset, Ty)] = EltPHI; + } + + // Replace the use of this piece with the PHI node. + ReplaceInstUsesWith(*PHIUsers[UserI].Inst, EltPHI); + } + + // Replace all the remaining uses of the PHI nodes (self uses and the lshrs) + // with undefs. + Value *Undef = UndefValue::get(FirstPhi.getType()); + for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i) + ReplaceInstUsesWith(*PHIsToSlice[i], Undef); + return ReplaceInstUsesWith(FirstPhi, Undef); +} + +// PHINode simplification +// +Instruction *InstCombiner::visitPHINode(PHINode &PN) { + // If LCSSA is around, don't mess with Phi nodes + if (MustPreserveLCSSA) return 0; + + if (Value *V = SimplifyInstruction(&PN, TD)) + return ReplaceInstUsesWith(PN, V); + + // If all PHI operands are the same operation, pull them through the PHI, + // reducing code size. + if (isa<Instruction>(PN.getIncomingValue(0)) && + isa<Instruction>(PN.getIncomingValue(1)) && + cast<Instruction>(PN.getIncomingValue(0))->getOpcode() == + cast<Instruction>(PN.getIncomingValue(1))->getOpcode() && + // FIXME: The hasOneUse check will fail for PHIs that use the value more + // than themselves more than once. + PN.getIncomingValue(0)->hasOneUse()) + if (Instruction *Result = FoldPHIArgOpIntoPHI(PN)) + return Result; + + // If this is a trivial cycle in the PHI node graph, remove it. Basically, if + // this PHI only has a single use (a PHI), and if that PHI only has one use (a + // PHI)... break the cycle. + if (PN.hasOneUse()) { + Instruction *PHIUser = cast<Instruction>(PN.use_back()); + if (PHINode *PU = dyn_cast<PHINode>(PHIUser)) { + SmallPtrSet<PHINode*, 16> PotentiallyDeadPHIs; + PotentiallyDeadPHIs.insert(&PN); + if (DeadPHICycle(PU, PotentiallyDeadPHIs)) + return ReplaceInstUsesWith(PN, UndefValue::get(PN.getType())); + } + + // If this phi has a single use, and if that use just computes a value for + // the next iteration of a loop, delete the phi. This occurs with unused + // induction variables, e.g. "for (int j = 0; ; ++j);". Detecting this + // common case here is good because the only other things that catch this + // are induction variable analysis (sometimes) and ADCE, which is only run + // late. + if (PHIUser->hasOneUse() && + (isa<BinaryOperator>(PHIUser) || isa<GetElementPtrInst>(PHIUser)) && + PHIUser->use_back() == &PN) { + return ReplaceInstUsesWith(PN, UndefValue::get(PN.getType())); + } + } + + // We sometimes end up with phi cycles that non-obviously end up being the + // same value, for example: + // z = some value; x = phi (y, z); y = phi (x, z) + // where the phi nodes don't necessarily need to be in the same block. Do a + // quick check to see if the PHI node only contains a single non-phi value, if + // so, scan to see if the phi cycle is actually equal to that value. + { + unsigned InValNo = 0, NumOperandVals = PN.getNumIncomingValues(); + // Scan for the first non-phi operand. + while (InValNo != NumOperandVals && + isa<PHINode>(PN.getIncomingValue(InValNo))) + ++InValNo; + + if (InValNo != NumOperandVals) { + Value *NonPhiInVal = PN.getOperand(InValNo); + + // Scan the rest of the operands to see if there are any conflicts, if so + // there is no need to recursively scan other phis. + for (++InValNo; InValNo != NumOperandVals; ++InValNo) { + Value *OpVal = PN.getIncomingValue(InValNo); + if (OpVal != NonPhiInVal && !isa<PHINode>(OpVal)) + break; + } + + // If we scanned over all operands, then we have one unique value plus + // phi values. Scan PHI nodes to see if they all merge in each other or + // the value. + if (InValNo == NumOperandVals) { + SmallPtrSet<PHINode*, 16> ValueEqualPHIs; + if (PHIsEqualValue(&PN, NonPhiInVal, ValueEqualPHIs)) + return ReplaceInstUsesWith(PN, NonPhiInVal); + } + } + } + + // If there are multiple PHIs, sort their operands so that they all list + // the blocks in the same order. This will help identical PHIs be eliminated + // by other passes. Other passes shouldn't depend on this for correctness + // however. + PHINode *FirstPN = cast<PHINode>(PN.getParent()->begin()); + if (&PN != FirstPN) + for (unsigned i = 0, e = FirstPN->getNumIncomingValues(); i != e; ++i) { + BasicBlock *BBA = PN.getIncomingBlock(i); + BasicBlock *BBB = FirstPN->getIncomingBlock(i); + if (BBA != BBB) { + Value *VA = PN.getIncomingValue(i); + unsigned j = PN.getBasicBlockIndex(BBB); + Value *VB = PN.getIncomingValue(j); + PN.setIncomingBlock(i, BBB); + PN.setIncomingValue(i, VB); + PN.setIncomingBlock(j, BBA); + PN.setIncomingValue(j, VA); + // NOTE: Instcombine normally would want us to "return &PN" if we + // modified any of the operands of an instruction. However, since we + // aren't adding or removing uses (just rearranging them) we don't do + // this in this case. + } + } + + // If this is an integer PHI and we know that it has an illegal type, see if + // it is only used by trunc or trunc(lshr) operations. If so, we split the + // PHI into the various pieces being extracted. This sort of thing is + // introduced when SROA promotes an aggregate to a single large integer type. + if (PN.getType()->isIntegerTy() && TD && + !TD->isLegalInteger(PN.getType()->getPrimitiveSizeInBits())) + if (Instruction *Res = SliceUpIllegalIntegerPHI(PN)) + return Res; + + return 0; +} diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp new file mode 100644 index 0000000..97abc76 --- /dev/null +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -0,0 +1,816 @@ +//===- InstCombineSelect.cpp ----------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the visitSelect function. +// +//===----------------------------------------------------------------------===// + +#include "InstCombine.h" +#include "llvm/Support/PatternMatch.h" +#include "llvm/Analysis/InstructionSimplify.h" +using namespace llvm; +using namespace PatternMatch; + +/// MatchSelectPattern - Pattern match integer [SU]MIN, [SU]MAX, and ABS idioms, +/// returning the kind and providing the out parameter results if we +/// successfully match. +static SelectPatternFlavor +MatchSelectPattern(Value *V, Value *&LHS, Value *&RHS) { + SelectInst *SI = dyn_cast<SelectInst>(V); + if (SI == 0) return SPF_UNKNOWN; + + ICmpInst *ICI = dyn_cast<ICmpInst>(SI->getCondition()); + if (ICI == 0) return SPF_UNKNOWN; + + LHS = ICI->getOperand(0); + RHS = ICI->getOperand(1); + + // (icmp X, Y) ? X : Y + if (SI->getTrueValue() == ICI->getOperand(0) && + SI->getFalseValue() == ICI->getOperand(1)) { + switch (ICI->getPredicate()) { + default: return SPF_UNKNOWN; // Equality. + case ICmpInst::ICMP_UGT: + case ICmpInst::ICMP_UGE: return SPF_UMAX; + case ICmpInst::ICMP_SGT: + case ICmpInst::ICMP_SGE: return SPF_SMAX; + case ICmpInst::ICMP_ULT: + case ICmpInst::ICMP_ULE: return SPF_UMIN; + case ICmpInst::ICMP_SLT: + case ICmpInst::ICMP_SLE: return SPF_SMIN; + } + } + + // (icmp X, Y) ? Y : X + if (SI->getTrueValue() == ICI->getOperand(1) && + SI->getFalseValue() == ICI->getOperand(0)) { + switch (ICI->getPredicate()) { + default: return SPF_UNKNOWN; // Equality. + case ICmpInst::ICMP_UGT: + case ICmpInst::ICMP_UGE: return SPF_UMIN; + case ICmpInst::ICMP_SGT: + case ICmpInst::ICMP_SGE: return SPF_SMIN; + case ICmpInst::ICMP_ULT: + case ICmpInst::ICMP_ULE: return SPF_UMAX; + case ICmpInst::ICMP_SLT: + case ICmpInst::ICMP_SLE: return SPF_SMAX; + } + } + + // TODO: (X > 4) ? X : 5 --> (X >= 5) ? X : 5 --> MAX(X, 5) + + return SPF_UNKNOWN; +} + + +/// GetSelectFoldableOperands - We want to turn code that looks like this: +/// %C = or %A, %B +/// %D = select %cond, %C, %A +/// into: +/// %C = select %cond, %B, 0 +/// %D = or %A, %C +/// +/// Assuming that the specified instruction is an operand to the select, return +/// a bitmask indicating which operands of this instruction are foldable if they +/// equal the other incoming value of the select. +/// +static unsigned GetSelectFoldableOperands(Instruction *I) { + switch (I->getOpcode()) { + case Instruction::Add: + case Instruction::Mul: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + return 3; // Can fold through either operand. + case Instruction::Sub: // Can only fold on the amount subtracted. + case Instruction::Shl: // Can only fold on the shift amount. + case Instruction::LShr: + case Instruction::AShr: + return 1; + default: + return 0; // Cannot fold + } +} + +/// GetSelectFoldableConstant - For the same transformation as the previous +/// function, return the identity constant that goes into the select. +static Constant *GetSelectFoldableConstant(Instruction *I) { + switch (I->getOpcode()) { + default: llvm_unreachable("This cannot happen!"); + case Instruction::Add: + case Instruction::Sub: + case Instruction::Or: + case Instruction::Xor: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + return Constant::getNullValue(I->getType()); + case Instruction::And: + return Constant::getAllOnesValue(I->getType()); + case Instruction::Mul: + return ConstantInt::get(I->getType(), 1); + } +} + +/// FoldSelectOpOp - Here we have (select c, TI, FI), and we know that TI and FI +/// have the same opcode and only one use each. Try to simplify this. +Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI, + Instruction *FI) { + if (TI->getNumOperands() == 1) { + // If this is a non-volatile load or a cast from the same type, + // merge. + if (TI->isCast()) { + if (TI->getOperand(0)->getType() != FI->getOperand(0)->getType()) + return 0; + } else { + return 0; // unknown unary op. + } + + // Fold this by inserting a select from the input values. + SelectInst *NewSI = SelectInst::Create(SI.getCondition(), TI->getOperand(0), + FI->getOperand(0), SI.getName()+".v"); + InsertNewInstBefore(NewSI, SI); + return CastInst::Create(Instruction::CastOps(TI->getOpcode()), NewSI, + TI->getType()); + } + + // Only handle binary operators here. + if (!isa<BinaryOperator>(TI)) + return 0; + + // Figure out if the operations have any operands in common. + Value *MatchOp, *OtherOpT, *OtherOpF; + bool MatchIsOpZero; + if (TI->getOperand(0) == FI->getOperand(0)) { + MatchOp = TI->getOperand(0); + OtherOpT = TI->getOperand(1); + OtherOpF = FI->getOperand(1); + MatchIsOpZero = true; + } else if (TI->getOperand(1) == FI->getOperand(1)) { + MatchOp = TI->getOperand(1); + OtherOpT = TI->getOperand(0); + OtherOpF = FI->getOperand(0); + MatchIsOpZero = false; + } else if (!TI->isCommutative()) { + return 0; + } else if (TI->getOperand(0) == FI->getOperand(1)) { + MatchOp = TI->getOperand(0); + OtherOpT = TI->getOperand(1); + OtherOpF = FI->getOperand(0); + MatchIsOpZero = true; + } else if (TI->getOperand(1) == FI->getOperand(0)) { + MatchOp = TI->getOperand(1); + OtherOpT = TI->getOperand(0); + OtherOpF = FI->getOperand(1); + MatchIsOpZero = true; + } else { + return 0; + } + + // If we reach here, they do have operations in common. + SelectInst *NewSI = SelectInst::Create(SI.getCondition(), OtherOpT, + OtherOpF, SI.getName()+".v"); + InsertNewInstBefore(NewSI, SI); + + if (BinaryOperator *BO = dyn_cast<BinaryOperator>(TI)) { + if (MatchIsOpZero) + return BinaryOperator::Create(BO->getOpcode(), MatchOp, NewSI); + else + return BinaryOperator::Create(BO->getOpcode(), NewSI, MatchOp); + } + llvm_unreachable("Shouldn't get here"); + return 0; +} + +static bool isSelect01(Constant *C1, Constant *C2) { + ConstantInt *C1I = dyn_cast<ConstantInt>(C1); + if (!C1I) + return false; + ConstantInt *C2I = dyn_cast<ConstantInt>(C2); + if (!C2I) + return false; + if (!C1I->isZero() && !C2I->isZero()) // One side must be zero. + return false; + return C1I->isOne() || C1I->isAllOnesValue() || + C2I->isOne() || C2I->isAllOnesValue(); +} + +/// FoldSelectIntoOp - Try fold the select into one of the operands to +/// facilitate further optimization. +Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal, + Value *FalseVal) { + // See the comment above GetSelectFoldableOperands for a description of the + // transformation we are doing here. + if (Instruction *TVI = dyn_cast<Instruction>(TrueVal)) { + if (TVI->hasOneUse() && TVI->getNumOperands() == 2 && + !isa<Constant>(FalseVal)) { + if (unsigned SFO = GetSelectFoldableOperands(TVI)) { + unsigned OpToFold = 0; + if ((SFO & 1) && FalseVal == TVI->getOperand(0)) { + OpToFold = 1; + } else if ((SFO & 2) && FalseVal == TVI->getOperand(1)) { + OpToFold = 2; + } + + if (OpToFold) { + Constant *C = GetSelectFoldableConstant(TVI); + Value *OOp = TVI->getOperand(2-OpToFold); + // Avoid creating select between 2 constants unless it's selecting + // between 0, 1 and -1. + if (!isa<Constant>(OOp) || isSelect01(C, cast<Constant>(OOp))) { + Instruction *NewSel = SelectInst::Create(SI.getCondition(), OOp, C); + InsertNewInstBefore(NewSel, SI); + NewSel->takeName(TVI); + if (BinaryOperator *BO = dyn_cast<BinaryOperator>(TVI)) + return BinaryOperator::Create(BO->getOpcode(), FalseVal, NewSel); + llvm_unreachable("Unknown instruction!!"); + } + } + } + } + } + + if (Instruction *FVI = dyn_cast<Instruction>(FalseVal)) { + if (FVI->hasOneUse() && FVI->getNumOperands() == 2 && + !isa<Constant>(TrueVal)) { + if (unsigned SFO = GetSelectFoldableOperands(FVI)) { + unsigned OpToFold = 0; + if ((SFO & 1) && TrueVal == FVI->getOperand(0)) { + OpToFold = 1; + } else if ((SFO & 2) && TrueVal == FVI->getOperand(1)) { + OpToFold = 2; + } + + if (OpToFold) { + Constant *C = GetSelectFoldableConstant(FVI); + Value *OOp = FVI->getOperand(2-OpToFold); + // Avoid creating select between 2 constants unless it's selecting + // between 0, 1 and -1. + if (!isa<Constant>(OOp) || isSelect01(C, cast<Constant>(OOp))) { + Instruction *NewSel = SelectInst::Create(SI.getCondition(), C, OOp); + InsertNewInstBefore(NewSel, SI); + NewSel->takeName(FVI); + if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FVI)) + return BinaryOperator::Create(BO->getOpcode(), TrueVal, NewSel); + llvm_unreachable("Unknown instruction!!"); + } + } + } + } + } + + return 0; +} + +/// visitSelectInstWithICmp - Visit a SelectInst that has an +/// ICmpInst as its first operand. +/// +Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI, + ICmpInst *ICI) { + bool Changed = false; + ICmpInst::Predicate Pred = ICI->getPredicate(); + Value *CmpLHS = ICI->getOperand(0); + Value *CmpRHS = ICI->getOperand(1); + Value *TrueVal = SI.getTrueValue(); + Value *FalseVal = SI.getFalseValue(); + + // Check cases where the comparison is with a constant that + // can be adjusted to fit the min/max idiom. We may move or edit ICI + // here, so make sure the select is the only user. + if (ICI->hasOneUse()) + if (ConstantInt *CI = dyn_cast<ConstantInt>(CmpRHS)) { + // X < MIN ? T : F --> F + if ((Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_ULT) + && CI->isMinValue(Pred == ICmpInst::ICMP_SLT)) + return ReplaceInstUsesWith(SI, FalseVal); + // X > MAX ? T : F --> F + else if ((Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_UGT) + && CI->isMaxValue(Pred == ICmpInst::ICMP_SGT)) + return ReplaceInstUsesWith(SI, FalseVal); + switch (Pred) { + default: break; + case ICmpInst::ICMP_ULT: + case ICmpInst::ICMP_SLT: + case ICmpInst::ICMP_UGT: + case ICmpInst::ICMP_SGT: { + // These transformations only work for selects over integers. + const IntegerType *SelectTy = dyn_cast<IntegerType>(SI.getType()); + if (!SelectTy) + break; + + Constant *AdjustedRHS; + if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_SGT) + AdjustedRHS = ConstantInt::get(CI->getContext(), CI->getValue() + 1); + else // (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_SLT) + AdjustedRHS = ConstantInt::get(CI->getContext(), CI->getValue() - 1); + + // X > C ? X : C+1 --> X < C+1 ? C+1 : X + // X < C ? X : C-1 --> X > C-1 ? C-1 : X + if ((CmpLHS == TrueVal && AdjustedRHS == FalseVal) || + (CmpLHS == FalseVal && AdjustedRHS == TrueVal)) + ; // Nothing to do here. Values match without any sign/zero extension. + + // Types do not match. Instead of calculating this with mixed types + // promote all to the larger type. This enables scalar evolution to + // analyze this expression. + else if (CmpRHS->getType()->getScalarSizeInBits() + < SelectTy->getBitWidth()) { + Constant *sextRHS = ConstantExpr::getSExt(AdjustedRHS, SelectTy); + + // X = sext x; x >s c ? X : C+1 --> X = sext x; X <s C+1 ? C+1 : X + // X = sext x; x <s c ? X : C-1 --> X = sext x; X >s C-1 ? C-1 : X + // X = sext x; x >u c ? X : C+1 --> X = sext x; X <u C+1 ? C+1 : X + // X = sext x; x <u c ? X : C-1 --> X = sext x; X >u C-1 ? C-1 : X + if (match(TrueVal, m_SExt(m_Specific(CmpLHS))) && + sextRHS == FalseVal) { + CmpLHS = TrueVal; + AdjustedRHS = sextRHS; + } else if (match(FalseVal, m_SExt(m_Specific(CmpLHS))) && + sextRHS == TrueVal) { + CmpLHS = FalseVal; + AdjustedRHS = sextRHS; + } else if (ICI->isUnsigned()) { + Constant *zextRHS = ConstantExpr::getZExt(AdjustedRHS, SelectTy); + // X = zext x; x >u c ? X : C+1 --> X = zext x; X <u C+1 ? C+1 : X + // X = zext x; x <u c ? X : C-1 --> X = zext x; X >u C-1 ? C-1 : X + // zext + signed compare cannot be changed: + // 0xff <s 0x00, but 0x00ff >s 0x0000 + if (match(TrueVal, m_ZExt(m_Specific(CmpLHS))) && + zextRHS == FalseVal) { + CmpLHS = TrueVal; + AdjustedRHS = zextRHS; + } else if (match(FalseVal, m_ZExt(m_Specific(CmpLHS))) && + zextRHS == TrueVal) { + CmpLHS = FalseVal; + AdjustedRHS = zextRHS; + } else + break; + } else + break; + } else + break; + + Pred = ICmpInst::getSwappedPredicate(Pred); + CmpRHS = AdjustedRHS; + std::swap(FalseVal, TrueVal); + ICI->setPredicate(Pred); + ICI->setOperand(0, CmpLHS); + ICI->setOperand(1, CmpRHS); + SI.setOperand(1, TrueVal); + SI.setOperand(2, FalseVal); + + // Move ICI instruction right before the select instruction. Otherwise + // the sext/zext value may be defined after the ICI instruction uses it. + ICI->moveBefore(&SI); + + Changed = true; + break; + } + } + } + + // Transform (X >s -1) ? C1 : C2 --> ((X >>s 31) & (C2 - C1)) + C1 + // and (X <s 0) ? C2 : C1 --> ((X >>s 31) & (C2 - C1)) + C1 + // FIXME: Type and constness constraints could be lifted, but we have to + // watch code size carefully. We should consider xor instead of + // sub/add when we decide to do that. + if (const IntegerType *Ty = dyn_cast<IntegerType>(CmpLHS->getType())) { + if (TrueVal->getType() == Ty) { + if (ConstantInt *Cmp = dyn_cast<ConstantInt>(CmpRHS)) { + ConstantInt *C1 = NULL, *C2 = NULL; + if (Pred == ICmpInst::ICMP_SGT && Cmp->isAllOnesValue()) { + C1 = dyn_cast<ConstantInt>(TrueVal); + C2 = dyn_cast<ConstantInt>(FalseVal); + } else if (Pred == ICmpInst::ICMP_SLT && Cmp->isNullValue()) { + C1 = dyn_cast<ConstantInt>(FalseVal); + C2 = dyn_cast<ConstantInt>(TrueVal); + } + if (C1 && C2) { + // This shift results in either -1 or 0. + Value *AShr = Builder->CreateAShr(CmpLHS, Ty->getBitWidth()-1); + + // Check if we can express the operation with a single or. + if (C2->isAllOnesValue()) + return ReplaceInstUsesWith(SI, Builder->CreateOr(AShr, C1)); + + Value *And = Builder->CreateAnd(AShr, C2->getValue()-C1->getValue()); + return ReplaceInstUsesWith(SI, Builder->CreateAdd(And, C1)); + } + } + } + } + + if (CmpLHS == TrueVal && CmpRHS == FalseVal) { + // Transform (X == Y) ? X : Y -> Y + if (Pred == ICmpInst::ICMP_EQ) + return ReplaceInstUsesWith(SI, FalseVal); + // Transform (X != Y) ? X : Y -> X + if (Pred == ICmpInst::ICMP_NE) + return ReplaceInstUsesWith(SI, TrueVal); + /// NOTE: if we wanted to, this is where to detect integer MIN/MAX + + } else if (CmpLHS == FalseVal && CmpRHS == TrueVal) { + // Transform (X == Y) ? Y : X -> X + if (Pred == ICmpInst::ICMP_EQ) + return ReplaceInstUsesWith(SI, FalseVal); + // Transform (X != Y) ? Y : X -> Y + if (Pred == ICmpInst::ICMP_NE) + return ReplaceInstUsesWith(SI, TrueVal); + /// NOTE: if we wanted to, this is where to detect integer MIN/MAX + } + return Changed ? &SI : 0; +} + + +/// CanSelectOperandBeMappingIntoPredBlock - SI is a select whose condition is a +/// PHI node (but the two may be in different blocks). See if the true/false +/// values (V) are live in all of the predecessor blocks of the PHI. For +/// example, cases like this cannot be mapped: +/// +/// X = phi [ C1, BB1], [C2, BB2] +/// Y = add +/// Z = select X, Y, 0 +/// +/// because Y is not live in BB1/BB2. +/// +static bool CanSelectOperandBeMappingIntoPredBlock(const Value *V, + const SelectInst &SI) { + // If the value is a non-instruction value like a constant or argument, it + // can always be mapped. + const Instruction *I = dyn_cast<Instruction>(V); + if (I == 0) return true; + + // If V is a PHI node defined in the same block as the condition PHI, we can + // map the arguments. + const PHINode *CondPHI = cast<PHINode>(SI.getCondition()); + + if (const PHINode *VP = dyn_cast<PHINode>(I)) + if (VP->getParent() == CondPHI->getParent()) + return true; + + // Otherwise, if the PHI and select are defined in the same block and if V is + // defined in a different block, then we can transform it. + if (SI.getParent() == CondPHI->getParent() && + I->getParent() != CondPHI->getParent()) + return true; + + // Otherwise we have a 'hard' case and we can't tell without doing more + // detailed dominator based analysis, punt. + return false; +} + +/// FoldSPFofSPF - We have an SPF (e.g. a min or max) of an SPF of the form: +/// SPF2(SPF1(A, B), C) +Instruction *InstCombiner::FoldSPFofSPF(Instruction *Inner, + SelectPatternFlavor SPF1, + Value *A, Value *B, + Instruction &Outer, + SelectPatternFlavor SPF2, Value *C) { + if (C == A || C == B) { + // MAX(MAX(A, B), B) -> MAX(A, B) + // MIN(MIN(a, b), a) -> MIN(a, b) + if (SPF1 == SPF2) + return ReplaceInstUsesWith(Outer, Inner); + + // MAX(MIN(a, b), a) -> a + // MIN(MAX(a, b), a) -> a + if ((SPF1 == SPF_SMIN && SPF2 == SPF_SMAX) || + (SPF1 == SPF_SMAX && SPF2 == SPF_SMIN) || + (SPF1 == SPF_UMIN && SPF2 == SPF_UMAX) || + (SPF1 == SPF_UMAX && SPF2 == SPF_UMIN)) + return ReplaceInstUsesWith(Outer, C); + } + + // TODO: MIN(MIN(A, 23), 97) + return 0; +} + + +/// foldSelectICmpAnd - If one of the constants is zero (we know they can't +/// both be) and we have an icmp instruction with zero, and we have an 'and' +/// with the non-constant value and a power of two we can turn the select +/// into a shift on the result of the 'and'. +static Value *foldSelectICmpAnd(const SelectInst &SI, ConstantInt *TrueVal, + ConstantInt *FalseVal, + InstCombiner::BuilderTy *Builder) { + const ICmpInst *IC = dyn_cast<ICmpInst>(SI.getCondition()); + if (!IC || !IC->isEquality()) + return 0; + + if (ConstantInt *C = dyn_cast<ConstantInt>(IC->getOperand(1))) + if (!C->isZero()) + return 0; + + ConstantInt *AndRHS; + Value *LHS = IC->getOperand(0); + if (LHS->getType() != SI.getType() || + !match(LHS, m_And(m_Value(), m_ConstantInt(AndRHS)))) + return 0; + + // If both select arms are non-zero see if we have a select of the form + // 'x ? 2^n + C : C'. Then we can offset both arms by C, use the logic + // for 'x ? 2^n : 0' and fix the thing up at the end. + ConstantInt *Offset = 0; + if (!TrueVal->isZero() && !FalseVal->isZero()) { + if ((TrueVal->getValue() - FalseVal->getValue()).isPowerOf2()) + Offset = FalseVal; + else if ((FalseVal->getValue() - TrueVal->getValue()).isPowerOf2()) + Offset = TrueVal; + else + return 0; + + // Adjust TrueVal and FalseVal to the offset. + TrueVal = ConstantInt::get(Builder->getContext(), + TrueVal->getValue() - Offset->getValue()); + FalseVal = ConstantInt::get(Builder->getContext(), + FalseVal->getValue() - Offset->getValue()); + } + + // Make sure the mask in the 'and' and one of the select arms is a power of 2. + if (!AndRHS->getValue().isPowerOf2() || + (!TrueVal->getValue().isPowerOf2() && + !FalseVal->getValue().isPowerOf2())) + return 0; + + // Determine which shift is needed to transform result of the 'and' into the + // desired result. + ConstantInt *ValC = !TrueVal->isZero() ? TrueVal : FalseVal; + unsigned ValZeros = ValC->getValue().logBase2(); + unsigned AndZeros = AndRHS->getValue().logBase2(); + + Value *V = LHS; + if (ValZeros > AndZeros) + V = Builder->CreateShl(V, ValZeros - AndZeros); + else if (ValZeros < AndZeros) + V = Builder->CreateLShr(V, AndZeros - ValZeros); + + // Okay, now we know that everything is set up, we just don't know whether we + // have a icmp_ne or icmp_eq and whether the true or false val is the zero. + bool ShouldNotVal = !TrueVal->isZero(); + ShouldNotVal ^= IC->getPredicate() == ICmpInst::ICMP_NE; + if (ShouldNotVal) + V = Builder->CreateXor(V, ValC); + + // Apply an offset if needed. + if (Offset) + V = Builder->CreateAdd(V, Offset); + return V; +} + +Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { + Value *CondVal = SI.getCondition(); + Value *TrueVal = SI.getTrueValue(); + Value *FalseVal = SI.getFalseValue(); + + if (Value *V = SimplifySelectInst(CondVal, TrueVal, FalseVal, TD)) + return ReplaceInstUsesWith(SI, V); + + if (SI.getType()->isIntegerTy(1)) { + if (ConstantInt *C = dyn_cast<ConstantInt>(TrueVal)) { + if (C->getZExtValue()) { + // Change: A = select B, true, C --> A = or B, C + return BinaryOperator::CreateOr(CondVal, FalseVal); + } + // Change: A = select B, false, C --> A = and !B, C + Value *NotCond = + InsertNewInstBefore(BinaryOperator::CreateNot(CondVal, + "not."+CondVal->getName()), SI); + return BinaryOperator::CreateAnd(NotCond, FalseVal); + } else if (ConstantInt *C = dyn_cast<ConstantInt>(FalseVal)) { + if (C->getZExtValue() == false) { + // Change: A = select B, C, false --> A = and B, C + return BinaryOperator::CreateAnd(CondVal, TrueVal); + } + // Change: A = select B, C, true --> A = or !B, C + Value *NotCond = + InsertNewInstBefore(BinaryOperator::CreateNot(CondVal, + "not."+CondVal->getName()), SI); + return BinaryOperator::CreateOr(NotCond, TrueVal); + } + + // select a, b, a -> a&b + // select a, a, b -> a|b + if (CondVal == TrueVal) + return BinaryOperator::CreateOr(CondVal, FalseVal); + else if (CondVal == FalseVal) + return BinaryOperator::CreateAnd(CondVal, TrueVal); + } + + // Selecting between two integer constants? + if (ConstantInt *TrueValC = dyn_cast<ConstantInt>(TrueVal)) + if (ConstantInt *FalseValC = dyn_cast<ConstantInt>(FalseVal)) { + // select C, 1, 0 -> zext C to int + if (FalseValC->isZero() && TrueValC->getValue() == 1) + return new ZExtInst(CondVal, SI.getType()); + + // select C, -1, 0 -> sext C to int + if (FalseValC->isZero() && TrueValC->isAllOnesValue()) + return new SExtInst(CondVal, SI.getType()); + + // select C, 0, 1 -> zext !C to int + if (TrueValC->isZero() && FalseValC->getValue() == 1) { + Value *NotCond = Builder->CreateNot(CondVal, "not."+CondVal->getName()); + return new ZExtInst(NotCond, SI.getType()); + } + + // select C, 0, -1 -> sext !C to int + if (TrueValC->isZero() && FalseValC->isAllOnesValue()) { + Value *NotCond = Builder->CreateNot(CondVal, "not."+CondVal->getName()); + return new SExtInst(NotCond, SI.getType()); + } + + if (Value *V = foldSelectICmpAnd(SI, TrueValC, FalseValC, Builder)) + return ReplaceInstUsesWith(SI, V); + } + + // See if we are selecting two values based on a comparison of the two values. + if (FCmpInst *FCI = dyn_cast<FCmpInst>(CondVal)) { + if (FCI->getOperand(0) == TrueVal && FCI->getOperand(1) == FalseVal) { + // Transform (X == Y) ? X : Y -> Y + if (FCI->getPredicate() == FCmpInst::FCMP_OEQ) { + // This is not safe in general for floating point: + // consider X== -0, Y== +0. + // It becomes safe if either operand is a nonzero constant. + ConstantFP *CFPt, *CFPf; + if (((CFPt = dyn_cast<ConstantFP>(TrueVal)) && + !CFPt->getValueAPF().isZero()) || + ((CFPf = dyn_cast<ConstantFP>(FalseVal)) && + !CFPf->getValueAPF().isZero())) + return ReplaceInstUsesWith(SI, FalseVal); + } + // Transform (X une Y) ? X : Y -> X + if (FCI->getPredicate() == FCmpInst::FCMP_UNE) { + // This is not safe in general for floating point: + // consider X== -0, Y== +0. + // It becomes safe if either operand is a nonzero constant. + ConstantFP *CFPt, *CFPf; + if (((CFPt = dyn_cast<ConstantFP>(TrueVal)) && + !CFPt->getValueAPF().isZero()) || + ((CFPf = dyn_cast<ConstantFP>(FalseVal)) && + !CFPf->getValueAPF().isZero())) + return ReplaceInstUsesWith(SI, TrueVal); + } + // NOTE: if we wanted to, this is where to detect MIN/MAX + + } else if (FCI->getOperand(0) == FalseVal && FCI->getOperand(1) == TrueVal){ + // Transform (X == Y) ? Y : X -> X + if (FCI->getPredicate() == FCmpInst::FCMP_OEQ) { + // This is not safe in general for floating point: + // consider X== -0, Y== +0. + // It becomes safe if either operand is a nonzero constant. + ConstantFP *CFPt, *CFPf; + if (((CFPt = dyn_cast<ConstantFP>(TrueVal)) && + !CFPt->getValueAPF().isZero()) || + ((CFPf = dyn_cast<ConstantFP>(FalseVal)) && + !CFPf->getValueAPF().isZero())) + return ReplaceInstUsesWith(SI, FalseVal); + } + // Transform (X une Y) ? Y : X -> Y + if (FCI->getPredicate() == FCmpInst::FCMP_UNE) { + // This is not safe in general for floating point: + // consider X== -0, Y== +0. + // It becomes safe if either operand is a nonzero constant. + ConstantFP *CFPt, *CFPf; + if (((CFPt = dyn_cast<ConstantFP>(TrueVal)) && + !CFPt->getValueAPF().isZero()) || + ((CFPf = dyn_cast<ConstantFP>(FalseVal)) && + !CFPf->getValueAPF().isZero())) + return ReplaceInstUsesWith(SI, TrueVal); + } + // NOTE: if we wanted to, this is where to detect MIN/MAX + } + // NOTE: if we wanted to, this is where to detect ABS + } + + // See if we are selecting two values based on a comparison of the two values. + if (ICmpInst *ICI = dyn_cast<ICmpInst>(CondVal)) + if (Instruction *Result = visitSelectInstWithICmp(SI, ICI)) + return Result; + + if (Instruction *TI = dyn_cast<Instruction>(TrueVal)) + if (Instruction *FI = dyn_cast<Instruction>(FalseVal)) + if (TI->hasOneUse() && FI->hasOneUse()) { + Instruction *AddOp = 0, *SubOp = 0; + + // Turn (select C, (op X, Y), (op X, Z)) -> (op X, (select C, Y, Z)) + if (TI->getOpcode() == FI->getOpcode()) + if (Instruction *IV = FoldSelectOpOp(SI, TI, FI)) + return IV; + + // Turn select C, (X+Y), (X-Y) --> (X+(select C, Y, (-Y))). This is + // even legal for FP. + if ((TI->getOpcode() == Instruction::Sub && + FI->getOpcode() == Instruction::Add) || + (TI->getOpcode() == Instruction::FSub && + FI->getOpcode() == Instruction::FAdd)) { + AddOp = FI; SubOp = TI; + } else if ((FI->getOpcode() == Instruction::Sub && + TI->getOpcode() == Instruction::Add) || + (FI->getOpcode() == Instruction::FSub && + TI->getOpcode() == Instruction::FAdd)) { + AddOp = TI; SubOp = FI; + } + + if (AddOp) { + Value *OtherAddOp = 0; + if (SubOp->getOperand(0) == AddOp->getOperand(0)) { + OtherAddOp = AddOp->getOperand(1); + } else if (SubOp->getOperand(0) == AddOp->getOperand(1)) { + OtherAddOp = AddOp->getOperand(0); + } + + if (OtherAddOp) { + // So at this point we know we have (Y -> OtherAddOp): + // select C, (add X, Y), (sub X, Z) + Value *NegVal; // Compute -Z + if (Constant *C = dyn_cast<Constant>(SubOp->getOperand(1))) { + NegVal = ConstantExpr::getNeg(C); + } else if (SI.getType()->isFloatingPointTy()) { + NegVal = InsertNewInstBefore( + BinaryOperator::CreateFNeg(SubOp->getOperand(1), + "tmp"), SI); + } else { + NegVal = InsertNewInstBefore( + BinaryOperator::CreateNeg(SubOp->getOperand(1), + "tmp"), SI); + } + + Value *NewTrueOp = OtherAddOp; + Value *NewFalseOp = NegVal; + if (AddOp != TI) + std::swap(NewTrueOp, NewFalseOp); + Instruction *NewSel = + SelectInst::Create(CondVal, NewTrueOp, + NewFalseOp, SI.getName() + ".p"); + + NewSel = InsertNewInstBefore(NewSel, SI); + if (SI.getType()->isFloatingPointTy()) + return BinaryOperator::CreateFAdd(SubOp->getOperand(0), NewSel); + else + return BinaryOperator::CreateAdd(SubOp->getOperand(0), NewSel); + } + } + } + + // See if we can fold the select into one of our operands. + if (SI.getType()->isIntegerTy()) { + if (Instruction *FoldI = FoldSelectIntoOp(SI, TrueVal, FalseVal)) + return FoldI; + + // MAX(MAX(a, b), a) -> MAX(a, b) + // MIN(MIN(a, b), a) -> MIN(a, b) + // MAX(MIN(a, b), a) -> a + // MIN(MAX(a, b), a) -> a + Value *LHS, *RHS, *LHS2, *RHS2; + if (SelectPatternFlavor SPF = MatchSelectPattern(&SI, LHS, RHS)) { + if (SelectPatternFlavor SPF2 = MatchSelectPattern(LHS, LHS2, RHS2)) + if (Instruction *R = FoldSPFofSPF(cast<Instruction>(LHS),SPF2,LHS2,RHS2, + SI, SPF, RHS)) + return R; + if (SelectPatternFlavor SPF2 = MatchSelectPattern(RHS, LHS2, RHS2)) + if (Instruction *R = FoldSPFofSPF(cast<Instruction>(RHS),SPF2,LHS2,RHS2, + SI, SPF, LHS)) + return R; + } + + // TODO. + // ABS(-X) -> ABS(X) + // ABS(ABS(X)) -> ABS(X) + } + + // See if we can fold the select into a phi node if the condition is a select. + if (isa<PHINode>(SI.getCondition())) + // The true/false values have to be live in the PHI predecessor's blocks. + if (CanSelectOperandBeMappingIntoPredBlock(TrueVal, SI) && + CanSelectOperandBeMappingIntoPredBlock(FalseVal, SI)) + if (Instruction *NV = FoldOpIntoPhi(SI)) + return NV; + + if (SelectInst *TrueSI = dyn_cast<SelectInst>(TrueVal)) { + if (TrueSI->getCondition() == CondVal) { + SI.setOperand(1, TrueSI->getTrueValue()); + return &SI; + } + } + if (SelectInst *FalseSI = dyn_cast<SelectInst>(FalseVal)) { + if (FalseSI->getCondition() == CondVal) { + SI.setOperand(2, FalseSI->getFalseValue()); + return &SI; + } + } + + if (BinaryOperator::isNot(CondVal)) { + SI.setOperand(0, BinaryOperator::getNotArgument(CondVal)); + SI.setOperand(1, FalseVal); + SI.setOperand(2, TrueVal); + return &SI; + } + + return 0; +} diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp new file mode 100644 index 0000000..a7f8005 --- /dev/null +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -0,0 +1,746 @@ +//===- InstCombineShifts.cpp ----------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the visitShl, visitLShr, and visitAShr functions. +// +//===----------------------------------------------------------------------===// + +#include "InstCombine.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Support/PatternMatch.h" +using namespace llvm; +using namespace PatternMatch; + +Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) { + assert(I.getOperand(1)->getType() == I.getOperand(0)->getType()); + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + + // See if we can fold away this shift. + if (SimplifyDemandedInstructionBits(I)) + return &I; + + // Try to fold constant and into select arguments. + if (isa<Constant>(Op0)) + if (SelectInst *SI = dyn_cast<SelectInst>(Op1)) + if (Instruction *R = FoldOpIntoSelect(I, SI)) + return R; + + if (ConstantInt *CUI = dyn_cast<ConstantInt>(Op1)) + if (Instruction *Res = FoldShiftByConstant(Op0, CUI, I)) + return Res; + + // X shift (A srem B) -> X shift (A and B-1) iff B is a power of 2. + // Because shifts by negative values (which could occur if A were negative) + // are undefined. + Value *A; const APInt *B; + if (Op1->hasOneUse() && match(Op1, m_SRem(m_Value(A), m_Power2(B)))) { + // FIXME: Should this get moved into SimplifyDemandedBits by saying we don't + // demand the sign bit (and many others) here?? + Value *Rem = Builder->CreateAnd(A, ConstantInt::get(I.getType(), *B-1), + Op1->getName()); + I.setOperand(1, Rem); + return &I; + } + + return 0; +} + +/// CanEvaluateShifted - See if we can compute the specified value, but shifted +/// logically to the left or right by some number of bits. This should return +/// true if the expression can be computed for the same cost as the current +/// expression tree. This is used to eliminate extraneous shifting from things +/// like: +/// %C = shl i128 %A, 64 +/// %D = shl i128 %B, 96 +/// %E = or i128 %C, %D +/// %F = lshr i128 %E, 64 +/// where the client will ask if E can be computed shifted right by 64-bits. If +/// this succeeds, the GetShiftedValue function will be called to produce the +/// value. +static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift, + InstCombiner &IC) { + // We can always evaluate constants shifted. + if (isa<Constant>(V)) + return true; + + Instruction *I = dyn_cast<Instruction>(V); + if (!I) return false; + + // If this is the opposite shift, we can directly reuse the input of the shift + // if the needed bits are already zero in the input. This allows us to reuse + // the value which means that we don't care if the shift has multiple uses. + // TODO: Handle opposite shift by exact value. + ConstantInt *CI = 0; + if ((isLeftShift && match(I, m_LShr(m_Value(), m_ConstantInt(CI)))) || + (!isLeftShift && match(I, m_Shl(m_Value(), m_ConstantInt(CI))))) { + if (CI->getZExtValue() == NumBits) { + // TODO: Check that the input bits are already zero with MaskedValueIsZero +#if 0 + // If this is a truncate of a logical shr, we can truncate it to a smaller + // lshr iff we know that the bits we would otherwise be shifting in are + // already zeros. + uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits(); + uint32_t BitWidth = Ty->getScalarSizeInBits(); + if (MaskedValueIsZero(I->getOperand(0), + APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth)) && + CI->getLimitedValue(BitWidth) < BitWidth) { + return CanEvaluateTruncated(I->getOperand(0), Ty); + } +#endif + + } + } + + // We can't mutate something that has multiple uses: doing so would + // require duplicating the instruction in general, which isn't profitable. + if (!I->hasOneUse()) return false; + + switch (I->getOpcode()) { + default: return false; + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + // Bitwise operators can all arbitrarily be arbitrarily evaluated shifted. + return CanEvaluateShifted(I->getOperand(0), NumBits, isLeftShift, IC) && + CanEvaluateShifted(I->getOperand(1), NumBits, isLeftShift, IC); + + case Instruction::Shl: { + // We can often fold the shift into shifts-by-a-constant. + CI = dyn_cast<ConstantInt>(I->getOperand(1)); + if (CI == 0) return false; + + // We can always fold shl(c1)+shl(c2) -> shl(c1+c2). + if (isLeftShift) return true; + + // We can always turn shl(c)+shr(c) -> and(c2). + if (CI->getValue() == NumBits) return true; + + unsigned TypeWidth = I->getType()->getScalarSizeInBits(); + + // We can turn shl(c1)+shr(c2) -> shl(c3)+and(c4), but it isn't + // profitable unless we know the and'd out bits are already zero. + if (CI->getZExtValue() > NumBits) { + unsigned LowBits = TypeWidth - CI->getZExtValue(); + if (MaskedValueIsZero(I->getOperand(0), + APInt::getLowBitsSet(TypeWidth, NumBits) << LowBits)) + return true; + } + + return false; + } + case Instruction::LShr: { + // We can often fold the shift into shifts-by-a-constant. + CI = dyn_cast<ConstantInt>(I->getOperand(1)); + if (CI == 0) return false; + + // We can always fold lshr(c1)+lshr(c2) -> lshr(c1+c2). + if (!isLeftShift) return true; + + // We can always turn lshr(c)+shl(c) -> and(c2). + if (CI->getValue() == NumBits) return true; + + unsigned TypeWidth = I->getType()->getScalarSizeInBits(); + + // We can always turn lshr(c1)+shl(c2) -> lshr(c3)+and(c4), but it isn't + // profitable unless we know the and'd out bits are already zero. + if (CI->getZExtValue() > NumBits) { + unsigned LowBits = CI->getZExtValue() - NumBits; + if (MaskedValueIsZero(I->getOperand(0), + APInt::getLowBitsSet(TypeWidth, NumBits) << LowBits)) + return true; + } + + return false; + } + case Instruction::Select: { + SelectInst *SI = cast<SelectInst>(I); + return CanEvaluateShifted(SI->getTrueValue(), NumBits, isLeftShift, IC) && + CanEvaluateShifted(SI->getFalseValue(), NumBits, isLeftShift, IC); + } + case Instruction::PHI: { + // We can change a phi if we can change all operands. Note that we never + // get into trouble with cyclic PHIs here because we only consider + // instructions with a single use. + PHINode *PN = cast<PHINode>(I); + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + if (!CanEvaluateShifted(PN->getIncomingValue(i), NumBits, isLeftShift,IC)) + return false; + return true; + } + } +} + +/// GetShiftedValue - When CanEvaluateShifted returned true for an expression, +/// this value inserts the new computation that produces the shifted value. +static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift, + InstCombiner &IC) { + // We can always evaluate constants shifted. + if (Constant *C = dyn_cast<Constant>(V)) { + if (isLeftShift) + V = IC.Builder->CreateShl(C, NumBits); + else + V = IC.Builder->CreateLShr(C, NumBits); + // If we got a constantexpr back, try to simplify it with TD info. + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) + V = ConstantFoldConstantExpression(CE, IC.getTargetData()); + return V; + } + + Instruction *I = cast<Instruction>(V); + IC.Worklist.Add(I); + + switch (I->getOpcode()) { + default: assert(0 && "Inconsistency with CanEvaluateShifted"); + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + // Bitwise operators can all arbitrarily be arbitrarily evaluated shifted. + I->setOperand(0, GetShiftedValue(I->getOperand(0), NumBits,isLeftShift,IC)); + I->setOperand(1, GetShiftedValue(I->getOperand(1), NumBits,isLeftShift,IC)); + return I; + + case Instruction::Shl: { + unsigned TypeWidth = I->getType()->getScalarSizeInBits(); + + // We only accept shifts-by-a-constant in CanEvaluateShifted. + ConstantInt *CI = cast<ConstantInt>(I->getOperand(1)); + + // We can always fold shl(c1)+shl(c2) -> shl(c1+c2). + if (isLeftShift) { + // If this is oversized composite shift, then unsigned shifts get 0. + unsigned NewShAmt = NumBits+CI->getZExtValue(); + if (NewShAmt >= TypeWidth) + return Constant::getNullValue(I->getType()); + + I->setOperand(1, ConstantInt::get(I->getType(), NewShAmt)); + return I; + } + + // We turn shl(c)+lshr(c) -> and(c2) if the input doesn't already have + // zeros. + if (CI->getValue() == NumBits) { + APInt Mask(APInt::getLowBitsSet(TypeWidth, TypeWidth - NumBits)); + V = IC.Builder->CreateAnd(I->getOperand(0), + ConstantInt::get(I->getContext(), Mask)); + if (Instruction *VI = dyn_cast<Instruction>(V)) { + VI->moveBefore(I); + VI->takeName(I); + } + return V; + } + + // We turn shl(c1)+shr(c2) -> shl(c3)+and(c4), but only when we know that + // the and won't be needed. + assert(CI->getZExtValue() > NumBits); + I->setOperand(1, ConstantInt::get(I->getType(), + CI->getZExtValue() - NumBits)); + return I; + } + case Instruction::LShr: { + unsigned TypeWidth = I->getType()->getScalarSizeInBits(); + // We only accept shifts-by-a-constant in CanEvaluateShifted. + ConstantInt *CI = cast<ConstantInt>(I->getOperand(1)); + + // We can always fold lshr(c1)+lshr(c2) -> lshr(c1+c2). + if (!isLeftShift) { + // If this is oversized composite shift, then unsigned shifts get 0. + unsigned NewShAmt = NumBits+CI->getZExtValue(); + if (NewShAmt >= TypeWidth) + return Constant::getNullValue(I->getType()); + + I->setOperand(1, ConstantInt::get(I->getType(), NewShAmt)); + return I; + } + + // We turn lshr(c)+shl(c) -> and(c2) if the input doesn't already have + // zeros. + if (CI->getValue() == NumBits) { + APInt Mask(APInt::getHighBitsSet(TypeWidth, TypeWidth - NumBits)); + V = IC.Builder->CreateAnd(I->getOperand(0), + ConstantInt::get(I->getContext(), Mask)); + if (Instruction *VI = dyn_cast<Instruction>(V)) { + VI->moveBefore(I); + VI->takeName(I); + } + return V; + } + + // We turn lshr(c1)+shl(c2) -> lshr(c3)+and(c4), but only when we know that + // the and won't be needed. + assert(CI->getZExtValue() > NumBits); + I->setOperand(1, ConstantInt::get(I->getType(), + CI->getZExtValue() - NumBits)); + return I; + } + + case Instruction::Select: + I->setOperand(1, GetShiftedValue(I->getOperand(1), NumBits,isLeftShift,IC)); + I->setOperand(2, GetShiftedValue(I->getOperand(2), NumBits,isLeftShift,IC)); + return I; + case Instruction::PHI: { + // We can change a phi if we can change all operands. Note that we never + // get into trouble with cyclic PHIs here because we only consider + // instructions with a single use. + PHINode *PN = cast<PHINode>(I); + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + PN->setIncomingValue(i, GetShiftedValue(PN->getIncomingValue(i), + NumBits, isLeftShift, IC)); + return PN; + } + } +} + + + +Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1, + BinaryOperator &I) { + bool isLeftShift = I.getOpcode() == Instruction::Shl; + + + // See if we can propagate this shift into the input, this covers the trivial + // cast of lshr(shl(x,c1),c2) as well as other more complex cases. + if (I.getOpcode() != Instruction::AShr && + CanEvaluateShifted(Op0, Op1->getZExtValue(), isLeftShift, *this)) { + DEBUG(dbgs() << "ICE: GetShiftedValue propagating shift through expression" + " to eliminate shift:\n IN: " << *Op0 << "\n SH: " << I <<"\n"); + + return ReplaceInstUsesWith(I, + GetShiftedValue(Op0, Op1->getZExtValue(), isLeftShift, *this)); + } + + + // See if we can simplify any instructions used by the instruction whose sole + // purpose is to compute bits we don't care about. + uint32_t TypeBits = Op0->getType()->getScalarSizeInBits(); + + // shl i32 X, 32 = 0 and srl i8 Y, 9 = 0, ... just don't eliminate + // a signed shift. + // + if (Op1->uge(TypeBits)) { + if (I.getOpcode() != Instruction::AShr) + return ReplaceInstUsesWith(I, Constant::getNullValue(Op0->getType())); + // ashr i32 X, 32 --> ashr i32 X, 31 + I.setOperand(1, ConstantInt::get(I.getType(), TypeBits-1)); + return &I; + } + + // ((X*C1) << C2) == (X * (C1 << C2)) + if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op0)) + if (BO->getOpcode() == Instruction::Mul && isLeftShift) + if (Constant *BOOp = dyn_cast<Constant>(BO->getOperand(1))) + return BinaryOperator::CreateMul(BO->getOperand(0), + ConstantExpr::getShl(BOOp, Op1)); + + // Try to fold constant and into select arguments. + if (SelectInst *SI = dyn_cast<SelectInst>(Op0)) + if (Instruction *R = FoldOpIntoSelect(I, SI)) + return R; + if (isa<PHINode>(Op0)) + if (Instruction *NV = FoldOpIntoPhi(I)) + return NV; + + // Fold shift2(trunc(shift1(x,c1)), c2) -> trunc(shift2(shift1(x,c1),c2)) + if (TruncInst *TI = dyn_cast<TruncInst>(Op0)) { + Instruction *TrOp = dyn_cast<Instruction>(TI->getOperand(0)); + // If 'shift2' is an ashr, we would have to get the sign bit into a funny + // place. Don't try to do this transformation in this case. Also, we + // require that the input operand is a shift-by-constant so that we have + // confidence that the shifts will get folded together. We could do this + // xform in more cases, but it is unlikely to be profitable. + if (TrOp && I.isLogicalShift() && TrOp->isShift() && + isa<ConstantInt>(TrOp->getOperand(1))) { + // Okay, we'll do this xform. Make the shift of shift. + Constant *ShAmt = ConstantExpr::getZExt(Op1, TrOp->getType()); + // (shift2 (shift1 & 0x00FF), c2) + Value *NSh = Builder->CreateBinOp(I.getOpcode(), TrOp, ShAmt,I.getName()); + + // For logical shifts, the truncation has the effect of making the high + // part of the register be zeros. Emulate this by inserting an AND to + // clear the top bits as needed. This 'and' will usually be zapped by + // other xforms later if dead. + unsigned SrcSize = TrOp->getType()->getScalarSizeInBits(); + unsigned DstSize = TI->getType()->getScalarSizeInBits(); + APInt MaskV(APInt::getLowBitsSet(SrcSize, DstSize)); + + // The mask we constructed says what the trunc would do if occurring + // between the shifts. We want to know the effect *after* the second + // shift. We know that it is a logical shift by a constant, so adjust the + // mask as appropriate. + if (I.getOpcode() == Instruction::Shl) + MaskV <<= Op1->getZExtValue(); + else { + assert(I.getOpcode() == Instruction::LShr && "Unknown logical shift"); + MaskV = MaskV.lshr(Op1->getZExtValue()); + } + + // shift1 & 0x00FF + Value *And = Builder->CreateAnd(NSh, + ConstantInt::get(I.getContext(), MaskV), + TI->getName()); + + // Return the value truncated to the interesting size. + return new TruncInst(And, I.getType()); + } + } + + if (Op0->hasOneUse()) { + if (BinaryOperator *Op0BO = dyn_cast<BinaryOperator>(Op0)) { + // Turn ((X >> C) + Y) << C -> (X + (Y << C)) & (~0 << C) + Value *V1, *V2; + ConstantInt *CC; + switch (Op0BO->getOpcode()) { + default: break; + case Instruction::Add: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + // These operators commute. + // Turn (Y + (X >> C)) << C -> (X + (Y << C)) & (~0 << C) + if (isLeftShift && Op0BO->getOperand(1)->hasOneUse() && + match(Op0BO->getOperand(1), m_Shr(m_Value(V1), + m_Specific(Op1)))) { + Value *YS = // (Y << C) + Builder->CreateShl(Op0BO->getOperand(0), Op1, Op0BO->getName()); + // (X + (Y << C)) + Value *X = Builder->CreateBinOp(Op0BO->getOpcode(), YS, V1, + Op0BO->getOperand(1)->getName()); + uint32_t Op1Val = Op1->getLimitedValue(TypeBits); + return BinaryOperator::CreateAnd(X, ConstantInt::get(I.getContext(), + APInt::getHighBitsSet(TypeBits, TypeBits-Op1Val))); + } + + // Turn (Y + ((X >> C) & CC)) << C -> ((X & (CC << C)) + (Y << C)) + Value *Op0BOOp1 = Op0BO->getOperand(1); + if (isLeftShift && Op0BOOp1->hasOneUse() && + match(Op0BOOp1, + m_And(m_Shr(m_Value(V1), m_Specific(Op1)), + m_ConstantInt(CC))) && + cast<BinaryOperator>(Op0BOOp1)->getOperand(0)->hasOneUse()) { + Value *YS = // (Y << C) + Builder->CreateShl(Op0BO->getOperand(0), Op1, + Op0BO->getName()); + // X & (CC << C) + Value *XM = Builder->CreateAnd(V1, ConstantExpr::getShl(CC, Op1), + V1->getName()+".mask"); + return BinaryOperator::Create(Op0BO->getOpcode(), YS, XM); + } + } + + // FALL THROUGH. + case Instruction::Sub: { + // Turn ((X >> C) + Y) << C -> (X + (Y << C)) & (~0 << C) + if (isLeftShift && Op0BO->getOperand(0)->hasOneUse() && + match(Op0BO->getOperand(0), m_Shr(m_Value(V1), + m_Specific(Op1)))) { + Value *YS = // (Y << C) + Builder->CreateShl(Op0BO->getOperand(1), Op1, Op0BO->getName()); + // (X + (Y << C)) + Value *X = Builder->CreateBinOp(Op0BO->getOpcode(), V1, YS, + Op0BO->getOperand(0)->getName()); + uint32_t Op1Val = Op1->getLimitedValue(TypeBits); + return BinaryOperator::CreateAnd(X, ConstantInt::get(I.getContext(), + APInt::getHighBitsSet(TypeBits, TypeBits-Op1Val))); + } + + // Turn (((X >> C)&CC) + Y) << C -> (X + (Y << C)) & (CC << C) + if (isLeftShift && Op0BO->getOperand(0)->hasOneUse() && + match(Op0BO->getOperand(0), + m_And(m_Shr(m_Value(V1), m_Value(V2)), + m_ConstantInt(CC))) && V2 == Op1 && + cast<BinaryOperator>(Op0BO->getOperand(0)) + ->getOperand(0)->hasOneUse()) { + Value *YS = // (Y << C) + Builder->CreateShl(Op0BO->getOperand(1), Op1, Op0BO->getName()); + // X & (CC << C) + Value *XM = Builder->CreateAnd(V1, ConstantExpr::getShl(CC, Op1), + V1->getName()+".mask"); + + return BinaryOperator::Create(Op0BO->getOpcode(), XM, YS); + } + + break; + } + } + + + // If the operand is an bitwise operator with a constant RHS, and the + // shift is the only use, we can pull it out of the shift. + if (ConstantInt *Op0C = dyn_cast<ConstantInt>(Op0BO->getOperand(1))) { + bool isValid = true; // Valid only for And, Or, Xor + bool highBitSet = false; // Transform if high bit of constant set? + + switch (Op0BO->getOpcode()) { + default: isValid = false; break; // Do not perform transform! + case Instruction::Add: + isValid = isLeftShift; + break; + case Instruction::Or: + case Instruction::Xor: + highBitSet = false; + break; + case Instruction::And: + highBitSet = true; + break; + } + + // If this is a signed shift right, and the high bit is modified + // by the logical operation, do not perform the transformation. + // The highBitSet boolean indicates the value of the high bit of + // the constant which would cause it to be modified for this + // operation. + // + if (isValid && I.getOpcode() == Instruction::AShr) + isValid = Op0C->getValue()[TypeBits-1] == highBitSet; + + if (isValid) { + Constant *NewRHS = ConstantExpr::get(I.getOpcode(), Op0C, Op1); + + Value *NewShift = + Builder->CreateBinOp(I.getOpcode(), Op0BO->getOperand(0), Op1); + NewShift->takeName(Op0BO); + + return BinaryOperator::Create(Op0BO->getOpcode(), NewShift, + NewRHS); + } + } + } + } + + // Find out if this is a shift of a shift by a constant. + BinaryOperator *ShiftOp = dyn_cast<BinaryOperator>(Op0); + if (ShiftOp && !ShiftOp->isShift()) + ShiftOp = 0; + + if (ShiftOp && isa<ConstantInt>(ShiftOp->getOperand(1))) { + ConstantInt *ShiftAmt1C = cast<ConstantInt>(ShiftOp->getOperand(1)); + uint32_t ShiftAmt1 = ShiftAmt1C->getLimitedValue(TypeBits); + uint32_t ShiftAmt2 = Op1->getLimitedValue(TypeBits); + assert(ShiftAmt2 != 0 && "Should have been simplified earlier"); + if (ShiftAmt1 == 0) return 0; // Will be simplified in the future. + Value *X = ShiftOp->getOperand(0); + + uint32_t AmtSum = ShiftAmt1+ShiftAmt2; // Fold into one big shift. + + const IntegerType *Ty = cast<IntegerType>(I.getType()); + + // Check for (X << c1) << c2 and (X >> c1) >> c2 + if (I.getOpcode() == ShiftOp->getOpcode()) { + // If this is oversized composite shift, then unsigned shifts get 0, ashr + // saturates. + if (AmtSum >= TypeBits) { + if (I.getOpcode() != Instruction::AShr) + return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType())); + AmtSum = TypeBits-1; // Saturate to 31 for i32 ashr. + } + + return BinaryOperator::Create(I.getOpcode(), X, + ConstantInt::get(Ty, AmtSum)); + } + + if (ShiftAmt1 == ShiftAmt2) { + // If we have ((X >>? C) << C), turn this into X & (-1 << C). + if (I.getOpcode() == Instruction::Shl && + ShiftOp->getOpcode() != Instruction::Shl) { + APInt Mask(APInt::getHighBitsSet(TypeBits, TypeBits - ShiftAmt1)); + return BinaryOperator::CreateAnd(X, + ConstantInt::get(I.getContext(),Mask)); + } + // If we have ((X << C) >>u C), turn this into X & (-1 >>u C). + if (I.getOpcode() == Instruction::LShr && + ShiftOp->getOpcode() == Instruction::Shl) { + APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt1)); + return BinaryOperator::CreateAnd(X, + ConstantInt::get(I.getContext(), Mask)); + } + } else if (ShiftAmt1 < ShiftAmt2) { + uint32_t ShiftDiff = ShiftAmt2-ShiftAmt1; + + // (X >>? C1) << C2 --> X << (C2-C1) & (-1 << C2) + if (I.getOpcode() == Instruction::Shl && + ShiftOp->getOpcode() != Instruction::Shl) { + assert(ShiftOp->getOpcode() == Instruction::LShr || + ShiftOp->getOpcode() == Instruction::AShr); + Value *Shift = Builder->CreateShl(X, ConstantInt::get(Ty, ShiftDiff)); + + APInt Mask(APInt::getHighBitsSet(TypeBits, TypeBits - ShiftAmt2)); + return BinaryOperator::CreateAnd(Shift, + ConstantInt::get(I.getContext(),Mask)); + } + + // (X << C1) >>u C2 --> X >>u (C2-C1) & (-1 >> C2) + if (I.getOpcode() == Instruction::LShr && + ShiftOp->getOpcode() == Instruction::Shl) { + assert(ShiftOp->getOpcode() == Instruction::Shl); + Value *Shift = Builder->CreateLShr(X, ConstantInt::get(Ty, ShiftDiff)); + + APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2)); + return BinaryOperator::CreateAnd(Shift, + ConstantInt::get(I.getContext(),Mask)); + } + + // We can't handle (X << C1) >>s C2, it shifts arbitrary bits in. + } else { + assert(ShiftAmt2 < ShiftAmt1); + uint32_t ShiftDiff = ShiftAmt1-ShiftAmt2; + + // (X >>? C1) << C2 --> X >>? (C1-C2) & (-1 << C2) + if (I.getOpcode() == Instruction::Shl && + ShiftOp->getOpcode() != Instruction::Shl) { + Value *Shift = Builder->CreateBinOp(ShiftOp->getOpcode(), X, + ConstantInt::get(Ty, ShiftDiff)); + + APInt Mask(APInt::getHighBitsSet(TypeBits, TypeBits - ShiftAmt2)); + return BinaryOperator::CreateAnd(Shift, + ConstantInt::get(I.getContext(),Mask)); + } + + // (X << C1) >>u C2 --> X << (C1-C2) & (-1 >> C2) + if (I.getOpcode() == Instruction::LShr && + ShiftOp->getOpcode() == Instruction::Shl) { + Value *Shift = Builder->CreateShl(X, ConstantInt::get(Ty, ShiftDiff)); + + APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2)); + return BinaryOperator::CreateAnd(Shift, + ConstantInt::get(I.getContext(),Mask)); + } + + // We can't handle (X << C1) >>a C2, it shifts arbitrary bits in. + } + } + return 0; +} + +Instruction *InstCombiner::visitShl(BinaryOperator &I) { + if (Value *V = SimplifyShlInst(I.getOperand(0), I.getOperand(1), + I.hasNoSignedWrap(), I.hasNoUnsignedWrap(), + TD)) + return ReplaceInstUsesWith(I, V); + + if (Instruction *V = commonShiftTransforms(I)) + return V; + + if (ConstantInt *Op1C = dyn_cast<ConstantInt>(I.getOperand(1))) { + unsigned ShAmt = Op1C->getZExtValue(); + + // If the shifted-out value is known-zero, then this is a NUW shift. + if (!I.hasNoUnsignedWrap() && + MaskedValueIsZero(I.getOperand(0), + APInt::getHighBitsSet(Op1C->getBitWidth(), ShAmt))) { + I.setHasNoUnsignedWrap(); + return &I; + } + + // If the shifted out value is all signbits, this is a NSW shift. + if (!I.hasNoSignedWrap() && + ComputeNumSignBits(I.getOperand(0)) > ShAmt) { + I.setHasNoSignedWrap(); + return &I; + } + } + + return 0; +} + +Instruction *InstCombiner::visitLShr(BinaryOperator &I) { + if (Value *V = SimplifyLShrInst(I.getOperand(0), I.getOperand(1), + I.isExact(), TD)) + return ReplaceInstUsesWith(I, V); + + if (Instruction *R = commonShiftTransforms(I)) + return R; + + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + + if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) { + unsigned ShAmt = Op1C->getZExtValue(); + + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Op0)) { + unsigned BitWidth = Op0->getType()->getScalarSizeInBits(); + // ctlz.i32(x)>>5 --> zext(x == 0) + // cttz.i32(x)>>5 --> zext(x == 0) + // ctpop.i32(x)>>5 --> zext(x == -1) + if ((II->getIntrinsicID() == Intrinsic::ctlz || + II->getIntrinsicID() == Intrinsic::cttz || + II->getIntrinsicID() == Intrinsic::ctpop) && + isPowerOf2_32(BitWidth) && Log2_32(BitWidth) == ShAmt) { + bool isCtPop = II->getIntrinsicID() == Intrinsic::ctpop; + Constant *RHS = ConstantInt::getSigned(Op0->getType(), isCtPop ? -1:0); + Value *Cmp = Builder->CreateICmpEQ(II->getArgOperand(0), RHS); + return new ZExtInst(Cmp, II->getType()); + } + } + + // If the shifted-out value is known-zero, then this is an exact shift. + if (!I.isExact() && + MaskedValueIsZero(Op0,APInt::getLowBitsSet(Op1C->getBitWidth(),ShAmt))){ + I.setIsExact(); + return &I; + } + } + + return 0; +} + +Instruction *InstCombiner::visitAShr(BinaryOperator &I) { + if (Value *V = SimplifyAShrInst(I.getOperand(0), I.getOperand(1), + I.isExact(), TD)) + return ReplaceInstUsesWith(I, V); + + if (Instruction *R = commonShiftTransforms(I)) + return R; + + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + + if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) { + unsigned ShAmt = Op1C->getZExtValue(); + + // If the input is a SHL by the same constant (ashr (shl X, C), C), then we + // have a sign-extend idiom. + Value *X; + if (match(Op0, m_Shl(m_Value(X), m_Specific(Op1)))) { + // If the left shift is just shifting out partial signbits, delete the + // extension. + if (cast<OverflowingBinaryOperator>(Op0)->hasNoSignedWrap()) + return ReplaceInstUsesWith(I, X); + + // If the input is an extension from the shifted amount value, e.g. + // %x = zext i8 %A to i32 + // %y = shl i32 %x, 24 + // %z = ashr %y, 24 + // then turn this into "z = sext i8 A to i32". + if (ZExtInst *ZI = dyn_cast<ZExtInst>(X)) { + uint32_t SrcBits = ZI->getOperand(0)->getType()->getScalarSizeInBits(); + uint32_t DestBits = ZI->getType()->getScalarSizeInBits(); + if (Op1C->getZExtValue() == DestBits-SrcBits) + return new SExtInst(ZI->getOperand(0), ZI->getType()); + } + } + + // If the shifted-out value is known-zero, then this is an exact shift. + if (!I.isExact() && + MaskedValueIsZero(Op0,APInt::getLowBitsSet(Op1C->getBitWidth(),ShAmt))){ + I.setIsExact(); + return &I; + } + } + + // See if we can turn a signed shr into an unsigned shr. + if (MaskedValueIsZero(Op0, + APInt::getSignBit(I.getType()->getScalarSizeInBits()))) + return BinaryOperator::CreateLShr(Op0, Op1); + + // Arithmetic shifting an all-sign-bit value is a no-op. + unsigned NumSignBits = ComputeNumSignBits(Op0); + if (NumSignBits == Op0->getType()->getScalarSizeInBits()) + return ReplaceInstUsesWith(I, Op0); + + return 0; +} + diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp new file mode 100644 index 0000000..bda8cea --- /dev/null +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -0,0 +1,1133 @@ +//===- InstCombineSimplifyDemanded.cpp ------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains logic for simplifying instructions based on information +// about how they are used. +// +//===----------------------------------------------------------------------===// + + +#include "InstCombine.h" +#include "llvm/Target/TargetData.h" +#include "llvm/IntrinsicInst.h" + +using namespace llvm; + + +/// ShrinkDemandedConstant - Check to see if the specified operand of the +/// specified instruction is a constant integer. If so, check to see if there +/// are any bits set in the constant that are not demanded. If so, shrink the +/// constant and return true. +static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo, + APInt Demanded) { + assert(I && "No instruction?"); + assert(OpNo < I->getNumOperands() && "Operand index too large"); + + // If the operand is not a constant integer, nothing to do. + ConstantInt *OpC = dyn_cast<ConstantInt>(I->getOperand(OpNo)); + if (!OpC) return false; + + // If there are no bits set that aren't demanded, nothing to do. + Demanded = Demanded.zextOrTrunc(OpC->getValue().getBitWidth()); + if ((~Demanded & OpC->getValue()) == 0) + return false; + + // This instruction is producing bits that are not demanded. Shrink the RHS. + Demanded &= OpC->getValue(); + I->setOperand(OpNo, ConstantInt::get(OpC->getType(), Demanded)); + return true; +} + + + +/// SimplifyDemandedInstructionBits - Inst is an integer instruction that +/// SimplifyDemandedBits knows about. See if the instruction has any +/// properties that allow us to simplify its operands. +bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) { + unsigned BitWidth = Inst.getType()->getScalarSizeInBits(); + APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0); + APInt DemandedMask(APInt::getAllOnesValue(BitWidth)); + + Value *V = SimplifyDemandedUseBits(&Inst, DemandedMask, + KnownZero, KnownOne, 0); + if (V == 0) return false; + if (V == &Inst) return true; + ReplaceInstUsesWith(Inst, V); + return true; +} + +/// SimplifyDemandedBits - This form of SimplifyDemandedBits simplifies the +/// specified instruction operand if possible, updating it in place. It returns +/// true if it made any change and false otherwise. +bool InstCombiner::SimplifyDemandedBits(Use &U, APInt DemandedMask, + APInt &KnownZero, APInt &KnownOne, + unsigned Depth) { + Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask, + KnownZero, KnownOne, Depth); + if (NewVal == 0) return false; + U = NewVal; + return true; +} + + +/// SimplifyDemandedUseBits - This function attempts to replace V with a simpler +/// value based on the demanded bits. When this function is called, it is known +/// that only the bits set in DemandedMask of the result of V are ever used +/// downstream. Consequently, depending on the mask and V, it may be possible +/// to replace V with a constant or one of its operands. In such cases, this +/// function does the replacement and returns true. In all other cases, it +/// returns false after analyzing the expression and setting KnownOne and known +/// to be one in the expression. KnownZero contains all the bits that are known +/// to be zero in the expression. These are provided to potentially allow the +/// caller (which might recursively be SimplifyDemandedBits itself) to simplify +/// the expression. KnownOne and KnownZero always follow the invariant that +/// KnownOne & KnownZero == 0. That is, a bit can't be both 1 and 0. Note that +/// the bits in KnownOne and KnownZero may only be accurate for those bits set +/// in DemandedMask. Note also that the bitwidth of V, DemandedMask, KnownZero +/// and KnownOne must all be the same. +/// +/// This returns null if it did not change anything and it permits no +/// simplification. This returns V itself if it did some simplification of V's +/// operands based on the information about what bits are demanded. This returns +/// some other non-null value if it found out that V is equal to another value +/// in the context where the specified bits are demanded, but not for all users. +Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, + APInt &KnownZero, APInt &KnownOne, + unsigned Depth) { + assert(V != 0 && "Null pointer of Value???"); + assert(Depth <= 6 && "Limit Search Depth"); + uint32_t BitWidth = DemandedMask.getBitWidth(); + const Type *VTy = V->getType(); + assert((TD || !VTy->isPointerTy()) && + "SimplifyDemandedBits needs to know bit widths!"); + assert((!TD || TD->getTypeSizeInBits(VTy->getScalarType()) == BitWidth) && + (!VTy->isIntOrIntVectorTy() || + VTy->getScalarSizeInBits() == BitWidth) && + KnownZero.getBitWidth() == BitWidth && + KnownOne.getBitWidth() == BitWidth && + "Value *V, DemandedMask, KnownZero and KnownOne " + "must have same BitWidth"); + if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) { + // We know all of the bits for a constant! + KnownOne = CI->getValue() & DemandedMask; + KnownZero = ~KnownOne & DemandedMask; + return 0; + } + if (isa<ConstantPointerNull>(V)) { + // We know all of the bits for a constant! + KnownOne.clearAllBits(); + KnownZero = DemandedMask; + return 0; + } + + KnownZero.clearAllBits(); + KnownOne.clearAllBits(); + if (DemandedMask == 0) { // Not demanding any bits from V. + if (isa<UndefValue>(V)) + return 0; + return UndefValue::get(VTy); + } + + if (Depth == 6) // Limit search depth. + return 0; + + APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0); + APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0); + + Instruction *I = dyn_cast<Instruction>(V); + if (!I) { + ComputeMaskedBits(V, DemandedMask, KnownZero, KnownOne, Depth); + return 0; // Only analyze instructions. + } + + // If there are multiple uses of this value and we aren't at the root, then + // we can't do any simplifications of the operands, because DemandedMask + // only reflects the bits demanded by *one* of the users. + if (Depth != 0 && !I->hasOneUse()) { + // Despite the fact that we can't simplify this instruction in all User's + // context, we can at least compute the knownzero/knownone bits, and we can + // do simplifications that apply to *just* the one user if we know that + // this instruction has a simpler value in that context. + if (I->getOpcode() == Instruction::And) { + // If either the LHS or the RHS are Zero, the result is zero. + ComputeMaskedBits(I->getOperand(1), DemandedMask, + RHSKnownZero, RHSKnownOne, Depth+1); + ComputeMaskedBits(I->getOperand(0), DemandedMask & ~RHSKnownZero, + LHSKnownZero, LHSKnownOne, Depth+1); + + // If all of the demanded bits are known 1 on one side, return the other. + // These bits cannot contribute to the result of the 'and' in this + // context. + if ((DemandedMask & ~LHSKnownZero & RHSKnownOne) == + (DemandedMask & ~LHSKnownZero)) + return I->getOperand(0); + if ((DemandedMask & ~RHSKnownZero & LHSKnownOne) == + (DemandedMask & ~RHSKnownZero)) + return I->getOperand(1); + + // If all of the demanded bits in the inputs are known zeros, return zero. + if ((DemandedMask & (RHSKnownZero|LHSKnownZero)) == DemandedMask) + return Constant::getNullValue(VTy); + + } else if (I->getOpcode() == Instruction::Or) { + // We can simplify (X|Y) -> X or Y in the user's context if we know that + // only bits from X or Y are demanded. + + // If either the LHS or the RHS are One, the result is One. + ComputeMaskedBits(I->getOperand(1), DemandedMask, + RHSKnownZero, RHSKnownOne, Depth+1); + ComputeMaskedBits(I->getOperand(0), DemandedMask & ~RHSKnownOne, + LHSKnownZero, LHSKnownOne, Depth+1); + + // If all of the demanded bits are known zero on one side, return the + // other. These bits cannot contribute to the result of the 'or' in this + // context. + if ((DemandedMask & ~LHSKnownOne & RHSKnownZero) == + (DemandedMask & ~LHSKnownOne)) + return I->getOperand(0); + if ((DemandedMask & ~RHSKnownOne & LHSKnownZero) == + (DemandedMask & ~RHSKnownOne)) + return I->getOperand(1); + + // If all of the potentially set bits on one side are known to be set on + // the other side, just use the 'other' side. + if ((DemandedMask & (~RHSKnownZero) & LHSKnownOne) == + (DemandedMask & (~RHSKnownZero))) + return I->getOperand(0); + if ((DemandedMask & (~LHSKnownZero) & RHSKnownOne) == + (DemandedMask & (~LHSKnownZero))) + return I->getOperand(1); + } + + // Compute the KnownZero/KnownOne bits to simplify things downstream. + ComputeMaskedBits(I, DemandedMask, KnownZero, KnownOne, Depth); + return 0; + } + + // If this is the root being simplified, allow it to have multiple uses, + // just set the DemandedMask to all bits so that we can try to simplify the + // operands. This allows visitTruncInst (for example) to simplify the + // operand of a trunc without duplicating all the logic below. + if (Depth == 0 && !V->hasOneUse()) + DemandedMask = APInt::getAllOnesValue(BitWidth); + + switch (I->getOpcode()) { + default: + ComputeMaskedBits(I, DemandedMask, KnownZero, KnownOne, Depth); + break; + case Instruction::And: + // If either the LHS or the RHS are Zero, the result is zero. + if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, + RHSKnownZero, RHSKnownOne, Depth+1) || + SimplifyDemandedBits(I->getOperandUse(0), DemandedMask & ~RHSKnownZero, + LHSKnownZero, LHSKnownOne, Depth+1)) + return I; + assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); + assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); + + // If all of the demanded bits are known 1 on one side, return the other. + // These bits cannot contribute to the result of the 'and'. + if ((DemandedMask & ~LHSKnownZero & RHSKnownOne) == + (DemandedMask & ~LHSKnownZero)) + return I->getOperand(0); + if ((DemandedMask & ~RHSKnownZero & LHSKnownOne) == + (DemandedMask & ~RHSKnownZero)) + return I->getOperand(1); + + // If all of the demanded bits in the inputs are known zeros, return zero. + if ((DemandedMask & (RHSKnownZero|LHSKnownZero)) == DemandedMask) + return Constant::getNullValue(VTy); + + // If the RHS is a constant, see if we can simplify it. + if (ShrinkDemandedConstant(I, 1, DemandedMask & ~LHSKnownZero)) + return I; + + // Output known-1 bits are only known if set in both the LHS & RHS. + KnownOne = RHSKnownOne & LHSKnownOne; + // Output known-0 are known to be clear if zero in either the LHS | RHS. + KnownZero = RHSKnownZero | LHSKnownZero; + break; + case Instruction::Or: + // If either the LHS or the RHS are One, the result is One. + if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, + RHSKnownZero, RHSKnownOne, Depth+1) || + SimplifyDemandedBits(I->getOperandUse(0), DemandedMask & ~RHSKnownOne, + LHSKnownZero, LHSKnownOne, Depth+1)) + return I; + assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); + assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); + + // If all of the demanded bits are known zero on one side, return the other. + // These bits cannot contribute to the result of the 'or'. + if ((DemandedMask & ~LHSKnownOne & RHSKnownZero) == + (DemandedMask & ~LHSKnownOne)) + return I->getOperand(0); + if ((DemandedMask & ~RHSKnownOne & LHSKnownZero) == + (DemandedMask & ~RHSKnownOne)) + return I->getOperand(1); + + // If all of the potentially set bits on one side are known to be set on + // the other side, just use the 'other' side. + if ((DemandedMask & (~RHSKnownZero) & LHSKnownOne) == + (DemandedMask & (~RHSKnownZero))) + return I->getOperand(0); + if ((DemandedMask & (~LHSKnownZero) & RHSKnownOne) == + (DemandedMask & (~LHSKnownZero))) + return I->getOperand(1); + + // If the RHS is a constant, see if we can simplify it. + if (ShrinkDemandedConstant(I, 1, DemandedMask)) + return I; + + // Output known-0 bits are only known if clear in both the LHS & RHS. + KnownZero = RHSKnownZero & LHSKnownZero; + // Output known-1 are known to be set if set in either the LHS | RHS. + KnownOne = RHSKnownOne | LHSKnownOne; + break; + case Instruction::Xor: { + if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, + RHSKnownZero, RHSKnownOne, Depth+1) || + SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, + LHSKnownZero, LHSKnownOne, Depth+1)) + return I; + assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); + assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); + + // If all of the demanded bits are known zero on one side, return the other. + // These bits cannot contribute to the result of the 'xor'. + if ((DemandedMask & RHSKnownZero) == DemandedMask) + return I->getOperand(0); + if ((DemandedMask & LHSKnownZero) == DemandedMask) + return I->getOperand(1); + + // If all of the demanded bits are known to be zero on one side or the + // other, turn this into an *inclusive* or. + // e.g. (A & C1)^(B & C2) -> (A & C1)|(B & C2) iff C1&C2 == 0 + if ((DemandedMask & ~RHSKnownZero & ~LHSKnownZero) == 0) { + Instruction *Or = + BinaryOperator::CreateOr(I->getOperand(0), I->getOperand(1), + I->getName()); + return InsertNewInstBefore(Or, *I); + } + + // If all of the demanded bits on one side are known, and all of the set + // bits on that side are also known to be set on the other side, turn this + // into an AND, as we know the bits will be cleared. + // e.g. (X | C1) ^ C2 --> (X | C1) & ~C2 iff (C1&C2) == C2 + if ((DemandedMask & (RHSKnownZero|RHSKnownOne)) == DemandedMask) { + // all known + if ((RHSKnownOne & LHSKnownOne) == RHSKnownOne) { + Constant *AndC = Constant::getIntegerValue(VTy, + ~RHSKnownOne & DemandedMask); + Instruction *And = + BinaryOperator::CreateAnd(I->getOperand(0), AndC, "tmp"); + return InsertNewInstBefore(And, *I); + } + } + + // If the RHS is a constant, see if we can simplify it. + // FIXME: for XOR, we prefer to force bits to 1 if they will make a -1. + if (ShrinkDemandedConstant(I, 1, DemandedMask)) + return I; + + // If our LHS is an 'and' and if it has one use, and if any of the bits we + // are flipping are known to be set, then the xor is just resetting those + // bits to zero. We can just knock out bits from the 'and' and the 'xor', + // simplifying both of them. + if (Instruction *LHSInst = dyn_cast<Instruction>(I->getOperand(0))) + if (LHSInst->getOpcode() == Instruction::And && LHSInst->hasOneUse() && + isa<ConstantInt>(I->getOperand(1)) && + isa<ConstantInt>(LHSInst->getOperand(1)) && + (LHSKnownOne & RHSKnownOne & DemandedMask) != 0) { + ConstantInt *AndRHS = cast<ConstantInt>(LHSInst->getOperand(1)); + ConstantInt *XorRHS = cast<ConstantInt>(I->getOperand(1)); + APInt NewMask = ~(LHSKnownOne & RHSKnownOne & DemandedMask); + + Constant *AndC = + ConstantInt::get(I->getType(), NewMask & AndRHS->getValue()); + Instruction *NewAnd = + BinaryOperator::CreateAnd(I->getOperand(0), AndC, "tmp"); + InsertNewInstBefore(NewAnd, *I); + + Constant *XorC = + ConstantInt::get(I->getType(), NewMask & XorRHS->getValue()); + Instruction *NewXor = + BinaryOperator::CreateXor(NewAnd, XorC, "tmp"); + return InsertNewInstBefore(NewXor, *I); + } + + // Output known-0 bits are known if clear or set in both the LHS & RHS. + KnownZero= (RHSKnownZero & LHSKnownZero) | (RHSKnownOne & LHSKnownOne); + // Output known-1 are known to be set if set in only one of the LHS, RHS. + KnownOne = (RHSKnownZero & LHSKnownOne) | (RHSKnownOne & LHSKnownZero); + break; + } + case Instruction::Select: + if (SimplifyDemandedBits(I->getOperandUse(2), DemandedMask, + RHSKnownZero, RHSKnownOne, Depth+1) || + SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, + LHSKnownZero, LHSKnownOne, Depth+1)) + return I; + assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?"); + assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?"); + + // If the operands are constants, see if we can simplify them. + if (ShrinkDemandedConstant(I, 1, DemandedMask) || + ShrinkDemandedConstant(I, 2, DemandedMask)) + return I; + + // Only known if known in both the LHS and RHS. + KnownOne = RHSKnownOne & LHSKnownOne; + KnownZero = RHSKnownZero & LHSKnownZero; + break; + case Instruction::Trunc: { + unsigned truncBf = I->getOperand(0)->getType()->getScalarSizeInBits(); + DemandedMask = DemandedMask.zext(truncBf); + KnownZero = KnownZero.zext(truncBf); + KnownOne = KnownOne.zext(truncBf); + if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, + KnownZero, KnownOne, Depth+1)) + return I; + DemandedMask = DemandedMask.trunc(BitWidth); + KnownZero = KnownZero.trunc(BitWidth); + KnownOne = KnownOne.trunc(BitWidth); + assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); + break; + } + case Instruction::BitCast: + if (!I->getOperand(0)->getType()->isIntOrIntVectorTy()) + return 0; // vector->int or fp->int? + + if (const VectorType *DstVTy = dyn_cast<VectorType>(I->getType())) { + if (const VectorType *SrcVTy = + dyn_cast<VectorType>(I->getOperand(0)->getType())) { + if (DstVTy->getNumElements() != SrcVTy->getNumElements()) + // Don't touch a bitcast between vectors of different element counts. + return 0; + } else + // Don't touch a scalar-to-vector bitcast. + return 0; + } else if (I->getOperand(0)->getType()->isVectorTy()) + // Don't touch a vector-to-scalar bitcast. + return 0; + + if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, + KnownZero, KnownOne, Depth+1)) + return I; + assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); + break; + case Instruction::ZExt: { + // Compute the bits in the result that are not present in the input. + unsigned SrcBitWidth =I->getOperand(0)->getType()->getScalarSizeInBits(); + + DemandedMask = DemandedMask.trunc(SrcBitWidth); + KnownZero = KnownZero.trunc(SrcBitWidth); + KnownOne = KnownOne.trunc(SrcBitWidth); + if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, + KnownZero, KnownOne, Depth+1)) + return I; + DemandedMask = DemandedMask.zext(BitWidth); + KnownZero = KnownZero.zext(BitWidth); + KnownOne = KnownOne.zext(BitWidth); + assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); + // The top bits are known to be zero. + KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth); + break; + } + case Instruction::SExt: { + // Compute the bits in the result that are not present in the input. + unsigned SrcBitWidth =I->getOperand(0)->getType()->getScalarSizeInBits(); + + APInt InputDemandedBits = DemandedMask & + APInt::getLowBitsSet(BitWidth, SrcBitWidth); + + APInt NewBits(APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth)); + // If any of the sign extended bits are demanded, we know that the sign + // bit is demanded. + if ((NewBits & DemandedMask) != 0) + InputDemandedBits.setBit(SrcBitWidth-1); + + InputDemandedBits = InputDemandedBits.trunc(SrcBitWidth); + KnownZero = KnownZero.trunc(SrcBitWidth); + KnownOne = KnownOne.trunc(SrcBitWidth); + if (SimplifyDemandedBits(I->getOperandUse(0), InputDemandedBits, + KnownZero, KnownOne, Depth+1)) + return I; + InputDemandedBits = InputDemandedBits.zext(BitWidth); + KnownZero = KnownZero.zext(BitWidth); + KnownOne = KnownOne.zext(BitWidth); + assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); + + // If the sign bit of the input is known set or clear, then we know the + // top bits of the result. + + // If the input sign bit is known zero, or if the NewBits are not demanded + // convert this into a zero extension. + if (KnownZero[SrcBitWidth-1] || (NewBits & ~DemandedMask) == NewBits) { + // Convert to ZExt cast + CastInst *NewCast = new ZExtInst(I->getOperand(0), VTy, I->getName()); + return InsertNewInstBefore(NewCast, *I); + } else if (KnownOne[SrcBitWidth-1]) { // Input sign bit known set + KnownOne |= NewBits; + } + break; + } + case Instruction::Add: { + // Figure out what the input bits are. If the top bits of the and result + // are not demanded, then the add doesn't demand them from its input + // either. + unsigned NLZ = DemandedMask.countLeadingZeros(); + + // If there is a constant on the RHS, there are a variety of xformations + // we can do. + if (ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1))) { + // If null, this should be simplified elsewhere. Some of the xforms here + // won't work if the RHS is zero. + if (RHS->isZero()) + break; + + // If the top bit of the output is demanded, demand everything from the + // input. Otherwise, we demand all the input bits except NLZ top bits. + APInt InDemandedBits(APInt::getLowBitsSet(BitWidth, BitWidth - NLZ)); + + // Find information about known zero/one bits in the input. + if (SimplifyDemandedBits(I->getOperandUse(0), InDemandedBits, + LHSKnownZero, LHSKnownOne, Depth+1)) + return I; + + // If the RHS of the add has bits set that can't affect the input, reduce + // the constant. + if (ShrinkDemandedConstant(I, 1, InDemandedBits)) + return I; + + // Avoid excess work. + if (LHSKnownZero == 0 && LHSKnownOne == 0) + break; + + // Turn it into OR if input bits are zero. + if ((LHSKnownZero & RHS->getValue()) == RHS->getValue()) { + Instruction *Or = + BinaryOperator::CreateOr(I->getOperand(0), I->getOperand(1), + I->getName()); + return InsertNewInstBefore(Or, *I); + } + + // We can say something about the output known-zero and known-one bits, + // depending on potential carries from the input constant and the + // unknowns. For example if the LHS is known to have at most the 0x0F0F0 + // bits set and the RHS constant is 0x01001, then we know we have a known + // one mask of 0x00001 and a known zero mask of 0xE0F0E. + + // To compute this, we first compute the potential carry bits. These are + // the bits which may be modified. I'm not aware of a better way to do + // this scan. + const APInt &RHSVal = RHS->getValue(); + APInt CarryBits((~LHSKnownZero + RHSVal) ^ (~LHSKnownZero ^ RHSVal)); + + // Now that we know which bits have carries, compute the known-1/0 sets. + + // Bits are known one if they are known zero in one operand and one in the + // other, and there is no input carry. + KnownOne = ((LHSKnownZero & RHSVal) | + (LHSKnownOne & ~RHSVal)) & ~CarryBits; + + // Bits are known zero if they are known zero in both operands and there + // is no input carry. + KnownZero = LHSKnownZero & ~RHSVal & ~CarryBits; + } else { + // If the high-bits of this ADD are not demanded, then it does not demand + // the high bits of its LHS or RHS. + if (DemandedMask[BitWidth-1] == 0) { + // Right fill the mask of bits for this ADD to demand the most + // significant bit and all those below it. + APInt DemandedFromOps(APInt::getLowBitsSet(BitWidth, BitWidth-NLZ)); + if (SimplifyDemandedBits(I->getOperandUse(0), DemandedFromOps, + LHSKnownZero, LHSKnownOne, Depth+1) || + SimplifyDemandedBits(I->getOperandUse(1), DemandedFromOps, + LHSKnownZero, LHSKnownOne, Depth+1)) + return I; + } + } + break; + } + case Instruction::Sub: + // If the high-bits of this SUB are not demanded, then it does not demand + // the high bits of its LHS or RHS. + if (DemandedMask[BitWidth-1] == 0) { + // Right fill the mask of bits for this SUB to demand the most + // significant bit and all those below it. + uint32_t NLZ = DemandedMask.countLeadingZeros(); + APInt DemandedFromOps(APInt::getLowBitsSet(BitWidth, BitWidth-NLZ)); + if (SimplifyDemandedBits(I->getOperandUse(0), DemandedFromOps, + LHSKnownZero, LHSKnownOne, Depth+1) || + SimplifyDemandedBits(I->getOperandUse(1), DemandedFromOps, + LHSKnownZero, LHSKnownOne, Depth+1)) + return I; + } + // Otherwise just hand the sub off to ComputeMaskedBits to fill in + // the known zeros and ones. + ComputeMaskedBits(V, DemandedMask, KnownZero, KnownOne, Depth); + break; + case Instruction::Shl: + if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) { + uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1); + APInt DemandedMaskIn(DemandedMask.lshr(ShiftAmt)); + + // If the shift is NUW/NSW, then it does demand the high bits. + ShlOperator *IOp = cast<ShlOperator>(I); + if (IOp->hasNoSignedWrap()) + DemandedMaskIn |= APInt::getHighBitsSet(BitWidth, ShiftAmt+1); + else if (IOp->hasNoUnsignedWrap()) + DemandedMaskIn |= APInt::getHighBitsSet(BitWidth, ShiftAmt); + + if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn, + KnownZero, KnownOne, Depth+1)) + return I; + assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); + KnownZero <<= ShiftAmt; + KnownOne <<= ShiftAmt; + // low bits known zero. + if (ShiftAmt) + KnownZero |= APInt::getLowBitsSet(BitWidth, ShiftAmt); + } + break; + case Instruction::LShr: + // For a logical shift right + if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) { + uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1); + + // Unsigned shift right. + APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt)); + + // If the shift is exact, then it does demand the low bits (and knows that + // they are zero). + if (cast<LShrOperator>(I)->isExact()) + DemandedMaskIn |= APInt::getLowBitsSet(BitWidth, ShiftAmt); + + if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn, + KnownZero, KnownOne, Depth+1)) + return I; + assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); + KnownZero = APIntOps::lshr(KnownZero, ShiftAmt); + KnownOne = APIntOps::lshr(KnownOne, ShiftAmt); + if (ShiftAmt) { + // Compute the new bits that are at the top now. + APInt HighBits(APInt::getHighBitsSet(BitWidth, ShiftAmt)); + KnownZero |= HighBits; // high bits known zero. + } + } + break; + case Instruction::AShr: + // If this is an arithmetic shift right and only the low-bit is set, we can + // always convert this into a logical shr, even if the shift amount is + // variable. The low bit of the shift cannot be an input sign bit unless + // the shift amount is >= the size of the datatype, which is undefined. + if (DemandedMask == 1) { + // Perform the logical shift right. + Instruction *NewVal = BinaryOperator::CreateLShr( + I->getOperand(0), I->getOperand(1), I->getName()); + return InsertNewInstBefore(NewVal, *I); + } + + // If the sign bit is the only bit demanded by this ashr, then there is no + // need to do it, the shift doesn't change the high bit. + if (DemandedMask.isSignBit()) + return I->getOperand(0); + + if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) { + uint32_t ShiftAmt = SA->getLimitedValue(BitWidth-1); + + // Signed shift right. + APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt)); + // If any of the "high bits" are demanded, we should set the sign bit as + // demanded. + if (DemandedMask.countLeadingZeros() <= ShiftAmt) + DemandedMaskIn.setBit(BitWidth-1); + + // If the shift is exact, then it does demand the low bits (and knows that + // they are zero). + if (cast<AShrOperator>(I)->isExact()) + DemandedMaskIn |= APInt::getLowBitsSet(BitWidth, ShiftAmt); + + if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn, + KnownZero, KnownOne, Depth+1)) + return I; + assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); + // Compute the new bits that are at the top now. + APInt HighBits(APInt::getHighBitsSet(BitWidth, ShiftAmt)); + KnownZero = APIntOps::lshr(KnownZero, ShiftAmt); + KnownOne = APIntOps::lshr(KnownOne, ShiftAmt); + + // Handle the sign bits. + APInt SignBit(APInt::getSignBit(BitWidth)); + // Adjust to where it is now in the mask. + SignBit = APIntOps::lshr(SignBit, ShiftAmt); + + // If the input sign bit is known to be zero, or if none of the top bits + // are demanded, turn this into an unsigned shift right. + if (BitWidth <= ShiftAmt || KnownZero[BitWidth-ShiftAmt-1] || + (HighBits & ~DemandedMask) == HighBits) { + // Perform the logical shift right. + Instruction *NewVal = BinaryOperator::CreateLShr( + I->getOperand(0), SA, I->getName()); + return InsertNewInstBefore(NewVal, *I); + } else if ((KnownOne & SignBit) != 0) { // New bits are known one. + KnownOne |= HighBits; + } + } + break; + case Instruction::SRem: + if (ConstantInt *Rem = dyn_cast<ConstantInt>(I->getOperand(1))) { + APInt RA = Rem->getValue().abs(); + if (RA.isPowerOf2()) { + if (DemandedMask.ult(RA)) // srem won't affect demanded bits + return I->getOperand(0); + + APInt LowBits = RA - 1; + APInt Mask2 = LowBits | APInt::getSignBit(BitWidth); + if (SimplifyDemandedBits(I->getOperandUse(0), Mask2, + LHSKnownZero, LHSKnownOne, Depth+1)) + return I; + + // The low bits of LHS are unchanged by the srem. + KnownZero = LHSKnownZero & LowBits; + KnownOne = LHSKnownOne & LowBits; + + // If LHS is non-negative or has all low bits zero, then the upper bits + // are all zero. + if (LHSKnownZero[BitWidth-1] || ((LHSKnownZero & LowBits) == LowBits)) + KnownZero |= ~LowBits; + + // If LHS is negative and not all low bits are zero, then the upper bits + // are all one. + if (LHSKnownOne[BitWidth-1] && ((LHSKnownOne & LowBits) != 0)) + KnownOne |= ~LowBits; + + assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?"); + } + } + break; + case Instruction::URem: { + APInt KnownZero2(BitWidth, 0), KnownOne2(BitWidth, 0); + APInt AllOnes = APInt::getAllOnesValue(BitWidth); + if (SimplifyDemandedBits(I->getOperandUse(0), AllOnes, + KnownZero2, KnownOne2, Depth+1) || + SimplifyDemandedBits(I->getOperandUse(1), AllOnes, + KnownZero2, KnownOne2, Depth+1)) + return I; + + unsigned Leaders = KnownZero2.countLeadingOnes(); + Leaders = std::max(Leaders, + KnownZero2.countLeadingOnes()); + KnownZero = APInt::getHighBitsSet(BitWidth, Leaders) & DemandedMask; + break; + } + case Instruction::Call: + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + switch (II->getIntrinsicID()) { + default: break; + case Intrinsic::bswap: { + // If the only bits demanded come from one byte of the bswap result, + // just shift the input byte into position to eliminate the bswap. + unsigned NLZ = DemandedMask.countLeadingZeros(); + unsigned NTZ = DemandedMask.countTrailingZeros(); + + // Round NTZ down to the next byte. If we have 11 trailing zeros, then + // we need all the bits down to bit 8. Likewise, round NLZ. If we + // have 14 leading zeros, round to 8. + NLZ &= ~7; + NTZ &= ~7; + // If we need exactly one byte, we can do this transformation. + if (BitWidth-NLZ-NTZ == 8) { + unsigned ResultBit = NTZ; + unsigned InputBit = BitWidth-NTZ-8; + + // Replace this with either a left or right shift to get the byte into + // the right place. + Instruction *NewVal; + if (InputBit > ResultBit) + NewVal = BinaryOperator::CreateLShr(II->getArgOperand(0), + ConstantInt::get(I->getType(), InputBit-ResultBit)); + else + NewVal = BinaryOperator::CreateShl(II->getArgOperand(0), + ConstantInt::get(I->getType(), ResultBit-InputBit)); + NewVal->takeName(I); + return InsertNewInstBefore(NewVal, *I); + } + + // TODO: Could compute known zero/one bits based on the input. + break; + } + } + } + ComputeMaskedBits(V, DemandedMask, KnownZero, KnownOne, Depth); + break; + } + + // If the client is only demanding bits that we know, return the known + // constant. + if ((DemandedMask & (KnownZero|KnownOne)) == DemandedMask) + return Constant::getIntegerValue(VTy, KnownOne); + return 0; +} + + +/// SimplifyDemandedVectorElts - The specified value produces a vector with +/// any number of elements. DemandedElts contains the set of elements that are +/// actually used by the caller. This method analyzes which elements of the +/// operand are undef and returns that information in UndefElts. +/// +/// If the information about demanded elements can be used to simplify the +/// operation, the operation is simplified, then the resultant value is +/// returned. This returns null if no change was made. +Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, + APInt &UndefElts, + unsigned Depth) { + unsigned VWidth = cast<VectorType>(V->getType())->getNumElements(); + APInt EltMask(APInt::getAllOnesValue(VWidth)); + assert((DemandedElts & ~EltMask) == 0 && "Invalid DemandedElts!"); + + if (isa<UndefValue>(V)) { + // If the entire vector is undefined, just return this info. + UndefElts = EltMask; + return 0; + } + + if (DemandedElts == 0) { // If nothing is demanded, provide undef. + UndefElts = EltMask; + return UndefValue::get(V->getType()); + } + + UndefElts = 0; + if (ConstantVector *CV = dyn_cast<ConstantVector>(V)) { + const Type *EltTy = cast<VectorType>(V->getType())->getElementType(); + Constant *Undef = UndefValue::get(EltTy); + + std::vector<Constant*> Elts; + for (unsigned i = 0; i != VWidth; ++i) + if (!DemandedElts[i]) { // If not demanded, set to undef. + Elts.push_back(Undef); + UndefElts.setBit(i); + } else if (isa<UndefValue>(CV->getOperand(i))) { // Already undef. + Elts.push_back(Undef); + UndefElts.setBit(i); + } else { // Otherwise, defined. + Elts.push_back(CV->getOperand(i)); + } + + // If we changed the constant, return it. + Constant *NewCP = ConstantVector::get(Elts); + return NewCP != CV ? NewCP : 0; + } + + if (isa<ConstantAggregateZero>(V)) { + // Simplify the CAZ to a ConstantVector where the non-demanded elements are + // set to undef. + + // Check if this is identity. If so, return 0 since we are not simplifying + // anything. + if (DemandedElts.isAllOnesValue()) + return 0; + + const Type *EltTy = cast<VectorType>(V->getType())->getElementType(); + Constant *Zero = Constant::getNullValue(EltTy); + Constant *Undef = UndefValue::get(EltTy); + std::vector<Constant*> Elts; + for (unsigned i = 0; i != VWidth; ++i) { + Constant *Elt = DemandedElts[i] ? Zero : Undef; + Elts.push_back(Elt); + } + UndefElts = DemandedElts ^ EltMask; + return ConstantVector::get(Elts); + } + + // Limit search depth. + if (Depth == 10) + return 0; + + // If multiple users are using the root value, procede with + // simplification conservatively assuming that all elements + // are needed. + if (!V->hasOneUse()) { + // Quit if we find multiple users of a non-root value though. + // They'll be handled when it's their turn to be visited by + // the main instcombine process. + if (Depth != 0) + // TODO: Just compute the UndefElts information recursively. + return 0; + + // Conservatively assume that all elements are needed. + DemandedElts = EltMask; + } + + Instruction *I = dyn_cast<Instruction>(V); + if (!I) return 0; // Only analyze instructions. + + bool MadeChange = false; + APInt UndefElts2(VWidth, 0); + Value *TmpV; + switch (I->getOpcode()) { + default: break; + + case Instruction::InsertElement: { + // If this is a variable index, we don't know which element it overwrites. + // demand exactly the same input as we produce. + ConstantInt *Idx = dyn_cast<ConstantInt>(I->getOperand(2)); + if (Idx == 0) { + // Note that we can't propagate undef elt info, because we don't know + // which elt is getting updated. + TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts, + UndefElts2, Depth+1); + if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; } + break; + } + + // If this is inserting an element that isn't demanded, remove this + // insertelement. + unsigned IdxNo = Idx->getZExtValue(); + if (IdxNo >= VWidth || !DemandedElts[IdxNo]) { + Worklist.Add(I); + return I->getOperand(0); + } + + // Otherwise, the element inserted overwrites whatever was there, so the + // input demanded set is simpler than the output set. + APInt DemandedElts2 = DemandedElts; + DemandedElts2.clearBit(IdxNo); + TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts2, + UndefElts, Depth+1); + if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; } + + // The inserted element is defined. + UndefElts.clearBit(IdxNo); + break; + } + case Instruction::ShuffleVector: { + ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(I); + uint64_t LHSVWidth = + cast<VectorType>(Shuffle->getOperand(0)->getType())->getNumElements(); + APInt LeftDemanded(LHSVWidth, 0), RightDemanded(LHSVWidth, 0); + for (unsigned i = 0; i < VWidth; i++) { + if (DemandedElts[i]) { + unsigned MaskVal = Shuffle->getMaskValue(i); + if (MaskVal != -1u) { + assert(MaskVal < LHSVWidth * 2 && + "shufflevector mask index out of range!"); + if (MaskVal < LHSVWidth) + LeftDemanded.setBit(MaskVal); + else + RightDemanded.setBit(MaskVal - LHSVWidth); + } + } + } + + APInt UndefElts4(LHSVWidth, 0); + TmpV = SimplifyDemandedVectorElts(I->getOperand(0), LeftDemanded, + UndefElts4, Depth+1); + if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; } + + APInt UndefElts3(LHSVWidth, 0); + TmpV = SimplifyDemandedVectorElts(I->getOperand(1), RightDemanded, + UndefElts3, Depth+1); + if (TmpV) { I->setOperand(1, TmpV); MadeChange = true; } + + bool NewUndefElts = false; + for (unsigned i = 0; i < VWidth; i++) { + unsigned MaskVal = Shuffle->getMaskValue(i); + if (MaskVal == -1u) { + UndefElts.setBit(i); + } else if (MaskVal < LHSVWidth) { + if (UndefElts4[MaskVal]) { + NewUndefElts = true; + UndefElts.setBit(i); + } + } else { + if (UndefElts3[MaskVal - LHSVWidth]) { + NewUndefElts = true; + UndefElts.setBit(i); + } + } + } + + if (NewUndefElts) { + // Add additional discovered undefs. + std::vector<Constant*> Elts; + for (unsigned i = 0; i < VWidth; ++i) { + if (UndefElts[i]) + Elts.push_back(UndefValue::get(Type::getInt32Ty(I->getContext()))); + else + Elts.push_back(ConstantInt::get(Type::getInt32Ty(I->getContext()), + Shuffle->getMaskValue(i))); + } + I->setOperand(2, ConstantVector::get(Elts)); + MadeChange = true; + } + break; + } + case Instruction::BitCast: { + // Vector->vector casts only. + const VectorType *VTy = dyn_cast<VectorType>(I->getOperand(0)->getType()); + if (!VTy) break; + unsigned InVWidth = VTy->getNumElements(); + APInt InputDemandedElts(InVWidth, 0); + unsigned Ratio; + + if (VWidth == InVWidth) { + // If we are converting from <4 x i32> -> <4 x f32>, we demand the same + // elements as are demanded of us. + Ratio = 1; + InputDemandedElts = DemandedElts; + } else if (VWidth > InVWidth) { + // Untested so far. + break; + + // If there are more elements in the result than there are in the source, + // then an input element is live if any of the corresponding output + // elements are live. + Ratio = VWidth/InVWidth; + for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) { + if (DemandedElts[OutIdx]) + InputDemandedElts.setBit(OutIdx/Ratio); + } + } else { + // Untested so far. + break; + + // If there are more elements in the source than there are in the result, + // then an input element is live if the corresponding output element is + // live. + Ratio = InVWidth/VWidth; + for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx) + if (DemandedElts[InIdx/Ratio]) + InputDemandedElts.setBit(InIdx); + } + + // div/rem demand all inputs, because they don't want divide by zero. + TmpV = SimplifyDemandedVectorElts(I->getOperand(0), InputDemandedElts, + UndefElts2, Depth+1); + if (TmpV) { + I->setOperand(0, TmpV); + MadeChange = true; + } + + UndefElts = UndefElts2; + if (VWidth > InVWidth) { + llvm_unreachable("Unimp"); + // If there are more elements in the result than there are in the source, + // then an output element is undef if the corresponding input element is + // undef. + for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) + if (UndefElts2[OutIdx/Ratio]) + UndefElts.setBit(OutIdx); + } else if (VWidth < InVWidth) { + llvm_unreachable("Unimp"); + // If there are more elements in the source than there are in the result, + // then a result element is undef if all of the corresponding input + // elements are undef. + UndefElts = ~0ULL >> (64-VWidth); // Start out all undef. + for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx) + if (!UndefElts2[InIdx]) // Not undef? + UndefElts.clearBit(InIdx/Ratio); // Clear undef bit. + } + break; + } + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + // div/rem demand all inputs, because they don't want divide by zero. + TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts, + UndefElts, Depth+1); + if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; } + TmpV = SimplifyDemandedVectorElts(I->getOperand(1), DemandedElts, + UndefElts2, Depth+1); + if (TmpV) { I->setOperand(1, TmpV); MadeChange = true; } + + // Output elements are undefined if both are undefined. Consider things + // like undef&0. The result is known zero, not undef. + UndefElts &= UndefElts2; + break; + + case Instruction::Call: { + IntrinsicInst *II = dyn_cast<IntrinsicInst>(I); + if (!II) break; + switch (II->getIntrinsicID()) { + default: break; + + // Binary vector operations that work column-wise. A dest element is a + // function of the corresponding input elements from the two inputs. + case Intrinsic::x86_sse_sub_ss: + case Intrinsic::x86_sse_mul_ss: + case Intrinsic::x86_sse_min_ss: + case Intrinsic::x86_sse_max_ss: + case Intrinsic::x86_sse2_sub_sd: + case Intrinsic::x86_sse2_mul_sd: + case Intrinsic::x86_sse2_min_sd: + case Intrinsic::x86_sse2_max_sd: + TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts, + UndefElts, Depth+1); + if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; } + TmpV = SimplifyDemandedVectorElts(II->getArgOperand(1), DemandedElts, + UndefElts2, Depth+1); + if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; } + + // If only the low elt is demanded and this is a scalarizable intrinsic, + // scalarize it now. + if (DemandedElts == 1) { + switch (II->getIntrinsicID()) { + default: break; + case Intrinsic::x86_sse_sub_ss: + case Intrinsic::x86_sse_mul_ss: + case Intrinsic::x86_sse2_sub_sd: + case Intrinsic::x86_sse2_mul_sd: + // TODO: Lower MIN/MAX/ABS/etc + Value *LHS = II->getArgOperand(0); + Value *RHS = II->getArgOperand(1); + // Extract the element as scalars. + LHS = InsertNewInstBefore(ExtractElementInst::Create(LHS, + ConstantInt::get(Type::getInt32Ty(I->getContext()), 0U)), *II); + RHS = InsertNewInstBefore(ExtractElementInst::Create(RHS, + ConstantInt::get(Type::getInt32Ty(I->getContext()), 0U)), *II); + + switch (II->getIntrinsicID()) { + default: llvm_unreachable("Case stmts out of sync!"); + case Intrinsic::x86_sse_sub_ss: + case Intrinsic::x86_sse2_sub_sd: + TmpV = InsertNewInstBefore(BinaryOperator::CreateFSub(LHS, RHS, + II->getName()), *II); + break; + case Intrinsic::x86_sse_mul_ss: + case Intrinsic::x86_sse2_mul_sd: + TmpV = InsertNewInstBefore(BinaryOperator::CreateFMul(LHS, RHS, + II->getName()), *II); + break; + } + + Instruction *New = + InsertElementInst::Create( + UndefValue::get(II->getType()), TmpV, + ConstantInt::get(Type::getInt32Ty(I->getContext()), 0U, false), + II->getName()); + InsertNewInstBefore(New, *II); + return New; + } + } + + // Output elements are undefined if both are undefined. Consider things + // like undef&0. The result is known zero, not undef. + UndefElts &= UndefElts2; + break; + } + break; + } + } + return MadeChange ? I : 0; +} diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp new file mode 100644 index 0000000..5caa12d --- /dev/null +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -0,0 +1,567 @@ +//===- InstCombineVectorOps.cpp -------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements instcombine for ExtractElement, InsertElement and +// ShuffleVector. +// +//===----------------------------------------------------------------------===// + +#include "InstCombine.h" +using namespace llvm; + +/// CheapToScalarize - Return true if the value is cheaper to scalarize than it +/// is to leave as a vector operation. +static bool CheapToScalarize(Value *V, bool isConstant) { + if (isa<ConstantAggregateZero>(V)) + return true; + if (ConstantVector *C = dyn_cast<ConstantVector>(V)) { + if (isConstant) return true; + // If all elts are the same, we can extract. + Constant *Op0 = C->getOperand(0); + for (unsigned i = 1; i < C->getNumOperands(); ++i) + if (C->getOperand(i) != Op0) + return false; + return true; + } + Instruction *I = dyn_cast<Instruction>(V); + if (!I) return false; + + // Insert element gets simplified to the inserted element or is deleted if + // this is constant idx extract element and its a constant idx insertelt. + if (I->getOpcode() == Instruction::InsertElement && isConstant && + isa<ConstantInt>(I->getOperand(2))) + return true; + if (I->getOpcode() == Instruction::Load && I->hasOneUse()) + return true; + if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) + if (BO->hasOneUse() && + (CheapToScalarize(BO->getOperand(0), isConstant) || + CheapToScalarize(BO->getOperand(1), isConstant))) + return true; + if (CmpInst *CI = dyn_cast<CmpInst>(I)) + if (CI->hasOneUse() && + (CheapToScalarize(CI->getOperand(0), isConstant) || + CheapToScalarize(CI->getOperand(1), isConstant))) + return true; + + return false; +} + +/// getShuffleMask - Read and decode a shufflevector mask. +/// Turn undef elements into negative values. +static std::vector<int> getShuffleMask(const ShuffleVectorInst *SVI) { + unsigned NElts = SVI->getType()->getNumElements(); + if (isa<ConstantAggregateZero>(SVI->getOperand(2))) + return std::vector<int>(NElts, 0); + if (isa<UndefValue>(SVI->getOperand(2))) + return std::vector<int>(NElts, -1); + + std::vector<int> Result; + const ConstantVector *CP = cast<ConstantVector>(SVI->getOperand(2)); + for (User::const_op_iterator i = CP->op_begin(), e = CP->op_end(); i!=e; ++i) + if (isa<UndefValue>(*i)) + Result.push_back(-1); // undef + else + Result.push_back(cast<ConstantInt>(*i)->getZExtValue()); + return Result; +} + +/// FindScalarElement - Given a vector and an element number, see if the scalar +/// value is already around as a register, for example if it were inserted then +/// extracted from the vector. +static Value *FindScalarElement(Value *V, unsigned EltNo) { + assert(V->getType()->isVectorTy() && "Not looking at a vector?"); + const VectorType *PTy = cast<VectorType>(V->getType()); + unsigned Width = PTy->getNumElements(); + if (EltNo >= Width) // Out of range access. + return UndefValue::get(PTy->getElementType()); + + if (isa<UndefValue>(V)) + return UndefValue::get(PTy->getElementType()); + if (isa<ConstantAggregateZero>(V)) + return Constant::getNullValue(PTy->getElementType()); + if (ConstantVector *CP = dyn_cast<ConstantVector>(V)) + return CP->getOperand(EltNo); + + if (InsertElementInst *III = dyn_cast<InsertElementInst>(V)) { + // If this is an insert to a variable element, we don't know what it is. + if (!isa<ConstantInt>(III->getOperand(2))) + return 0; + unsigned IIElt = cast<ConstantInt>(III->getOperand(2))->getZExtValue(); + + // If this is an insert to the element we are looking for, return the + // inserted value. + if (EltNo == IIElt) + return III->getOperand(1); + + // Otherwise, the insertelement doesn't modify the value, recurse on its + // vector input. + return FindScalarElement(III->getOperand(0), EltNo); + } + + if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(V)) { + unsigned LHSWidth = + cast<VectorType>(SVI->getOperand(0)->getType())->getNumElements(); + int InEl = getShuffleMask(SVI)[EltNo]; + if (InEl < 0) + return UndefValue::get(PTy->getElementType()); + if (InEl < (int)LHSWidth) + return FindScalarElement(SVI->getOperand(0), InEl); + return FindScalarElement(SVI->getOperand(1), InEl - LHSWidth); + } + + // Otherwise, we don't know. + return 0; +} + +Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { + // If vector val is undef, replace extract with scalar undef. + if (isa<UndefValue>(EI.getOperand(0))) + return ReplaceInstUsesWith(EI, UndefValue::get(EI.getType())); + + // If vector val is constant 0, replace extract with scalar 0. + if (isa<ConstantAggregateZero>(EI.getOperand(0))) + return ReplaceInstUsesWith(EI, Constant::getNullValue(EI.getType())); + + if (ConstantVector *C = dyn_cast<ConstantVector>(EI.getOperand(0))) { + // If vector val is constant with all elements the same, replace EI with + // that element. When the elements are not identical, we cannot replace yet + // (we do that below, but only when the index is constant). + Constant *op0 = C->getOperand(0); + for (unsigned i = 1; i != C->getNumOperands(); ++i) + if (C->getOperand(i) != op0) { + op0 = 0; + break; + } + if (op0) + return ReplaceInstUsesWith(EI, op0); + } + + // If extracting a specified index from the vector, see if we can recursively + // find a previously computed scalar that was inserted into the vector. + if (ConstantInt *IdxC = dyn_cast<ConstantInt>(EI.getOperand(1))) { + unsigned IndexVal = IdxC->getZExtValue(); + unsigned VectorWidth = EI.getVectorOperandType()->getNumElements(); + + // If this is extracting an invalid index, turn this into undef, to avoid + // crashing the code below. + if (IndexVal >= VectorWidth) + return ReplaceInstUsesWith(EI, UndefValue::get(EI.getType())); + + // This instruction only demands the single element from the input vector. + // If the input vector has a single use, simplify it based on this use + // property. + if (EI.getOperand(0)->hasOneUse() && VectorWidth != 1) { + APInt UndefElts(VectorWidth, 0); + APInt DemandedMask(VectorWidth, 0); + DemandedMask.setBit(IndexVal); + if (Value *V = SimplifyDemandedVectorElts(EI.getOperand(0), + DemandedMask, UndefElts)) { + EI.setOperand(0, V); + return &EI; + } + } + + if (Value *Elt = FindScalarElement(EI.getOperand(0), IndexVal)) + return ReplaceInstUsesWith(EI, Elt); + + // If the this extractelement is directly using a bitcast from a vector of + // the same number of elements, see if we can find the source element from + // it. In this case, we will end up needing to bitcast the scalars. + if (BitCastInst *BCI = dyn_cast<BitCastInst>(EI.getOperand(0))) { + if (const VectorType *VT = + dyn_cast<VectorType>(BCI->getOperand(0)->getType())) + if (VT->getNumElements() == VectorWidth) + if (Value *Elt = FindScalarElement(BCI->getOperand(0), IndexVal)) + return new BitCastInst(Elt, EI.getType()); + } + } + + if (Instruction *I = dyn_cast<Instruction>(EI.getOperand(0))) { + // Push extractelement into predecessor operation if legal and + // profitable to do so + if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) { + if (I->hasOneUse() && + CheapToScalarize(BO, isa<ConstantInt>(EI.getOperand(1)))) { + Value *newEI0 = + Builder->CreateExtractElement(BO->getOperand(0), EI.getOperand(1), + EI.getName()+".lhs"); + Value *newEI1 = + Builder->CreateExtractElement(BO->getOperand(1), EI.getOperand(1), + EI.getName()+".rhs"); + return BinaryOperator::Create(BO->getOpcode(), newEI0, newEI1); + } + } else if (InsertElementInst *IE = dyn_cast<InsertElementInst>(I)) { + // Extracting the inserted element? + if (IE->getOperand(2) == EI.getOperand(1)) + return ReplaceInstUsesWith(EI, IE->getOperand(1)); + // If the inserted and extracted elements are constants, they must not + // be the same value, extract from the pre-inserted value instead. + if (isa<Constant>(IE->getOperand(2)) && isa<Constant>(EI.getOperand(1))) { + Worklist.AddValue(EI.getOperand(0)); + EI.setOperand(0, IE->getOperand(0)); + return &EI; + } + } else if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(I)) { + // If this is extracting an element from a shufflevector, figure out where + // it came from and extract from the appropriate input element instead. + if (ConstantInt *Elt = dyn_cast<ConstantInt>(EI.getOperand(1))) { + int SrcIdx = getShuffleMask(SVI)[Elt->getZExtValue()]; + Value *Src; + unsigned LHSWidth = + cast<VectorType>(SVI->getOperand(0)->getType())->getNumElements(); + + if (SrcIdx < 0) + return ReplaceInstUsesWith(EI, UndefValue::get(EI.getType())); + if (SrcIdx < (int)LHSWidth) + Src = SVI->getOperand(0); + else { + SrcIdx -= LHSWidth; + Src = SVI->getOperand(1); + } + const Type *Int32Ty = Type::getInt32Ty(EI.getContext()); + return ExtractElementInst::Create(Src, + ConstantInt::get(Int32Ty, + SrcIdx, false)); + } + } + // FIXME: Canonicalize extractelement(bitcast) -> bitcast(extractelement) + } + return 0; +} + +/// CollectSingleShuffleElements - If V is a shuffle of values that ONLY returns +/// elements from either LHS or RHS, return the shuffle mask and true. +/// Otherwise, return false. +static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS, + std::vector<Constant*> &Mask) { + assert(V->getType() == LHS->getType() && V->getType() == RHS->getType() && + "Invalid CollectSingleShuffleElements"); + unsigned NumElts = cast<VectorType>(V->getType())->getNumElements(); + + if (isa<UndefValue>(V)) { + Mask.assign(NumElts, UndefValue::get(Type::getInt32Ty(V->getContext()))); + return true; + } + + if (V == LHS) { + for (unsigned i = 0; i != NumElts; ++i) + Mask.push_back(ConstantInt::get(Type::getInt32Ty(V->getContext()), i)); + return true; + } + + if (V == RHS) { + for (unsigned i = 0; i != NumElts; ++i) + Mask.push_back(ConstantInt::get(Type::getInt32Ty(V->getContext()), + i+NumElts)); + return true; + } + + if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) { + // If this is an insert of an extract from some other vector, include it. + Value *VecOp = IEI->getOperand(0); + Value *ScalarOp = IEI->getOperand(1); + Value *IdxOp = IEI->getOperand(2); + + if (!isa<ConstantInt>(IdxOp)) + return false; + unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue(); + + if (isa<UndefValue>(ScalarOp)) { // inserting undef into vector. + // Okay, we can handle this if the vector we are insertinting into is + // transitively ok. + if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) { + // If so, update the mask to reflect the inserted undef. + Mask[InsertedIdx] = UndefValue::get(Type::getInt32Ty(V->getContext())); + return true; + } + } else if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)){ + if (isa<ConstantInt>(EI->getOperand(1)) && + EI->getOperand(0)->getType() == V->getType()) { + unsigned ExtractedIdx = + cast<ConstantInt>(EI->getOperand(1))->getZExtValue(); + + // This must be extracting from either LHS or RHS. + if (EI->getOperand(0) == LHS || EI->getOperand(0) == RHS) { + // Okay, we can handle this if the vector we are insertinting into is + // transitively ok. + if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) { + // If so, update the mask to reflect the inserted value. + if (EI->getOperand(0) == LHS) { + Mask[InsertedIdx % NumElts] = + ConstantInt::get(Type::getInt32Ty(V->getContext()), + ExtractedIdx); + } else { + assert(EI->getOperand(0) == RHS); + Mask[InsertedIdx % NumElts] = + ConstantInt::get(Type::getInt32Ty(V->getContext()), + ExtractedIdx+NumElts); + } + return true; + } + } + } + } + } + // TODO: Handle shufflevector here! + + return false; +} + +/// CollectShuffleElements - We are building a shuffle of V, using RHS as the +/// RHS of the shuffle instruction, if it is not null. Return a shuffle mask +/// that computes V and the LHS value of the shuffle. +static Value *CollectShuffleElements(Value *V, std::vector<Constant*> &Mask, + Value *&RHS) { + assert(V->getType()->isVectorTy() && + (RHS == 0 || V->getType() == RHS->getType()) && + "Invalid shuffle!"); + unsigned NumElts = cast<VectorType>(V->getType())->getNumElements(); + + if (isa<UndefValue>(V)) { + Mask.assign(NumElts, UndefValue::get(Type::getInt32Ty(V->getContext()))); + return V; + } else if (isa<ConstantAggregateZero>(V)) { + Mask.assign(NumElts, ConstantInt::get(Type::getInt32Ty(V->getContext()),0)); + return V; + } else if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) { + // If this is an insert of an extract from some other vector, include it. + Value *VecOp = IEI->getOperand(0); + Value *ScalarOp = IEI->getOperand(1); + Value *IdxOp = IEI->getOperand(2); + + if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) { + if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp) && + EI->getOperand(0)->getType() == V->getType()) { + unsigned ExtractedIdx = + cast<ConstantInt>(EI->getOperand(1))->getZExtValue(); + unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue(); + + // Either the extracted from or inserted into vector must be RHSVec, + // otherwise we'd end up with a shuffle of three inputs. + if (EI->getOperand(0) == RHS || RHS == 0) { + RHS = EI->getOperand(0); + Value *V = CollectShuffleElements(VecOp, Mask, RHS); + Mask[InsertedIdx % NumElts] = + ConstantInt::get(Type::getInt32Ty(V->getContext()), + NumElts+ExtractedIdx); + return V; + } + + if (VecOp == RHS) { + Value *V = CollectShuffleElements(EI->getOperand(0), Mask, RHS); + // Everything but the extracted element is replaced with the RHS. + for (unsigned i = 0; i != NumElts; ++i) { + if (i != InsertedIdx) + Mask[i] = ConstantInt::get(Type::getInt32Ty(V->getContext()), + NumElts+i); + } + return V; + } + + // If this insertelement is a chain that comes from exactly these two + // vectors, return the vector and the effective shuffle. + if (CollectSingleShuffleElements(IEI, EI->getOperand(0), RHS, Mask)) + return EI->getOperand(0); + } + } + } + // TODO: Handle shufflevector here! + + // Otherwise, can't do anything fancy. Return an identity vector. + for (unsigned i = 0; i != NumElts; ++i) + Mask.push_back(ConstantInt::get(Type::getInt32Ty(V->getContext()), i)); + return V; +} + +Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) { + Value *VecOp = IE.getOperand(0); + Value *ScalarOp = IE.getOperand(1); + Value *IdxOp = IE.getOperand(2); + + // Inserting an undef or into an undefined place, remove this. + if (isa<UndefValue>(ScalarOp) || isa<UndefValue>(IdxOp)) + ReplaceInstUsesWith(IE, VecOp); + + // If the inserted element was extracted from some other vector, and if the + // indexes are constant, try to turn this into a shufflevector operation. + if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) { + if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp) && + EI->getOperand(0)->getType() == IE.getType()) { + unsigned NumVectorElts = IE.getType()->getNumElements(); + unsigned ExtractedIdx = + cast<ConstantInt>(EI->getOperand(1))->getZExtValue(); + unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue(); + + if (ExtractedIdx >= NumVectorElts) // Out of range extract. + return ReplaceInstUsesWith(IE, VecOp); + + if (InsertedIdx >= NumVectorElts) // Out of range insert. + return ReplaceInstUsesWith(IE, UndefValue::get(IE.getType())); + + // If we are extracting a value from a vector, then inserting it right + // back into the same place, just use the input vector. + if (EI->getOperand(0) == VecOp && ExtractedIdx == InsertedIdx) + return ReplaceInstUsesWith(IE, VecOp); + + // If this insertelement isn't used by some other insertelement, turn it + // (and any insertelements it points to), into one big shuffle. + if (!IE.hasOneUse() || !isa<InsertElementInst>(IE.use_back())) { + std::vector<Constant*> Mask; + Value *RHS = 0; + Value *LHS = CollectShuffleElements(&IE, Mask, RHS); + if (RHS == 0) RHS = UndefValue::get(LHS->getType()); + // We now have a shuffle of LHS, RHS, Mask. + return new ShuffleVectorInst(LHS, RHS, ConstantVector::get(Mask)); + } + } + } + + unsigned VWidth = cast<VectorType>(VecOp->getType())->getNumElements(); + APInt UndefElts(VWidth, 0); + APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth)); + if (Value *V = SimplifyDemandedVectorElts(&IE, AllOnesEltMask, UndefElts)) { + if (V != &IE) + return ReplaceInstUsesWith(IE, V); + return &IE; + } + + return 0; +} + + +Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { + Value *LHS = SVI.getOperand(0); + Value *RHS = SVI.getOperand(1); + std::vector<int> Mask = getShuffleMask(&SVI); + + bool MadeChange = false; + + // Undefined shuffle mask -> undefined value. + if (isa<UndefValue>(SVI.getOperand(2))) + return ReplaceInstUsesWith(SVI, UndefValue::get(SVI.getType())); + + unsigned VWidth = cast<VectorType>(SVI.getType())->getNumElements(); + + if (VWidth != cast<VectorType>(LHS->getType())->getNumElements()) + return 0; + + APInt UndefElts(VWidth, 0); + APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth)); + if (Value *V = SimplifyDemandedVectorElts(&SVI, AllOnesEltMask, UndefElts)) { + if (V != &SVI) + return ReplaceInstUsesWith(SVI, V); + LHS = SVI.getOperand(0); + RHS = SVI.getOperand(1); + MadeChange = true; + } + + // Canonicalize shuffle(x ,x,mask) -> shuffle(x, undef,mask') + // Canonicalize shuffle(undef,x,mask) -> shuffle(x, undef,mask'). + if (LHS == RHS || isa<UndefValue>(LHS)) { + if (isa<UndefValue>(LHS) && LHS == RHS) { + // shuffle(undef,undef,mask) -> undef. + return ReplaceInstUsesWith(SVI, LHS); + } + + // Remap any references to RHS to use LHS. + std::vector<Constant*> Elts; + for (unsigned i = 0, e = Mask.size(); i != e; ++i) { + if (Mask[i] < 0) + Elts.push_back(UndefValue::get(Type::getInt32Ty(SVI.getContext()))); + else { + if ((Mask[i] >= (int)e && isa<UndefValue>(RHS)) || + (Mask[i] < (int)e && isa<UndefValue>(LHS))) { + Mask[i] = -1; // Turn into undef. + Elts.push_back(UndefValue::get(Type::getInt32Ty(SVI.getContext()))); + } else { + Mask[i] = Mask[i] % e; // Force to LHS. + Elts.push_back(ConstantInt::get(Type::getInt32Ty(SVI.getContext()), + Mask[i])); + } + } + } + SVI.setOperand(0, SVI.getOperand(1)); + SVI.setOperand(1, UndefValue::get(RHS->getType())); + SVI.setOperand(2, ConstantVector::get(Elts)); + LHS = SVI.getOperand(0); + RHS = SVI.getOperand(1); + MadeChange = true; + } + + // Analyze the shuffle, are the LHS or RHS and identity shuffles? + bool isLHSID = true, isRHSID = true; + + for (unsigned i = 0, e = Mask.size(); i != e; ++i) { + if (Mask[i] < 0) continue; // Ignore undef values. + // Is this an identity shuffle of the LHS value? + isLHSID &= (Mask[i] == (int)i); + + // Is this an identity shuffle of the RHS value? + isRHSID &= (Mask[i]-e == i); + } + + // Eliminate identity shuffles. + if (isLHSID) return ReplaceInstUsesWith(SVI, LHS); + if (isRHSID) return ReplaceInstUsesWith(SVI, RHS); + + // If the LHS is a shufflevector itself, see if we can combine it with this + // one without producing an unusual shuffle. Here we are really conservative: + // we are absolutely afraid of producing a shuffle mask not in the input + // program, because the code gen may not be smart enough to turn a merged + // shuffle into two specific shuffles: it may produce worse code. As such, + // we only merge two shuffles if the result is either a splat or one of the + // two input shuffle masks. In this case, merging the shuffles just removes + // one instruction, which we know is safe. This is good for things like + // turning: (splat(splat)) -> splat. + if (ShuffleVectorInst *LHSSVI = dyn_cast<ShuffleVectorInst>(LHS)) { + if (isa<UndefValue>(RHS)) { + std::vector<int> LHSMask = getShuffleMask(LHSSVI); + + if (LHSMask.size() == Mask.size()) { + std::vector<int> NewMask; + bool isSplat = true; + int SplatElt = -1; // undef + for (unsigned i = 0, e = Mask.size(); i != e; ++i) { + int MaskElt; + if (Mask[i] < 0 || Mask[i] >= (int)e) + MaskElt = -1; // undef + else + MaskElt = LHSMask[Mask[i]]; + // Check if this could still be a splat. + if (MaskElt >= 0) { + if (SplatElt >=0 && SplatElt != MaskElt) + isSplat = false; + SplatElt = MaskElt; + } + NewMask.push_back(MaskElt); + } + + // If the result mask is equal to the src shuffle or this + // shuffle mask, do the replacement. + if (isSplat || NewMask == LHSMask || NewMask == Mask) { + std::vector<Constant*> Elts; + const Type *Int32Ty = Type::getInt32Ty(SVI.getContext()); + for (unsigned i = 0, e = NewMask.size(); i != e; ++i) { + if (NewMask[i] < 0) { + Elts.push_back(UndefValue::get(Int32Ty)); + } else { + Elts.push_back(ConstantInt::get(Int32Ty, NewMask[i])); + } + } + return new ShuffleVectorInst(LHSSVI->getOperand(0), + LHSSVI->getOperand(1), + ConstantVector::get(Elts)); + } + } + } + } + + return MadeChange ? &SVI : 0; +} diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineWorklist.h b/contrib/llvm/lib/Transforms/InstCombine/InstCombineWorklist.h new file mode 100644 index 0000000..9100a85 --- /dev/null +++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineWorklist.h @@ -0,0 +1,105 @@ +//===- InstCombineWorklist.h - Worklist for the InstCombine pass ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef INSTCOMBINE_WORKLIST_H +#define INSTCOMBINE_WORKLIST_H + +#define DEBUG_TYPE "instcombine" +#include "llvm/Instruction.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Compiler.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { + +/// InstCombineWorklist - This is the worklist management logic for +/// InstCombine. +class LLVM_LIBRARY_VISIBILITY InstCombineWorklist { + SmallVector<Instruction*, 256> Worklist; + DenseMap<Instruction*, unsigned> WorklistMap; + + void operator=(const InstCombineWorklist&RHS); // DO NOT IMPLEMENT + InstCombineWorklist(const InstCombineWorklist&); // DO NOT IMPLEMENT +public: + InstCombineWorklist() {} + + bool isEmpty() const { return Worklist.empty(); } + + /// Add - Add the specified instruction to the worklist if it isn't already + /// in it. + void Add(Instruction *I) { + if (WorklistMap.insert(std::make_pair(I, Worklist.size())).second) { + DEBUG(errs() << "IC: ADD: " << *I << '\n'); + Worklist.push_back(I); + } + } + + void AddValue(Value *V) { + if (Instruction *I = dyn_cast<Instruction>(V)) + Add(I); + } + + /// AddInitialGroup - Add the specified batch of stuff in reverse order. + /// which should only be done when the worklist is empty and when the group + /// has no duplicates. + void AddInitialGroup(Instruction *const *List, unsigned NumEntries) { + assert(Worklist.empty() && "Worklist must be empty to add initial group"); + Worklist.reserve(NumEntries+16); + DEBUG(errs() << "IC: ADDING: " << NumEntries << " instrs to worklist\n"); + for (; NumEntries; --NumEntries) { + Instruction *I = List[NumEntries-1]; + WorklistMap.insert(std::make_pair(I, Worklist.size())); + Worklist.push_back(I); + } + } + + // Remove - remove I from the worklist if it exists. + void Remove(Instruction *I) { + DenseMap<Instruction*, unsigned>::iterator It = WorklistMap.find(I); + if (It == WorklistMap.end()) return; // Not in worklist. + + // Don't bother moving everything down, just null out the slot. + Worklist[It->second] = 0; + + WorklistMap.erase(It); + } + + Instruction *RemoveOne() { + Instruction *I = Worklist.back(); + Worklist.pop_back(); + WorklistMap.erase(I); + return I; + } + + /// AddUsersToWorkList - When an instruction is simplified, add all users of + /// the instruction to the work lists because they might get more simplified + /// now. + /// + void AddUsersToWorkList(Instruction &I) { + for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); + UI != UE; ++UI) + Add(cast<Instruction>(*UI)); + } + + + /// Zap - check that the worklist is empty and nuke the backing store for + /// the map if it is large. + void Zap() { + assert(WorklistMap.empty() && "Worklist empty, but map not?"); + + // Do an explicit clear, this shrinks the map if needed. + WorklistMap.clear(); + } +}; + +} // end namespace llvm. + +#endif diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp new file mode 100644 index 0000000..37123d0 --- /dev/null +++ b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -0,0 +1,1662 @@ +//===- InstructionCombining.cpp - Combine multiple instructions -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// InstructionCombining - Combine instructions to form fewer, simple +// instructions. This pass does not modify the CFG. This pass is where +// algebraic simplification happens. +// +// This pass combines things like: +// %Y = add i32 %X, 1 +// %Z = add i32 %Y, 1 +// into: +// %Z = add i32 %X, 2 +// +// This is a simple worklist driven algorithm. +// +// This pass guarantees that the following canonicalizations are performed on +// the program: +// 1. If a binary operator has a constant operand, it is moved to the RHS +// 2. Bitwise operators with constant operands are always grouped so that +// shifts are performed first, then or's, then and's, then xor's. +// 3. Compare instructions are converted from <,>,<=,>= to ==,!= if possible +// 4. All cmp instructions on boolean values are replaced with logical ops +// 5. add X, X is represented as (X*2) => (X << 1) +// 6. Multiplies with a power-of-two constant argument are transformed into +// shifts. +// ... etc. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "instcombine" +#include "llvm/Transforms/Scalar.h" +#include "InstCombine.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Support/CFG.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/GetElementPtrTypeIterator.h" +#include "llvm/Support/PatternMatch.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm-c/Initialization.h" +#include <algorithm> +#include <climits> +using namespace llvm; +using namespace llvm::PatternMatch; + +STATISTIC(NumCombined , "Number of insts combined"); +STATISTIC(NumConstProp, "Number of constant folds"); +STATISTIC(NumDeadInst , "Number of dead inst eliminated"); +STATISTIC(NumSunkInst , "Number of instructions sunk"); +STATISTIC(NumExpand, "Number of expansions"); +STATISTIC(NumFactor , "Number of factorizations"); +STATISTIC(NumReassoc , "Number of reassociations"); + +// Initialization Routines +void llvm::initializeInstCombine(PassRegistry &Registry) { + initializeInstCombinerPass(Registry); +} + +void LLVMInitializeInstCombine(LLVMPassRegistryRef R) { + initializeInstCombine(*unwrap(R)); +} + +char InstCombiner::ID = 0; +INITIALIZE_PASS(InstCombiner, "instcombine", + "Combine redundant instructions", false, false) + +void InstCombiner::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addPreservedID(LCSSAID); + AU.setPreservesCFG(); +} + + +/// ShouldChangeType - Return true if it is desirable to convert a computation +/// from 'From' to 'To'. We don't want to convert from a legal to an illegal +/// type for example, or from a smaller to a larger illegal type. +bool InstCombiner::ShouldChangeType(const Type *From, const Type *To) const { + assert(From->isIntegerTy() && To->isIntegerTy()); + + // If we don't have TD, we don't know if the source/dest are legal. + if (!TD) return false; + + unsigned FromWidth = From->getPrimitiveSizeInBits(); + unsigned ToWidth = To->getPrimitiveSizeInBits(); + bool FromLegal = TD->isLegalInteger(FromWidth); + bool ToLegal = TD->isLegalInteger(ToWidth); + + // If this is a legal integer from type, and the result would be an illegal + // type, don't do the transformation. + if (FromLegal && !ToLegal) + return false; + + // Otherwise, if both are illegal, do not increase the size of the result. We + // do allow things like i160 -> i64, but not i64 -> i160. + if (!FromLegal && !ToLegal && ToWidth > FromWidth) + return false; + + return true; +} + + +/// SimplifyAssociativeOrCommutative - This performs a few simplifications for +/// operators which are associative or commutative: +// +// Commutative operators: +// +// 1. Order operands such that they are listed from right (least complex) to +// left (most complex). This puts constants before unary operators before +// binary operators. +// +// Associative operators: +// +// 2. Transform: "(A op B) op C" ==> "A op (B op C)" if "B op C" simplifies. +// 3. Transform: "A op (B op C)" ==> "(A op B) op C" if "A op B" simplifies. +// +// Associative and commutative operators: +// +// 4. Transform: "(A op B) op C" ==> "(C op A) op B" if "C op A" simplifies. +// 5. Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies. +// 6. Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)" +// if C1 and C2 are constants. +// +bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) { + Instruction::BinaryOps Opcode = I.getOpcode(); + bool Changed = false; + + do { + // Order operands such that they are listed from right (least complex) to + // left (most complex). This puts constants before unary operators before + // binary operators. + if (I.isCommutative() && getComplexity(I.getOperand(0)) < + getComplexity(I.getOperand(1))) + Changed = !I.swapOperands(); + + BinaryOperator *Op0 = dyn_cast<BinaryOperator>(I.getOperand(0)); + BinaryOperator *Op1 = dyn_cast<BinaryOperator>(I.getOperand(1)); + + if (I.isAssociative()) { + // Transform: "(A op B) op C" ==> "A op (B op C)" if "B op C" simplifies. + if (Op0 && Op0->getOpcode() == Opcode) { + Value *A = Op0->getOperand(0); + Value *B = Op0->getOperand(1); + Value *C = I.getOperand(1); + + // Does "B op C" simplify? + if (Value *V = SimplifyBinOp(Opcode, B, C, TD)) { + // It simplifies to V. Form "A op V". + I.setOperand(0, A); + I.setOperand(1, V); + // Conservatively clear the optional flags, since they may not be + // preserved by the reassociation. + I.clearSubclassOptionalData(); + Changed = true; + ++NumReassoc; + continue; + } + } + + // Transform: "A op (B op C)" ==> "(A op B) op C" if "A op B" simplifies. + if (Op1 && Op1->getOpcode() == Opcode) { + Value *A = I.getOperand(0); + Value *B = Op1->getOperand(0); + Value *C = Op1->getOperand(1); + + // Does "A op B" simplify? + if (Value *V = SimplifyBinOp(Opcode, A, B, TD)) { + // It simplifies to V. Form "V op C". + I.setOperand(0, V); + I.setOperand(1, C); + // Conservatively clear the optional flags, since they may not be + // preserved by the reassociation. + I.clearSubclassOptionalData(); + Changed = true; + ++NumReassoc; + continue; + } + } + } + + if (I.isAssociative() && I.isCommutative()) { + // Transform: "(A op B) op C" ==> "(C op A) op B" if "C op A" simplifies. + if (Op0 && Op0->getOpcode() == Opcode) { + Value *A = Op0->getOperand(0); + Value *B = Op0->getOperand(1); + Value *C = I.getOperand(1); + + // Does "C op A" simplify? + if (Value *V = SimplifyBinOp(Opcode, C, A, TD)) { + // It simplifies to V. Form "V op B". + I.setOperand(0, V); + I.setOperand(1, B); + // Conservatively clear the optional flags, since they may not be + // preserved by the reassociation. + I.clearSubclassOptionalData(); + Changed = true; + ++NumReassoc; + continue; + } + } + + // Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies. + if (Op1 && Op1->getOpcode() == Opcode) { + Value *A = I.getOperand(0); + Value *B = Op1->getOperand(0); + Value *C = Op1->getOperand(1); + + // Does "C op A" simplify? + if (Value *V = SimplifyBinOp(Opcode, C, A, TD)) { + // It simplifies to V. Form "B op V". + I.setOperand(0, B); + I.setOperand(1, V); + // Conservatively clear the optional flags, since they may not be + // preserved by the reassociation. + I.clearSubclassOptionalData(); + Changed = true; + ++NumReassoc; + continue; + } + } + + // Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)" + // if C1 and C2 are constants. + if (Op0 && Op1 && + Op0->getOpcode() == Opcode && Op1->getOpcode() == Opcode && + isa<Constant>(Op0->getOperand(1)) && + isa<Constant>(Op1->getOperand(1)) && + Op0->hasOneUse() && Op1->hasOneUse()) { + Value *A = Op0->getOperand(0); + Constant *C1 = cast<Constant>(Op0->getOperand(1)); + Value *B = Op1->getOperand(0); + Constant *C2 = cast<Constant>(Op1->getOperand(1)); + + Constant *Folded = ConstantExpr::get(Opcode, C1, C2); + Instruction *New = BinaryOperator::Create(Opcode, A, B, Op1->getName(), + &I); + Worklist.Add(New); + I.setOperand(0, New); + I.setOperand(1, Folded); + // Conservatively clear the optional flags, since they may not be + // preserved by the reassociation. + I.clearSubclassOptionalData(); + Changed = true; + continue; + } + } + + // No further simplifications. + return Changed; + } while (1); +} + +/// LeftDistributesOverRight - Whether "X LOp (Y ROp Z)" is always equal to +/// "(X LOp Y) ROp (X LOp Z)". +static bool LeftDistributesOverRight(Instruction::BinaryOps LOp, + Instruction::BinaryOps ROp) { + switch (LOp) { + default: + return false; + + case Instruction::And: + // And distributes over Or and Xor. + switch (ROp) { + default: + return false; + case Instruction::Or: + case Instruction::Xor: + return true; + } + + case Instruction::Mul: + // Multiplication distributes over addition and subtraction. + switch (ROp) { + default: + return false; + case Instruction::Add: + case Instruction::Sub: + return true; + } + + case Instruction::Or: + // Or distributes over And. + switch (ROp) { + default: + return false; + case Instruction::And: + return true; + } + } +} + +/// RightDistributesOverLeft - Whether "(X LOp Y) ROp Z" is always equal to +/// "(X ROp Z) LOp (Y ROp Z)". +static bool RightDistributesOverLeft(Instruction::BinaryOps LOp, + Instruction::BinaryOps ROp) { + if (Instruction::isCommutative(ROp)) + return LeftDistributesOverRight(ROp, LOp); + // TODO: It would be nice to handle division, aka "(X + Y)/Z = X/Z + Y/Z", + // but this requires knowing that the addition does not overflow and other + // such subtleties. + return false; +} + +/// SimplifyUsingDistributiveLaws - This tries to simplify binary operations +/// which some other binary operation distributes over either by factorizing +/// out common terms (eg "(A*B)+(A*C)" -> "A*(B+C)") or expanding out if this +/// results in simplifications (eg: "A & (B | C) -> (A&B) | (A&C)" if this is +/// a win). Returns the simplified value, or null if it didn't simplify. +Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) { + Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); + BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS); + BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS); + Instruction::BinaryOps TopLevelOpcode = I.getOpcode(); // op + + // Factorization. + if (Op0 && Op1 && Op0->getOpcode() == Op1->getOpcode()) { + // The instruction has the form "(A op' B) op (C op' D)". Try to factorize + // a common term. + Value *A = Op0->getOperand(0), *B = Op0->getOperand(1); + Value *C = Op1->getOperand(0), *D = Op1->getOperand(1); + Instruction::BinaryOps InnerOpcode = Op0->getOpcode(); // op' + + // Does "X op' Y" always equal "Y op' X"? + bool InnerCommutative = Instruction::isCommutative(InnerOpcode); + + // Does "X op' (Y op Z)" always equal "(X op' Y) op (X op' Z)"? + if (LeftDistributesOverRight(InnerOpcode, TopLevelOpcode)) + // Does the instruction have the form "(A op' B) op (A op' D)" or, in the + // commutative case, "(A op' B) op (C op' A)"? + if (A == C || (InnerCommutative && A == D)) { + if (A != C) + std::swap(C, D); + // Consider forming "A op' (B op D)". + // If "B op D" simplifies then it can be formed with no cost. + Value *V = SimplifyBinOp(TopLevelOpcode, B, D, TD); + // If "B op D" doesn't simplify then only go on if both of the existing + // operations "A op' B" and "C op' D" will be zapped as no longer used. + if (!V && Op0->hasOneUse() && Op1->hasOneUse()) + V = Builder->CreateBinOp(TopLevelOpcode, B, D, Op1->getName()); + if (V) { + ++NumFactor; + V = Builder->CreateBinOp(InnerOpcode, A, V); + V->takeName(&I); + return V; + } + } + + // Does "(X op Y) op' Z" always equal "(X op' Z) op (Y op' Z)"? + if (RightDistributesOverLeft(TopLevelOpcode, InnerOpcode)) + // Does the instruction have the form "(A op' B) op (C op' B)" or, in the + // commutative case, "(A op' B) op (B op' D)"? + if (B == D || (InnerCommutative && B == C)) { + if (B != D) + std::swap(C, D); + // Consider forming "(A op C) op' B". + // If "A op C" simplifies then it can be formed with no cost. + Value *V = SimplifyBinOp(TopLevelOpcode, A, C, TD); + // If "A op C" doesn't simplify then only go on if both of the existing + // operations "A op' B" and "C op' D" will be zapped as no longer used. + if (!V && Op0->hasOneUse() && Op1->hasOneUse()) + V = Builder->CreateBinOp(TopLevelOpcode, A, C, Op0->getName()); + if (V) { + ++NumFactor; + V = Builder->CreateBinOp(InnerOpcode, V, B); + V->takeName(&I); + return V; + } + } + } + + // Expansion. + if (Op0 && RightDistributesOverLeft(Op0->getOpcode(), TopLevelOpcode)) { + // The instruction has the form "(A op' B) op C". See if expanding it out + // to "(A op C) op' (B op C)" results in simplifications. + Value *A = Op0->getOperand(0), *B = Op0->getOperand(1), *C = RHS; + Instruction::BinaryOps InnerOpcode = Op0->getOpcode(); // op' + + // Do "A op C" and "B op C" both simplify? + if (Value *L = SimplifyBinOp(TopLevelOpcode, A, C, TD)) + if (Value *R = SimplifyBinOp(TopLevelOpcode, B, C, TD)) { + // They do! Return "L op' R". + ++NumExpand; + // If "L op' R" equals "A op' B" then "L op' R" is just the LHS. + if ((L == A && R == B) || + (Instruction::isCommutative(InnerOpcode) && L == B && R == A)) + return Op0; + // Otherwise return "L op' R" if it simplifies. + if (Value *V = SimplifyBinOp(InnerOpcode, L, R, TD)) + return V; + // Otherwise, create a new instruction. + C = Builder->CreateBinOp(InnerOpcode, L, R); + C->takeName(&I); + return C; + } + } + + if (Op1 && LeftDistributesOverRight(TopLevelOpcode, Op1->getOpcode())) { + // The instruction has the form "A op (B op' C)". See if expanding it out + // to "(A op B) op' (A op C)" results in simplifications. + Value *A = LHS, *B = Op1->getOperand(0), *C = Op1->getOperand(1); + Instruction::BinaryOps InnerOpcode = Op1->getOpcode(); // op' + + // Do "A op B" and "A op C" both simplify? + if (Value *L = SimplifyBinOp(TopLevelOpcode, A, B, TD)) + if (Value *R = SimplifyBinOp(TopLevelOpcode, A, C, TD)) { + // They do! Return "L op' R". + ++NumExpand; + // If "L op' R" equals "B op' C" then "L op' R" is just the RHS. + if ((L == B && R == C) || + (Instruction::isCommutative(InnerOpcode) && L == C && R == B)) + return Op1; + // Otherwise return "L op' R" if it simplifies. + if (Value *V = SimplifyBinOp(InnerOpcode, L, R, TD)) + return V; + // Otherwise, create a new instruction. + A = Builder->CreateBinOp(InnerOpcode, L, R); + A->takeName(&I); + return A; + } + } + + return 0; +} + +// dyn_castNegVal - Given a 'sub' instruction, return the RHS of the instruction +// if the LHS is a constant zero (which is the 'negate' form). +// +Value *InstCombiner::dyn_castNegVal(Value *V) const { + if (BinaryOperator::isNeg(V)) + return BinaryOperator::getNegArgument(V); + + // Constants can be considered to be negated values if they can be folded. + if (ConstantInt *C = dyn_cast<ConstantInt>(V)) + return ConstantExpr::getNeg(C); + + if (ConstantVector *C = dyn_cast<ConstantVector>(V)) + if (C->getType()->getElementType()->isIntegerTy()) + return ConstantExpr::getNeg(C); + + return 0; +} + +// dyn_castFNegVal - Given a 'fsub' instruction, return the RHS of the +// instruction if the LHS is a constant negative zero (which is the 'negate' +// form). +// +Value *InstCombiner::dyn_castFNegVal(Value *V) const { + if (BinaryOperator::isFNeg(V)) + return BinaryOperator::getFNegArgument(V); + + // Constants can be considered to be negated values if they can be folded. + if (ConstantFP *C = dyn_cast<ConstantFP>(V)) + return ConstantExpr::getFNeg(C); + + if (ConstantVector *C = dyn_cast<ConstantVector>(V)) + if (C->getType()->getElementType()->isFloatingPointTy()) + return ConstantExpr::getFNeg(C); + + return 0; +} + +static Value *FoldOperationIntoSelectOperand(Instruction &I, Value *SO, + InstCombiner *IC) { + if (CastInst *CI = dyn_cast<CastInst>(&I)) { + return IC->Builder->CreateCast(CI->getOpcode(), SO, I.getType()); + } + + // Figure out if the constant is the left or the right argument. + bool ConstIsRHS = isa<Constant>(I.getOperand(1)); + Constant *ConstOperand = cast<Constant>(I.getOperand(ConstIsRHS)); + + if (Constant *SOC = dyn_cast<Constant>(SO)) { + if (ConstIsRHS) + return ConstantExpr::get(I.getOpcode(), SOC, ConstOperand); + return ConstantExpr::get(I.getOpcode(), ConstOperand, SOC); + } + + Value *Op0 = SO, *Op1 = ConstOperand; + if (!ConstIsRHS) + std::swap(Op0, Op1); + + if (BinaryOperator *BO = dyn_cast<BinaryOperator>(&I)) + return IC->Builder->CreateBinOp(BO->getOpcode(), Op0, Op1, + SO->getName()+".op"); + if (ICmpInst *CI = dyn_cast<ICmpInst>(&I)) + return IC->Builder->CreateICmp(CI->getPredicate(), Op0, Op1, + SO->getName()+".cmp"); + if (FCmpInst *CI = dyn_cast<FCmpInst>(&I)) + return IC->Builder->CreateICmp(CI->getPredicate(), Op0, Op1, + SO->getName()+".cmp"); + llvm_unreachable("Unknown binary instruction type!"); +} + +// FoldOpIntoSelect - Given an instruction with a select as one operand and a +// constant as the other operand, try to fold the binary operator into the +// select arguments. This also works for Cast instructions, which obviously do +// not have a second operand. +Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) { + // Don't modify shared select instructions + if (!SI->hasOneUse()) return 0; + Value *TV = SI->getOperand(1); + Value *FV = SI->getOperand(2); + + if (isa<Constant>(TV) || isa<Constant>(FV)) { + // Bool selects with constant operands can be folded to logical ops. + if (SI->getType()->isIntegerTy(1)) return 0; + + // If it's a bitcast involving vectors, make sure it has the same number of + // elements on both sides. + if (BitCastInst *BC = dyn_cast<BitCastInst>(&Op)) { + const VectorType *DestTy = dyn_cast<VectorType>(BC->getDestTy()); + const VectorType *SrcTy = dyn_cast<VectorType>(BC->getSrcTy()); + + // Verify that either both or neither are vectors. + if ((SrcTy == NULL) != (DestTy == NULL)) return 0; + // If vectors, verify that they have the same number of elements. + if (SrcTy && SrcTy->getNumElements() != DestTy->getNumElements()) + return 0; + } + + Value *SelectTrueVal = FoldOperationIntoSelectOperand(Op, TV, this); + Value *SelectFalseVal = FoldOperationIntoSelectOperand(Op, FV, this); + + return SelectInst::Create(SI->getCondition(), + SelectTrueVal, SelectFalseVal); + } + return 0; +} + + +/// FoldOpIntoPhi - Given a binary operator, cast instruction, or select which +/// has a PHI node as operand #0, see if we can fold the instruction into the +/// PHI (which is only possible if all operands to the PHI are constants). +/// +Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { + PHINode *PN = cast<PHINode>(I.getOperand(0)); + unsigned NumPHIValues = PN->getNumIncomingValues(); + if (NumPHIValues == 0) + return 0; + + // We normally only transform phis with a single use. However, if a PHI has + // multiple uses and they are all the same operation, we can fold *all* of the + // uses into the PHI. + if (!PN->hasOneUse()) { + // Walk the use list for the instruction, comparing them to I. + for (Value::use_iterator UI = PN->use_begin(), E = PN->use_end(); + UI != E; ++UI) { + Instruction *User = cast<Instruction>(*UI); + if (User != &I && !I.isIdenticalTo(User)) + return 0; + } + // Otherwise, we can replace *all* users with the new PHI we form. + } + + // Check to see if all of the operands of the PHI are simple constants + // (constantint/constantfp/undef). If there is one non-constant value, + // remember the BB it is in. If there is more than one or if *it* is a PHI, + // bail out. We don't do arbitrary constant expressions here because moving + // their computation can be expensive without a cost model. + BasicBlock *NonConstBB = 0; + for (unsigned i = 0; i != NumPHIValues; ++i) { + Value *InVal = PN->getIncomingValue(i); + if (isa<Constant>(InVal) && !isa<ConstantExpr>(InVal)) + continue; + + if (isa<PHINode>(InVal)) return 0; // Itself a phi. + if (NonConstBB) return 0; // More than one non-const value. + + NonConstBB = PN->getIncomingBlock(i); + + // If the InVal is an invoke at the end of the pred block, then we can't + // insert a computation after it without breaking the edge. + if (InvokeInst *II = dyn_cast<InvokeInst>(InVal)) + if (II->getParent() == NonConstBB) + return 0; + + // If the incoming non-constant value is in I's block, we will remove one + // instruction, but insert another equivalent one, leading to infinite + // instcombine. + if (NonConstBB == I.getParent()) + return 0; + } + + // If there is exactly one non-constant value, we can insert a copy of the + // operation in that block. However, if this is a critical edge, we would be + // inserting the computation one some other paths (e.g. inside a loop). Only + // do this if the pred block is unconditionally branching into the phi block. + if (NonConstBB != 0) { + BranchInst *BI = dyn_cast<BranchInst>(NonConstBB->getTerminator()); + if (!BI || !BI->isUnconditional()) return 0; + } + + // Okay, we can do the transformation: create the new PHI node. + PHINode *NewPN = PHINode::Create(I.getType(), ""); + NewPN->reserveOperandSpace(PN->getNumOperands()/2); + InsertNewInstBefore(NewPN, *PN); + NewPN->takeName(PN); + + // If we are going to have to insert a new computation, do so right before the + // predecessors terminator. + if (NonConstBB) + Builder->SetInsertPoint(NonConstBB->getTerminator()); + + // Next, add all of the operands to the PHI. + if (SelectInst *SI = dyn_cast<SelectInst>(&I)) { + // We only currently try to fold the condition of a select when it is a phi, + // not the true/false values. + Value *TrueV = SI->getTrueValue(); + Value *FalseV = SI->getFalseValue(); + BasicBlock *PhiTransBB = PN->getParent(); + for (unsigned i = 0; i != NumPHIValues; ++i) { + BasicBlock *ThisBB = PN->getIncomingBlock(i); + Value *TrueVInPred = TrueV->DoPHITranslation(PhiTransBB, ThisBB); + Value *FalseVInPred = FalseV->DoPHITranslation(PhiTransBB, ThisBB); + Value *InV = 0; + if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i))) + InV = InC->isNullValue() ? FalseVInPred : TrueVInPred; + else + InV = Builder->CreateSelect(PN->getIncomingValue(i), + TrueVInPred, FalseVInPred, "phitmp"); + NewPN->addIncoming(InV, ThisBB); + } + } else if (CmpInst *CI = dyn_cast<CmpInst>(&I)) { + Constant *C = cast<Constant>(I.getOperand(1)); + for (unsigned i = 0; i != NumPHIValues; ++i) { + Value *InV = 0; + if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i))) + InV = ConstantExpr::getCompare(CI->getPredicate(), InC, C); + else if (isa<ICmpInst>(CI)) + InV = Builder->CreateICmp(CI->getPredicate(), PN->getIncomingValue(i), + C, "phitmp"); + else + InV = Builder->CreateFCmp(CI->getPredicate(), PN->getIncomingValue(i), + C, "phitmp"); + NewPN->addIncoming(InV, PN->getIncomingBlock(i)); + } + } else if (I.getNumOperands() == 2) { + Constant *C = cast<Constant>(I.getOperand(1)); + for (unsigned i = 0; i != NumPHIValues; ++i) { + Value *InV = 0; + if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i))) + InV = ConstantExpr::get(I.getOpcode(), InC, C); + else + InV = Builder->CreateBinOp(cast<BinaryOperator>(I).getOpcode(), + PN->getIncomingValue(i), C, "phitmp"); + NewPN->addIncoming(InV, PN->getIncomingBlock(i)); + } + } else { + CastInst *CI = cast<CastInst>(&I); + const Type *RetTy = CI->getType(); + for (unsigned i = 0; i != NumPHIValues; ++i) { + Value *InV; + if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i))) + InV = ConstantExpr::getCast(CI->getOpcode(), InC, RetTy); + else + InV = Builder->CreateCast(CI->getOpcode(), + PN->getIncomingValue(i), I.getType(), "phitmp"); + NewPN->addIncoming(InV, PN->getIncomingBlock(i)); + } + } + + for (Value::use_iterator UI = PN->use_begin(), E = PN->use_end(); + UI != E; ) { + Instruction *User = cast<Instruction>(*UI++); + if (User == &I) continue; + ReplaceInstUsesWith(*User, NewPN); + EraseInstFromFunction(*User); + } + return ReplaceInstUsesWith(I, NewPN); +} + +/// FindElementAtOffset - Given a type and a constant offset, determine whether +/// or not there is a sequence of GEP indices into the type that will land us at +/// the specified offset. If so, fill them into NewIndices and return the +/// resultant element type, otherwise return null. +const Type *InstCombiner::FindElementAtOffset(const Type *Ty, int64_t Offset, + SmallVectorImpl<Value*> &NewIndices) { + if (!TD) return 0; + if (!Ty->isSized()) return 0; + + // Start with the index over the outer type. Note that the type size + // might be zero (even if the offset isn't zero) if the indexed type + // is something like [0 x {int, int}] + const Type *IntPtrTy = TD->getIntPtrType(Ty->getContext()); + int64_t FirstIdx = 0; + if (int64_t TySize = TD->getTypeAllocSize(Ty)) { + FirstIdx = Offset/TySize; + Offset -= FirstIdx*TySize; + + // Handle hosts where % returns negative instead of values [0..TySize). + if (Offset < 0) { + --FirstIdx; + Offset += TySize; + assert(Offset >= 0); + } + assert((uint64_t)Offset < (uint64_t)TySize && "Out of range offset"); + } + + NewIndices.push_back(ConstantInt::get(IntPtrTy, FirstIdx)); + + // Index into the types. If we fail, set OrigBase to null. + while (Offset) { + // Indexing into tail padding between struct/array elements. + if (uint64_t(Offset*8) >= TD->getTypeSizeInBits(Ty)) + return 0; + + if (const StructType *STy = dyn_cast<StructType>(Ty)) { + const StructLayout *SL = TD->getStructLayout(STy); + assert(Offset < (int64_t)SL->getSizeInBytes() && + "Offset must stay within the indexed type"); + + unsigned Elt = SL->getElementContainingOffset(Offset); + NewIndices.push_back(ConstantInt::get(Type::getInt32Ty(Ty->getContext()), + Elt)); + + Offset -= SL->getElementOffset(Elt); + Ty = STy->getElementType(Elt); + } else if (const ArrayType *AT = dyn_cast<ArrayType>(Ty)) { + uint64_t EltSize = TD->getTypeAllocSize(AT->getElementType()); + assert(EltSize && "Cannot index into a zero-sized array"); + NewIndices.push_back(ConstantInt::get(IntPtrTy,Offset/EltSize)); + Offset %= EltSize; + Ty = AT->getElementType(); + } else { + // Otherwise, we can't index into the middle of this atomic type, bail. + return 0; + } + } + + return Ty; +} + + + +Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { + SmallVector<Value*, 8> Ops(GEP.op_begin(), GEP.op_end()); + + if (Value *V = SimplifyGEPInst(&Ops[0], Ops.size(), TD)) + return ReplaceInstUsesWith(GEP, V); + + Value *PtrOp = GEP.getOperand(0); + + // Eliminate unneeded casts for indices, and replace indices which displace + // by multiples of a zero size type with zero. + if (TD) { + bool MadeChange = false; + const Type *IntPtrTy = TD->getIntPtrType(GEP.getContext()); + + gep_type_iterator GTI = gep_type_begin(GEP); + for (User::op_iterator I = GEP.op_begin() + 1, E = GEP.op_end(); + I != E; ++I, ++GTI) { + // Skip indices into struct types. + const SequentialType *SeqTy = dyn_cast<SequentialType>(*GTI); + if (!SeqTy) continue; + + // If the element type has zero size then any index over it is equivalent + // to an index of zero, so replace it with zero if it is not zero already. + if (SeqTy->getElementType()->isSized() && + TD->getTypeAllocSize(SeqTy->getElementType()) == 0) + if (!isa<Constant>(*I) || !cast<Constant>(*I)->isNullValue()) { + *I = Constant::getNullValue(IntPtrTy); + MadeChange = true; + } + + if ((*I)->getType() != IntPtrTy) { + // If we are using a wider index than needed for this platform, shrink + // it to what we need. If narrower, sign-extend it to what we need. + // This explicit cast can make subsequent optimizations more obvious. + *I = Builder->CreateIntCast(*I, IntPtrTy, true); + MadeChange = true; + } + } + if (MadeChange) return &GEP; + } + + // Combine Indices - If the source pointer to this getelementptr instruction + // is a getelementptr instruction, combine the indices of the two + // getelementptr instructions into a single instruction. + // + if (GEPOperator *Src = dyn_cast<GEPOperator>(PtrOp)) { + // Note that if our source is a gep chain itself that we wait for that + // chain to be resolved before we perform this transformation. This + // avoids us creating a TON of code in some cases. + // + if (GetElementPtrInst *SrcGEP = + dyn_cast<GetElementPtrInst>(Src->getOperand(0))) + if (SrcGEP->getNumOperands() == 2) + return 0; // Wait until our source is folded to completion. + + SmallVector<Value*, 8> Indices; + + // Find out whether the last index in the source GEP is a sequential idx. + bool EndsWithSequential = false; + for (gep_type_iterator I = gep_type_begin(*Src), E = gep_type_end(*Src); + I != E; ++I) + EndsWithSequential = !(*I)->isStructTy(); + + // Can we combine the two pointer arithmetics offsets? + if (EndsWithSequential) { + // Replace: gep (gep %P, long B), long A, ... + // With: T = long A+B; gep %P, T, ... + // + Value *Sum; + Value *SO1 = Src->getOperand(Src->getNumOperands()-1); + Value *GO1 = GEP.getOperand(1); + if (SO1 == Constant::getNullValue(SO1->getType())) { + Sum = GO1; + } else if (GO1 == Constant::getNullValue(GO1->getType())) { + Sum = SO1; + } else { + // If they aren't the same type, then the input hasn't been processed + // by the loop above yet (which canonicalizes sequential index types to + // intptr_t). Just avoid transforming this until the input has been + // normalized. + if (SO1->getType() != GO1->getType()) + return 0; + Sum = Builder->CreateAdd(SO1, GO1, PtrOp->getName()+".sum"); + } + + // Update the GEP in place if possible. + if (Src->getNumOperands() == 2) { + GEP.setOperand(0, Src->getOperand(0)); + GEP.setOperand(1, Sum); + return &GEP; + } + Indices.append(Src->op_begin()+1, Src->op_end()-1); + Indices.push_back(Sum); + Indices.append(GEP.op_begin()+2, GEP.op_end()); + } else if (isa<Constant>(*GEP.idx_begin()) && + cast<Constant>(*GEP.idx_begin())->isNullValue() && + Src->getNumOperands() != 1) { + // Otherwise we can do the fold if the first index of the GEP is a zero + Indices.append(Src->op_begin()+1, Src->op_end()); + Indices.append(GEP.idx_begin()+1, GEP.idx_end()); + } + + if (!Indices.empty()) + return (GEP.isInBounds() && Src->isInBounds()) ? + GetElementPtrInst::CreateInBounds(Src->getOperand(0), Indices.begin(), + Indices.end(), GEP.getName()) : + GetElementPtrInst::Create(Src->getOperand(0), Indices.begin(), + Indices.end(), GEP.getName()); + } + + // Handle gep(bitcast x) and gep(gep x, 0, 0, 0). + Value *StrippedPtr = PtrOp->stripPointerCasts(); + if (StrippedPtr != PtrOp) { + const PointerType *StrippedPtrTy =cast<PointerType>(StrippedPtr->getType()); + + bool HasZeroPointerIndex = false; + if (ConstantInt *C = dyn_cast<ConstantInt>(GEP.getOperand(1))) + HasZeroPointerIndex = C->isZero(); + + // Transform: GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ... + // into : GEP [10 x i8]* X, i32 0, ... + // + // Likewise, transform: GEP (bitcast i8* X to [0 x i8]*), i32 0, ... + // into : GEP i8* X, ... + // + // This occurs when the program declares an array extern like "int X[];" + if (HasZeroPointerIndex) { + const PointerType *CPTy = cast<PointerType>(PtrOp->getType()); + if (const ArrayType *CATy = + dyn_cast<ArrayType>(CPTy->getElementType())) { + // GEP (bitcast i8* X to [0 x i8]*), i32 0, ... ? + if (CATy->getElementType() == StrippedPtrTy->getElementType()) { + // -> GEP i8* X, ... + SmallVector<Value*, 8> Idx(GEP.idx_begin()+1, GEP.idx_end()); + GetElementPtrInst *Res = + GetElementPtrInst::Create(StrippedPtr, Idx.begin(), + Idx.end(), GEP.getName()); + Res->setIsInBounds(GEP.isInBounds()); + return Res; + } + + if (const ArrayType *XATy = + dyn_cast<ArrayType>(StrippedPtrTy->getElementType())){ + // GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ... ? + if (CATy->getElementType() == XATy->getElementType()) { + // -> GEP [10 x i8]* X, i32 0, ... + // At this point, we know that the cast source type is a pointer + // to an array of the same type as the destination pointer + // array. Because the array type is never stepped over (there + // is a leading zero) we can fold the cast into this GEP. + GEP.setOperand(0, StrippedPtr); + return &GEP; + } + } + } + } else if (GEP.getNumOperands() == 2) { + // Transform things like: + // %t = getelementptr i32* bitcast ([2 x i32]* %str to i32*), i32 %V + // into: %t1 = getelementptr [2 x i32]* %str, i32 0, i32 %V; bitcast + const Type *SrcElTy = StrippedPtrTy->getElementType(); + const Type *ResElTy=cast<PointerType>(PtrOp->getType())->getElementType(); + if (TD && SrcElTy->isArrayTy() && + TD->getTypeAllocSize(cast<ArrayType>(SrcElTy)->getElementType()) == + TD->getTypeAllocSize(ResElTy)) { + Value *Idx[2]; + Idx[0] = Constant::getNullValue(Type::getInt32Ty(GEP.getContext())); + Idx[1] = GEP.getOperand(1); + Value *NewGEP = GEP.isInBounds() ? + Builder->CreateInBoundsGEP(StrippedPtr, Idx, Idx + 2, GEP.getName()) : + Builder->CreateGEP(StrippedPtr, Idx, Idx + 2, GEP.getName()); + // V and GEP are both pointer types --> BitCast + return new BitCastInst(NewGEP, GEP.getType()); + } + + // Transform things like: + // getelementptr i8* bitcast ([100 x double]* X to i8*), i32 %tmp + // (where tmp = 8*tmp2) into: + // getelementptr [100 x double]* %arr, i32 0, i32 %tmp2; bitcast + + if (TD && SrcElTy->isArrayTy() && ResElTy->isIntegerTy(8)) { + uint64_t ArrayEltSize = + TD->getTypeAllocSize(cast<ArrayType>(SrcElTy)->getElementType()); + + // Check to see if "tmp" is a scale by a multiple of ArrayEltSize. We + // allow either a mul, shift, or constant here. + Value *NewIdx = 0; + ConstantInt *Scale = 0; + if (ArrayEltSize == 1) { + NewIdx = GEP.getOperand(1); + Scale = ConstantInt::get(cast<IntegerType>(NewIdx->getType()), 1); + } else if (ConstantInt *CI = dyn_cast<ConstantInt>(GEP.getOperand(1))) { + NewIdx = ConstantInt::get(CI->getType(), 1); + Scale = CI; + } else if (Instruction *Inst =dyn_cast<Instruction>(GEP.getOperand(1))){ + if (Inst->getOpcode() == Instruction::Shl && + isa<ConstantInt>(Inst->getOperand(1))) { + ConstantInt *ShAmt = cast<ConstantInt>(Inst->getOperand(1)); + uint32_t ShAmtVal = ShAmt->getLimitedValue(64); + Scale = ConstantInt::get(cast<IntegerType>(Inst->getType()), + 1ULL << ShAmtVal); + NewIdx = Inst->getOperand(0); + } else if (Inst->getOpcode() == Instruction::Mul && + isa<ConstantInt>(Inst->getOperand(1))) { + Scale = cast<ConstantInt>(Inst->getOperand(1)); + NewIdx = Inst->getOperand(0); + } + } + + // If the index will be to exactly the right offset with the scale taken + // out, perform the transformation. Note, we don't know whether Scale is + // signed or not. We'll use unsigned version of division/modulo + // operation after making sure Scale doesn't have the sign bit set. + if (ArrayEltSize && Scale && Scale->getSExtValue() >= 0LL && + Scale->getZExtValue() % ArrayEltSize == 0) { + Scale = ConstantInt::get(Scale->getType(), + Scale->getZExtValue() / ArrayEltSize); + if (Scale->getZExtValue() != 1) { + Constant *C = ConstantExpr::getIntegerCast(Scale, NewIdx->getType(), + false /*ZExt*/); + NewIdx = Builder->CreateMul(NewIdx, C, "idxscale"); + } + + // Insert the new GEP instruction. + Value *Idx[2]; + Idx[0] = Constant::getNullValue(Type::getInt32Ty(GEP.getContext())); + Idx[1] = NewIdx; + Value *NewGEP = GEP.isInBounds() ? + Builder->CreateInBoundsGEP(StrippedPtr, Idx, Idx + 2,GEP.getName()): + Builder->CreateGEP(StrippedPtr, Idx, Idx + 2, GEP.getName()); + // The NewGEP must be pointer typed, so must the old one -> BitCast + return new BitCastInst(NewGEP, GEP.getType()); + } + } + } + } + + /// See if we can simplify: + /// X = bitcast A* to B* + /// Y = gep X, <...constant indices...> + /// into a gep of the original struct. This is important for SROA and alias + /// analysis of unions. If "A" is also a bitcast, wait for A/X to be merged. + if (BitCastInst *BCI = dyn_cast<BitCastInst>(PtrOp)) { + if (TD && + !isa<BitCastInst>(BCI->getOperand(0)) && GEP.hasAllConstantIndices()) { + // Determine how much the GEP moves the pointer. We are guaranteed to get + // a constant back from EmitGEPOffset. + ConstantInt *OffsetV = cast<ConstantInt>(EmitGEPOffset(&GEP)); + int64_t Offset = OffsetV->getSExtValue(); + + // If this GEP instruction doesn't move the pointer, just replace the GEP + // with a bitcast of the real input to the dest type. + if (Offset == 0) { + // If the bitcast is of an allocation, and the allocation will be + // converted to match the type of the cast, don't touch this. + if (isa<AllocaInst>(BCI->getOperand(0)) || + isMalloc(BCI->getOperand(0))) { + // See if the bitcast simplifies, if so, don't nuke this GEP yet. + if (Instruction *I = visitBitCast(*BCI)) { + if (I != BCI) { + I->takeName(BCI); + BCI->getParent()->getInstList().insert(BCI, I); + ReplaceInstUsesWith(*BCI, I); + } + return &GEP; + } + } + return new BitCastInst(BCI->getOperand(0), GEP.getType()); + } + + // Otherwise, if the offset is non-zero, we need to find out if there is a + // field at Offset in 'A's type. If so, we can pull the cast through the + // GEP. + SmallVector<Value*, 8> NewIndices; + const Type *InTy = + cast<PointerType>(BCI->getOperand(0)->getType())->getElementType(); + if (FindElementAtOffset(InTy, Offset, NewIndices)) { + Value *NGEP = GEP.isInBounds() ? + Builder->CreateInBoundsGEP(BCI->getOperand(0), NewIndices.begin(), + NewIndices.end()) : + Builder->CreateGEP(BCI->getOperand(0), NewIndices.begin(), + NewIndices.end()); + + if (NGEP->getType() == GEP.getType()) + return ReplaceInstUsesWith(GEP, NGEP); + NGEP->takeName(&GEP); + return new BitCastInst(NGEP, GEP.getType()); + } + } + } + + return 0; +} + + + +static bool IsOnlyNullComparedAndFreed(const Value &V) { + for (Value::const_use_iterator UI = V.use_begin(), UE = V.use_end(); + UI != UE; ++UI) { + const User *U = *UI; + if (isFreeCall(U)) + continue; + if (const ICmpInst *ICI = dyn_cast<ICmpInst>(U)) + if (ICI->isEquality() && isa<ConstantPointerNull>(ICI->getOperand(1))) + continue; + return false; + } + return true; +} + +Instruction *InstCombiner::visitMalloc(Instruction &MI) { + // If we have a malloc call which is only used in any amount of comparisons + // to null and free calls, delete the calls and replace the comparisons with + // true or false as appropriate. + if (IsOnlyNullComparedAndFreed(MI)) { + for (Value::use_iterator UI = MI.use_begin(), UE = MI.use_end(); + UI != UE;) { + // We can assume that every remaining use is a free call or an icmp eq/ne + // to null, so the cast is safe. + Instruction *I = cast<Instruction>(*UI); + + // Early increment here, as we're about to get rid of the user. + ++UI; + + if (isFreeCall(I)) { + EraseInstFromFunction(*cast<CallInst>(I)); + continue; + } + // Again, the cast is safe. + ICmpInst *C = cast<ICmpInst>(I); + ReplaceInstUsesWith(*C, ConstantInt::get(Type::getInt1Ty(C->getContext()), + C->isFalseWhenEqual())); + EraseInstFromFunction(*C); + } + return EraseInstFromFunction(MI); + } + return 0; +} + + + +Instruction *InstCombiner::visitFree(CallInst &FI) { + Value *Op = FI.getArgOperand(0); + + // free undef -> unreachable. + if (isa<UndefValue>(Op)) { + // Insert a new store to null because we cannot modify the CFG here. + new StoreInst(ConstantInt::getTrue(FI.getContext()), + UndefValue::get(Type::getInt1PtrTy(FI.getContext())), &FI); + return EraseInstFromFunction(FI); + } + + // If we have 'free null' delete the instruction. This can happen in stl code + // when lots of inlining happens. + if (isa<ConstantPointerNull>(Op)) + return EraseInstFromFunction(FI); + + return 0; +} + + + +Instruction *InstCombiner::visitBranchInst(BranchInst &BI) { + // Change br (not X), label True, label False to: br X, label False, True + Value *X = 0; + BasicBlock *TrueDest; + BasicBlock *FalseDest; + if (match(&BI, m_Br(m_Not(m_Value(X)), TrueDest, FalseDest)) && + !isa<Constant>(X)) { + // Swap Destinations and condition... + BI.setCondition(X); + BI.setSuccessor(0, FalseDest); + BI.setSuccessor(1, TrueDest); + return &BI; + } + + // Cannonicalize fcmp_one -> fcmp_oeq + FCmpInst::Predicate FPred; Value *Y; + if (match(&BI, m_Br(m_FCmp(FPred, m_Value(X), m_Value(Y)), + TrueDest, FalseDest)) && + BI.getCondition()->hasOneUse()) + if (FPred == FCmpInst::FCMP_ONE || FPred == FCmpInst::FCMP_OLE || + FPred == FCmpInst::FCMP_OGE) { + FCmpInst *Cond = cast<FCmpInst>(BI.getCondition()); + Cond->setPredicate(FCmpInst::getInversePredicate(FPred)); + + // Swap Destinations and condition. + BI.setSuccessor(0, FalseDest); + BI.setSuccessor(1, TrueDest); + Worklist.Add(Cond); + return &BI; + } + + // Cannonicalize icmp_ne -> icmp_eq + ICmpInst::Predicate IPred; + if (match(&BI, m_Br(m_ICmp(IPred, m_Value(X), m_Value(Y)), + TrueDest, FalseDest)) && + BI.getCondition()->hasOneUse()) + if (IPred == ICmpInst::ICMP_NE || IPred == ICmpInst::ICMP_ULE || + IPred == ICmpInst::ICMP_SLE || IPred == ICmpInst::ICMP_UGE || + IPred == ICmpInst::ICMP_SGE) { + ICmpInst *Cond = cast<ICmpInst>(BI.getCondition()); + Cond->setPredicate(ICmpInst::getInversePredicate(IPred)); + // Swap Destinations and condition. + BI.setSuccessor(0, FalseDest); + BI.setSuccessor(1, TrueDest); + Worklist.Add(Cond); + return &BI; + } + + return 0; +} + +Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) { + Value *Cond = SI.getCondition(); + if (Instruction *I = dyn_cast<Instruction>(Cond)) { + if (I->getOpcode() == Instruction::Add) + if (ConstantInt *AddRHS = dyn_cast<ConstantInt>(I->getOperand(1))) { + // change 'switch (X+4) case 1:' into 'switch (X) case -3' + for (unsigned i = 2, e = SI.getNumOperands(); i != e; i += 2) + SI.setOperand(i, + ConstantExpr::getSub(cast<Constant>(SI.getOperand(i)), + AddRHS)); + SI.setOperand(0, I->getOperand(0)); + Worklist.Add(I); + return &SI; + } + } + return 0; +} + +Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) { + Value *Agg = EV.getAggregateOperand(); + + if (!EV.hasIndices()) + return ReplaceInstUsesWith(EV, Agg); + + if (Constant *C = dyn_cast<Constant>(Agg)) { + if (isa<UndefValue>(C)) + return ReplaceInstUsesWith(EV, UndefValue::get(EV.getType())); + + if (isa<ConstantAggregateZero>(C)) + return ReplaceInstUsesWith(EV, Constant::getNullValue(EV.getType())); + + if (isa<ConstantArray>(C) || isa<ConstantStruct>(C)) { + // Extract the element indexed by the first index out of the constant + Value *V = C->getOperand(*EV.idx_begin()); + if (EV.getNumIndices() > 1) + // Extract the remaining indices out of the constant indexed by the + // first index + return ExtractValueInst::Create(V, EV.idx_begin() + 1, EV.idx_end()); + else + return ReplaceInstUsesWith(EV, V); + } + return 0; // Can't handle other constants + } + if (InsertValueInst *IV = dyn_cast<InsertValueInst>(Agg)) { + // We're extracting from an insertvalue instruction, compare the indices + const unsigned *exti, *exte, *insi, *inse; + for (exti = EV.idx_begin(), insi = IV->idx_begin(), + exte = EV.idx_end(), inse = IV->idx_end(); + exti != exte && insi != inse; + ++exti, ++insi) { + if (*insi != *exti) + // The insert and extract both reference distinctly different elements. + // This means the extract is not influenced by the insert, and we can + // replace the aggregate operand of the extract with the aggregate + // operand of the insert. i.e., replace + // %I = insertvalue { i32, { i32 } } %A, { i32 } { i32 42 }, 1 + // %E = extractvalue { i32, { i32 } } %I, 0 + // with + // %E = extractvalue { i32, { i32 } } %A, 0 + return ExtractValueInst::Create(IV->getAggregateOperand(), + EV.idx_begin(), EV.idx_end()); + } + if (exti == exte && insi == inse) + // Both iterators are at the end: Index lists are identical. Replace + // %B = insertvalue { i32, { i32 } } %A, i32 42, 1, 0 + // %C = extractvalue { i32, { i32 } } %B, 1, 0 + // with "i32 42" + return ReplaceInstUsesWith(EV, IV->getInsertedValueOperand()); + if (exti == exte) { + // The extract list is a prefix of the insert list. i.e. replace + // %I = insertvalue { i32, { i32 } } %A, i32 42, 1, 0 + // %E = extractvalue { i32, { i32 } } %I, 1 + // with + // %X = extractvalue { i32, { i32 } } %A, 1 + // %E = insertvalue { i32 } %X, i32 42, 0 + // by switching the order of the insert and extract (though the + // insertvalue should be left in, since it may have other uses). + Value *NewEV = Builder->CreateExtractValue(IV->getAggregateOperand(), + EV.idx_begin(), EV.idx_end()); + return InsertValueInst::Create(NewEV, IV->getInsertedValueOperand(), + insi, inse); + } + if (insi == inse) + // The insert list is a prefix of the extract list + // We can simply remove the common indices from the extract and make it + // operate on the inserted value instead of the insertvalue result. + // i.e., replace + // %I = insertvalue { i32, { i32 } } %A, { i32 } { i32 42 }, 1 + // %E = extractvalue { i32, { i32 } } %I, 1, 0 + // with + // %E extractvalue { i32 } { i32 42 }, 0 + return ExtractValueInst::Create(IV->getInsertedValueOperand(), + exti, exte); + } + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Agg)) { + // We're extracting from an intrinsic, see if we're the only user, which + // allows us to simplify multiple result intrinsics to simpler things that + // just get one value. + if (II->hasOneUse()) { + // Check if we're grabbing the overflow bit or the result of a 'with + // overflow' intrinsic. If it's the latter we can remove the intrinsic + // and replace it with a traditional binary instruction. + switch (II->getIntrinsicID()) { + case Intrinsic::uadd_with_overflow: + case Intrinsic::sadd_with_overflow: + if (*EV.idx_begin() == 0) { // Normal result. + Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1); + II->replaceAllUsesWith(UndefValue::get(II->getType())); + EraseInstFromFunction(*II); + return BinaryOperator::CreateAdd(LHS, RHS); + } + + // If the normal result of the add is dead, and the RHS is a constant, + // we can transform this into a range comparison. + // overflow = uadd a, -4 --> overflow = icmp ugt a, 3 + if (II->getIntrinsicID() == Intrinsic::uadd_with_overflow) + if (ConstantInt *CI = dyn_cast<ConstantInt>(II->getArgOperand(1))) + return new ICmpInst(ICmpInst::ICMP_UGT, II->getArgOperand(0), + ConstantExpr::getNot(CI)); + break; + case Intrinsic::usub_with_overflow: + case Intrinsic::ssub_with_overflow: + if (*EV.idx_begin() == 0) { // Normal result. + Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1); + II->replaceAllUsesWith(UndefValue::get(II->getType())); + EraseInstFromFunction(*II); + return BinaryOperator::CreateSub(LHS, RHS); + } + break; + case Intrinsic::umul_with_overflow: + case Intrinsic::smul_with_overflow: + if (*EV.idx_begin() == 0) { // Normal result. + Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1); + II->replaceAllUsesWith(UndefValue::get(II->getType())); + EraseInstFromFunction(*II); + return BinaryOperator::CreateMul(LHS, RHS); + } + break; + default: + break; + } + } + } + if (LoadInst *L = dyn_cast<LoadInst>(Agg)) + // If the (non-volatile) load only has one use, we can rewrite this to a + // load from a GEP. This reduces the size of the load. + // FIXME: If a load is used only by extractvalue instructions then this + // could be done regardless of having multiple uses. + if (!L->isVolatile() && L->hasOneUse()) { + // extractvalue has integer indices, getelementptr has Value*s. Convert. + SmallVector<Value*, 4> Indices; + // Prefix an i32 0 since we need the first element. + Indices.push_back(Builder->getInt32(0)); + for (ExtractValueInst::idx_iterator I = EV.idx_begin(), E = EV.idx_end(); + I != E; ++I) + Indices.push_back(Builder->getInt32(*I)); + + // We need to insert these at the location of the old load, not at that of + // the extractvalue. + Builder->SetInsertPoint(L->getParent(), L); + Value *GEP = Builder->CreateInBoundsGEP(L->getPointerOperand(), + Indices.begin(), Indices.end()); + // Returning the load directly will cause the main loop to insert it in + // the wrong spot, so use ReplaceInstUsesWith(). + return ReplaceInstUsesWith(EV, Builder->CreateLoad(GEP)); + } + // We could simplify extracts from other values. Note that nested extracts may + // already be simplified implicitly by the above: extract (extract (insert) ) + // will be translated into extract ( insert ( extract ) ) first and then just + // the value inserted, if appropriate. Similarly for extracts from single-use + // loads: extract (extract (load)) will be translated to extract (load (gep)) + // and if again single-use then via load (gep (gep)) to load (gep). + // However, double extracts from e.g. function arguments or return values + // aren't handled yet. + return 0; +} + + + + +/// TryToSinkInstruction - Try to move the specified instruction from its +/// current block into the beginning of DestBlock, which can only happen if it's +/// safe to move the instruction past all of the instructions between it and the +/// end of its block. +static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) { + assert(I->hasOneUse() && "Invariants didn't hold!"); + + // Cannot move control-flow-involving, volatile loads, vaarg, etc. + if (isa<PHINode>(I) || I->mayHaveSideEffects() || isa<TerminatorInst>(I)) + return false; + + // Do not sink alloca instructions out of the entry block. + if (isa<AllocaInst>(I) && I->getParent() == + &DestBlock->getParent()->getEntryBlock()) + return false; + + // We can only sink load instructions if there is nothing between the load and + // the end of block that could change the value. + if (I->mayReadFromMemory()) { + for (BasicBlock::iterator Scan = I, E = I->getParent()->end(); + Scan != E; ++Scan) + if (Scan->mayWriteToMemory()) + return false; + } + + BasicBlock::iterator InsertPos = DestBlock->getFirstNonPHI(); + + I->moveBefore(InsertPos); + ++NumSunkInst; + return true; +} + + +/// AddReachableCodeToWorklist - Walk the function in depth-first order, adding +/// all reachable code to the worklist. +/// +/// This has a couple of tricks to make the code faster and more powerful. In +/// particular, we constant fold and DCE instructions as we go, to avoid adding +/// them to the worklist (this significantly speeds up instcombine on code where +/// many instructions are dead or constant). Additionally, if we find a branch +/// whose condition is a known constant, we only visit the reachable successors. +/// +static bool AddReachableCodeToWorklist(BasicBlock *BB, + SmallPtrSet<BasicBlock*, 64> &Visited, + InstCombiner &IC, + const TargetData *TD) { + bool MadeIRChange = false; + SmallVector<BasicBlock*, 256> Worklist; + Worklist.push_back(BB); + + SmallVector<Instruction*, 128> InstrsForInstCombineWorklist; + SmallPtrSet<ConstantExpr*, 64> FoldedConstants; + + do { + BB = Worklist.pop_back_val(); + + // We have now visited this block! If we've already been here, ignore it. + if (!Visited.insert(BB)) continue; + + for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) { + Instruction *Inst = BBI++; + + // DCE instruction if trivially dead. + if (isInstructionTriviallyDead(Inst)) { + ++NumDeadInst; + DEBUG(errs() << "IC: DCE: " << *Inst << '\n'); + Inst->eraseFromParent(); + continue; + } + + // ConstantProp instruction if trivially constant. + if (!Inst->use_empty() && isa<Constant>(Inst->getOperand(0))) + if (Constant *C = ConstantFoldInstruction(Inst, TD)) { + DEBUG(errs() << "IC: ConstFold to: " << *C << " from: " + << *Inst << '\n'); + Inst->replaceAllUsesWith(C); + ++NumConstProp; + Inst->eraseFromParent(); + continue; + } + + if (TD) { + // See if we can constant fold its operands. + for (User::op_iterator i = Inst->op_begin(), e = Inst->op_end(); + i != e; ++i) { + ConstantExpr *CE = dyn_cast<ConstantExpr>(i); + if (CE == 0) continue; + + // If we already folded this constant, don't try again. + if (!FoldedConstants.insert(CE)) + continue; + + Constant *NewC = ConstantFoldConstantExpression(CE, TD); + if (NewC && NewC != CE) { + *i = NewC; + MadeIRChange = true; + } + } + } + + InstrsForInstCombineWorklist.push_back(Inst); + } + + // Recursively visit successors. If this is a branch or switch on a + // constant, only visit the reachable successor. + TerminatorInst *TI = BB->getTerminator(); + if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { + if (BI->isConditional() && isa<ConstantInt>(BI->getCondition())) { + bool CondVal = cast<ConstantInt>(BI->getCondition())->getZExtValue(); + BasicBlock *ReachableBB = BI->getSuccessor(!CondVal); + Worklist.push_back(ReachableBB); + continue; + } + } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { + if (ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition())) { + // See if this is an explicit destination. + for (unsigned i = 1, e = SI->getNumSuccessors(); i != e; ++i) + if (SI->getCaseValue(i) == Cond) { + BasicBlock *ReachableBB = SI->getSuccessor(i); + Worklist.push_back(ReachableBB); + continue; + } + + // Otherwise it is the default destination. + Worklist.push_back(SI->getSuccessor(0)); + continue; + } + } + + for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) + Worklist.push_back(TI->getSuccessor(i)); + } while (!Worklist.empty()); + + // Once we've found all of the instructions to add to instcombine's worklist, + // add them in reverse order. This way instcombine will visit from the top + // of the function down. This jives well with the way that it adds all uses + // of instructions to the worklist after doing a transformation, thus avoiding + // some N^2 behavior in pathological cases. + IC.Worklist.AddInitialGroup(&InstrsForInstCombineWorklist[0], + InstrsForInstCombineWorklist.size()); + + return MadeIRChange; +} + +bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { + MadeIRChange = false; + + DEBUG(errs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on " + << F.getNameStr() << "\n"); + + { + // Do a depth-first traversal of the function, populate the worklist with + // the reachable instructions. Ignore blocks that are not reachable. Keep + // track of which blocks we visit. + SmallPtrSet<BasicBlock*, 64> Visited; + MadeIRChange |= AddReachableCodeToWorklist(F.begin(), Visited, *this, TD); + + // Do a quick scan over the function. If we find any blocks that are + // unreachable, remove any instructions inside of them. This prevents + // the instcombine code from having to deal with some bad special cases. + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) + if (!Visited.count(BB)) { + Instruction *Term = BB->getTerminator(); + while (Term != BB->begin()) { // Remove instrs bottom-up + BasicBlock::iterator I = Term; --I; + + DEBUG(errs() << "IC: DCE: " << *I << '\n'); + // A debug intrinsic shouldn't force another iteration if we weren't + // going to do one without it. + if (!isa<DbgInfoIntrinsic>(I)) { + ++NumDeadInst; + MadeIRChange = true; + } + + // If I is not void type then replaceAllUsesWith undef. + // This allows ValueHandlers and custom metadata to adjust itself. + if (!I->getType()->isVoidTy()) + I->replaceAllUsesWith(UndefValue::get(I->getType())); + I->eraseFromParent(); + } + } + } + + while (!Worklist.isEmpty()) { + Instruction *I = Worklist.RemoveOne(); + if (I == 0) continue; // skip null values. + + // Check to see if we can DCE the instruction. + if (isInstructionTriviallyDead(I)) { + DEBUG(errs() << "IC: DCE: " << *I << '\n'); + EraseInstFromFunction(*I); + ++NumDeadInst; + MadeIRChange = true; + continue; + } + + // Instruction isn't dead, see if we can constant propagate it. + if (!I->use_empty() && isa<Constant>(I->getOperand(0))) + if (Constant *C = ConstantFoldInstruction(I, TD)) { + DEBUG(errs() << "IC: ConstFold to: " << *C << " from: " << *I << '\n'); + + // Add operands to the worklist. + ReplaceInstUsesWith(*I, C); + ++NumConstProp; + EraseInstFromFunction(*I); + MadeIRChange = true; + continue; + } + + // See if we can trivially sink this instruction to a successor basic block. + if (I->hasOneUse()) { + BasicBlock *BB = I->getParent(); + Instruction *UserInst = cast<Instruction>(I->use_back()); + BasicBlock *UserParent; + + // Get the block the use occurs in. + if (PHINode *PN = dyn_cast<PHINode>(UserInst)) + UserParent = PN->getIncomingBlock(I->use_begin().getUse()); + else + UserParent = UserInst->getParent(); + + if (UserParent != BB) { + bool UserIsSuccessor = false; + // See if the user is one of our successors. + for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI) + if (*SI == UserParent) { + UserIsSuccessor = true; + break; + } + + // If the user is one of our immediate successors, and if that successor + // only has us as a predecessors (we'd have to split the critical edge + // otherwise), we can keep going. + if (UserIsSuccessor && UserParent->getSinglePredecessor()) + // Okay, the CFG is simple enough, try to sink this instruction. + MadeIRChange |= TryToSinkInstruction(I, UserParent); + } + } + + // Now that we have an instruction, try combining it to simplify it. + Builder->SetInsertPoint(I->getParent(), I); + +#ifndef NDEBUG + std::string OrigI; +#endif + DEBUG(raw_string_ostream SS(OrigI); I->print(SS); OrigI = SS.str();); + DEBUG(errs() << "IC: Visiting: " << OrigI << '\n'); + + if (Instruction *Result = visit(*I)) { + ++NumCombined; + // Should we replace the old instruction with a new one? + if (Result != I) { + DEBUG(errs() << "IC: Old = " << *I << '\n' + << " New = " << *Result << '\n'); + + Result->setDebugLoc(I->getDebugLoc()); + // Everything uses the new instruction now. + I->replaceAllUsesWith(Result); + + // Push the new instruction and any users onto the worklist. + Worklist.Add(Result); + Worklist.AddUsersToWorkList(*Result); + + // Move the name to the new instruction first. + Result->takeName(I); + + // Insert the new instruction into the basic block... + BasicBlock *InstParent = I->getParent(); + BasicBlock::iterator InsertPos = I; + + if (!isa<PHINode>(Result)) // If combining a PHI, don't insert + while (isa<PHINode>(InsertPos)) // middle of a block of PHIs. + ++InsertPos; + + InstParent->getInstList().insert(InsertPos, Result); + + EraseInstFromFunction(*I); + } else { +#ifndef NDEBUG + DEBUG(errs() << "IC: Mod = " << OrigI << '\n' + << " New = " << *I << '\n'); +#endif + + // If the instruction was modified, it's possible that it is now dead. + // if so, remove it. + if (isInstructionTriviallyDead(I)) { + EraseInstFromFunction(*I); + } else { + Worklist.Add(I); + Worklist.AddUsersToWorkList(*I); + } + } + MadeIRChange = true; + } + } + + Worklist.Zap(); + return MadeIRChange; +} + + +bool InstCombiner::runOnFunction(Function &F) { + MustPreserveLCSSA = mustPreserveAnalysisID(LCSSAID); + TD = getAnalysisIfAvailable<TargetData>(); + + + /// Builder - This is an IRBuilder that automatically inserts new + /// instructions into the worklist when they are created. + IRBuilder<true, TargetFolder, InstCombineIRInserter> + TheBuilder(F.getContext(), TargetFolder(TD), + InstCombineIRInserter(Worklist)); + Builder = &TheBuilder; + + bool EverMadeChange = false; + + // Iterate while there is work to do. + unsigned Iteration = 0; + while (DoOneIteration(F, Iteration++)) + EverMadeChange = true; + + Builder = 0; + return EverMadeChange; +} + +FunctionPass *llvm::createInstructionCombiningPass() { + return new InstCombiner(); +} diff --git a/contrib/llvm/lib/Transforms/Instrumentation/EdgeProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/EdgeProfiling.cpp new file mode 100644 index 0000000..1d31fcc --- /dev/null +++ b/contrib/llvm/lib/Transforms/Instrumentation/EdgeProfiling.cpp @@ -0,0 +1,117 @@ +//===- EdgeProfiling.cpp - Insert counters for edge profiling -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass instruments the specified program with counters for edge profiling. +// Edge profiling can give a reasonable approximation of the hot paths through a +// program, and is used for a wide variety of program transformations. +// +// Note that this implementation is very naive. We insert a counter for *every* +// edge in the program, instead of using control flow information to prune the +// number of counters inserted. +// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "insert-edge-profiling" + +#include "ProfilingUtils.h" +#include "llvm/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Instrumentation.h" +#include "llvm/ADT/Statistic.h" +#include <set> +using namespace llvm; + +STATISTIC(NumEdgesInserted, "The # of edges inserted."); + +namespace { + class EdgeProfiler : public ModulePass { + bool runOnModule(Module &M); + public: + static char ID; // Pass identification, replacement for typeid + EdgeProfiler() : ModulePass(ID) { + initializeEdgeProfilerPass(*PassRegistry::getPassRegistry()); + } + + virtual const char *getPassName() const { + return "Edge Profiler"; + } + }; +} + +char EdgeProfiler::ID = 0; +INITIALIZE_PASS(EdgeProfiler, "insert-edge-profiling", + "Insert instrumentation for edge profiling", false, false) + +ModulePass *llvm::createEdgeProfilerPass() { return new EdgeProfiler(); } + +bool EdgeProfiler::runOnModule(Module &M) { + Function *Main = M.getFunction("main"); + if (Main == 0) { + errs() << "WARNING: cannot insert edge profiling into a module" + << " with no main function!\n"; + return false; // No main, no instrumentation! + } + + std::set<BasicBlock*> BlocksToInstrument; + unsigned NumEdges = 0; + for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) { + if (F->isDeclaration()) continue; + // Reserve space for (0,entry) edge. + ++NumEdges; + for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { + // Keep track of which blocks need to be instrumented. We don't want to + // instrument blocks that are added as the result of breaking critical + // edges! + BlocksToInstrument.insert(BB); + NumEdges += BB->getTerminator()->getNumSuccessors(); + } + } + + const Type *ATy = ArrayType::get(Type::getInt32Ty(M.getContext()), NumEdges); + GlobalVariable *Counters = + new GlobalVariable(M, ATy, false, GlobalValue::InternalLinkage, + Constant::getNullValue(ATy), "EdgeProfCounters"); + NumEdgesInserted = NumEdges; + + // Instrument all of the edges... + unsigned i = 0; + for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) { + if (F->isDeclaration()) continue; + // Create counter for (0,entry) edge. + IncrementCounterInBlock(&F->getEntryBlock(), i++, Counters); + for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) + if (BlocksToInstrument.count(BB)) { // Don't instrument inserted blocks + // Okay, we have to add a counter of each outgoing edge. If the + // outgoing edge is not critical don't split it, just insert the counter + // in the source or destination of the edge. + TerminatorInst *TI = BB->getTerminator(); + for (unsigned s = 0, e = TI->getNumSuccessors(); s != e; ++s) { + // If the edge is critical, split it. + SplitCriticalEdge(TI, s, this); + + // Okay, we are guaranteed that the edge is no longer critical. If we + // only have a single successor, insert the counter in this block, + // otherwise insert it in the successor block. + if (TI->getNumSuccessors() == 1) { + // Insert counter at the start of the block + IncrementCounterInBlock(BB, i++, Counters, false); + } else { + // Insert counter at the start of the block + IncrementCounterInBlock(TI->getSuccessor(s), i++, Counters); + } + } + } + } + + // Add the initialization call to main. + InsertProfilingInitCall(Main, "llvm_start_edge_profiling", Counters); + return true; +} + diff --git a/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp b/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp new file mode 100644 index 0000000..96ed4fa --- /dev/null +++ b/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp @@ -0,0 +1,32 @@ +//===-- Instrumentation.cpp - TransformUtils Infrastructure ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the common initialization infrastructure for the +// Instrumentation library. +// +//===----------------------------------------------------------------------===// + +#include "llvm/InitializePasses.h" +#include "llvm-c/Initialization.h" + +using namespace llvm; + +/// initializeInstrumentation - Initialize all passes in the TransformUtils +/// library. +void llvm::initializeInstrumentation(PassRegistry &Registry) { + initializeEdgeProfilerPass(Registry); + initializeOptimalEdgeProfilerPass(Registry); + initializePathProfilerPass(Registry); +} + +/// LLVMInitializeInstrumentation - C binding for +/// initializeInstrumentation. +void LLVMInitializeInstrumentation(LLVMPassRegistryRef R) { + initializeInstrumentation(*unwrap(R)); +} diff --git a/contrib/llvm/lib/Transforms/Instrumentation/MaximumSpanningTree.h b/contrib/llvm/lib/Transforms/Instrumentation/MaximumSpanningTree.h new file mode 100644 index 0000000..829da6b --- /dev/null +++ b/contrib/llvm/lib/Transforms/Instrumentation/MaximumSpanningTree.h @@ -0,0 +1,108 @@ +//===- llvm/Analysis/MaximumSpanningTree.h - Interface ----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This module privides means for calculating a maximum spanning tree for a +// given set of weighted edges. The type parameter T is the type of a node. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_MAXIMUMSPANNINGTREE_H +#define LLVM_ANALYSIS_MAXIMUMSPANNINGTREE_H + +#include "llvm/BasicBlock.h" +#include "llvm/ADT/EquivalenceClasses.h" +#include <vector> +#include <algorithm> + +namespace llvm { + + /// MaximumSpanningTree - A MST implementation. + /// The type parameter T determines the type of the nodes of the graph. + template <typename T> + class MaximumSpanningTree { + + // A comparing class for comparing weighted edges. + template <typename CT> + struct EdgeWeightCompare { + bool operator()(typename MaximumSpanningTree<CT>::EdgeWeight X, + typename MaximumSpanningTree<CT>::EdgeWeight Y) const { + if (X.second > Y.second) return true; + if (X.second < Y.second) return false; + if (const BasicBlock *BBX = dyn_cast<BasicBlock>(X.first.first)) { + if (const BasicBlock *BBY = dyn_cast<BasicBlock>(Y.first.first)) { + if (BBX->size() > BBY->size()) return true; + if (BBX->size() < BBY->size()) return false; + } + } + if (const BasicBlock *BBX = dyn_cast<BasicBlock>(X.first.second)) { + if (const BasicBlock *BBY = dyn_cast<BasicBlock>(Y.first.second)) { + if (BBX->size() > BBY->size()) return true; + if (BBX->size() < BBY->size()) return false; + } + } + return false; + } + }; + + public: + typedef std::pair<const T*, const T*> Edge; + typedef std::pair<Edge, double> EdgeWeight; + typedef std::vector<EdgeWeight> EdgeWeights; + protected: + typedef std::vector<Edge> MaxSpanTree; + + MaxSpanTree MST; + + public: + static char ID; // Class identification, replacement for typeinfo + + /// MaximumSpanningTree() - Takes a vector of weighted edges and returns a + /// spanning tree. + MaximumSpanningTree(EdgeWeights &EdgeVector) { + + std::stable_sort(EdgeVector.begin(), EdgeVector.end(), EdgeWeightCompare<T>()); + + // Create spanning tree, Forest contains a special data structure + // that makes checking if two nodes are already in a common (sub-)tree + // fast and cheap. + EquivalenceClasses<const T*> Forest; + for (typename EdgeWeights::iterator EWi = EdgeVector.begin(), + EWe = EdgeVector.end(); EWi != EWe; ++EWi) { + Edge e = (*EWi).first; + + Forest.insert(e.first); + Forest.insert(e.second); + } + + // Iterate over the sorted edges, biggest first. + for (typename EdgeWeights::iterator EWi = EdgeVector.begin(), + EWe = EdgeVector.end(); EWi != EWe; ++EWi) { + Edge e = (*EWi).first; + + if (Forest.findLeader(e.first) != Forest.findLeader(e.second)) { + Forest.unionSets(e.first, e.second); + // So we know now that the edge is not already in a subtree, so we push + // the edge to the MST. + MST.push_back(e); + } + } + } + + typename MaxSpanTree::iterator begin() { + return MST.begin(); + } + + typename MaxSpanTree::iterator end() { + return MST.end(); + } + }; + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp new file mode 100644 index 0000000..c85a1a9 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp @@ -0,0 +1,225 @@ +//===- OptimalEdgeProfiling.cpp - Insert counters for opt. edge profiling -===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass instruments the specified program with counters for edge profiling. +// Edge profiling can give a reasonable approximation of the hot paths through a +// program, and is used for a wide variety of program transformations. +// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "insert-optimal-edge-profiling" +#include "ProfilingUtils.h" +#include "llvm/Module.h" +#include "llvm/Pass.h" +#include "llvm/Analysis/Passes.h" +#include "llvm/Analysis/ProfileInfo.h" +#include "llvm/Analysis/ProfileInfoLoader.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Instrumentation.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/Statistic.h" +#include "MaximumSpanningTree.h" +#include <set> +using namespace llvm; + +STATISTIC(NumEdgesInserted, "The # of edges inserted."); + +namespace { + class OptimalEdgeProfiler : public ModulePass { + bool runOnModule(Module &M); + public: + static char ID; // Pass identification, replacement for typeid + OptimalEdgeProfiler() : ModulePass(ID) { + initializeOptimalEdgeProfilerPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequiredID(ProfileEstimatorPassID); + AU.addRequired<ProfileInfo>(); + } + + virtual const char *getPassName() const { + return "Optimal Edge Profiler"; + } + }; +} + +char OptimalEdgeProfiler::ID = 0; +INITIALIZE_PASS_BEGIN(OptimalEdgeProfiler, "insert-optimal-edge-profiling", + "Insert optimal instrumentation for edge profiling", + false, false) +INITIALIZE_PASS_DEPENDENCY(ProfileEstimatorPass) +INITIALIZE_AG_DEPENDENCY(ProfileInfo) +INITIALIZE_PASS_END(OptimalEdgeProfiler, "insert-optimal-edge-profiling", + "Insert optimal instrumentation for edge profiling", + false, false) + +ModulePass *llvm::createOptimalEdgeProfilerPass() { + return new OptimalEdgeProfiler(); +} + +inline static void printEdgeCounter(ProfileInfo::Edge e, + BasicBlock* b, + unsigned i) { + DEBUG(dbgs() << "--Edge Counter for " << (e) << " in " \ + << ((b)?(b)->getNameStr():"0") << " (# " << (i) << ")\n"); +} + +bool OptimalEdgeProfiler::runOnModule(Module &M) { + Function *Main = M.getFunction("main"); + if (Main == 0) { + errs() << "WARNING: cannot insert edge profiling into a module" + << " with no main function!\n"; + return false; // No main, no instrumentation! + } + + // NumEdges counts all the edges that may be instrumented. Later on its + // decided which edges to actually instrument, to achieve optimal profiling. + // For the entry block a virtual edge (0,entry) is reserved, for each block + // with no successors an edge (BB,0) is reserved. These edges are necessary + // to calculate a truly optimal maximum spanning tree and thus an optimal + // instrumentation. + unsigned NumEdges = 0; + + for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) { + if (F->isDeclaration()) continue; + // Reserve space for (0,entry) edge. + ++NumEdges; + for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { + // Keep track of which blocks need to be instrumented. We don't want to + // instrument blocks that are added as the result of breaking critical + // edges! + if (BB->getTerminator()->getNumSuccessors() == 0) { + // Reserve space for (BB,0) edge. + ++NumEdges; + } else { + NumEdges += BB->getTerminator()->getNumSuccessors(); + } + } + } + + // In the profiling output a counter for each edge is reserved, but only few + // are used. This is done to be able to read back in the profile without + // calulating the maximum spanning tree again, instead each edge counter that + // is not used is initialised with -1 to signal that this edge counter has to + // be calculated from other edge counters on reading the profile info back + // in. + + const Type *Int32 = Type::getInt32Ty(M.getContext()); + const ArrayType *ATy = ArrayType::get(Int32, NumEdges); + GlobalVariable *Counters = + new GlobalVariable(M, ATy, false, GlobalValue::InternalLinkage, + Constant::getNullValue(ATy), "OptEdgeProfCounters"); + NumEdgesInserted = 0; + + std::vector<Constant*> Initializer(NumEdges); + Constant* Zero = ConstantInt::get(Int32, 0); + Constant* Uncounted = ConstantInt::get(Int32, ProfileInfoLoader::Uncounted); + + // Instrument all of the edges not in MST... + unsigned i = 0; + for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) { + if (F->isDeclaration()) continue; + DEBUG(dbgs()<<"Working on "<<F->getNameStr()<<"\n"); + + // Calculate a Maximum Spanning Tree with the edge weights determined by + // ProfileEstimator. ProfileEstimator also assign weights to the virtual + // edges (0,entry) and (BB,0) (for blocks with no successors) and this + // edges also participate in the maximum spanning tree calculation. + // The third parameter of MaximumSpanningTree() has the effect that not the + // actual MST is returned but the edges _not_ in the MST. + + ProfileInfo::EdgeWeights ECs = + getAnalysis<ProfileInfo>(*F).getEdgeWeights(F); + std::vector<ProfileInfo::EdgeWeight> EdgeVector(ECs.begin(), ECs.end()); + MaximumSpanningTree<BasicBlock> MST (EdgeVector); + std::stable_sort(MST.begin(),MST.end()); + + // Check if (0,entry) not in the MST. If not, instrument edge + // (IncrementCounterInBlock()) and set the counter initially to zero, if + // the edge is in the MST the counter is initialised to -1. + + BasicBlock *entry = &(F->getEntryBlock()); + ProfileInfo::Edge edge = ProfileInfo::getEdge(0,entry); + if (!std::binary_search(MST.begin(), MST.end(), edge)) { + printEdgeCounter(edge,entry,i); + IncrementCounterInBlock(entry, i, Counters); ++NumEdgesInserted; + Initializer[i++] = (Zero); + } else{ + Initializer[i++] = (Uncounted); + } + + // InsertedBlocks contains all blocks that were inserted for splitting an + // edge, this blocks do not have to be instrumented. + DenseSet<BasicBlock*> InsertedBlocks; + for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { + // Check if block was not inserted and thus does not have to be + // instrumented. + if (InsertedBlocks.count(BB)) continue; + + // Okay, we have to add a counter of each outgoing edge not in MST. If + // the outgoing edge is not critical don't split it, just insert the + // counter in the source or destination of the edge. Also, if the block + // has no successors, the virtual edge (BB,0) is processed. + TerminatorInst *TI = BB->getTerminator(); + if (TI->getNumSuccessors() == 0) { + ProfileInfo::Edge edge = ProfileInfo::getEdge(BB,0); + if (!std::binary_search(MST.begin(), MST.end(), edge)) { + printEdgeCounter(edge,BB,i); + IncrementCounterInBlock(BB, i, Counters); ++NumEdgesInserted; + Initializer[i++] = (Zero); + } else{ + Initializer[i++] = (Uncounted); + } + } + for (unsigned s = 0, e = TI->getNumSuccessors(); s != e; ++s) { + BasicBlock *Succ = TI->getSuccessor(s); + ProfileInfo::Edge edge = ProfileInfo::getEdge(BB,Succ); + if (!std::binary_search(MST.begin(), MST.end(), edge)) { + + // If the edge is critical, split it. + bool wasInserted = SplitCriticalEdge(TI, s, this); + Succ = TI->getSuccessor(s); + if (wasInserted) + InsertedBlocks.insert(Succ); + + // Okay, we are guaranteed that the edge is no longer critical. If + // we only have a single successor, insert the counter in this block, + // otherwise insert it in the successor block. + if (TI->getNumSuccessors() == 1) { + // Insert counter at the start of the block + printEdgeCounter(edge,BB,i); + IncrementCounterInBlock(BB, i, Counters); ++NumEdgesInserted; + } else { + // Insert counter at the start of the block + printEdgeCounter(edge,Succ,i); + IncrementCounterInBlock(Succ, i, Counters); ++NumEdgesInserted; + } + Initializer[i++] = (Zero); + } else { + Initializer[i++] = (Uncounted); + } + } + } + } + + // Check if the number of edges counted at first was the number of edges we + // considered for instrumentation. + assert(i==NumEdges && "the number of edges in counting array is wrong"); + + // Assing the now completely defined initialiser to the array. + Constant *init = ConstantArray::get(ATy, Initializer); + Counters->setInitializer(init); + + // Add the initialization call to main. + InsertProfilingInitCall(Main, "llvm_start_opt_edge_profiling", Counters); + return true; +} + diff --git a/contrib/llvm/lib/Transforms/Instrumentation/PathProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/PathProfiling.cpp new file mode 100644 index 0000000..6449b39 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Instrumentation/PathProfiling.cpp @@ -0,0 +1,1423 @@ +//===- PathProfiling.cpp - Inserts counters for path profiling ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass instruments functions for Ball-Larus path profiling. Ball-Larus +// profiling converts the CFG into a DAG by replacing backedges with edges +// from entry to the start block and from the end block to exit. The paths +// along the new DAG are enumrated, i.e. each path is given a path number. +// Edges are instrumented to increment the path number register, such that the +// path number register will equal the path number of the path taken at the +// exit. +// +// This file defines classes for building a CFG for use with different stages +// in the Ball-Larus path profiling instrumentation [Ball96]. The +// requirements are formatting the llvm CFG into the Ball-Larus DAG, path +// numbering, finding a spanning tree, moving increments from the spanning +// tree to chords. +// +// Terms: +// DAG - Directed Acyclic Graph. +// Ball-Larus DAG - A CFG with an entry node, an exit node, and backedges +// removed in the following manner. For every backedge +// v->w, insert edge ENTRY->w and edge v->EXIT. +// Path Number - The number corresponding to a specific path through a +// Ball-Larus DAG. +// Spanning Tree - A subgraph, S, is a spanning tree if S covers all +// vertices and is a tree. +// Chord - An edge not in the spanning tree. +// +// [Ball96] +// T. Ball and J. R. Larus. "Efficient Path Profiling." +// International Symposium on Microarchitecture, pages 46-57, 1996. +// http://portal.acm.org/citation.cfm?id=243857 +// +// [Ball94] +// Thomas Ball. "Efficiently Counting Program Events with Support for +// On-line queries." +// ACM Transactions on Programmmg Languages and Systems, Vol 16, No 5, +// September 1994, Pages 1399-1410. +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "insert-path-profiling" + +#include "llvm/DerivedTypes.h" +#include "ProfilingUtils.h" +#include "llvm/Analysis/PathNumbering.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/InstrTypes.h" +#include "llvm/Instructions.h" +#include "llvm/LLVMContext.h" +#include "llvm/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/CFG.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/TypeBuilder.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Instrumentation.h" +#include <map> +#include <vector> + +#define HASH_THRESHHOLD 100000 + +using namespace llvm; + +namespace { +class BLInstrumentationNode; +class BLInstrumentationEdge; +class BLInstrumentationDag; + +// --------------------------------------------------------------------------- +// BLInstrumentationNode extends BallLarusNode with member used by the +// instrumentation algortihms. +// --------------------------------------------------------------------------- +class BLInstrumentationNode : public BallLarusNode { +public: + // Creates a new BLInstrumentationNode from a BasicBlock. + BLInstrumentationNode(BasicBlock* BB); + + // Get/sets the Value corresponding to the pathNumber register, + // constant or phinode. Used by the instrumentation code to remember + // path number Values. + Value* getStartingPathNumber(); + void setStartingPathNumber(Value* pathNumber); + + Value* getEndingPathNumber(); + void setEndingPathNumber(Value* pathNumber); + + // Get/set the PHINode Instruction for this node. + PHINode* getPathPHI(); + void setPathPHI(PHINode* pathPHI); + +private: + + Value* _startingPathNumber; // The Value for the current pathNumber. + Value* _endingPathNumber; // The Value for the current pathNumber. + PHINode* _pathPHI; // The PHINode for current pathNumber. +}; + +// -------------------------------------------------------------------------- +// BLInstrumentationEdge extends BallLarusEdge with data about the +// instrumentation that will end up on each edge. +// -------------------------------------------------------------------------- +class BLInstrumentationEdge : public BallLarusEdge { +public: + BLInstrumentationEdge(BLInstrumentationNode* source, + BLInstrumentationNode* target); + + // Sets the target node of this edge. Required to split edges. + void setTarget(BallLarusNode* node); + + // Get/set whether edge is in the spanning tree. + bool isInSpanningTree() const; + void setIsInSpanningTree(bool isInSpanningTree); + + // Get/ set whether this edge will be instrumented with a path number + // initialization. + bool isInitialization() const; + void setIsInitialization(bool isInitialization); + + // Get/set whether this edge will be instrumented with a path counter + // increment. Notice this is incrementing the path counter + // corresponding to the path number register. The path number + // increment is determined by getIncrement(). + bool isCounterIncrement() const; + void setIsCounterIncrement(bool isCounterIncrement); + + // Get/set the path number increment that this edge will be instrumented + // with. This is distinct from the path counter increment and the + // weight. The counter increment counts the number of executions of + // some path, whereas the path number keeps track of which path number + // the program is on. + long getIncrement() const; + void setIncrement(long increment); + + // Get/set whether the edge has been instrumented. + bool hasInstrumentation(); + void setHasInstrumentation(bool hasInstrumentation); + + // Returns the successor number of this edge in the source. + unsigned getSuccessorNumber(); + +private: + // The increment that the code will be instrumented with. + long long _increment; + + // Whether this edge is in the spanning tree. + bool _isInSpanningTree; + + // Whether this edge is an initialiation of the path number. + bool _isInitialization; + + // Whether this edge is a path counter increment. + bool _isCounterIncrement; + + // Whether this edge has been instrumented. + bool _hasInstrumentation; +}; + +// --------------------------------------------------------------------------- +// BLInstrumentationDag extends BallLarusDag with algorithms that +// determine where instrumentation should be placed. +// --------------------------------------------------------------------------- +class BLInstrumentationDag : public BallLarusDag { +public: + BLInstrumentationDag(Function &F); + + // Returns the Exit->Root edge. This edge is required for creating + // directed cycles in the algorithm for moving instrumentation off of + // the spanning tree + BallLarusEdge* getExitRootEdge(); + + // Returns an array of phony edges which mark those nodes + // with function calls + BLEdgeVector getCallPhonyEdges(); + + // Gets/sets the path counter array + GlobalVariable* getCounterArray(); + void setCounterArray(GlobalVariable* c); + + // Calculates the increments for the chords, thereby removing + // instrumentation from the spanning tree edges. Implementation is based + // on the algorithm in Figure 4 of [Ball94] + void calculateChordIncrements(); + + // Updates the state when an edge has been split + void splitUpdate(BLInstrumentationEdge* formerEdge, BasicBlock* newBlock); + + // Calculates a spanning tree of the DAG ignoring cycles. Whichever + // edges are in the spanning tree will not be instrumented, but this + // implementation does not try to minimize the instrumentation overhead + // by trying to find hot edges. + void calculateSpanningTree(); + + // Pushes initialization further down in order to group the first + // increment and initialization. + void pushInitialization(); + + // Pushes the path counter increments up in order to group the last path + // number increment. + void pushCounters(); + + // Removes phony edges from the successor list of the source, and the + // predecessor list of the target. + void unlinkPhony(); + + // Generate dot graph for the function + void generateDotGraph(); + +protected: + // BLInstrumentationDag creates BLInstrumentationNode objects in this + // method overriding the creation of BallLarusNode objects. + // + // Allows subclasses to determine which type of Node is created. + // Override this method to produce subclasses of BallLarusNode if + // necessary. + virtual BallLarusNode* createNode(BasicBlock* BB); + + // BLInstrumentationDag create BLInstrumentationEdges. + // + // Allows subclasses to determine which type of Edge is created. + // Override this method to produce subclasses of BallLarusEdge if + // necessary. Parameters source and target will have been created by + // createNode and can be cast to the subclass of BallLarusNode* + // returned by createNode. + virtual BallLarusEdge* createEdge( + BallLarusNode* source, BallLarusNode* target, unsigned edgeNumber); + +private: + BLEdgeVector _treeEdges; // All edges in the spanning tree. + BLEdgeVector _chordEdges; // All edges not in the spanning tree. + GlobalVariable* _counterArray; // Array to store path counters + + // Removes the edge from the appropriate predecessor and successor lists. + void unlinkEdge(BallLarusEdge* edge); + + // Makes an edge part of the spanning tree. + void makeEdgeSpanning(BLInstrumentationEdge* edge); + + // Pushes initialization and calls itself recursively. + void pushInitializationFromEdge(BLInstrumentationEdge* edge); + + // Pushes path counter increments up recursively. + void pushCountersFromEdge(BLInstrumentationEdge* edge); + + // Depth first algorithm for determining the chord increments.f + void calculateChordIncrementsDfs( + long weight, BallLarusNode* v, BallLarusEdge* e); + + // Determines the relative direction of two edges. + int calculateChordIncrementsDir(BallLarusEdge* e, BallLarusEdge* f); +}; + +// --------------------------------------------------------------------------- +// PathProfiler is a module pass which intruments path profiling instructions +// --------------------------------------------------------------------------- +class PathProfiler : public ModulePass { +private: + // Current context for multi threading support. + LLVMContext* Context; + + // Which function are we currently instrumenting + unsigned currentFunctionNumber; + + // The function prototype in the profiling runtime for incrementing a + // single path counter in a hash table. + Constant* llvmIncrementHashFunction; + Constant* llvmDecrementHashFunction; + + // Instruments each function with path profiling. 'main' is instrumented + // with code to save the profile to disk. + bool runOnModule(Module &M); + + // Analyzes the function for Ball-Larus path profiling, and inserts code. + void runOnFunction(std::vector<Constant*> &ftInit, Function &F, Module &M); + + // Creates an increment constant representing incr. + ConstantInt* createIncrementConstant(long incr, int bitsize); + + // Creates an increment constant representing the value in + // edge->getIncrement(). + ConstantInt* createIncrementConstant(BLInstrumentationEdge* edge); + + // Finds the insertion point after pathNumber in block. PathNumber may + // be NULL. + BasicBlock::iterator getInsertionPoint( + BasicBlock* block, Value* pathNumber); + + // Inserts source's pathNumber Value* into target. Target may or may not + // have multiple predecessors, and may or may not have its phiNode + // initalized. + void pushValueIntoNode( + BLInstrumentationNode* source, BLInstrumentationNode* target); + + // Inserts source's pathNumber Value* into the appropriate slot of + // target's phiNode. + void pushValueIntoPHI( + BLInstrumentationNode* target, BLInstrumentationNode* source); + + // The Value* in node, oldVal, is updated with a Value* correspodning to + // oldVal + addition. + void insertNumberIncrement(BLInstrumentationNode* node, Value* addition, + bool atBeginning); + + // Creates a counter increment in the given node. The Value* in node is + // taken as the index into a hash table. + void insertCounterIncrement( + Value* incValue, + BasicBlock::iterator insertPoint, + BLInstrumentationDag* dag, + bool increment = true); + + // A PHINode is created in the node, and its values initialized to -1U. + void preparePHI(BLInstrumentationNode* node); + + // Inserts instrumentation for the given edge + // + // Pre: The edge's source node has pathNumber set if edge is non zero + // path number increment. + // + // Post: Edge's target node has a pathNumber set to the path number Value + // corresponding to the value of the path register after edge's + // execution. + void insertInstrumentationStartingAt( + BLInstrumentationEdge* edge, + BLInstrumentationDag* dag); + + // If this edge is a critical edge, then inserts a node at this edge. + // This edge becomes the first edge, and a new BallLarusEdge is created. + bool splitCritical(BLInstrumentationEdge* edge, BLInstrumentationDag* dag); + + // Inserts instrumentation according to the marked edges in dag. Phony + // edges must be unlinked from the DAG, but accessible from the + // backedges. Dag must have initializations, path number increments, and + // counter increments present. + // + // Counter storage is created here. + void insertInstrumentation( BLInstrumentationDag& dag, Module &M); + +public: + static char ID; // Pass identification, replacement for typeid + PathProfiler() : ModulePass(ID) { + initializePathProfilerPass(*PassRegistry::getPassRegistry()); + } + + virtual const char *getPassName() const { + return "Path Profiler"; + } +}; +} // end anonymous namespace + +// Should we print the dot-graphs +static cl::opt<bool> DotPathDag("path-profile-pathdag", cl::Hidden, + cl::desc("Output the path profiling DAG for each function.")); + +// Register the path profiler as a pass +char PathProfiler::ID = 0; +INITIALIZE_PASS(PathProfiler, "insert-path-profiling", + "Insert instrumentation for Ball-Larus path profiling", + false, false) + +ModulePass *llvm::createPathProfilerPass() { return new PathProfiler(); } + +namespace llvm { + class PathProfilingFunctionTable {}; + + // Type for global array storing references to hashes or arrays + template<bool xcompile> class TypeBuilder<PathProfilingFunctionTable, + xcompile> { + public: + static const StructType *get(LLVMContext& C) { + return( StructType::get( + C, TypeBuilder<types::i<32>, xcompile>::get(C), // type + TypeBuilder<types::i<32>, xcompile>::get(C), // array size + TypeBuilder<types::i<8>*, xcompile>::get(C), // array/hash ptr + NULL)); + } + }; + + typedef TypeBuilder<PathProfilingFunctionTable, true> + ftEntryTypeBuilder; + + // BallLarusEdge << operator overloading + raw_ostream& operator<<(raw_ostream& os, + const BLInstrumentationEdge& edge) { + os << "[" << edge.getSource()->getName() << " -> " + << edge.getTarget()->getName() << "] init: " + << (edge.isInitialization() ? "yes" : "no") + << " incr:" << edge.getIncrement() << " cinc: " + << (edge.isCounterIncrement() ? "yes" : "no"); + return(os); + } +} + +// Creates a new BLInstrumentationNode from a BasicBlock. +BLInstrumentationNode::BLInstrumentationNode(BasicBlock* BB) : + BallLarusNode(BB), + _startingPathNumber(NULL), _endingPathNumber(NULL), _pathPHI(NULL) {} + +// Constructor for BLInstrumentationEdge. +BLInstrumentationEdge::BLInstrumentationEdge(BLInstrumentationNode* source, + BLInstrumentationNode* target) + : BallLarusEdge(source, target, 0), + _increment(0), _isInSpanningTree(false), _isInitialization(false), + _isCounterIncrement(false), _hasInstrumentation(false) {} + +// Sets the target node of this edge. Required to split edges. +void BLInstrumentationEdge::setTarget(BallLarusNode* node) { + _target = node; +} + +// Returns whether this edge is in the spanning tree. +bool BLInstrumentationEdge::isInSpanningTree() const { + return(_isInSpanningTree); +} + +// Sets whether this edge is in the spanning tree. +void BLInstrumentationEdge::setIsInSpanningTree(bool isInSpanningTree) { + _isInSpanningTree = isInSpanningTree; +} + +// Returns whether this edge will be instrumented with a path number +// initialization. +bool BLInstrumentationEdge::isInitialization() const { + return(_isInitialization); +} + +// Sets whether this edge will be instrumented with a path number +// initialization. +void BLInstrumentationEdge::setIsInitialization(bool isInitialization) { + _isInitialization = isInitialization; +} + +// Returns whether this edge will be instrumented with a path counter +// increment. Notice this is incrementing the path counter +// corresponding to the path number register. The path number +// increment is determined by getIncrement(). +bool BLInstrumentationEdge::isCounterIncrement() const { + return(_isCounterIncrement); +} + +// Sets whether this edge will be instrumented with a path counter +// increment. +void BLInstrumentationEdge::setIsCounterIncrement(bool isCounterIncrement) { + _isCounterIncrement = isCounterIncrement; +} + +// Gets the path number increment that this edge will be instrumented +// with. This is distinct from the path counter increment and the +// weight. The counter increment is counts the number of executions of +// some path, whereas the path number keeps track of which path number +// the program is on. +long BLInstrumentationEdge::getIncrement() const { + return(_increment); +} + +// Set whether this edge will be instrumented with a path number +// increment. +void BLInstrumentationEdge::setIncrement(long increment) { + _increment = increment; +} + +// True iff the edge has already been instrumented. +bool BLInstrumentationEdge::hasInstrumentation() { + return(_hasInstrumentation); +} + +// Set whether this edge has been instrumented. +void BLInstrumentationEdge::setHasInstrumentation(bool hasInstrumentation) { + _hasInstrumentation = hasInstrumentation; +} + +// Returns the successor number of this edge in the source. +unsigned BLInstrumentationEdge::getSuccessorNumber() { + BallLarusNode* sourceNode = getSource(); + BallLarusNode* targetNode = getTarget(); + BasicBlock* source = sourceNode->getBlock(); + BasicBlock* target = targetNode->getBlock(); + + if(source == NULL || target == NULL) + return(0); + + TerminatorInst* terminator = source->getTerminator(); + + unsigned i; + for(i=0; i < terminator->getNumSuccessors(); i++) { + if(terminator->getSuccessor(i) == target) + break; + } + + return(i); +} + +// BLInstrumentationDag constructor initializes a DAG for the given Function. +BLInstrumentationDag::BLInstrumentationDag(Function &F) : BallLarusDag(F), + _counterArray(0) { +} + +// Returns the Exit->Root edge. This edge is required for creating +// directed cycles in the algorithm for moving instrumentation off of +// the spanning tree +BallLarusEdge* BLInstrumentationDag::getExitRootEdge() { + BLEdgeIterator erEdge = getExit()->succBegin(); + return(*erEdge); +} + +BLEdgeVector BLInstrumentationDag::getCallPhonyEdges () { + BLEdgeVector callEdges; + + for( BLEdgeIterator edge = _edges.begin(), end = _edges.end(); + edge != end; edge++ ) { + if( (*edge)->getType() == BallLarusEdge::CALLEDGE_PHONY ) + callEdges.push_back(*edge); + } + + return callEdges; +} + +// Gets the path counter array +GlobalVariable* BLInstrumentationDag::getCounterArray() { + return _counterArray; +} + +void BLInstrumentationDag::setCounterArray(GlobalVariable* c) { + _counterArray = c; +} + +// Calculates the increment for the chords, thereby removing +// instrumentation from the spanning tree edges. Implementation is based on +// the algorithm in Figure 4 of [Ball94] +void BLInstrumentationDag::calculateChordIncrements() { + calculateChordIncrementsDfs(0, getRoot(), NULL); + + BLInstrumentationEdge* chord; + for(BLEdgeIterator chordEdge = _chordEdges.begin(), + end = _chordEdges.end(); chordEdge != end; chordEdge++) { + chord = (BLInstrumentationEdge*) *chordEdge; + chord->setIncrement(chord->getIncrement() + chord->getWeight()); + } +} + +// Updates the state when an edge has been split +void BLInstrumentationDag::splitUpdate(BLInstrumentationEdge* formerEdge, + BasicBlock* newBlock) { + BallLarusNode* oldTarget = formerEdge->getTarget(); + BallLarusNode* newNode = addNode(newBlock); + formerEdge->setTarget(newNode); + newNode->addPredEdge(formerEdge); + + DEBUG(dbgs() << " Edge split: " << *formerEdge << "\n"); + + oldTarget->removePredEdge(formerEdge); + BallLarusEdge* newEdge = addEdge(newNode, oldTarget,0); + + if( formerEdge->getType() == BallLarusEdge::BACKEDGE || + formerEdge->getType() == BallLarusEdge::SPLITEDGE) { + newEdge->setType(formerEdge->getType()); + newEdge->setPhonyRoot(formerEdge->getPhonyRoot()); + newEdge->setPhonyExit(formerEdge->getPhonyExit()); + formerEdge->setType(BallLarusEdge::NORMAL); + formerEdge->setPhonyRoot(NULL); + formerEdge->setPhonyExit(NULL); + } +} + +// Calculates a spanning tree of the DAG ignoring cycles. Whichever +// edges are in the spanning tree will not be instrumented, but this +// implementation does not try to minimize the instrumentation overhead +// by trying to find hot edges. +void BLInstrumentationDag::calculateSpanningTree() { + std::stack<BallLarusNode*> dfsStack; + + for(BLNodeIterator nodeIt = _nodes.begin(), end = _nodes.end(); + nodeIt != end; nodeIt++) { + (*nodeIt)->setColor(BallLarusNode::WHITE); + } + + dfsStack.push(getRoot()); + while(dfsStack.size() > 0) { + BallLarusNode* node = dfsStack.top(); + dfsStack.pop(); + + if(node->getColor() == BallLarusNode::WHITE) + continue; + + BallLarusNode* nextNode; + bool forward = true; + BLEdgeIterator succEnd = node->succEnd(); + + node->setColor(BallLarusNode::WHITE); + // first iterate over successors then predecessors + for(BLEdgeIterator edge = node->succBegin(), predEnd = node->predEnd(); + edge != predEnd; edge++) { + if(edge == succEnd) { + edge = node->predBegin(); + forward = false; + } + + // Ignore split edges + if ((*edge)->getType() == BallLarusEdge::SPLITEDGE) + continue; + + nextNode = forward? (*edge)->getTarget(): (*edge)->getSource(); + if(nextNode->getColor() != BallLarusNode::WHITE) { + nextNode->setColor(BallLarusNode::WHITE); + makeEdgeSpanning((BLInstrumentationEdge*)(*edge)); + } + } + } + + for(BLEdgeIterator edge = _edges.begin(), end = _edges.end(); + edge != end; edge++) { + BLInstrumentationEdge* instEdge = (BLInstrumentationEdge*) (*edge); + // safe since createEdge is overriden + if(!instEdge->isInSpanningTree() && (*edge)->getType() + != BallLarusEdge::SPLITEDGE) + _chordEdges.push_back(instEdge); + } +} + +// Pushes initialization further down in order to group the first +// increment and initialization. +void BLInstrumentationDag::pushInitialization() { + BLInstrumentationEdge* exitRootEdge = + (BLInstrumentationEdge*) getExitRootEdge(); + exitRootEdge->setIsInitialization(true); + pushInitializationFromEdge(exitRootEdge); +} + +// Pushes the path counter increments up in order to group the last path +// number increment. +void BLInstrumentationDag::pushCounters() { + BLInstrumentationEdge* exitRootEdge = + (BLInstrumentationEdge*) getExitRootEdge(); + exitRootEdge->setIsCounterIncrement(true); + pushCountersFromEdge(exitRootEdge); +} + +// Removes phony edges from the successor list of the source, and the +// predecessor list of the target. +void BLInstrumentationDag::unlinkPhony() { + BallLarusEdge* edge; + + for(BLEdgeIterator next = _edges.begin(), + end = _edges.end(); next != end; next++) { + edge = (*next); + + if( edge->getType() == BallLarusEdge::BACKEDGE_PHONY || + edge->getType() == BallLarusEdge::SPLITEDGE_PHONY || + edge->getType() == BallLarusEdge::CALLEDGE_PHONY ) { + unlinkEdge(edge); + } + } +} + +// Generate a .dot graph to represent the DAG and pathNumbers +void BLInstrumentationDag::generateDotGraph() { + std::string errorInfo; + std::string functionName = getFunction().getNameStr(); + std::string filename = "pathdag." + functionName + ".dot"; + + DEBUG (dbgs() << "Writing '" << filename << "'...\n"); + raw_fd_ostream dotFile(filename.c_str(), errorInfo); + + if (!errorInfo.empty()) { + errs() << "Error opening '" << filename.c_str() <<"' for writing!"; + errs() << "\n"; + return; + } + + dotFile << "digraph " << functionName << " {\n"; + + for( BLEdgeIterator edge = _edges.begin(), end = _edges.end(); + edge != end; edge++) { + std::string sourceName = (*edge)->getSource()->getName(); + std::string targetName = (*edge)->getTarget()->getName(); + + dotFile << "\t\"" << sourceName.c_str() << "\" -> \"" + << targetName.c_str() << "\" "; + + long inc = ((BLInstrumentationEdge*)(*edge))->getIncrement(); + + switch( (*edge)->getType() ) { + case BallLarusEdge::NORMAL: + dotFile << "[label=" << inc << "] [color=black];\n"; + break; + + case BallLarusEdge::BACKEDGE: + dotFile << "[color=cyan];\n"; + break; + + case BallLarusEdge::BACKEDGE_PHONY: + dotFile << "[label=" << inc + << "] [color=blue];\n"; + break; + + case BallLarusEdge::SPLITEDGE: + dotFile << "[color=violet];\n"; + break; + + case BallLarusEdge::SPLITEDGE_PHONY: + dotFile << "[label=" << inc << "] [color=red];\n"; + break; + + case BallLarusEdge::CALLEDGE_PHONY: + dotFile << "[label=" << inc << "] [color=green];\n"; + break; + } + } + + dotFile << "}\n"; +} + +// Allows subclasses to determine which type of Node is created. +// Override this method to produce subclasses of BallLarusNode if +// necessary. The destructor of BallLarusDag will call free on each pointer +// created. +BallLarusNode* BLInstrumentationDag::createNode(BasicBlock* BB) { + return( new BLInstrumentationNode(BB) ); +} + +// Allows subclasses to determine which type of Edge is created. +// Override this method to produce subclasses of BallLarusEdge if +// necessary. The destructor of BallLarusDag will call free on each pointer +// created. +BallLarusEdge* BLInstrumentationDag::createEdge(BallLarusNode* source, + BallLarusNode* target, unsigned edgeNumber) { + // One can cast from BallLarusNode to BLInstrumentationNode since createNode + // is overriden to produce BLInstrumentationNode. + return( new BLInstrumentationEdge((BLInstrumentationNode*)source, + (BLInstrumentationNode*)target) ); +} + +// Sets the Value corresponding to the pathNumber register, constant, +// or phinode. Used by the instrumentation code to remember path +// number Values. +Value* BLInstrumentationNode::getStartingPathNumber(){ + return(_startingPathNumber); +} + +// Sets the Value of the pathNumber. Used by the instrumentation code. +void BLInstrumentationNode::setStartingPathNumber(Value* pathNumber) { + DEBUG(dbgs() << " SPN-" << getName() << " <-- " << (pathNumber ? + pathNumber->getNameStr() : "unused") << "\n"); + _startingPathNumber = pathNumber; +} + +Value* BLInstrumentationNode::getEndingPathNumber(){ + return(_endingPathNumber); +} + +void BLInstrumentationNode::setEndingPathNumber(Value* pathNumber) { + DEBUG(dbgs() << " EPN-" << getName() << " <-- " + << (pathNumber ? pathNumber->getNameStr() : "unused") << "\n"); + _endingPathNumber = pathNumber; +} + +// Get the PHINode Instruction for this node. Used by instrumentation +// code. +PHINode* BLInstrumentationNode::getPathPHI() { + return(_pathPHI); +} + +// Set the PHINode Instruction for this node. Used by instrumentation +// code. +void BLInstrumentationNode::setPathPHI(PHINode* pathPHI) { + _pathPHI = pathPHI; +} + +// Removes the edge from the appropriate predecessor and successor +// lists. +void BLInstrumentationDag::unlinkEdge(BallLarusEdge* edge) { + if(edge == getExitRootEdge()) + DEBUG(dbgs() << " Removing exit->root edge\n"); + + edge->getSource()->removeSuccEdge(edge); + edge->getTarget()->removePredEdge(edge); +} + +// Makes an edge part of the spanning tree. +void BLInstrumentationDag::makeEdgeSpanning(BLInstrumentationEdge* edge) { + edge->setIsInSpanningTree(true); + _treeEdges.push_back(edge); +} + +// Pushes initialization and calls itself recursively. +void BLInstrumentationDag::pushInitializationFromEdge( + BLInstrumentationEdge* edge) { + BallLarusNode* target; + + target = edge->getTarget(); + if( target->getNumberPredEdges() > 1 || target == getExit() ) { + return; + } else { + for(BLEdgeIterator next = target->succBegin(), + end = target->succEnd(); next != end; next++) { + BLInstrumentationEdge* intoEdge = (BLInstrumentationEdge*) *next; + + // Skip split edges + if (intoEdge->getType() == BallLarusEdge::SPLITEDGE) + continue; + + intoEdge->setIncrement(intoEdge->getIncrement() + + edge->getIncrement()); + intoEdge->setIsInitialization(true); + pushInitializationFromEdge(intoEdge); + } + + edge->setIncrement(0); + edge->setIsInitialization(false); + } +} + +// Pushes path counter increments up recursively. +void BLInstrumentationDag::pushCountersFromEdge(BLInstrumentationEdge* edge) { + BallLarusNode* source; + + source = edge->getSource(); + if(source->getNumberSuccEdges() > 1 || source == getRoot() + || edge->isInitialization()) { + return; + } else { + for(BLEdgeIterator previous = source->predBegin(), + end = source->predEnd(); previous != end; previous++) { + BLInstrumentationEdge* fromEdge = (BLInstrumentationEdge*) *previous; + + // Skip split edges + if (fromEdge->getType() == BallLarusEdge::SPLITEDGE) + continue; + + fromEdge->setIncrement(fromEdge->getIncrement() + + edge->getIncrement()); + fromEdge->setIsCounterIncrement(true); + pushCountersFromEdge(fromEdge); + } + + edge->setIncrement(0); + edge->setIsCounterIncrement(false); + } +} + +// Depth first algorithm for determining the chord increments. +void BLInstrumentationDag::calculateChordIncrementsDfs(long weight, + BallLarusNode* v, BallLarusEdge* e) { + BLInstrumentationEdge* f; + + for(BLEdgeIterator treeEdge = _treeEdges.begin(), + end = _treeEdges.end(); treeEdge != end; treeEdge++) { + f = (BLInstrumentationEdge*) *treeEdge; + if(e != f && v == f->getTarget()) { + calculateChordIncrementsDfs( + calculateChordIncrementsDir(e,f)*(weight) + + f->getWeight(), f->getSource(), f); + } + if(e != f && v == f->getSource()) { + calculateChordIncrementsDfs( + calculateChordIncrementsDir(e,f)*(weight) + + f->getWeight(), f->getTarget(), f); + } + } + + for(BLEdgeIterator chordEdge = _chordEdges.begin(), + end = _chordEdges.end(); chordEdge != end; chordEdge++) { + f = (BLInstrumentationEdge*) *chordEdge; + if(v == f->getSource() || v == f->getTarget()) { + f->setIncrement(f->getIncrement() + + calculateChordIncrementsDir(e,f)*weight); + } + } +} + +// Determines the relative direction of two edges. +int BLInstrumentationDag::calculateChordIncrementsDir(BallLarusEdge* e, + BallLarusEdge* f) { + if( e == NULL) + return(1); + else if(e->getSource() == f->getTarget() + || e->getTarget() == f->getSource()) + return(1); + + return(-1); +} + +// Creates an increment constant representing incr. +ConstantInt* PathProfiler::createIncrementConstant(long incr, + int bitsize) { + return(ConstantInt::get(IntegerType::get(*Context, 32), incr)); +} + +// Creates an increment constant representing the value in +// edge->getIncrement(). +ConstantInt* PathProfiler::createIncrementConstant( + BLInstrumentationEdge* edge) { + return(createIncrementConstant(edge->getIncrement(), 32)); +} + +// Finds the insertion point after pathNumber in block. PathNumber may +// be NULL. +BasicBlock::iterator PathProfiler::getInsertionPoint(BasicBlock* block, Value* + pathNumber) { + if(pathNumber == NULL || isa<ConstantInt>(pathNumber) + || (((Instruction*)(pathNumber))->getParent()) != block) { + return(block->getFirstNonPHI()); + } else { + Instruction* pathNumberInst = (Instruction*) (pathNumber); + BasicBlock::iterator insertPoint; + BasicBlock::iterator end = block->end(); + + for(insertPoint = block->begin(); + insertPoint != end; insertPoint++) { + Instruction* insertInst = &(*insertPoint); + + if(insertInst == pathNumberInst) + return(++insertPoint); + } + + return(insertPoint); + } +} + +// A PHINode is created in the node, and its values initialized to -1U. +void PathProfiler::preparePHI(BLInstrumentationNode* node) { + BasicBlock* block = node->getBlock(); + BasicBlock::iterator insertPoint = block->getFirstNonPHI(); + PHINode* phi = PHINode::Create(Type::getInt32Ty(*Context), "pathNumber", + insertPoint ); + node->setPathPHI(phi); + node->setStartingPathNumber(phi); + node->setEndingPathNumber(phi); + + for(pred_iterator predIt = pred_begin(node->getBlock()), + end = pred_end(node->getBlock()); predIt != end; predIt++) { + BasicBlock* pred = (*predIt); + + if(pred != NULL) + phi->addIncoming(createIncrementConstant((long)-1, 32), pred); + } +} + +// Inserts source's pathNumber Value* into target. Target may or may not +// have multiple predecessors, and may or may not have its phiNode +// initalized. +void PathProfiler::pushValueIntoNode(BLInstrumentationNode* source, + BLInstrumentationNode* target) { + if(target->getBlock() == NULL) + return; + + + if(target->getNumberPredEdges() <= 1) { + assert(target->getStartingPathNumber() == NULL && + "Target already has path number"); + target->setStartingPathNumber(source->getEndingPathNumber()); + target->setEndingPathNumber(source->getEndingPathNumber()); + DEBUG(dbgs() << " Passing path number" + << (source->getEndingPathNumber() ? "" : " (null)") + << " value through.\n"); + } else { + if(target->getPathPHI() == NULL) { + DEBUG(dbgs() << " Initializing PHI node for block '" + << target->getName() << "'\n"); + preparePHI(target); + } + pushValueIntoPHI(target, source); + DEBUG(dbgs() << " Passing number value into PHI for block '" + << target->getName() << "'\n"); + } +} + +// Inserts source's pathNumber Value* into the appropriate slot of +// target's phiNode. +void PathProfiler::pushValueIntoPHI(BLInstrumentationNode* target, + BLInstrumentationNode* source) { + PHINode* phi = target->getPathPHI(); + assert(phi != NULL && " Tried to push value into node with PHI, but node" + " actually had no PHI."); + phi->removeIncomingValue(source->getBlock(), false); + phi->addIncoming(source->getEndingPathNumber(), source->getBlock()); +} + +// The Value* in node, oldVal, is updated with a Value* correspodning to +// oldVal + addition. +void PathProfiler::insertNumberIncrement(BLInstrumentationNode* node, + Value* addition, bool atBeginning) { + BasicBlock* block = node->getBlock(); + assert(node->getStartingPathNumber() != NULL); + assert(node->getEndingPathNumber() != NULL); + + BasicBlock::iterator insertPoint; + + if( atBeginning ) + insertPoint = block->getFirstNonPHI(); + else + insertPoint = block->getTerminator(); + + DEBUG(errs() << " Creating addition instruction.\n"); + Value* newpn = BinaryOperator::Create(Instruction::Add, + node->getStartingPathNumber(), + addition, "pathNumber", insertPoint); + + node->setEndingPathNumber(newpn); + + if( atBeginning ) + node->setStartingPathNumber(newpn); +} + +// Creates a counter increment in the given node. The Value* in node is +// taken as the index into an array or hash table. The hash table access +// is a call to the runtime. +void PathProfiler::insertCounterIncrement(Value* incValue, + BasicBlock::iterator insertPoint, + BLInstrumentationDag* dag, + bool increment) { + // Counter increment for array + if( dag->getNumberOfPaths() <= HASH_THRESHHOLD ) { + // Get pointer to the array location + std::vector<Value*> gepIndices(2); + gepIndices[0] = Constant::getNullValue(Type::getInt32Ty(*Context)); + gepIndices[1] = incValue; + + GetElementPtrInst* pcPointer = + GetElementPtrInst::Create(dag->getCounterArray(), + gepIndices.begin(), gepIndices.end(), + "counterInc", insertPoint); + + // Load from the array - call it oldPC + LoadInst* oldPc = new LoadInst(pcPointer, "oldPC", insertPoint); + + // Test to see whether adding 1 will overflow the counter + ICmpInst* isMax = new ICmpInst(insertPoint, CmpInst::ICMP_ULT, oldPc, + createIncrementConstant(0xffffffff, 32), + "isMax"); + + // Select increment for the path counter based on overflow + SelectInst* inc = + SelectInst::Create( isMax, createIncrementConstant(increment?1:-1,32), + createIncrementConstant(0,32), + "pathInc", insertPoint); + + // newPc = oldPc + inc + BinaryOperator* newPc = BinaryOperator::Create(Instruction::Add, + oldPc, inc, "newPC", + insertPoint); + + // Store back in to the array + new StoreInst(newPc, pcPointer, insertPoint); + } else { // Counter increment for hash + std::vector<Value*> args(2); + args[0] = ConstantInt::get(Type::getInt32Ty(*Context), + currentFunctionNumber); + args[1] = incValue; + + CallInst::Create( + increment ? llvmIncrementHashFunction : llvmDecrementHashFunction, + args.begin(), args.end(), "", insertPoint); + } +} + +// Inserts instrumentation for the given edge +// +// Pre: The edge's source node has pathNumber set if edge is non zero +// path number increment. +// +// Post: Edge's target node has a pathNumber set to the path number Value +// corresponding to the value of the path register after edge's +// execution. +// +// FIXME: This should be reworked so it's not recursive. +void PathProfiler::insertInstrumentationStartingAt(BLInstrumentationEdge* edge, + BLInstrumentationDag* dag) { + // Mark the edge as instrumented + edge->setHasInstrumentation(true); + DEBUG(dbgs() << "\nInstrumenting edge: " << (*edge) << "\n"); + + // create a new node for this edge's instrumentation + splitCritical(edge, dag); + + BLInstrumentationNode* sourceNode = (BLInstrumentationNode*)edge->getSource(); + BLInstrumentationNode* targetNode = (BLInstrumentationNode*)edge->getTarget(); + BLInstrumentationNode* instrumentNode; + BLInstrumentationNode* nextSourceNode; + + bool atBeginning = false; + + // Source node has only 1 successor so any information can be simply + // inserted in to it without splitting + if( sourceNode->getBlock() && sourceNode->getNumberSuccEdges() <= 1) { + DEBUG(dbgs() << " Potential instructions to be placed in: " + << sourceNode->getName() << " (at end)\n"); + instrumentNode = sourceNode; + nextSourceNode = targetNode; // ... since we never made any new nodes + } + + // The target node only has one predecessor, so we can safely insert edge + // instrumentation into it. If there was splitting, it must have been + // successful. + else if( targetNode->getNumberPredEdges() == 1 ) { + DEBUG(dbgs() << " Potential instructions to be placed in: " + << targetNode->getName() << " (at beginning)\n"); + pushValueIntoNode(sourceNode, targetNode); + instrumentNode = targetNode; + nextSourceNode = NULL; // ... otherwise we'll just keep splitting + atBeginning = true; + } + + // Somehow, splitting must have failed. + else { + errs() << "Instrumenting could not split a critical edge.\n"; + DEBUG(dbgs() << " Couldn't split edge " << (*edge) << ".\n"); + return; + } + + // Insert instrumentation if this is a back or split edge + if( edge->getType() == BallLarusEdge::BACKEDGE || + edge->getType() == BallLarusEdge::SPLITEDGE ) { + BLInstrumentationEdge* top = + (BLInstrumentationEdge*) edge->getPhonyRoot(); + BLInstrumentationEdge* bottom = + (BLInstrumentationEdge*) edge->getPhonyExit(); + + assert( top->isInitialization() && " Top phony edge did not" + " contain a path number initialization."); + assert( bottom->isCounterIncrement() && " Bottom phony edge" + " did not contain a path counter increment."); + + // split edge has yet to be initialized + if( !instrumentNode->getEndingPathNumber() ) { + instrumentNode->setStartingPathNumber(createIncrementConstant(0,32)); + instrumentNode->setEndingPathNumber(createIncrementConstant(0,32)); + } + + BasicBlock::iterator insertPoint = atBeginning ? + instrumentNode->getBlock()->getFirstNonPHI() : + instrumentNode->getBlock()->getTerminator(); + + // add information from the bottom edge, if it exists + if( bottom->getIncrement() ) { + Value* newpn = + BinaryOperator::Create(Instruction::Add, + instrumentNode->getStartingPathNumber(), + createIncrementConstant(bottom), + "pathNumber", insertPoint); + instrumentNode->setEndingPathNumber(newpn); + } + + insertCounterIncrement(instrumentNode->getEndingPathNumber(), + insertPoint, dag); + + if( atBeginning ) + instrumentNode->setStartingPathNumber(createIncrementConstant(top)); + + instrumentNode->setEndingPathNumber(createIncrementConstant(top)); + + // Check for path counter increments + if( top->isCounterIncrement() ) { + insertCounterIncrement(instrumentNode->getEndingPathNumber(), + instrumentNode->getBlock()->getTerminator(),dag); + instrumentNode->setEndingPathNumber(0); + } + } + + // Insert instrumentation if this is a normal edge + else { + BasicBlock::iterator insertPoint = atBeginning ? + instrumentNode->getBlock()->getFirstNonPHI() : + instrumentNode->getBlock()->getTerminator(); + + if( edge->isInitialization() ) { // initialize path number + instrumentNode->setEndingPathNumber(createIncrementConstant(edge)); + } else if( edge->getIncrement() ) {// increment path number + Value* newpn = + BinaryOperator::Create(Instruction::Add, + instrumentNode->getStartingPathNumber(), + createIncrementConstant(edge), + "pathNumber", insertPoint); + instrumentNode->setEndingPathNumber(newpn); + + if( atBeginning ) + instrumentNode->setStartingPathNumber(newpn); + } + + // Check for path counter increments + if( edge->isCounterIncrement() ) { + insertCounterIncrement(instrumentNode->getEndingPathNumber(), + insertPoint, dag); + instrumentNode->setEndingPathNumber(0); + } + } + + // Push it along + if (nextSourceNode && instrumentNode->getEndingPathNumber()) + pushValueIntoNode(instrumentNode, nextSourceNode); + + // Add all the successors + for( BLEdgeIterator next = targetNode->succBegin(), + end = targetNode->succEnd(); next != end; next++ ) { + // So long as it is un-instrumented, add it to the list + if( !((BLInstrumentationEdge*)(*next))->hasInstrumentation() ) + insertInstrumentationStartingAt((BLInstrumentationEdge*)*next,dag); + else + DEBUG(dbgs() << " Edge " << *(BLInstrumentationEdge*)(*next) + << " already instrumented.\n"); + } +} + +// Inserts instrumentation according to the marked edges in dag. Phony edges +// must be unlinked from the DAG, but accessible from the backedges. Dag +// must have initializations, path number increments, and counter increments +// present. +// +// Counter storage is created here. +void PathProfiler::insertInstrumentation( + BLInstrumentationDag& dag, Module &M) { + + BLInstrumentationEdge* exitRootEdge = + (BLInstrumentationEdge*) dag.getExitRootEdge(); + insertInstrumentationStartingAt(exitRootEdge, &dag); + + // Iterate through each call edge and apply the appropriate hash increment + // and decrement functions + BLEdgeVector callEdges = dag.getCallPhonyEdges(); + for( BLEdgeIterator edge = callEdges.begin(), + end = callEdges.end(); edge != end; edge++ ) { + BLInstrumentationNode* node = + (BLInstrumentationNode*)(*edge)->getSource(); + BasicBlock::iterator insertPoint = node->getBlock()->getFirstNonPHI(); + + // Find the first function call + while( ((Instruction&)(*insertPoint)).getOpcode() != Instruction::Call ) + insertPoint++; + + DEBUG(dbgs() << "\nInstrumenting method call block '" + << node->getBlock()->getNameStr() << "'\n"); + DEBUG(dbgs() << " Path number initialized: " + << ((node->getStartingPathNumber()) ? "yes" : "no") << "\n"); + + Value* newpn; + if( node->getStartingPathNumber() ) { + long inc = ((BLInstrumentationEdge*)(*edge))->getIncrement(); + if ( inc ) + newpn = BinaryOperator::Create(Instruction::Add, + node->getStartingPathNumber(), + createIncrementConstant(inc,32), + "pathNumber", insertPoint); + else + newpn = node->getStartingPathNumber(); + } else { + newpn = (Value*)createIncrementConstant( + ((BLInstrumentationEdge*)(*edge))->getIncrement(), 32); + } + + insertCounterIncrement(newpn, insertPoint, &dag); + insertCounterIncrement(newpn, node->getBlock()->getTerminator(), + &dag, false); + } +} + +// Entry point of the module +void PathProfiler::runOnFunction(std::vector<Constant*> &ftInit, + Function &F, Module &M) { + // Build DAG from CFG + BLInstrumentationDag dag = BLInstrumentationDag(F); + dag.init(); + + // give each path a unique integer value + dag.calculatePathNumbers(); + + // modify path increments to increase the efficiency + // of instrumentation + dag.calculateSpanningTree(); + dag.calculateChordIncrements(); + dag.pushInitialization(); + dag.pushCounters(); + dag.unlinkPhony(); + + // potentially generate .dot graph for the dag + if (DotPathDag) + dag.generateDotGraph (); + + // Should we store the information in an array or hash + if( dag.getNumberOfPaths() <= HASH_THRESHHOLD ) { + const Type* t = ArrayType::get(Type::getInt32Ty(*Context), + dag.getNumberOfPaths()); + + dag.setCounterArray(new GlobalVariable(M, t, false, + GlobalValue::InternalLinkage, + Constant::getNullValue(t), "")); + } + + insertInstrumentation(dag, M); + + // Add to global function reference table + unsigned type; + const Type* voidPtr = TypeBuilder<types::i<8>*, true>::get(*Context); + + if( dag.getNumberOfPaths() <= HASH_THRESHHOLD ) + type = ProfilingArray; + else + type = ProfilingHash; + + std::vector<Constant*> entryArray(3); + entryArray[0] = createIncrementConstant(type,32); + entryArray[1] = createIncrementConstant(dag.getNumberOfPaths(),32); + entryArray[2] = dag.getCounterArray() ? + ConstantExpr::getBitCast(dag.getCounterArray(), voidPtr) : + Constant::getNullValue(voidPtr); + + const StructType* at = ftEntryTypeBuilder::get(*Context); + ConstantStruct* functionEntry = + (ConstantStruct*)ConstantStruct::get(at, entryArray); + ftInit.push_back(functionEntry); +} + +// Output the bitcode if we want to observe instrumentation changess +#define PRINT_MODULE dbgs() << \ + "\n\n============= MODULE BEGIN ===============\n" << M << \ + "\n============== MODULE END ================\n" + +bool PathProfiler::runOnModule(Module &M) { + Context = &M.getContext(); + + DEBUG(dbgs() + << "****************************************\n" + << "****************************************\n" + << "** **\n" + << "** PATH PROFILING INSTRUMENTATION **\n" + << "** **\n" + << "****************************************\n" + << "****************************************\n"); + + // No main, no instrumentation! + Function *Main = M.getFunction("main"); + + // Using fortran? ... this kind of works + if (!Main) + Main = M.getFunction("MAIN__"); + + if (!Main) { + errs() << "WARNING: cannot insert path profiling into a module" + << " with no main function!\n"; + return false; + } + + BasicBlock::iterator insertPoint = Main->getEntryBlock().getFirstNonPHI(); + + llvmIncrementHashFunction = M.getOrInsertFunction( + "llvm_increment_path_count", + Type::getVoidTy(*Context), // return type + Type::getInt32Ty(*Context), // function number + Type::getInt32Ty(*Context), // path number + NULL ); + + llvmDecrementHashFunction = M.getOrInsertFunction( + "llvm_decrement_path_count", + Type::getVoidTy(*Context), // return type + Type::getInt32Ty(*Context), // function number + Type::getInt32Ty(*Context), // path number + NULL ); + + std::vector<Constant*> ftInit; + unsigned functionNumber = 0; + for (Module::iterator F = M.begin(), E = M.end(); F != E; F++) { + if (F->isDeclaration()) + continue; + + DEBUG(dbgs() << "Function: " << F->getNameStr() << "\n"); + functionNumber++; + + // set function number + currentFunctionNumber = functionNumber; + runOnFunction(ftInit, *F, M); + } + + const Type *t = ftEntryTypeBuilder::get(*Context); + const ArrayType* ftArrayType = ArrayType::get(t, ftInit.size()); + Constant* ftInitConstant = ConstantArray::get(ftArrayType, ftInit); + + DEBUG(dbgs() << " ftArrayType:" << *ftArrayType << "\n"); + + GlobalVariable* functionTable = + new GlobalVariable(M, ftArrayType, false, GlobalValue::InternalLinkage, + ftInitConstant, "functionPathTable"); + const Type *eltType = ftArrayType->getTypeAtIndex((unsigned)0); + InsertProfilingInitCall(Main, "llvm_start_path_profiling", functionTable, + PointerType::getUnqual(eltType)); + + DEBUG(PRINT_MODULE); + + return true; +} + +// If this edge is a critical edge, then inserts a node at this edge. +// This edge becomes the first edge, and a new BallLarusEdge is created. +// Returns true if the edge was split +bool PathProfiler::splitCritical(BLInstrumentationEdge* edge, + BLInstrumentationDag* dag) { + unsigned succNum = edge->getSuccessorNumber(); + BallLarusNode* sourceNode = edge->getSource(); + BallLarusNode* targetNode = edge->getTarget(); + BasicBlock* sourceBlock = sourceNode->getBlock(); + BasicBlock* targetBlock = targetNode->getBlock(); + + if(sourceBlock == NULL || targetBlock == NULL + || sourceNode->getNumberSuccEdges() <= 1 + || targetNode->getNumberPredEdges() == 1 ) { + return(false); + } + + TerminatorInst* terminator = sourceBlock->getTerminator(); + + if( SplitCriticalEdge(terminator, succNum, this, false)) { + BasicBlock* newBlock = terminator->getSuccessor(succNum); + dag->splitUpdate(edge, newBlock); + return(true); + } else + return(false); +} diff --git a/contrib/llvm/lib/Transforms/Instrumentation/ProfilingUtils.cpp b/contrib/llvm/lib/Transforms/Instrumentation/ProfilingUtils.cpp new file mode 100644 index 0000000..b57bbf6 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Instrumentation/ProfilingUtils.cpp @@ -0,0 +1,133 @@ +//===- ProfilingUtils.cpp - Helper functions shared by profilers ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a few helper functions which are used by profile +// instrumentation code to instrument the code. This allows the profiler pass +// to worry about *what* to insert, and these functions take care of *how* to do +// it. +// +//===----------------------------------------------------------------------===// + +#include "ProfilingUtils.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Instructions.h" +#include "llvm/LLVMContext.h" +#include "llvm/Module.h" + +void llvm::InsertProfilingInitCall(Function *MainFn, const char *FnName, + GlobalValue *Array, + PointerType *arrayType) { + LLVMContext &Context = MainFn->getContext(); + const Type *ArgVTy = + PointerType::getUnqual(Type::getInt8PtrTy(Context)); + const PointerType *UIntPtr = arrayType ? arrayType : + Type::getInt32PtrTy(Context); + Module &M = *MainFn->getParent(); + Constant *InitFn = M.getOrInsertFunction(FnName, Type::getInt32Ty(Context), + Type::getInt32Ty(Context), + ArgVTy, UIntPtr, + Type::getInt32Ty(Context), + (Type *)0); + + // This could force argc and argv into programs that wouldn't otherwise have + // them, but instead we just pass null values in. + std::vector<Value*> Args(4); + Args[0] = Constant::getNullValue(Type::getInt32Ty(Context)); + Args[1] = Constant::getNullValue(ArgVTy); + + // Skip over any allocas in the entry block. + BasicBlock *Entry = MainFn->begin(); + BasicBlock::iterator InsertPos = Entry->begin(); + while (isa<AllocaInst>(InsertPos)) ++InsertPos; + + std::vector<Constant*> GEPIndices(2, + Constant::getNullValue(Type::getInt32Ty(Context))); + unsigned NumElements = 0; + if (Array) { + Args[2] = ConstantExpr::getGetElementPtr(Array, &GEPIndices[0], + GEPIndices.size()); + NumElements = + cast<ArrayType>(Array->getType()->getElementType())->getNumElements(); + } else { + // If this profiling instrumentation doesn't have a constant array, just + // pass null. + Args[2] = ConstantPointerNull::get(UIntPtr); + } + Args[3] = ConstantInt::get(Type::getInt32Ty(Context), NumElements); + + CallInst *InitCall = CallInst::Create(InitFn, Args.begin(), Args.end(), + "newargc", InsertPos); + + // If argc or argv are not available in main, just pass null values in. + Function::arg_iterator AI; + switch (MainFn->arg_size()) { + default: + case 2: + AI = MainFn->arg_begin(); ++AI; + if (AI->getType() != ArgVTy) { + Instruction::CastOps opcode = CastInst::getCastOpcode(AI, false, ArgVTy, + false); + InitCall->setArgOperand(1, + CastInst::Create(opcode, AI, ArgVTy, "argv.cast", InitCall)); + } else { + InitCall->setArgOperand(1, AI); + } + /* FALL THROUGH */ + + case 1: + AI = MainFn->arg_begin(); + // If the program looked at argc, have it look at the return value of the + // init call instead. + if (!AI->getType()->isIntegerTy(32)) { + Instruction::CastOps opcode; + if (!AI->use_empty()) { + opcode = CastInst::getCastOpcode(InitCall, true, AI->getType(), true); + AI->replaceAllUsesWith( + CastInst::Create(opcode, InitCall, AI->getType(), "", InsertPos)); + } + opcode = CastInst::getCastOpcode(AI, true, + Type::getInt32Ty(Context), true); + InitCall->setArgOperand(0, + CastInst::Create(opcode, AI, Type::getInt32Ty(Context), + "argc.cast", InitCall)); + } else { + AI->replaceAllUsesWith(InitCall); + InitCall->setArgOperand(0, AI); + } + + case 0: break; + } +} + +void llvm::IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum, + GlobalValue *CounterArray, bool beginning) { + // Insert the increment after any alloca or PHI instructions... + BasicBlock::iterator InsertPos = beginning ? BB->getFirstNonPHI() : + BB->getTerminator(); + while (isa<AllocaInst>(InsertPos)) + ++InsertPos; + + LLVMContext &Context = BB->getContext(); + + // Create the getelementptr constant expression + std::vector<Constant*> Indices(2); + Indices[0] = Constant::getNullValue(Type::getInt32Ty(Context)); + Indices[1] = ConstantInt::get(Type::getInt32Ty(Context), CounterNum); + Constant *ElementPtr = + ConstantExpr::getGetElementPtr(CounterArray, &Indices[0], + Indices.size()); + + // Load, increment and store the value back. + Value *OldVal = new LoadInst(ElementPtr, "OldFuncCounter", InsertPos); + Value *NewVal = BinaryOperator::Create(Instruction::Add, OldVal, + ConstantInt::get(Type::getInt32Ty(Context), 1), + "NewFuncCounter", InsertPos); + new StoreInst(NewVal, ElementPtr, InsertPos); +} diff --git a/contrib/llvm/lib/Transforms/Instrumentation/ProfilingUtils.h b/contrib/llvm/lib/Transforms/Instrumentation/ProfilingUtils.h new file mode 100644 index 0000000..a76e357 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Instrumentation/ProfilingUtils.h @@ -0,0 +1,34 @@ +//===- ProfilingUtils.h - Helper functions shared by profilers --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines a few helper functions which are used by profile +// instrumentation code to instrument the code. This allows the profiler pass +// to worry about *what* to insert, and these functions take care of *how* to do +// it. +// +//===----------------------------------------------------------------------===// + +#ifndef PROFILINGUTILS_H +#define PROFILINGUTILS_H + +namespace llvm { + class Function; + class GlobalValue; + class BasicBlock; + class PointerType; + + void InsertProfilingInitCall(Function *MainFn, const char *FnName, + GlobalValue *Arr = 0, + PointerType *arrayType = 0); + void IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum, + GlobalValue *CounterArray, + bool beginning = true); +} + +#endif diff --git a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp new file mode 100644 index 0000000..a5adb5e --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp @@ -0,0 +1,97 @@ +//===- DCE.cpp - Code to perform dead code elimination --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the Aggressive Dead Code Elimination pass. This pass +// optimistically assumes that all instructions are dead until proven otherwise, +// allowing it to eliminate dead computations that other DCE passes do not +// catch, particularly involving loop computations. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "adce" +#include "llvm/Transforms/Scalar.h" +#include "llvm/BasicBlock.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Pass.h" +#include "llvm/Support/CFG.h" +#include "llvm/Support/InstIterator.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumRemoved, "Number of instructions removed"); + +namespace { + struct ADCE : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + ADCE() : FunctionPass(ID) { + initializeADCEPass(*PassRegistry::getPassRegistry()); + } + + virtual bool runOnFunction(Function& F); + + virtual void getAnalysisUsage(AnalysisUsage& AU) const { + AU.setPreservesCFG(); + } + + }; +} + +char ADCE::ID = 0; +INITIALIZE_PASS(ADCE, "adce", "Aggressive Dead Code Elimination", false, false) + +bool ADCE::runOnFunction(Function& F) { + SmallPtrSet<Instruction*, 128> alive; + SmallVector<Instruction*, 128> worklist; + + // Collect the set of "root" instructions that are known live. + for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) + if (isa<TerminatorInst>(I.getInstructionIterator()) || + isa<DbgInfoIntrinsic>(I.getInstructionIterator()) || + I->mayHaveSideEffects()) { + alive.insert(I.getInstructionIterator()); + worklist.push_back(I.getInstructionIterator()); + } + + // Propagate liveness backwards to operands. + while (!worklist.empty()) { + Instruction* curr = worklist.pop_back_val(); + + for (Instruction::op_iterator OI = curr->op_begin(), OE = curr->op_end(); + OI != OE; ++OI) + if (Instruction* Inst = dyn_cast<Instruction>(OI)) + if (alive.insert(Inst)) + worklist.push_back(Inst); + } + + // The inverse of the live set is the dead set. These are those instructions + // which have no side effects and do not influence the control flow or return + // value of the function, and may therefore be deleted safely. + // NOTE: We reuse the worklist vector here for memory efficiency. + for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) + if (!alive.count(I.getInstructionIterator())) { + worklist.push_back(I.getInstructionIterator()); + I->dropAllReferences(); + } + + for (SmallVector<Instruction*, 1024>::iterator I = worklist.begin(), + E = worklist.end(); I != E; ++I) { + ++NumRemoved; + (*I)->eraseFromParent(); + } + + return !worklist.empty(); +} + +FunctionPass *llvm::createAggressiveDCEPass() { + return new ADCE(); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/BasicBlockPlacement.cpp b/contrib/llvm/lib/Transforms/Scalar/BasicBlockPlacement.cpp new file mode 100644 index 0000000..cee5502 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/BasicBlockPlacement.cpp @@ -0,0 +1,152 @@ +//===-- BasicBlockPlacement.cpp - Basic Block Code Layout optimization ----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a very simple profile guided basic block placement +// algorithm. The idea is to put frequently executed blocks together at the +// start of the function, and hopefully increase the number of fall-through +// conditional branches. If there is no profile information for a particular +// function, this pass basically orders blocks in depth-first order +// +// The algorithm implemented here is basically "Algo1" from "Profile Guided Code +// Positioning" by Pettis and Hansen, except that it uses basic block counts +// instead of edge counts. This should be improved in many ways, but is very +// simple for now. +// +// Basically we "place" the entry block, then loop over all successors in a DFO, +// placing the most frequently executed successor until we run out of blocks. I +// told you this was _extremely_ simplistic. :) This is also much slower than it +// could be. When it becomes important, this pass will be rewritten to use a +// better algorithm, and then we can worry about efficiency. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "block-placement" +#include "llvm/Analysis/ProfileInfo.h" +#include "llvm/Function.h" +#include "llvm/Pass.h" +#include "llvm/Support/CFG.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Transforms/Scalar.h" +#include <set> +using namespace llvm; + +STATISTIC(NumMoved, "Number of basic blocks moved"); + +namespace { + struct BlockPlacement : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + BlockPlacement() : FunctionPass(ID) { + initializeBlockPlacementPass(*PassRegistry::getPassRegistry()); + } + + virtual bool runOnFunction(Function &F); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired<ProfileInfo>(); + //AU.addPreserved<ProfileInfo>(); // Does this work? + } + private: + /// PI - The profile information that is guiding us. + /// + ProfileInfo *PI; + + /// NumMovedBlocks - Every time we move a block, increment this counter. + /// + unsigned NumMovedBlocks; + + /// PlacedBlocks - Every time we place a block, remember it so we don't get + /// into infinite loops. + std::set<BasicBlock*> PlacedBlocks; + + /// InsertPos - This an iterator to the next place we want to insert a + /// block. + Function::iterator InsertPos; + + /// PlaceBlocks - Recursively place the specified blocks and any unplaced + /// successors. + void PlaceBlocks(BasicBlock *BB); + }; +} + +char BlockPlacement::ID = 0; +INITIALIZE_PASS_BEGIN(BlockPlacement, "block-placement", + "Profile Guided Basic Block Placement", false, false) +INITIALIZE_AG_DEPENDENCY(ProfileInfo) +INITIALIZE_PASS_END(BlockPlacement, "block-placement", + "Profile Guided Basic Block Placement", false, false) + +FunctionPass *llvm::createBlockPlacementPass() { return new BlockPlacement(); } + +bool BlockPlacement::runOnFunction(Function &F) { + PI = &getAnalysis<ProfileInfo>(); + + NumMovedBlocks = 0; + InsertPos = F.begin(); + + // Recursively place all blocks. + PlaceBlocks(F.begin()); + + PlacedBlocks.clear(); + NumMoved += NumMovedBlocks; + return NumMovedBlocks != 0; +} + + +/// PlaceBlocks - Recursively place the specified blocks and any unplaced +/// successors. +void BlockPlacement::PlaceBlocks(BasicBlock *BB) { + assert(!PlacedBlocks.count(BB) && "Already placed this block!"); + PlacedBlocks.insert(BB); + + // Place the specified block. + if (&*InsertPos != BB) { + // Use splice to move the block into the right place. This avoids having to + // remove the block from the function then readd it, which causes a bunch of + // symbol table traffic that is entirely pointless. + Function::BasicBlockListType &Blocks = BB->getParent()->getBasicBlockList(); + Blocks.splice(InsertPos, Blocks, BB); + + ++NumMovedBlocks; + } else { + // This block is already in the right place, we don't have to do anything. + ++InsertPos; + } + + // Keep placing successors until we run out of ones to place. Note that this + // loop is very inefficient (N^2) for blocks with many successors, like switch + // statements. FIXME! + while (1) { + // Okay, now place any unplaced successors. + succ_iterator SI = succ_begin(BB), E = succ_end(BB); + + // Scan for the first unplaced successor. + for (; SI != E && PlacedBlocks.count(*SI); ++SI) + /*empty*/; + if (SI == E) return; // No more successors to place. + + double MaxExecutionCount = PI->getExecutionCount(*SI); + BasicBlock *MaxSuccessor = *SI; + + // Scan for more frequently executed successors + for (; SI != E; ++SI) + if (!PlacedBlocks.count(*SI)) { + double Count = PI->getExecutionCount(*SI); + if (Count > MaxExecutionCount || + // Prefer to not disturb the code. + (Count == MaxExecutionCount && *SI == &*InsertPos)) { + MaxExecutionCount = Count; + MaxSuccessor = *SI; + } + } + + // Now that we picked the maximally executed successor, place it. + PlaceBlocks(MaxSuccessor); + } +} diff --git a/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp b/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp new file mode 100644 index 0000000..9536939 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp @@ -0,0 +1,1104 @@ +//===- CodeGenPrepare.cpp - Prepare a function for code generation --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass munges the code in the input function to better prepare it for +// SelectionDAG-based code generation. This works around limitations in it's +// basic-block-at-a-time approach. It should eventually be removed. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "codegenprepare" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/InlineAsm.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Pass.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/ProfileInfo.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Transforms/Utils/AddrModeMatcher.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/BuildLibCalls.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Assembly/Writer.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/GetElementPtrTypeIterator.h" +#include "llvm/Support/PatternMatch.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/IRBuilder.h" +#include "llvm/Support/ValueHandle.h" +using namespace llvm; +using namespace llvm::PatternMatch; + +STATISTIC(NumBlocksElim, "Number of blocks eliminated"); +STATISTIC(NumPHIsElim, "Number of trivial PHIs eliminated"); +STATISTIC(NumGEPsElim, "Number of GEPs converted to casts"); +STATISTIC(NumCmpUses, "Number of uses of Cmp expressions replaced with uses of " + "sunken Cmps"); +STATISTIC(NumCastUses, "Number of uses of Cast expressions replaced with uses " + "of sunken Casts"); +STATISTIC(NumMemoryInsts, "Number of memory instructions whose address " + "computations were sunk"); +STATISTIC(NumExtsMoved, "Number of [s|z]ext instructions combined with loads"); +STATISTIC(NumExtUses, "Number of uses of [s|z]ext instructions optimized"); + +static cl::opt<bool> +CriticalEdgeSplit("cgp-critical-edge-splitting", + cl::desc("Split critical edges during codegen prepare"), + cl::init(false), cl::Hidden); + +namespace { + class CodeGenPrepare : public FunctionPass { + /// TLI - Keep a pointer of a TargetLowering to consult for determining + /// transformation profitability. + const TargetLowering *TLI; + DominatorTree *DT; + ProfileInfo *PFI; + + /// CurInstIterator - As we scan instructions optimizing them, this is the + /// next instruction to optimize. Xforms that can invalidate this should + /// update it. + BasicBlock::iterator CurInstIterator; + + /// BackEdges - Keep a set of all the loop back edges. + /// + SmallSet<std::pair<const BasicBlock*, const BasicBlock*>, 8> BackEdges; + + // Keeps track of non-local addresses that have been sunk into a block. This + // allows us to avoid inserting duplicate code for blocks with multiple + // load/stores of the same address. + DenseMap<Value*, Value*> SunkAddrs; + + public: + static char ID; // Pass identification, replacement for typeid + explicit CodeGenPrepare(const TargetLowering *tli = 0) + : FunctionPass(ID), TLI(tli) { + initializeCodeGenPreparePass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addPreserved<DominatorTree>(); + AU.addPreserved<ProfileInfo>(); + } + + virtual void releaseMemory() { + BackEdges.clear(); + } + + private: + bool EliminateMostlyEmptyBlocks(Function &F); + bool CanMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const; + void EliminateMostlyEmptyBlock(BasicBlock *BB); + bool OptimizeBlock(BasicBlock &BB); + bool OptimizeInst(Instruction *I); + bool OptimizeMemoryInst(Instruction *I, Value *Addr, const Type *AccessTy); + bool OptimizeInlineAsmInst(CallInst *CS); + bool OptimizeCallInst(CallInst *CI); + bool MoveExtToFormExtLoad(Instruction *I); + bool OptimizeExtUses(Instruction *I); + void findLoopBackEdges(const Function &F); + }; +} + +char CodeGenPrepare::ID = 0; +INITIALIZE_PASS(CodeGenPrepare, "codegenprepare", + "Optimize for code generation", false, false) + +FunctionPass *llvm::createCodeGenPreparePass(const TargetLowering *TLI) { + return new CodeGenPrepare(TLI); +} + +/// findLoopBackEdges - Do a DFS walk to find loop back edges. +/// +void CodeGenPrepare::findLoopBackEdges(const Function &F) { + SmallVector<std::pair<const BasicBlock*,const BasicBlock*>, 32> Edges; + FindFunctionBackedges(F, Edges); + + BackEdges.insert(Edges.begin(), Edges.end()); +} + + +bool CodeGenPrepare::runOnFunction(Function &F) { + bool EverMadeChange = false; + + DT = getAnalysisIfAvailable<DominatorTree>(); + PFI = getAnalysisIfAvailable<ProfileInfo>(); + // First pass, eliminate blocks that contain only PHI nodes and an + // unconditional branch. + EverMadeChange |= EliminateMostlyEmptyBlocks(F); + + // Now find loop back edges, but only if they are being used to decide which + // critical edges to split. + if (CriticalEdgeSplit) + findLoopBackEdges(F); + + bool MadeChange = true; + while (MadeChange) { + MadeChange = false; + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) + MadeChange |= OptimizeBlock(*BB); + EverMadeChange |= MadeChange; + } + + SunkAddrs.clear(); + + return EverMadeChange; +} + +/// EliminateMostlyEmptyBlocks - eliminate blocks that contain only PHI nodes, +/// debug info directives, and an unconditional branch. Passes before isel +/// (e.g. LSR/loopsimplify) often split edges in ways that are non-optimal for +/// isel. Start by eliminating these blocks so we can split them the way we +/// want them. +bool CodeGenPrepare::EliminateMostlyEmptyBlocks(Function &F) { + bool MadeChange = false; + // Note that this intentionally skips the entry block. + for (Function::iterator I = ++F.begin(), E = F.end(); I != E; ) { + BasicBlock *BB = I++; + + // If this block doesn't end with an uncond branch, ignore it. + BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()); + if (!BI || !BI->isUnconditional()) + continue; + + // If the instruction before the branch (skipping debug info) isn't a phi + // node, then other stuff is happening here. + BasicBlock::iterator BBI = BI; + if (BBI != BB->begin()) { + --BBI; + while (isa<DbgInfoIntrinsic>(BBI)) { + if (BBI == BB->begin()) + break; + --BBI; + } + if (!isa<DbgInfoIntrinsic>(BBI) && !isa<PHINode>(BBI)) + continue; + } + + // Do not break infinite loops. + BasicBlock *DestBB = BI->getSuccessor(0); + if (DestBB == BB) + continue; + + if (!CanMergeBlocks(BB, DestBB)) + continue; + + EliminateMostlyEmptyBlock(BB); + MadeChange = true; + } + return MadeChange; +} + +/// CanMergeBlocks - Return true if we can merge BB into DestBB if there is a +/// single uncond branch between them, and BB contains no other non-phi +/// instructions. +bool CodeGenPrepare::CanMergeBlocks(const BasicBlock *BB, + const BasicBlock *DestBB) const { + // We only want to eliminate blocks whose phi nodes are used by phi nodes in + // the successor. If there are more complex condition (e.g. preheaders), + // don't mess around with them. + BasicBlock::const_iterator BBI = BB->begin(); + while (const PHINode *PN = dyn_cast<PHINode>(BBI++)) { + for (Value::const_use_iterator UI = PN->use_begin(), E = PN->use_end(); + UI != E; ++UI) { + const Instruction *User = cast<Instruction>(*UI); + if (User->getParent() != DestBB || !isa<PHINode>(User)) + return false; + // If User is inside DestBB block and it is a PHINode then check + // incoming value. If incoming value is not from BB then this is + // a complex condition (e.g. preheaders) we want to avoid here. + if (User->getParent() == DestBB) { + if (const PHINode *UPN = dyn_cast<PHINode>(User)) + for (unsigned I = 0, E = UPN->getNumIncomingValues(); I != E; ++I) { + Instruction *Insn = dyn_cast<Instruction>(UPN->getIncomingValue(I)); + if (Insn && Insn->getParent() == BB && + Insn->getParent() != UPN->getIncomingBlock(I)) + return false; + } + } + } + } + + // If BB and DestBB contain any common predecessors, then the phi nodes in BB + // and DestBB may have conflicting incoming values for the block. If so, we + // can't merge the block. + const PHINode *DestBBPN = dyn_cast<PHINode>(DestBB->begin()); + if (!DestBBPN) return true; // no conflict. + + // Collect the preds of BB. + SmallPtrSet<const BasicBlock*, 16> BBPreds; + if (const PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) { + // It is faster to get preds from a PHI than with pred_iterator. + for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i) + BBPreds.insert(BBPN->getIncomingBlock(i)); + } else { + BBPreds.insert(pred_begin(BB), pred_end(BB)); + } + + // Walk the preds of DestBB. + for (unsigned i = 0, e = DestBBPN->getNumIncomingValues(); i != e; ++i) { + BasicBlock *Pred = DestBBPN->getIncomingBlock(i); + if (BBPreds.count(Pred)) { // Common predecessor? + BBI = DestBB->begin(); + while (const PHINode *PN = dyn_cast<PHINode>(BBI++)) { + const Value *V1 = PN->getIncomingValueForBlock(Pred); + const Value *V2 = PN->getIncomingValueForBlock(BB); + + // If V2 is a phi node in BB, look up what the mapped value will be. + if (const PHINode *V2PN = dyn_cast<PHINode>(V2)) + if (V2PN->getParent() == BB) + V2 = V2PN->getIncomingValueForBlock(Pred); + + // If there is a conflict, bail out. + if (V1 != V2) return false; + } + } + } + + return true; +} + + +/// EliminateMostlyEmptyBlock - Eliminate a basic block that have only phi's and +/// an unconditional branch in it. +void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) { + BranchInst *BI = cast<BranchInst>(BB->getTerminator()); + BasicBlock *DestBB = BI->getSuccessor(0); + + DEBUG(dbgs() << "MERGING MOSTLY EMPTY BLOCKS - BEFORE:\n" << *BB << *DestBB); + + // If the destination block has a single pred, then this is a trivial edge, + // just collapse it. + if (BasicBlock *SinglePred = DestBB->getSinglePredecessor()) { + if (SinglePred != DestBB) { + // Remember if SinglePred was the entry block of the function. If so, we + // will need to move BB back to the entry position. + bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock(); + MergeBasicBlockIntoOnlyPred(DestBB, this); + + if (isEntry && BB != &BB->getParent()->getEntryBlock()) + BB->moveBefore(&BB->getParent()->getEntryBlock()); + + DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n"); + return; + } + } + + // Otherwise, we have multiple predecessors of BB. Update the PHIs in DestBB + // to handle the new incoming edges it is about to have. + PHINode *PN; + for (BasicBlock::iterator BBI = DestBB->begin(); + (PN = dyn_cast<PHINode>(BBI)); ++BBI) { + // Remove the incoming value for BB, and remember it. + Value *InVal = PN->removeIncomingValue(BB, false); + + // Two options: either the InVal is a phi node defined in BB or it is some + // value that dominates BB. + PHINode *InValPhi = dyn_cast<PHINode>(InVal); + if (InValPhi && InValPhi->getParent() == BB) { + // Add all of the input values of the input PHI as inputs of this phi. + for (unsigned i = 0, e = InValPhi->getNumIncomingValues(); i != e; ++i) + PN->addIncoming(InValPhi->getIncomingValue(i), + InValPhi->getIncomingBlock(i)); + } else { + // Otherwise, add one instance of the dominating value for each edge that + // we will be adding. + if (PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) { + for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i) + PN->addIncoming(InVal, BBPN->getIncomingBlock(i)); + } else { + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) + PN->addIncoming(InVal, *PI); + } + } + } + + // The PHIs are now updated, change everything that refers to BB to use + // DestBB and remove BB. + BB->replaceAllUsesWith(DestBB); + if (DT) { + BasicBlock *BBIDom = DT->getNode(BB)->getIDom()->getBlock(); + BasicBlock *DestBBIDom = DT->getNode(DestBB)->getIDom()->getBlock(); + BasicBlock *NewIDom = DT->findNearestCommonDominator(BBIDom, DestBBIDom); + DT->changeImmediateDominator(DestBB, NewIDom); + DT->eraseNode(BB); + } + if (PFI) { + PFI->replaceAllUses(BB, DestBB); + PFI->removeEdge(ProfileInfo::getEdge(BB, DestBB)); + } + BB->eraseFromParent(); + ++NumBlocksElim; + + DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n"); +} + +/// FindReusablePredBB - Check all of the predecessors of the block DestPHI +/// lives in to see if there is a block that we can reuse as a critical edge +/// from TIBB. +static BasicBlock *FindReusablePredBB(PHINode *DestPHI, BasicBlock *TIBB) { + BasicBlock *Dest = DestPHI->getParent(); + + /// TIPHIValues - This array is lazily computed to determine the values of + /// PHIs in Dest that TI would provide. + SmallVector<Value*, 32> TIPHIValues; + + /// TIBBEntryNo - This is a cache to speed up pred queries for TIBB. + unsigned TIBBEntryNo = 0; + + // Check to see if Dest has any blocks that can be used as a split edge for + // this terminator. + for (unsigned pi = 0, e = DestPHI->getNumIncomingValues(); pi != e; ++pi) { + BasicBlock *Pred = DestPHI->getIncomingBlock(pi); + // To be usable, the pred has to end with an uncond branch to the dest. + BranchInst *PredBr = dyn_cast<BranchInst>(Pred->getTerminator()); + if (!PredBr || !PredBr->isUnconditional()) + continue; + // Must be empty other than the branch and debug info. + BasicBlock::iterator I = Pred->begin(); + while (isa<DbgInfoIntrinsic>(I)) + I++; + if (&*I != PredBr) + continue; + // Cannot be the entry block; its label does not get emitted. + if (Pred == &Dest->getParent()->getEntryBlock()) + continue; + + // Finally, since we know that Dest has phi nodes in it, we have to make + // sure that jumping to Pred will have the same effect as going to Dest in + // terms of PHI values. + PHINode *PN; + unsigned PHINo = 0; + unsigned PredEntryNo = pi; + + bool FoundMatch = true; + for (BasicBlock::iterator I = Dest->begin(); + (PN = dyn_cast<PHINode>(I)); ++I, ++PHINo) { + if (PHINo == TIPHIValues.size()) { + if (PN->getIncomingBlock(TIBBEntryNo) != TIBB) + TIBBEntryNo = PN->getBasicBlockIndex(TIBB); + TIPHIValues.push_back(PN->getIncomingValue(TIBBEntryNo)); + } + + // If the PHI entry doesn't work, we can't use this pred. + if (PN->getIncomingBlock(PredEntryNo) != Pred) + PredEntryNo = PN->getBasicBlockIndex(Pred); + + if (TIPHIValues[PHINo] != PN->getIncomingValue(PredEntryNo)) { + FoundMatch = false; + break; + } + } + + // If we found a workable predecessor, change TI to branch to Succ. + if (FoundMatch) + return Pred; + } + return 0; +} + + +/// SplitEdgeNicely - Split the critical edge from TI to its specified +/// successor if it will improve codegen. We only do this if the successor has +/// phi nodes (otherwise critical edges are ok). If there is already another +/// predecessor of the succ that is empty (and thus has no phi nodes), use it +/// instead of introducing a new block. +static void SplitEdgeNicely(TerminatorInst *TI, unsigned SuccNum, + SmallSet<std::pair<const BasicBlock*, + const BasicBlock*>, 8> &BackEdges, + Pass *P) { + BasicBlock *TIBB = TI->getParent(); + BasicBlock *Dest = TI->getSuccessor(SuccNum); + assert(isa<PHINode>(Dest->begin()) && + "This should only be called if Dest has a PHI!"); + PHINode *DestPHI = cast<PHINode>(Dest->begin()); + + // Do not split edges to EH landing pads. + if (InvokeInst *Invoke = dyn_cast<InvokeInst>(TI)) + if (Invoke->getSuccessor(1) == Dest) + return; + + // As a hack, never split backedges of loops. Even though the copy for any + // PHIs inserted on the backedge would be dead for exits from the loop, we + // assume that the cost of *splitting* the backedge would be too high. + if (BackEdges.count(std::make_pair(TIBB, Dest))) + return; + + if (BasicBlock *ReuseBB = FindReusablePredBB(DestPHI, TIBB)) { + ProfileInfo *PFI = P->getAnalysisIfAvailable<ProfileInfo>(); + if (PFI) + PFI->splitEdge(TIBB, Dest, ReuseBB); + Dest->removePredecessor(TIBB); + TI->setSuccessor(SuccNum, ReuseBB); + return; + } + + SplitCriticalEdge(TI, SuccNum, P, true); +} + + +/// OptimizeNoopCopyExpression - If the specified cast instruction is a noop +/// copy (e.g. it's casting from one pointer type to another, i32->i8 on PPC), +/// sink it into user blocks to reduce the number of virtual +/// registers that must be created and coalesced. +/// +/// Return true if any changes are made. +/// +static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI){ + // If this is a noop copy, + EVT SrcVT = TLI.getValueType(CI->getOperand(0)->getType()); + EVT DstVT = TLI.getValueType(CI->getType()); + + // This is an fp<->int conversion? + if (SrcVT.isInteger() != DstVT.isInteger()) + return false; + + // If this is an extension, it will be a zero or sign extension, which + // isn't a noop. + if (SrcVT.bitsLT(DstVT)) return false; + + // If these values will be promoted, find out what they will be promoted + // to. This helps us consider truncates on PPC as noop copies when they + // are. + if (TLI.getTypeAction(SrcVT) == TargetLowering::Promote) + SrcVT = TLI.getTypeToTransformTo(CI->getContext(), SrcVT); + if (TLI.getTypeAction(DstVT) == TargetLowering::Promote) + DstVT = TLI.getTypeToTransformTo(CI->getContext(), DstVT); + + // If, after promotion, these are the same types, this is a noop copy. + if (SrcVT != DstVT) + return false; + + BasicBlock *DefBB = CI->getParent(); + + /// InsertedCasts - Only insert a cast in each block once. + DenseMap<BasicBlock*, CastInst*> InsertedCasts; + + bool MadeChange = false; + for (Value::use_iterator UI = CI->use_begin(), E = CI->use_end(); + UI != E; ) { + Use &TheUse = UI.getUse(); + Instruction *User = cast<Instruction>(*UI); + + // Figure out which BB this cast is used in. For PHI's this is the + // appropriate predecessor block. + BasicBlock *UserBB = User->getParent(); + if (PHINode *PN = dyn_cast<PHINode>(User)) { + UserBB = PN->getIncomingBlock(UI); + } + + // Preincrement use iterator so we don't invalidate it. + ++UI; + + // If this user is in the same block as the cast, don't change the cast. + if (UserBB == DefBB) continue; + + // If we have already inserted a cast into this block, use it. + CastInst *&InsertedCast = InsertedCasts[UserBB]; + + if (!InsertedCast) { + BasicBlock::iterator InsertPt = UserBB->getFirstNonPHI(); + + InsertedCast = + CastInst::Create(CI->getOpcode(), CI->getOperand(0), CI->getType(), "", + InsertPt); + MadeChange = true; + } + + // Replace a use of the cast with a use of the new cast. + TheUse = InsertedCast; + ++NumCastUses; + } + + // If we removed all uses, nuke the cast. + if (CI->use_empty()) { + CI->eraseFromParent(); + MadeChange = true; + } + + return MadeChange; +} + +/// OptimizeCmpExpression - sink the given CmpInst into user blocks to reduce +/// the number of virtual registers that must be created and coalesced. This is +/// a clear win except on targets with multiple condition code registers +/// (PowerPC), where it might lose; some adjustment may be wanted there. +/// +/// Return true if any changes are made. +static bool OptimizeCmpExpression(CmpInst *CI) { + BasicBlock *DefBB = CI->getParent(); + + /// InsertedCmp - Only insert a cmp in each block once. + DenseMap<BasicBlock*, CmpInst*> InsertedCmps; + + bool MadeChange = false; + for (Value::use_iterator UI = CI->use_begin(), E = CI->use_end(); + UI != E; ) { + Use &TheUse = UI.getUse(); + Instruction *User = cast<Instruction>(*UI); + + // Preincrement use iterator so we don't invalidate it. + ++UI; + + // Don't bother for PHI nodes. + if (isa<PHINode>(User)) + continue; + + // Figure out which BB this cmp is used in. + BasicBlock *UserBB = User->getParent(); + + // If this user is in the same block as the cmp, don't change the cmp. + if (UserBB == DefBB) continue; + + // If we have already inserted a cmp into this block, use it. + CmpInst *&InsertedCmp = InsertedCmps[UserBB]; + + if (!InsertedCmp) { + BasicBlock::iterator InsertPt = UserBB->getFirstNonPHI(); + + InsertedCmp = + CmpInst::Create(CI->getOpcode(), + CI->getPredicate(), CI->getOperand(0), + CI->getOperand(1), "", InsertPt); + MadeChange = true; + } + + // Replace a use of the cmp with a use of the new cmp. + TheUse = InsertedCmp; + ++NumCmpUses; + } + + // If we removed all uses, nuke the cmp. + if (CI->use_empty()) + CI->eraseFromParent(); + + return MadeChange; +} + +namespace { +class CodeGenPrepareFortifiedLibCalls : public SimplifyFortifiedLibCalls { +protected: + void replaceCall(Value *With) { + CI->replaceAllUsesWith(With); + CI->eraseFromParent(); + } + bool isFoldable(unsigned SizeCIOp, unsigned, bool) const { + if (ConstantInt *SizeCI = + dyn_cast<ConstantInt>(CI->getArgOperand(SizeCIOp))) + return SizeCI->isAllOnesValue(); + return false; + } +}; +} // end anonymous namespace + +bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { + BasicBlock *BB = CI->getParent(); + + // Lower inline assembly if we can. + // If we found an inline asm expession, and if the target knows how to + // lower it to normal LLVM code, do so now. + if (TLI && isa<InlineAsm>(CI->getCalledValue())) { + if (TLI->ExpandInlineAsm(CI)) { + // Avoid invalidating the iterator. + CurInstIterator = BB->begin(); + // Avoid processing instructions out of order, which could cause + // reuse before a value is defined. + SunkAddrs.clear(); + return true; + } + // Sink address computing for memory operands into the block. + if (OptimizeInlineAsmInst(CI)) + return true; + } + + // Lower all uses of llvm.objectsize.* + IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI); + if (II && II->getIntrinsicID() == Intrinsic::objectsize) { + bool Min = (cast<ConstantInt>(II->getArgOperand(1))->getZExtValue() == 1); + const Type *ReturnTy = CI->getType(); + Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL); + + // Substituting this can cause recursive simplifications, which can + // invalidate our iterator. Use a WeakVH to hold onto it in case this + // happens. + WeakVH IterHandle(CurInstIterator); + + ReplaceAndSimplifyAllUses(CI, RetVal, TLI ? TLI->getTargetData() : 0, DT); + + // If the iterator instruction was recursively deleted, start over at the + // start of the block. + if (IterHandle != CurInstIterator) { + CurInstIterator = BB->begin(); + SunkAddrs.clear(); + } + return true; + } + + // From here on out we're working with named functions. + if (CI->getCalledFunction() == 0) return false; + + // We'll need TargetData from here on out. + const TargetData *TD = TLI ? TLI->getTargetData() : 0; + if (!TD) return false; + + // Lower all default uses of _chk calls. This is very similar + // to what InstCombineCalls does, but here we are only lowering calls + // that have the default "don't know" as the objectsize. Anything else + // should be left alone. + CodeGenPrepareFortifiedLibCalls Simplifier; + return Simplifier.fold(CI, TD); +} + +//===----------------------------------------------------------------------===// +// Memory Optimization +//===----------------------------------------------------------------------===// + +/// IsNonLocalValue - Return true if the specified values are defined in a +/// different basic block than BB. +static bool IsNonLocalValue(Value *V, BasicBlock *BB) { + if (Instruction *I = dyn_cast<Instruction>(V)) + return I->getParent() != BB; + return false; +} + +/// OptimizeMemoryInst - Load and Store Instructions often have +/// addressing modes that can do significant amounts of computation. As such, +/// instruction selection will try to get the load or store to do as much +/// computation as possible for the program. The problem is that isel can only +/// see within a single block. As such, we sink as much legal addressing mode +/// stuff into the block as possible. +/// +/// This method is used to optimize both load/store and inline asms with memory +/// operands. +bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, + const Type *AccessTy) { + Value *Repl = Addr; + + // Try to collapse single-value PHI nodes. This is necessary to undo + // unprofitable PRE transformations. + SmallVector<Value*, 8> worklist; + SmallPtrSet<Value*, 16> Visited; + worklist.push_back(Addr); + + // Use a worklist to iteratively look through PHI nodes, and ensure that + // the addressing mode obtained from the non-PHI roots of the graph + // are equivalent. + Value *Consensus = 0; + unsigned NumUses = 0; + SmallVector<Instruction*, 16> AddrModeInsts; + ExtAddrMode AddrMode; + while (!worklist.empty()) { + Value *V = worklist.back(); + worklist.pop_back(); + + // Break use-def graph loops. + if (Visited.count(V)) { + Consensus = 0; + break; + } + + Visited.insert(V); + + // For a PHI node, push all of its incoming values. + if (PHINode *P = dyn_cast<PHINode>(V)) { + for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i) + worklist.push_back(P->getIncomingValue(i)); + continue; + } + + // For non-PHIs, determine the addressing mode being computed. + SmallVector<Instruction*, 16> NewAddrModeInsts; + ExtAddrMode NewAddrMode = + AddressingModeMatcher::Match(V, AccessTy,MemoryInst, + NewAddrModeInsts, *TLI); + + // Ensure that the obtained addressing mode is equivalent to that obtained + // for all other roots of the PHI traversal. Also, when choosing one + // such root as representative, select the one with the most uses in order + // to keep the cost modeling heuristics in AddressingModeMatcher applicable. + if (!Consensus || NewAddrMode == AddrMode) { + if (V->getNumUses() > NumUses) { + Consensus = V; + NumUses = V->getNumUses(); + AddrMode = NewAddrMode; + AddrModeInsts = NewAddrModeInsts; + } + continue; + } + + Consensus = 0; + break; + } + + // If the addressing mode couldn't be determined, or if multiple different + // ones were determined, bail out now. + if (!Consensus) return false; + + // Check to see if any of the instructions supersumed by this addr mode are + // non-local to I's BB. + bool AnyNonLocal = false; + for (unsigned i = 0, e = AddrModeInsts.size(); i != e; ++i) { + if (IsNonLocalValue(AddrModeInsts[i], MemoryInst->getParent())) { + AnyNonLocal = true; + break; + } + } + + // If all the instructions matched are already in this BB, don't do anything. + if (!AnyNonLocal) { + DEBUG(dbgs() << "CGP: Found local addrmode: " << AddrMode << "\n"); + return false; + } + + // Insert this computation right after this user. Since our caller is + // scanning from the top of the BB to the bottom, reuse of the expr are + // guaranteed to happen later. + BasicBlock::iterator InsertPt = MemoryInst; + + // Now that we determined the addressing expression we want to use and know + // that we have to sink it into this block. Check to see if we have already + // done this for some other load/store instr in this block. If so, reuse the + // computation. + Value *&SunkAddr = SunkAddrs[Addr]; + if (SunkAddr) { + DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode << " for " + << *MemoryInst); + if (SunkAddr->getType() != Addr->getType()) + SunkAddr = new BitCastInst(SunkAddr, Addr->getType(), "tmp", InsertPt); + } else { + DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for " + << *MemoryInst); + const Type *IntPtrTy = + TLI->getTargetData()->getIntPtrType(AccessTy->getContext()); + + Value *Result = 0; + + // Start with the base register. Do this first so that subsequent address + // matching finds it last, which will prevent it from trying to match it + // as the scaled value in case it happens to be a mul. That would be + // problematic if we've sunk a different mul for the scale, because then + // we'd end up sinking both muls. + if (AddrMode.BaseReg) { + Value *V = AddrMode.BaseReg; + if (V->getType()->isPointerTy()) + V = new PtrToIntInst(V, IntPtrTy, "sunkaddr", InsertPt); + if (V->getType() != IntPtrTy) + V = CastInst::CreateIntegerCast(V, IntPtrTy, /*isSigned=*/true, + "sunkaddr", InsertPt); + Result = V; + } + + // Add the scale value. + if (AddrMode.Scale) { + Value *V = AddrMode.ScaledReg; + if (V->getType() == IntPtrTy) { + // done. + } else if (V->getType()->isPointerTy()) { + V = new PtrToIntInst(V, IntPtrTy, "sunkaddr", InsertPt); + } else if (cast<IntegerType>(IntPtrTy)->getBitWidth() < + cast<IntegerType>(V->getType())->getBitWidth()) { + V = new TruncInst(V, IntPtrTy, "sunkaddr", InsertPt); + } else { + V = new SExtInst(V, IntPtrTy, "sunkaddr", InsertPt); + } + if (AddrMode.Scale != 1) + V = BinaryOperator::CreateMul(V, ConstantInt::get(IntPtrTy, + AddrMode.Scale), + "sunkaddr", InsertPt); + if (Result) + Result = BinaryOperator::CreateAdd(Result, V, "sunkaddr", InsertPt); + else + Result = V; + } + + // Add in the BaseGV if present. + if (AddrMode.BaseGV) { + Value *V = new PtrToIntInst(AddrMode.BaseGV, IntPtrTy, "sunkaddr", + InsertPt); + if (Result) + Result = BinaryOperator::CreateAdd(Result, V, "sunkaddr", InsertPt); + else + Result = V; + } + + // Add in the Base Offset if present. + if (AddrMode.BaseOffs) { + Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs); + if (Result) + Result = BinaryOperator::CreateAdd(Result, V, "sunkaddr", InsertPt); + else + Result = V; + } + + if (Result == 0) + SunkAddr = Constant::getNullValue(Addr->getType()); + else + SunkAddr = new IntToPtrInst(Result, Addr->getType(), "sunkaddr",InsertPt); + } + + MemoryInst->replaceUsesOfWith(Repl, SunkAddr); + + if (Repl->use_empty()) { + RecursivelyDeleteTriviallyDeadInstructions(Repl); + // This address is now available for reassignment, so erase the table entry; + // we don't want to match some completely different instruction. + SunkAddrs[Addr] = 0; + } + ++NumMemoryInsts; + return true; +} + +/// OptimizeInlineAsmInst - If there are any memory operands, use +/// OptimizeMemoryInst to sink their address computing into the block when +/// possible / profitable. +bool CodeGenPrepare::OptimizeInlineAsmInst(CallInst *CS) { + bool MadeChange = false; + + TargetLowering::AsmOperandInfoVector + TargetConstraints = TLI->ParseConstraints(CS); + unsigned ArgNo = 0; + for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { + TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; + + // Compute the constraint code and ConstraintType to use. + TLI->ComputeConstraintToUse(OpInfo, SDValue()); + + if (OpInfo.ConstraintType == TargetLowering::C_Memory && + OpInfo.isIndirect) { + Value *OpVal = CS->getArgOperand(ArgNo++); + MadeChange |= OptimizeMemoryInst(CS, OpVal, OpVal->getType()); + } else if (OpInfo.Type == InlineAsm::isInput) + ArgNo++; + } + + return MadeChange; +} + +/// MoveExtToFormExtLoad - Move a zext or sext fed by a load into the same +/// basic block as the load, unless conditions are unfavorable. This allows +/// SelectionDAG to fold the extend into the load. +/// +bool CodeGenPrepare::MoveExtToFormExtLoad(Instruction *I) { + // Look for a load being extended. + LoadInst *LI = dyn_cast<LoadInst>(I->getOperand(0)); + if (!LI) return false; + + // If they're already in the same block, there's nothing to do. + if (LI->getParent() == I->getParent()) + return false; + + // If the load has other users and the truncate is not free, this probably + // isn't worthwhile. + if (!LI->hasOneUse() && + TLI && (TLI->isTypeLegal(TLI->getValueType(LI->getType())) || + !TLI->isTypeLegal(TLI->getValueType(I->getType()))) && + !TLI->isTruncateFree(I->getType(), LI->getType())) + return false; + + // Check whether the target supports casts folded into loads. + unsigned LType; + if (isa<ZExtInst>(I)) + LType = ISD::ZEXTLOAD; + else { + assert(isa<SExtInst>(I) && "Unexpected ext type!"); + LType = ISD::SEXTLOAD; + } + if (TLI && !TLI->isLoadExtLegal(LType, TLI->getValueType(LI->getType()))) + return false; + + // Move the extend into the same block as the load, so that SelectionDAG + // can fold it. + I->removeFromParent(); + I->insertAfter(LI); + ++NumExtsMoved; + return true; +} + +bool CodeGenPrepare::OptimizeExtUses(Instruction *I) { + BasicBlock *DefBB = I->getParent(); + + // If the result of a {s|z}ext and its source are both live out, rewrite all + // other uses of the source with result of extension. + Value *Src = I->getOperand(0); + if (Src->hasOneUse()) + return false; + + // Only do this xform if truncating is free. + if (TLI && !TLI->isTruncateFree(I->getType(), Src->getType())) + return false; + + // Only safe to perform the optimization if the source is also defined in + // this block. + if (!isa<Instruction>(Src) || DefBB != cast<Instruction>(Src)->getParent()) + return false; + + bool DefIsLiveOut = false; + for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); + UI != E; ++UI) { + Instruction *User = cast<Instruction>(*UI); + + // Figure out which BB this ext is used in. + BasicBlock *UserBB = User->getParent(); + if (UserBB == DefBB) continue; + DefIsLiveOut = true; + break; + } + if (!DefIsLiveOut) + return false; + + // Make sure non of the uses are PHI nodes. + for (Value::use_iterator UI = Src->use_begin(), E = Src->use_end(); + UI != E; ++UI) { + Instruction *User = cast<Instruction>(*UI); + BasicBlock *UserBB = User->getParent(); + if (UserBB == DefBB) continue; + // Be conservative. We don't want this xform to end up introducing + // reloads just before load / store instructions. + if (isa<PHINode>(User) || isa<LoadInst>(User) || isa<StoreInst>(User)) + return false; + } + + // InsertedTruncs - Only insert one trunc in each block once. + DenseMap<BasicBlock*, Instruction*> InsertedTruncs; + + bool MadeChange = false; + for (Value::use_iterator UI = Src->use_begin(), E = Src->use_end(); + UI != E; ++UI) { + Use &TheUse = UI.getUse(); + Instruction *User = cast<Instruction>(*UI); + + // Figure out which BB this ext is used in. + BasicBlock *UserBB = User->getParent(); + if (UserBB == DefBB) continue; + + // Both src and def are live in this block. Rewrite the use. + Instruction *&InsertedTrunc = InsertedTruncs[UserBB]; + + if (!InsertedTrunc) { + BasicBlock::iterator InsertPt = UserBB->getFirstNonPHI(); + + InsertedTrunc = new TruncInst(I, Src->getType(), "", InsertPt); + } + + // Replace a use of the {s|z}ext source with a use of the result. + TheUse = InsertedTrunc; + ++NumExtUses; + MadeChange = true; + } + + return MadeChange; +} + +bool CodeGenPrepare::OptimizeInst(Instruction *I) { + if (PHINode *P = dyn_cast<PHINode>(I)) { + // It is possible for very late stage optimizations (such as SimplifyCFG) + // to introduce PHI nodes too late to be cleaned up. If we detect such a + // trivial PHI, go ahead and zap it here. + if (Value *V = SimplifyInstruction(P)) { + P->replaceAllUsesWith(V); + P->eraseFromParent(); + ++NumPHIsElim; + return true; + } + return false; + } + + if (CastInst *CI = dyn_cast<CastInst>(I)) { + // If the source of the cast is a constant, then this should have + // already been constant folded. The only reason NOT to constant fold + // it is if something (e.g. LSR) was careful to place the constant + // evaluation in a block other than then one that uses it (e.g. to hoist + // the address of globals out of a loop). If this is the case, we don't + // want to forward-subst the cast. + if (isa<Constant>(CI->getOperand(0))) + return false; + + if (TLI && OptimizeNoopCopyExpression(CI, *TLI)) + return true; + + if (isa<ZExtInst>(I) || isa<SExtInst>(I)) { + bool MadeChange = MoveExtToFormExtLoad(I); + return MadeChange | OptimizeExtUses(I); + } + return false; + } + + if (CmpInst *CI = dyn_cast<CmpInst>(I)) + return OptimizeCmpExpression(CI); + + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + if (TLI) + return OptimizeMemoryInst(I, I->getOperand(0), LI->getType()); + return false; + } + + if (StoreInst *SI = dyn_cast<StoreInst>(I)) { + if (TLI) + return OptimizeMemoryInst(I, SI->getOperand(1), + SI->getOperand(0)->getType()); + return false; + } + + if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) { + if (GEPI->hasAllZeroIndices()) { + /// The GEP operand must be a pointer, so must its result -> BitCast + Instruction *NC = new BitCastInst(GEPI->getOperand(0), GEPI->getType(), + GEPI->getName(), GEPI); + GEPI->replaceAllUsesWith(NC); + GEPI->eraseFromParent(); + ++NumGEPsElim; + OptimizeInst(NC); + return true; + } + return false; + } + + if (CallInst *CI = dyn_cast<CallInst>(I)) + return OptimizeCallInst(CI); + + return false; +} + +// In this pass we look for GEP and cast instructions that are used +// across basic blocks and rewrite them to improve basic-block-at-a-time +// selection. +bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) { + bool MadeChange = false; + + // Split all critical edges where the dest block has a PHI. + if (CriticalEdgeSplit) { + TerminatorInst *BBTI = BB.getTerminator(); + if (BBTI->getNumSuccessors() > 1 && !isa<IndirectBrInst>(BBTI)) { + for (unsigned i = 0, e = BBTI->getNumSuccessors(); i != e; ++i) { + BasicBlock *SuccBB = BBTI->getSuccessor(i); + if (isa<PHINode>(SuccBB->begin()) && isCriticalEdge(BBTI, i, true)) + SplitEdgeNicely(BBTI, i, BackEdges, this); + } + } + } + + SunkAddrs.clear(); + + CurInstIterator = BB.begin(); + for (BasicBlock::iterator E = BB.end(); CurInstIterator != E; ) + MadeChange |= OptimizeInst(CurInstIterator++); + + return MadeChange; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp new file mode 100644 index 0000000..664c3f6 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp @@ -0,0 +1,91 @@ +//===- ConstantProp.cpp - Code to perform Simple Constant Propagation -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements constant propagation and merging: +// +// Specifically, this: +// * Converts instructions like "add int 1, 2" into 3 +// +// Notice that: +// * This pass has a habit of making definitions be dead. It is a good idea +// to run a DIE pass sometime after running this pass. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "constprop" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Constant.h" +#include "llvm/Instruction.h" +#include "llvm/Pass.h" +#include "llvm/Support/InstIterator.h" +#include "llvm/ADT/Statistic.h" +#include <set> +using namespace llvm; + +STATISTIC(NumInstKilled, "Number of instructions killed"); + +namespace { + struct ConstantPropagation : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + ConstantPropagation() : FunctionPass(ID) { + initializeConstantPropagationPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + } + }; +} + +char ConstantPropagation::ID = 0; +INITIALIZE_PASS(ConstantPropagation, "constprop", + "Simple constant propagation", false, false) + +FunctionPass *llvm::createConstantPropagationPass() { + return new ConstantPropagation(); +} + + +bool ConstantPropagation::runOnFunction(Function &F) { + // Initialize the worklist to all of the instructions ready to process... + std::set<Instruction*> WorkList; + for(inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { + WorkList.insert(&*i); + } + bool Changed = false; + + while (!WorkList.empty()) { + Instruction *I = *WorkList.begin(); + WorkList.erase(WorkList.begin()); // Get an element from the worklist... + + if (!I->use_empty()) // Don't muck with dead instructions... + if (Constant *C = ConstantFoldInstruction(I)) { + // Add all of the users of this instruction to the worklist, they might + // be constant propagatable now... + for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); + UI != UE; ++UI) + WorkList.insert(cast<Instruction>(*UI)); + + // Replace all of the uses of a variable with uses of the constant. + I->replaceAllUsesWith(C); + + // Remove the dead instruction. + WorkList.erase(I); + I->eraseFromParent(); + + // We made a change to the function... + Changed = true; + ++NumInstKilled; + } + } + return Changed; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp new file mode 100644 index 0000000..be12973 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -0,0 +1,206 @@ +//===- CorrelatedValuePropagation.cpp - Propagate CFG-derived info --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the Correlated Value Propagation pass. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "correlated-value-propagation" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Function.h" +#include "llvm/Instructions.h" +#include "llvm/Pass.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LazyValueInfo.h" +#include "llvm/Support/CFG.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumPhis, "Number of phis propagated"); +STATISTIC(NumSelects, "Number of selects propagated"); +STATISTIC(NumMemAccess, "Number of memory access targets propagated"); +STATISTIC(NumCmps, "Number of comparisons propagated"); + +namespace { + class CorrelatedValuePropagation : public FunctionPass { + LazyValueInfo *LVI; + + bool processSelect(SelectInst *SI); + bool processPHI(PHINode *P); + bool processMemAccess(Instruction *I); + bool processCmp(CmpInst *C); + + public: + static char ID; + CorrelatedValuePropagation(): FunctionPass(ID) { + initializeCorrelatedValuePropagationPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<LazyValueInfo>(); + } + }; +} + +char CorrelatedValuePropagation::ID = 0; +INITIALIZE_PASS_BEGIN(CorrelatedValuePropagation, "correlated-propagation", + "Value Propagation", false, false) +INITIALIZE_PASS_DEPENDENCY(LazyValueInfo) +INITIALIZE_PASS_END(CorrelatedValuePropagation, "correlated-propagation", + "Value Propagation", false, false) + +// Public interface to the Value Propagation pass +Pass *llvm::createCorrelatedValuePropagationPass() { + return new CorrelatedValuePropagation(); +} + +bool CorrelatedValuePropagation::processSelect(SelectInst *S) { + if (S->getType()->isVectorTy()) return false; + if (isa<Constant>(S->getOperand(0))) return false; + + Constant *C = LVI->getConstant(S->getOperand(0), S->getParent()); + if (!C) return false; + + ConstantInt *CI = dyn_cast<ConstantInt>(C); + if (!CI) return false; + + Value *ReplaceWith = S->getOperand(1); + Value *Other = S->getOperand(2); + if (!CI->isOne()) std::swap(ReplaceWith, Other); + if (ReplaceWith == S) ReplaceWith = UndefValue::get(S->getType()); + + S->replaceAllUsesWith(ReplaceWith); + S->eraseFromParent(); + + ++NumSelects; + + return true; +} + +bool CorrelatedValuePropagation::processPHI(PHINode *P) { + bool Changed = false; + + BasicBlock *BB = P->getParent(); + for (unsigned i = 0, e = P->getNumIncomingValues(); i < e; ++i) { + Value *Incoming = P->getIncomingValue(i); + if (isa<Constant>(Incoming)) continue; + + Constant *C = LVI->getConstantOnEdge(P->getIncomingValue(i), + P->getIncomingBlock(i), + BB); + if (!C) continue; + + P->setIncomingValue(i, C); + Changed = true; + } + + if (Value *V = SimplifyInstruction(P)) { + P->replaceAllUsesWith(V); + P->eraseFromParent(); + Changed = true; + } + + ++NumPhis; + + return Changed; +} + +bool CorrelatedValuePropagation::processMemAccess(Instruction *I) { + Value *Pointer = 0; + if (LoadInst *L = dyn_cast<LoadInst>(I)) + Pointer = L->getPointerOperand(); + else + Pointer = cast<StoreInst>(I)->getPointerOperand(); + + if (isa<Constant>(Pointer)) return false; + + Constant *C = LVI->getConstant(Pointer, I->getParent()); + if (!C) return false; + + ++NumMemAccess; + I->replaceUsesOfWith(Pointer, C); + return true; +} + +/// processCmp - If the value of this comparison could be determined locally, +/// constant propagation would already have figured it out. Instead, walk +/// the predecessors and statically evaluate the comparison based on information +/// available on that edge. If a given static evaluation is true on ALL +/// incoming edges, then it's true universally and we can simplify the compare. +bool CorrelatedValuePropagation::processCmp(CmpInst *C) { + Value *Op0 = C->getOperand(0); + if (isa<Instruction>(Op0) && + cast<Instruction>(Op0)->getParent() == C->getParent()) + return false; + + Constant *Op1 = dyn_cast<Constant>(C->getOperand(1)); + if (!Op1) return false; + + pred_iterator PI = pred_begin(C->getParent()), PE = pred_end(C->getParent()); + if (PI == PE) return false; + + LazyValueInfo::Tristate Result = LVI->getPredicateOnEdge(C->getPredicate(), + C->getOperand(0), Op1, *PI, C->getParent()); + if (Result == LazyValueInfo::Unknown) return false; + + ++PI; + while (PI != PE) { + LazyValueInfo::Tristate Res = LVI->getPredicateOnEdge(C->getPredicate(), + C->getOperand(0), Op1, *PI, C->getParent()); + if (Res != Result) return false; + ++PI; + } + + ++NumCmps; + + if (Result == LazyValueInfo::True) + C->replaceAllUsesWith(ConstantInt::getTrue(C->getContext())); + else + C->replaceAllUsesWith(ConstantInt::getFalse(C->getContext())); + + C->eraseFromParent(); + + return true; +} + +bool CorrelatedValuePropagation::runOnFunction(Function &F) { + LVI = &getAnalysis<LazyValueInfo>(); + + bool FnChanged = false; + + for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) { + bool BBChanged = false; + for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ) { + Instruction *II = BI++; + switch (II->getOpcode()) { + case Instruction::Select: + BBChanged |= processSelect(cast<SelectInst>(II)); + break; + case Instruction::PHI: + BBChanged |= processPHI(cast<PHINode>(II)); + break; + case Instruction::ICmp: + case Instruction::FCmp: + BBChanged |= processCmp(cast<CmpInst>(II)); + break; + case Instruction::Load: + case Instruction::Store: + BBChanged |= processMemAccess(II); + break; + } + } + + FnChanged |= BBChanged; + } + + return FnChanged; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp new file mode 100644 index 0000000..dbb68f3 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp @@ -0,0 +1,136 @@ +//===- DCE.cpp - Code to perform dead code elimination --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements dead inst elimination and dead code elimination. +// +// Dead Inst Elimination performs a single pass over the function removing +// instructions that are obviously dead. Dead Code Elimination is similar, but +// it rechecks instructions that were used by removed instructions to see if +// they are newly dead. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "dce" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Instruction.h" +#include "llvm/Pass.h" +#include "llvm/Support/InstIterator.h" +#include "llvm/ADT/Statistic.h" +#include <set> +using namespace llvm; + +STATISTIC(DIEEliminated, "Number of insts removed by DIE pass"); +STATISTIC(DCEEliminated, "Number of insts removed"); + +namespace { + //===--------------------------------------------------------------------===// + // DeadInstElimination pass implementation + // + struct DeadInstElimination : public BasicBlockPass { + static char ID; // Pass identification, replacement for typeid + DeadInstElimination() : BasicBlockPass(ID) { + initializeDeadInstEliminationPass(*PassRegistry::getPassRegistry()); + } + virtual bool runOnBasicBlock(BasicBlock &BB) { + bool Changed = false; + for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) { + Instruction *Inst = DI++; + if (isInstructionTriviallyDead(Inst)) { + Inst->eraseFromParent(); + Changed = true; + ++DIEEliminated; + } + } + return Changed; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + } + }; +} + +char DeadInstElimination::ID = 0; +INITIALIZE_PASS(DeadInstElimination, "die", + "Dead Instruction Elimination", false, false) + +Pass *llvm::createDeadInstEliminationPass() { + return new DeadInstElimination(); +} + + +namespace { + //===--------------------------------------------------------------------===// + // DeadCodeElimination pass implementation + // + struct DCE : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + DCE() : FunctionPass(ID) { + initializeDCEPass(*PassRegistry::getPassRegistry()); + } + + virtual bool runOnFunction(Function &F); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + } + }; +} + +char DCE::ID = 0; +INITIALIZE_PASS(DCE, "dce", "Dead Code Elimination", false, false) + +bool DCE::runOnFunction(Function &F) { + // Start out with all of the instructions in the worklist... + std::vector<Instruction*> WorkList; + for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) + WorkList.push_back(&*i); + + // Loop over the worklist finding instructions that are dead. If they are + // dead make them drop all of their uses, making other instructions + // potentially dead, and work until the worklist is empty. + // + bool MadeChange = false; + while (!WorkList.empty()) { + Instruction *I = WorkList.back(); + WorkList.pop_back(); + + if (isInstructionTriviallyDead(I)) { // If the instruction is dead. + // Loop over all of the values that the instruction uses, if there are + // instructions being used, add them to the worklist, because they might + // go dead after this one is removed. + // + for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) + if (Instruction *Used = dyn_cast<Instruction>(*OI)) + WorkList.push_back(Used); + + // Remove the instruction. + I->eraseFromParent(); + + // Remove the instruction from the worklist if it still exists in it. + for (std::vector<Instruction*>::iterator WI = WorkList.begin(); + WI != WorkList.end(); ) { + if (*WI == I) + WI = WorkList.erase(WI); + else + ++WI; + } + + MadeChange = true; + ++DCEEliminated; + } + } + return MadeChange; +} + +FunctionPass *llvm::createDeadCodeEliminationPass() { + return new DCE(); +} + diff --git a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp new file mode 100644 index 0000000..867a06a --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -0,0 +1,730 @@ +//===- DeadStoreElimination.cpp - Fast Dead Store Elimination -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a trivial dead store elimination that only considers +// basic-block local redundant stores. +// +// FIXME: This should eventually be extended to be a post-dominator tree +// traversal. Doing so would be pretty trivial. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "dse" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Constants.h" +#include "llvm/Function.h" +#include "llvm/GlobalVariable.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Pass.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/MemoryDependenceAnalysis.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Support/Debug.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumFastStores, "Number of stores deleted"); +STATISTIC(NumFastOther , "Number of other instrs removed"); + +namespace { + struct DSE : public FunctionPass { + AliasAnalysis *AA; + MemoryDependenceAnalysis *MD; + + static char ID; // Pass identification, replacement for typeid + DSE() : FunctionPass(ID), AA(0), MD(0) { + initializeDSEPass(*PassRegistry::getPassRegistry()); + } + + virtual bool runOnFunction(Function &F) { + AA = &getAnalysis<AliasAnalysis>(); + MD = &getAnalysis<MemoryDependenceAnalysis>(); + DominatorTree &DT = getAnalysis<DominatorTree>(); + + bool Changed = false; + for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) + // Only check non-dead blocks. Dead blocks may have strange pointer + // cycles that will confuse alias analysis. + if (DT.isReachableFromEntry(I)) + Changed |= runOnBasicBlock(*I); + + AA = 0; MD = 0; + return Changed; + } + + bool runOnBasicBlock(BasicBlock &BB); + bool HandleFree(CallInst *F); + bool handleEndBlock(BasicBlock &BB); + void RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc, + SmallPtrSet<Value*, 16> &DeadStackObjects); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired<DominatorTree>(); + AU.addRequired<AliasAnalysis>(); + AU.addRequired<MemoryDependenceAnalysis>(); + AU.addPreserved<AliasAnalysis>(); + AU.addPreserved<DominatorTree>(); + AU.addPreserved<MemoryDependenceAnalysis>(); + } + }; +} + +char DSE::ID = 0; +INITIALIZE_PASS_BEGIN(DSE, "dse", "Dead Store Elimination", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_END(DSE, "dse", "Dead Store Elimination", false, false) + +FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); } + +//===----------------------------------------------------------------------===// +// Helper functions +//===----------------------------------------------------------------------===// + +/// DeleteDeadInstruction - Delete this instruction. Before we do, go through +/// and zero out all the operands of this instruction. If any of them become +/// dead, delete them and the computation tree that feeds them. +/// +/// If ValueSet is non-null, remove any deleted instructions from it as well. +/// +static void DeleteDeadInstruction(Instruction *I, + MemoryDependenceAnalysis &MD, + SmallPtrSet<Value*, 16> *ValueSet = 0) { + SmallVector<Instruction*, 32> NowDeadInsts; + + NowDeadInsts.push_back(I); + --NumFastOther; + + // Before we touch this instruction, remove it from memdep! + do { + Instruction *DeadInst = NowDeadInsts.pop_back_val(); + ++NumFastOther; + + // This instruction is dead, zap it, in stages. Start by removing it from + // MemDep, which needs to know the operands and needs it to be in the + // function. + MD.removeInstruction(DeadInst); + + for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) { + Value *Op = DeadInst->getOperand(op); + DeadInst->setOperand(op, 0); + + // If this operand just became dead, add it to the NowDeadInsts list. + if (!Op->use_empty()) continue; + + if (Instruction *OpI = dyn_cast<Instruction>(Op)) + if (isInstructionTriviallyDead(OpI)) + NowDeadInsts.push_back(OpI); + } + + DeadInst->eraseFromParent(); + + if (ValueSet) ValueSet->erase(DeadInst); + } while (!NowDeadInsts.empty()); +} + + +/// hasMemoryWrite - Does this instruction write some memory? This only returns +/// true for things that we can analyze with other helpers below. +static bool hasMemoryWrite(Instruction *I) { + if (isa<StoreInst>(I)) + return true; + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + switch (II->getIntrinsicID()) { + default: + return false; + case Intrinsic::memset: + case Intrinsic::memmove: + case Intrinsic::memcpy: + case Intrinsic::init_trampoline: + case Intrinsic::lifetime_end: + return true; + } + } + return false; +} + +/// getLocForWrite - Return a Location stored to by the specified instruction. +static AliasAnalysis::Location +getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) + return AA.getLocation(SI); + + if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Inst)) { + // memcpy/memmove/memset. + AliasAnalysis::Location Loc = AA.getLocationForDest(MI); + // If we don't have target data around, an unknown size in Location means + // that we should use the size of the pointee type. This isn't valid for + // memset/memcpy, which writes more than an i8. + if (Loc.Size == AliasAnalysis::UnknownSize && AA.getTargetData() == 0) + return AliasAnalysis::Location(); + return Loc; + } + + IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst); + if (II == 0) return AliasAnalysis::Location(); + + switch (II->getIntrinsicID()) { + default: return AliasAnalysis::Location(); // Unhandled intrinsic. + case Intrinsic::init_trampoline: + // If we don't have target data around, an unknown size in Location means + // that we should use the size of the pointee type. This isn't valid for + // init.trampoline, which writes more than an i8. + if (AA.getTargetData() == 0) return AliasAnalysis::Location(); + + // FIXME: We don't know the size of the trampoline, so we can't really + // handle it here. + return AliasAnalysis::Location(II->getArgOperand(0)); + case Intrinsic::lifetime_end: { + uint64_t Len = cast<ConstantInt>(II->getArgOperand(0))->getZExtValue(); + return AliasAnalysis::Location(II->getArgOperand(1), Len); + } + } +} + +/// getLocForRead - Return the location read by the specified "hasMemoryWrite" +/// instruction if any. +static AliasAnalysis::Location +getLocForRead(Instruction *Inst, AliasAnalysis &AA) { + assert(hasMemoryWrite(Inst) && "Unknown instruction case"); + + // The only instructions that both read and write are the mem transfer + // instructions (memcpy/memmove). + if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(Inst)) + return AA.getLocationForSource(MTI); + return AliasAnalysis::Location(); +} + + +/// isRemovable - If the value of this instruction and the memory it writes to +/// is unused, may we delete this instruction? +static bool isRemovable(Instruction *I) { + // Don't remove volatile stores. + if (StoreInst *SI = dyn_cast<StoreInst>(I)) + return !SI->isVolatile(); + + IntrinsicInst *II = cast<IntrinsicInst>(I); + switch (II->getIntrinsicID()) { + default: assert(0 && "doesn't pass 'hasMemoryWrite' predicate"); + case Intrinsic::lifetime_end: + // Never remove dead lifetime_end's, e.g. because it is followed by a + // free. + return false; + case Intrinsic::init_trampoline: + // Always safe to remove init_trampoline. + return true; + + case Intrinsic::memset: + case Intrinsic::memmove: + case Intrinsic::memcpy: + // Don't remove volatile memory intrinsics. + return !cast<MemIntrinsic>(II)->isVolatile(); + } +} + +/// getStoredPointerOperand - Return the pointer that is being written to. +static Value *getStoredPointerOperand(Instruction *I) { + if (StoreInst *SI = dyn_cast<StoreInst>(I)) + return SI->getPointerOperand(); + if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) + return MI->getDest(); + + IntrinsicInst *II = cast<IntrinsicInst>(I); + switch (II->getIntrinsicID()) { + default: assert(false && "Unexpected intrinsic!"); + case Intrinsic::init_trampoline: + return II->getArgOperand(0); + } +} + +static uint64_t getPointerSize(Value *V, AliasAnalysis &AA) { + const TargetData *TD = AA.getTargetData(); + if (TD == 0) + return AliasAnalysis::UnknownSize; + + if (AllocaInst *A = dyn_cast<AllocaInst>(V)) { + // Get size information for the alloca + if (ConstantInt *C = dyn_cast<ConstantInt>(A->getArraySize())) + return C->getZExtValue() * TD->getTypeAllocSize(A->getAllocatedType()); + return AliasAnalysis::UnknownSize; + } + + assert(isa<Argument>(V) && "Expected AllocaInst or Argument!"); + const PointerType *PT = cast<PointerType>(V->getType()); + return TD->getTypeAllocSize(PT->getElementType()); +} + +/// isObjectPointerWithTrustworthySize - Return true if the specified Value* is +/// pointing to an object with a pointer size we can trust. +static bool isObjectPointerWithTrustworthySize(const Value *V) { + if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) + return !AI->isArrayAllocation(); + if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) + return !GV->mayBeOverridden(); + if (const Argument *A = dyn_cast<Argument>(V)) + return A->hasByValAttr(); + return false; +} + +/// isCompleteOverwrite - Return true if a store to the 'Later' location +/// completely overwrites a store to the 'Earlier' location. +static bool isCompleteOverwrite(const AliasAnalysis::Location &Later, + const AliasAnalysis::Location &Earlier, + AliasAnalysis &AA) { + const Value *P1 = Earlier.Ptr->stripPointerCasts(); + const Value *P2 = Later.Ptr->stripPointerCasts(); + + // If the start pointers are the same, we just have to compare sizes to see if + // the later store was larger than the earlier store. + if (P1 == P2) { + // If we don't know the sizes of either access, then we can't do a + // comparison. + if (Later.Size == AliasAnalysis::UnknownSize || + Earlier.Size == AliasAnalysis::UnknownSize) { + // If we have no TargetData information around, then the size of the store + // is inferrable from the pointee type. If they are the same type, then + // we know that the store is safe. + if (AA.getTargetData() == 0) + return Later.Ptr->getType() == Earlier.Ptr->getType(); + return false; + } + + // Make sure that the Later size is >= the Earlier size. + if (Later.Size < Earlier.Size) + return false; + return true; + } + + // Otherwise, we have to have size information, and the later store has to be + // larger than the earlier one. + if (Later.Size == AliasAnalysis::UnknownSize || + Earlier.Size == AliasAnalysis::UnknownSize || + Later.Size <= Earlier.Size || AA.getTargetData() == 0) + return false; + + // Check to see if the later store is to the entire object (either a global, + // an alloca, or a byval argument). If so, then it clearly overwrites any + // other store to the same object. + const TargetData &TD = *AA.getTargetData(); + + const Value *UO1 = GetUnderlyingObject(P1, &TD), + *UO2 = GetUnderlyingObject(P2, &TD); + + // If we can't resolve the same pointers to the same object, then we can't + // analyze them at all. + if (UO1 != UO2) + return false; + + // If the "Later" store is to a recognizable object, get its size. + if (isObjectPointerWithTrustworthySize(UO2)) { + uint64_t ObjectSize = + TD.getTypeAllocSize(cast<PointerType>(UO2->getType())->getElementType()); + if (ObjectSize == Later.Size) + return true; + } + + // Okay, we have stores to two completely different pointers. Try to + // decompose the pointer into a "base + constant_offset" form. If the base + // pointers are equal, then we can reason about the two stores. + int64_t Off1 = 0, Off2 = 0; + const Value *BP1 = GetPointerBaseWithConstantOffset(P1, Off1, TD); + const Value *BP2 = GetPointerBaseWithConstantOffset(P2, Off2, TD); + + // If the base pointers still differ, we have two completely different stores. + if (BP1 != BP2) + return false; + + // Otherwise, we might have a situation like: + // store i16 -> P + 1 Byte + // store i32 -> P + // In this case, we see if the later store completely overlaps all bytes + // stored by the previous store. + if (Off1 < Off2 || // Earlier starts before Later. + Off1+Earlier.Size > Off2+Later.Size) // Earlier goes beyond Later. + return false; + // Otherwise, we have complete overlap. + return true; +} + +/// isPossibleSelfRead - If 'Inst' might be a self read (i.e. a noop copy of a +/// memory region into an identical pointer) then it doesn't actually make its +/// input dead in the traditional sense. Consider this case: +/// +/// memcpy(A <- B) +/// memcpy(A <- A) +/// +/// In this case, the second store to A does not make the first store to A dead. +/// The usual situation isn't an explicit A<-A store like this (which can be +/// trivially removed) but a case where two pointers may alias. +/// +/// This function detects when it is unsafe to remove a dependent instruction +/// because the DSE inducing instruction may be a self-read. +static bool isPossibleSelfRead(Instruction *Inst, + const AliasAnalysis::Location &InstStoreLoc, + Instruction *DepWrite, AliasAnalysis &AA) { + // Self reads can only happen for instructions that read memory. Get the + // location read. + AliasAnalysis::Location InstReadLoc = getLocForRead(Inst, AA); + if (InstReadLoc.Ptr == 0) return false; // Not a reading instruction. + + // If the read and written loc obviously don't alias, it isn't a read. + if (AA.isNoAlias(InstReadLoc, InstStoreLoc)) return false; + + // Okay, 'Inst' may copy over itself. However, we can still remove a the + // DepWrite instruction if we can prove that it reads from the same location + // as Inst. This handles useful cases like: + // memcpy(A <- B) + // memcpy(A <- B) + // Here we don't know if A/B may alias, but we do know that B/B are must + // aliases, so removing the first memcpy is safe (assuming it writes <= # + // bytes as the second one. + AliasAnalysis::Location DepReadLoc = getLocForRead(DepWrite, AA); + + if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr)) + return false; + + // If DepWrite doesn't read memory or if we can't prove it is a must alias, + // then it can't be considered dead. + return true; +} + + +//===----------------------------------------------------------------------===// +// DSE Pass +//===----------------------------------------------------------------------===// + +bool DSE::runOnBasicBlock(BasicBlock &BB) { + bool MadeChange = false; + + // Do a top-down walk on the BB. + for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) { + Instruction *Inst = BBI++; + + // Handle 'free' calls specially. + if (CallInst *F = isFreeCall(Inst)) { + MadeChange |= HandleFree(F); + continue; + } + + // If we find something that writes memory, get its memory dependence. + if (!hasMemoryWrite(Inst)) + continue; + + MemDepResult InstDep = MD->getDependency(Inst); + + // Ignore non-local store liveness. + // FIXME: cross-block DSE would be fun. :) + if (InstDep.isNonLocal() || + // Ignore self dependence, which happens in the entry block of the + // function. + InstDep.getInst() == Inst) + continue; + + // If we're storing the same value back to a pointer that we just + // loaded from, then the store can be removed. + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + if (LoadInst *DepLoad = dyn_cast<LoadInst>(InstDep.getInst())) { + if (SI->getPointerOperand() == DepLoad->getPointerOperand() && + SI->getOperand(0) == DepLoad && !SI->isVolatile()) { + DEBUG(dbgs() << "DSE: Remove Store Of Load from same pointer:\n " + << "LOAD: " << *DepLoad << "\n STORE: " << *SI << '\n'); + + // DeleteDeadInstruction can delete the current instruction. Save BBI + // in case we need it. + WeakVH NextInst(BBI); + + DeleteDeadInstruction(SI, *MD); + + if (NextInst == 0) // Next instruction deleted. + BBI = BB.begin(); + else if (BBI != BB.begin()) // Revisit this instruction if possible. + --BBI; + ++NumFastStores; + MadeChange = true; + continue; + } + } + } + + // Figure out what location is being stored to. + AliasAnalysis::Location Loc = getLocForWrite(Inst, *AA); + + // If we didn't get a useful location, fail. + if (Loc.Ptr == 0) + continue; + + while (!InstDep.isNonLocal()) { + // Get the memory clobbered by the instruction we depend on. MemDep will + // skip any instructions that 'Loc' clearly doesn't interact with. If we + // end up depending on a may- or must-aliased load, then we can't optimize + // away the store and we bail out. However, if we depend on on something + // that overwrites the memory location we *can* potentially optimize it. + // + // Find out what memory location the dependant instruction stores. + Instruction *DepWrite = InstDep.getInst(); + AliasAnalysis::Location DepLoc = getLocForWrite(DepWrite, *AA); + // If we didn't get a useful location, or if it isn't a size, bail out. + if (DepLoc.Ptr == 0) + break; + + // If we find a write that is a) removable (i.e., non-volatile), b) is + // completely obliterated by the store to 'Loc', and c) which we know that + // 'Inst' doesn't load from, then we can remove it. + if (isRemovable(DepWrite) && isCompleteOverwrite(Loc, DepLoc, *AA) && + !isPossibleSelfRead(Inst, Loc, DepWrite, *AA)) { + DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " + << *DepWrite << "\n KILLER: " << *Inst << '\n'); + + // Delete the store and now-dead instructions that feed it. + DeleteDeadInstruction(DepWrite, *MD); + ++NumFastStores; + MadeChange = true; + + // DeleteDeadInstruction can delete the current instruction in loop + // cases, reset BBI. + BBI = Inst; + if (BBI != BB.begin()) + --BBI; + break; + } + + // If this is a may-aliased store that is clobbering the store value, we + // can keep searching past it for another must-aliased pointer that stores + // to the same location. For example, in: + // store -> P + // store -> Q + // store -> P + // we can remove the first store to P even though we don't know if P and Q + // alias. + if (DepWrite == &BB.front()) break; + + // Can't look past this instruction if it might read 'Loc'. + if (AA->getModRefInfo(DepWrite, Loc) & AliasAnalysis::Ref) + break; + + InstDep = MD->getPointerDependencyFrom(Loc, false, DepWrite, &BB); + } + } + + // If this block ends in a return, unwind, or unreachable, all allocas are + // dead at its end, which means stores to them are also dead. + if (BB.getTerminator()->getNumSuccessors() == 0) + MadeChange |= handleEndBlock(BB); + + return MadeChange; +} + +/// HandleFree - Handle frees of entire structures whose dependency is a store +/// to a field of that structure. +bool DSE::HandleFree(CallInst *F) { + MemDepResult Dep = MD->getDependency(F); + do { + if (Dep.isNonLocal()) return false; + + Instruction *Dependency = Dep.getInst(); + if (!hasMemoryWrite(Dependency) || !isRemovable(Dependency)) + return false; + + Value *DepPointer = + GetUnderlyingObject(getStoredPointerOperand(Dependency)); + + // Check for aliasing. + if (!AA->isMustAlias(F->getArgOperand(0), DepPointer)) + return false; + + // DCE instructions only used to calculate that store + DeleteDeadInstruction(Dependency, *MD); + ++NumFastStores; + + // Inst's old Dependency is now deleted. Compute the next dependency, + // which may also be dead, as in + // s[0] = 0; + // s[1] = 0; // This has just been deleted. + // free(s); + Dep = MD->getDependency(F); + } while (!Dep.isNonLocal()); + + return true; +} + +/// handleEndBlock - Remove dead stores to stack-allocated locations in the +/// function end block. Ex: +/// %A = alloca i32 +/// ... +/// store i32 1, i32* %A +/// ret void +bool DSE::handleEndBlock(BasicBlock &BB) { + bool MadeChange = false; + + // Keep track of all of the stack objects that are dead at the end of the + // function. + SmallPtrSet<Value*, 16> DeadStackObjects; + + // Find all of the alloca'd pointers in the entry block. + BasicBlock *Entry = BB.getParent()->begin(); + for (BasicBlock::iterator I = Entry->begin(), E = Entry->end(); I != E; ++I) + if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) + DeadStackObjects.insert(AI); + + // Treat byval arguments the same, stores to them are dead at the end of the + // function. + for (Function::arg_iterator AI = BB.getParent()->arg_begin(), + AE = BB.getParent()->arg_end(); AI != AE; ++AI) + if (AI->hasByValAttr()) + DeadStackObjects.insert(AI); + + // Scan the basic block backwards + for (BasicBlock::iterator BBI = BB.end(); BBI != BB.begin(); ){ + --BBI; + + // If we find a store, check to see if it points into a dead stack value. + if (hasMemoryWrite(BBI) && isRemovable(BBI)) { + // See through pointer-to-pointer bitcasts + Value *Pointer = GetUnderlyingObject(getStoredPointerOperand(BBI)); + + // Stores to stack values are valid candidates for removal. + if (DeadStackObjects.count(Pointer)) { + Instruction *Dead = BBI++; + + DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n DEAD: " + << *Dead << "\n Object: " << *Pointer << '\n'); + + // DCE instructions only used to calculate that store. + DeleteDeadInstruction(Dead, *MD, &DeadStackObjects); + ++NumFastStores; + MadeChange = true; + continue; + } + } + + // Remove any dead non-memory-mutating instructions. + if (isInstructionTriviallyDead(BBI)) { + Instruction *Inst = BBI++; + DeleteDeadInstruction(Inst, *MD, &DeadStackObjects); + ++NumFastOther; + MadeChange = true; + continue; + } + + if (AllocaInst *A = dyn_cast<AllocaInst>(BBI)) { + DeadStackObjects.erase(A); + continue; + } + + if (CallSite CS = cast<Value>(BBI)) { + // If this call does not access memory, it can't be loading any of our + // pointers. + if (AA->doesNotAccessMemory(CS)) + continue; + + unsigned NumModRef = 0, NumOther = 0; + + // If the call might load from any of our allocas, then any store above + // the call is live. + SmallVector<Value*, 8> LiveAllocas; + for (SmallPtrSet<Value*, 16>::iterator I = DeadStackObjects.begin(), + E = DeadStackObjects.end(); I != E; ++I) { + // If we detect that our AA is imprecise, it's not worth it to scan the + // rest of the DeadPointers set. Just assume that the AA will return + // ModRef for everything, and go ahead and bail out. + if (NumModRef >= 16 && NumOther == 0) + return MadeChange; + + // See if the call site touches it. + AliasAnalysis::ModRefResult A = + AA->getModRefInfo(CS, *I, getPointerSize(*I, *AA)); + + if (A == AliasAnalysis::ModRef) + ++NumModRef; + else + ++NumOther; + + if (A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref) + LiveAllocas.push_back(*I); + } + + for (SmallVector<Value*, 8>::iterator I = LiveAllocas.begin(), + E = LiveAllocas.end(); I != E; ++I) + DeadStackObjects.erase(*I); + + // If all of the allocas were clobbered by the call then we're not going + // to find anything else to process. + if (DeadStackObjects.empty()) + return MadeChange; + + continue; + } + + AliasAnalysis::Location LoadedLoc; + + // If we encounter a use of the pointer, it is no longer considered dead + if (LoadInst *L = dyn_cast<LoadInst>(BBI)) { + LoadedLoc = AA->getLocation(L); + } else if (VAArgInst *V = dyn_cast<VAArgInst>(BBI)) { + LoadedLoc = AA->getLocation(V); + } else if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(BBI)) { + LoadedLoc = AA->getLocationForSource(MTI); + } else { + // Not a loading instruction. + continue; + } + + // Remove any allocas from the DeadPointer set that are loaded, as this + // makes any stores above the access live. + RemoveAccessedObjects(LoadedLoc, DeadStackObjects); + + // If all of the allocas were clobbered by the access then we're not going + // to find anything else to process. + if (DeadStackObjects.empty()) + break; + } + + return MadeChange; +} + +/// RemoveAccessedObjects - Check to see if the specified location may alias any +/// of the stack objects in the DeadStackObjects set. If so, they become live +/// because the location is being loaded. +void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc, + SmallPtrSet<Value*, 16> &DeadStackObjects) { + const Value *UnderlyingPointer = GetUnderlyingObject(LoadedLoc.Ptr); + + // A constant can't be in the dead pointer set. + if (isa<Constant>(UnderlyingPointer)) + return; + + // If the kill pointer can be easily reduced to an alloca, don't bother doing + // extraneous AA queries. + if (isa<AllocaInst>(UnderlyingPointer) || isa<Argument>(UnderlyingPointer)) { + DeadStackObjects.erase(const_cast<Value*>(UnderlyingPointer)); + return; + } + + SmallVector<Value*, 16> NowLive; + for (SmallPtrSet<Value*, 16>::iterator I = DeadStackObjects.begin(), + E = DeadStackObjects.end(); I != E; ++I) { + // See if the loaded location could alias the stack location. + AliasAnalysis::Location StackLoc(*I, getPointerSize(*I, *AA)); + if (!AA->isNoAlias(StackLoc, LoadedLoc)) + NowLive.push_back(*I); + } + + for (SmallVector<Value*, 16>::iterator I = NowLive.begin(), E = NowLive.end(); + I != E; ++I) + DeadStackObjects.erase(*I); +} + diff --git a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp new file mode 100644 index 0000000..3d3f17b --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -0,0 +1,470 @@ +//===- EarlyCSE.cpp - Simple and fast CSE pass ----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass performs a simple dominator tree walk that eliminates trivially +// redundant instructions. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "early-cse" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Instructions.h" +#include "llvm/Pass.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/RecyclingAllocator.h" +#include "llvm/ADT/ScopedHashTable.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumSimplify, "Number of instructions simplified or DCE'd"); +STATISTIC(NumCSE, "Number of instructions CSE'd"); +STATISTIC(NumCSELoad, "Number of load instructions CSE'd"); +STATISTIC(NumCSECall, "Number of call instructions CSE'd"); +STATISTIC(NumDSE, "Number of trivial dead stores removed"); + +static unsigned getHash(const void *V) { + return DenseMapInfo<const void*>::getHashValue(V); +} + +//===----------------------------------------------------------------------===// +// SimpleValue +//===----------------------------------------------------------------------===// + +namespace { + /// SimpleValue - Instances of this struct represent available values in the + /// scoped hash table. + struct SimpleValue { + Instruction *Inst; + + SimpleValue(Instruction *I) : Inst(I) { + assert((isSentinel() || canHandle(I)) && "Inst can't be handled!"); + } + + bool isSentinel() const { + return Inst == DenseMapInfo<Instruction*>::getEmptyKey() || + Inst == DenseMapInfo<Instruction*>::getTombstoneKey(); + } + + static bool canHandle(Instruction *Inst) { + // This can only handle non-void readnone functions. + if (CallInst *CI = dyn_cast<CallInst>(Inst)) + return CI->doesNotAccessMemory() && !CI->getType()->isVoidTy(); + return isa<CastInst>(Inst) || isa<BinaryOperator>(Inst) || + isa<GetElementPtrInst>(Inst) || isa<CmpInst>(Inst) || + isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) || + isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) || + isa<ExtractValueInst>(Inst) || isa<InsertValueInst>(Inst); + } + }; +} + +namespace llvm { +// SimpleValue is POD. +template<> struct isPodLike<SimpleValue> { + static const bool value = true; +}; + +template<> struct DenseMapInfo<SimpleValue> { + static inline SimpleValue getEmptyKey() { + return DenseMapInfo<Instruction*>::getEmptyKey(); + } + static inline SimpleValue getTombstoneKey() { + return DenseMapInfo<Instruction*>::getTombstoneKey(); + } + static unsigned getHashValue(SimpleValue Val); + static bool isEqual(SimpleValue LHS, SimpleValue RHS); +}; +} + +unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) { + Instruction *Inst = Val.Inst; + + // Hash in all of the operands as pointers. + unsigned Res = 0; + for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i) + Res ^= getHash(Inst->getOperand(i)) << i; + + if (CastInst *CI = dyn_cast<CastInst>(Inst)) + Res ^= getHash(CI->getType()); + else if (CmpInst *CI = dyn_cast<CmpInst>(Inst)) + Res ^= CI->getPredicate(); + else if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(Inst)) { + for (ExtractValueInst::idx_iterator I = EVI->idx_begin(), + E = EVI->idx_end(); I != E; ++I) + Res ^= *I; + } else if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(Inst)) { + for (InsertValueInst::idx_iterator I = IVI->idx_begin(), + E = IVI->idx_end(); I != E; ++I) + Res ^= *I; + } else { + // nothing extra to hash in. + assert((isa<CallInst>(Inst) || + isa<BinaryOperator>(Inst) || isa<GetElementPtrInst>(Inst) || + isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) || + isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst)) && + "Invalid/unknown instruction"); + } + + // Mix in the opcode. + return (Res << 1) ^ Inst->getOpcode(); +} + +bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) { + Instruction *LHSI = LHS.Inst, *RHSI = RHS.Inst; + + if (LHS.isSentinel() || RHS.isSentinel()) + return LHSI == RHSI; + + if (LHSI->getOpcode() != RHSI->getOpcode()) return false; + return LHSI->isIdenticalTo(RHSI); +} + +//===----------------------------------------------------------------------===// +// CallValue +//===----------------------------------------------------------------------===// + +namespace { + /// CallValue - Instances of this struct represent available call values in + /// the scoped hash table. + struct CallValue { + Instruction *Inst; + + CallValue(Instruction *I) : Inst(I) { + assert((isSentinel() || canHandle(I)) && "Inst can't be handled!"); + } + + bool isSentinel() const { + return Inst == DenseMapInfo<Instruction*>::getEmptyKey() || + Inst == DenseMapInfo<Instruction*>::getTombstoneKey(); + } + + static bool canHandle(Instruction *Inst) { + // Don't value number anything that returns void. + if (Inst->getType()->isVoidTy()) + return false; + + CallInst *CI = dyn_cast<CallInst>(Inst); + if (CI == 0 || !CI->onlyReadsMemory()) + return false; + return true; + } + }; +} + +namespace llvm { + // CallValue is POD. + template<> struct isPodLike<CallValue> { + static const bool value = true; + }; + + template<> struct DenseMapInfo<CallValue> { + static inline CallValue getEmptyKey() { + return DenseMapInfo<Instruction*>::getEmptyKey(); + } + static inline CallValue getTombstoneKey() { + return DenseMapInfo<Instruction*>::getTombstoneKey(); + } + static unsigned getHashValue(CallValue Val); + static bool isEqual(CallValue LHS, CallValue RHS); + }; +} +unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) { + Instruction *Inst = Val.Inst; + // Hash in all of the operands as pointers. + unsigned Res = 0; + for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i) { + assert(!Inst->getOperand(i)->getType()->isMetadataTy() && + "Cannot value number calls with metadata operands"); + Res ^= getHash(Inst->getOperand(i)) << i; + } + + // Mix in the opcode. + return (Res << 1) ^ Inst->getOpcode(); +} + +bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) { + Instruction *LHSI = LHS.Inst, *RHSI = RHS.Inst; + if (LHS.isSentinel() || RHS.isSentinel()) + return LHSI == RHSI; + return LHSI->isIdenticalTo(RHSI); +} + + +//===----------------------------------------------------------------------===// +// EarlyCSE pass. +//===----------------------------------------------------------------------===// + +namespace { + +/// EarlyCSE - This pass does a simple depth-first walk over the dominator +/// tree, eliminating trivially redundant instructions and using instsimplify +/// to canonicalize things as it goes. It is intended to be fast and catch +/// obvious cases so that instcombine and other passes are more effective. It +/// is expected that a later pass of GVN will catch the interesting/hard +/// cases. +class EarlyCSE : public FunctionPass { +public: + const TargetData *TD; + DominatorTree *DT; + typedef RecyclingAllocator<BumpPtrAllocator, + ScopedHashTableVal<SimpleValue, Value*> > AllocatorTy; + typedef ScopedHashTable<SimpleValue, Value*, DenseMapInfo<SimpleValue>, + AllocatorTy> ScopedHTType; + + /// AvailableValues - This scoped hash table contains the current values of + /// all of our simple scalar expressions. As we walk down the domtree, we + /// look to see if instructions are in this: if so, we replace them with what + /// we find, otherwise we insert them so that dominated values can succeed in + /// their lookup. + ScopedHTType *AvailableValues; + + /// AvailableLoads - This scoped hash table contains the current values + /// of loads. This allows us to get efficient access to dominating loads when + /// we have a fully redundant load. In addition to the most recent load, we + /// keep track of a generation count of the read, which is compared against + /// the current generation count. The current generation count is + /// incremented after every possibly writing memory operation, which ensures + /// that we only CSE loads with other loads that have no intervening store. + typedef RecyclingAllocator<BumpPtrAllocator, + ScopedHashTableVal<Value*, std::pair<Value*, unsigned> > > LoadMapAllocator; + typedef ScopedHashTable<Value*, std::pair<Value*, unsigned>, + DenseMapInfo<Value*>, LoadMapAllocator> LoadHTType; + LoadHTType *AvailableLoads; + + /// AvailableCalls - This scoped hash table contains the current values + /// of read-only call values. It uses the same generation count as loads. + typedef ScopedHashTable<CallValue, std::pair<Value*, unsigned> > CallHTType; + CallHTType *AvailableCalls; + + /// CurrentGeneration - This is the current generation of the memory value. + unsigned CurrentGeneration; + + static char ID; + explicit EarlyCSE() : FunctionPass(ID) { + initializeEarlyCSEPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F); + +private: + + bool processNode(DomTreeNode *Node); + + // This transformation requires dominator postdominator info + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<DominatorTree>(); + AU.setPreservesCFG(); + } +}; +} + +char EarlyCSE::ID = 0; + +// createEarlyCSEPass - The public interface to this file. +FunctionPass *llvm::createEarlyCSEPass() { + return new EarlyCSE(); +} + +INITIALIZE_PASS_BEGIN(EarlyCSE, "early-cse", "Early CSE", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_END(EarlyCSE, "early-cse", "Early CSE", false, false) + +bool EarlyCSE::processNode(DomTreeNode *Node) { + // Define a scope in the scoped hash table. When we are done processing this + // domtree node and recurse back up to our parent domtree node, this will pop + // off all the values we install. + ScopedHTType::ScopeTy Scope(*AvailableValues); + + // Define a scope for the load values so that anything we add will get + // popped when we recurse back up to our parent domtree node. + LoadHTType::ScopeTy LoadScope(*AvailableLoads); + + // Define a scope for the call values so that anything we add will get + // popped when we recurse back up to our parent domtree node. + CallHTType::ScopeTy CallScope(*AvailableCalls); + + BasicBlock *BB = Node->getBlock(); + + // If this block has a single predecessor, then the predecessor is the parent + // of the domtree node and all of the live out memory values are still current + // in this block. If this block has multiple predecessors, then they could + // have invalidated the live-out memory values of our parent value. For now, + // just be conservative and invalidate memory if this block has multiple + // predecessors. + if (BB->getSinglePredecessor() == 0) + ++CurrentGeneration; + + /// LastStore - Keep track of the last non-volatile store that we saw... for + /// as long as there in no instruction that reads memory. If we see a store + /// to the same location, we delete the dead store. This zaps trivial dead + /// stores which can occur in bitfield code among other things. + StoreInst *LastStore = 0; + + bool Changed = false; + + // See if any instructions in the block can be eliminated. If so, do it. If + // not, add them to AvailableValues. + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) { + Instruction *Inst = I++; + + // Dead instructions should just be removed. + if (isInstructionTriviallyDead(Inst)) { + DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n'); + Inst->eraseFromParent(); + Changed = true; + ++NumSimplify; + continue; + } + + // If the instruction can be simplified (e.g. X+0 = X) then replace it with + // its simpler value. + if (Value *V = SimplifyInstruction(Inst, TD, DT)) { + DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << " to: " << *V << '\n'); + Inst->replaceAllUsesWith(V); + Inst->eraseFromParent(); + Changed = true; + ++NumSimplify; + continue; + } + + // If this is a simple instruction that we can value number, process it. + if (SimpleValue::canHandle(Inst)) { + // See if the instruction has an available value. If so, use it. + if (Value *V = AvailableValues->lookup(Inst)) { + DEBUG(dbgs() << "EarlyCSE CSE: " << *Inst << " to: " << *V << '\n'); + Inst->replaceAllUsesWith(V); + Inst->eraseFromParent(); + Changed = true; + ++NumCSE; + continue; + } + + // Otherwise, just remember that this value is available. + AvailableValues->insert(Inst, Inst); + continue; + } + + // If this is a non-volatile load, process it. + if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { + // Ignore volatile loads. + if (LI->isVolatile()) { + LastStore = 0; + continue; + } + + // If we have an available version of this load, and if it is the right + // generation, replace this instruction. + std::pair<Value*, unsigned> InVal = + AvailableLoads->lookup(Inst->getOperand(0)); + if (InVal.first != 0 && InVal.second == CurrentGeneration) { + DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst << " to: " + << *InVal.first << '\n'); + if (!Inst->use_empty()) Inst->replaceAllUsesWith(InVal.first); + Inst->eraseFromParent(); + Changed = true; + ++NumCSELoad; + continue; + } + + // Otherwise, remember that we have this instruction. + AvailableLoads->insert(Inst->getOperand(0), + std::pair<Value*, unsigned>(Inst, CurrentGeneration)); + LastStore = 0; + continue; + } + + // If this instruction may read from memory, forget LastStore. + if (Inst->mayReadFromMemory()) + LastStore = 0; + + // If this is a read-only call, process it. + if (CallValue::canHandle(Inst)) { + // If we have an available version of this call, and if it is the right + // generation, replace this instruction. + std::pair<Value*, unsigned> InVal = AvailableCalls->lookup(Inst); + if (InVal.first != 0 && InVal.second == CurrentGeneration) { + DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst << " to: " + << *InVal.first << '\n'); + if (!Inst->use_empty()) Inst->replaceAllUsesWith(InVal.first); + Inst->eraseFromParent(); + Changed = true; + ++NumCSECall; + continue; + } + + // Otherwise, remember that we have this instruction. + AvailableCalls->insert(Inst, + std::pair<Value*, unsigned>(Inst, CurrentGeneration)); + continue; + } + + // Okay, this isn't something we can CSE at all. Check to see if it is + // something that could modify memory. If so, our available memory values + // cannot be used so bump the generation count. + if (Inst->mayWriteToMemory()) { + ++CurrentGeneration; + + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + // We do a trivial form of DSE if there are two stores to the same + // location with no intervening loads. Delete the earlier store. + if (LastStore && + LastStore->getPointerOperand() == SI->getPointerOperand()) { + DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore << " due to: " + << *Inst << '\n'); + LastStore->eraseFromParent(); + Changed = true; + ++NumDSE; + LastStore = 0; + continue; + } + + // Okay, we just invalidated anything we knew about loaded values. Try + // to salvage *something* by remembering that the stored value is a live + // version of the pointer. It is safe to forward from volatile stores + // to non-volatile loads, so we don't have to check for volatility of + // the store. + AvailableLoads->insert(SI->getPointerOperand(), + std::pair<Value*, unsigned>(SI->getValueOperand(), CurrentGeneration)); + + // Remember that this was the last store we saw for DSE. + if (!SI->isVolatile()) + LastStore = SI; + } + } + } + + unsigned LiveOutGeneration = CurrentGeneration; + for (DomTreeNode::iterator I = Node->begin(), E = Node->end(); I != E; ++I) { + Changed |= processNode(*I); + // Pop any generation changes off the stack from the recursive walk. + CurrentGeneration = LiveOutGeneration; + } + return Changed; +} + + +bool EarlyCSE::runOnFunction(Function &F) { + TD = getAnalysisIfAvailable<TargetData>(); + DT = &getAnalysis<DominatorTree>(); + + // Tables that the pass uses when walking the domtree. + ScopedHTType AVTable; + AvailableValues = &AVTable; + LoadHTType LoadTable; + AvailableLoads = &LoadTable; + CallHTType CallTable; + AvailableCalls = &CallTable; + + CurrentGeneration = 0; + return processNode(DT->getRootNode()); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/GEPSplitter.cpp b/contrib/llvm/lib/Transforms/Scalar/GEPSplitter.cpp new file mode 100644 index 0000000..4c3d188 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/GEPSplitter.cpp @@ -0,0 +1,83 @@ +//===- GEPSplitter.cpp - Split complex GEPs into simple ones --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This function breaks GEPs with more than 2 non-zero operands into smaller +// GEPs each with no more than 2 non-zero operands. This exposes redundancy +// between GEPs with common initial operand sequences. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "split-geps" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Constants.h" +#include "llvm/Function.h" +#include "llvm/Instructions.h" +#include "llvm/Pass.h" +using namespace llvm; + +namespace { + class GEPSplitter : public FunctionPass { + virtual bool runOnFunction(Function &F); + virtual void getAnalysisUsage(AnalysisUsage &AU) const; + public: + static char ID; // Pass identification, replacement for typeid + explicit GEPSplitter() : FunctionPass(ID) { + initializeGEPSplitterPass(*PassRegistry::getPassRegistry()); + } + }; +} + +char GEPSplitter::ID = 0; +INITIALIZE_PASS(GEPSplitter, "split-geps", + "split complex GEPs into simple GEPs", false, false) + +FunctionPass *llvm::createGEPSplitterPass() { + return new GEPSplitter(); +} + +bool GEPSplitter::runOnFunction(Function &F) { + bool Changed = false; + + // Visit each GEP instruction. + for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) + for (BasicBlock::iterator II = I->begin(), IE = I->end(); II != IE; ) + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(II++)) { + unsigned NumOps = GEP->getNumOperands(); + // Ignore GEPs which are already simple. + if (NumOps <= 2) + continue; + bool FirstIndexIsZero = isa<ConstantInt>(GEP->getOperand(1)) && + cast<ConstantInt>(GEP->getOperand(1))->isZero(); + if (NumOps == 3 && FirstIndexIsZero) + continue; + // The first index is special and gets expanded with a 2-operand GEP + // (unless it's zero, in which case we can skip this). + Value *NewGEP = FirstIndexIsZero ? + GEP->getOperand(0) : + GetElementPtrInst::Create(GEP->getOperand(0), GEP->getOperand(1), + "tmp", GEP); + // All remaining indices get expanded with a 3-operand GEP with zero + // as the second operand. + Value *Idxs[2]; + Idxs[0] = ConstantInt::get(Type::getInt64Ty(F.getContext()), 0); + for (unsigned i = 2; i != NumOps; ++i) { + Idxs[1] = GEP->getOperand(i); + NewGEP = GetElementPtrInst::Create(NewGEP, Idxs, Idxs+2, "tmp", GEP); + } + GEP->replaceAllUsesWith(NewGEP); + GEP->eraseFromParent(); + Changed = true; + } + + return Changed; +} + +void GEPSplitter::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp new file mode 100644 index 0000000..a0123f5 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp @@ -0,0 +1,2044 @@ +//===- GVN.cpp - Eliminate redundant values and loads ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass performs global value numbering to eliminate fully redundant +// instructions. It also performs simple dead load elimination. +// +// Note that this pass does the value numbering itself; it does not use the +// ValueNumbering analysis passes. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "gvn" +#include "llvm/Transforms/Scalar.h" +#include "llvm/GlobalVariable.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/LLVMContext.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/MemoryDependenceAnalysis.h" +#include "llvm/Analysis/PHITransAddr.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Assembly/Writer.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/IRBuilder.h" +using namespace llvm; + +STATISTIC(NumGVNInstr, "Number of instructions deleted"); +STATISTIC(NumGVNLoad, "Number of loads deleted"); +STATISTIC(NumGVNPRE, "Number of instructions PRE'd"); +STATISTIC(NumGVNBlocks, "Number of blocks merged"); +STATISTIC(NumPRELoad, "Number of loads PRE'd"); + +static cl::opt<bool> EnablePRE("enable-pre", + cl::init(true), cl::Hidden); +static cl::opt<bool> EnableLoadPRE("enable-load-pre", cl::init(true)); + +//===----------------------------------------------------------------------===// +// ValueTable Class +//===----------------------------------------------------------------------===// + +/// This class holds the mapping between values and value numbers. It is used +/// as an efficient mechanism to determine the expression-wise equivalence of +/// two values. +namespace { + struct Expression { + uint32_t opcode; + const Type* type; + SmallVector<uint32_t, 4> varargs; + + Expression() { } + Expression(uint32_t o) : opcode(o) { } + + bool operator==(const Expression &other) const { + if (opcode != other.opcode) + return false; + else if (opcode == ~0U || opcode == ~1U) + return true; + else if (type != other.type) + return false; + else if (varargs != other.varargs) + return false; + return true; + } + }; + + class ValueTable { + private: + DenseMap<Value*, uint32_t> valueNumbering; + DenseMap<Expression, uint32_t> expressionNumbering; + AliasAnalysis* AA; + MemoryDependenceAnalysis* MD; + DominatorTree* DT; + + uint32_t nextValueNumber; + + Expression create_expression(Instruction* I); + uint32_t lookup_or_add_call(CallInst* C); + public: + ValueTable() : nextValueNumber(1) { } + uint32_t lookup_or_add(Value *V); + uint32_t lookup(Value *V) const; + void add(Value *V, uint32_t num); + void clear(); + void erase(Value *v); + void setAliasAnalysis(AliasAnalysis* A) { AA = A; } + AliasAnalysis *getAliasAnalysis() const { return AA; } + void setMemDep(MemoryDependenceAnalysis* M) { MD = M; } + void setDomTree(DominatorTree* D) { DT = D; } + uint32_t getNextUnusedValueNumber() { return nextValueNumber; } + void verifyRemoved(const Value *) const; + }; +} + +namespace llvm { +template <> struct DenseMapInfo<Expression> { + static inline Expression getEmptyKey() { + return ~0U; + } + + static inline Expression getTombstoneKey() { + return ~1U; + } + + static unsigned getHashValue(const Expression e) { + unsigned hash = e.opcode; + + hash = ((unsigned)((uintptr_t)e.type >> 4) ^ + (unsigned)((uintptr_t)e.type >> 9)); + + for (SmallVector<uint32_t, 4>::const_iterator I = e.varargs.begin(), + E = e.varargs.end(); I != E; ++I) + hash = *I + hash * 37; + + return hash; + } + static bool isEqual(const Expression &LHS, const Expression &RHS) { + return LHS == RHS; + } +}; + +} + +//===----------------------------------------------------------------------===// +// ValueTable Internal Functions +//===----------------------------------------------------------------------===// + + +Expression ValueTable::create_expression(Instruction *I) { + Expression e; + e.type = I->getType(); + e.opcode = I->getOpcode(); + for (Instruction::op_iterator OI = I->op_begin(), OE = I->op_end(); + OI != OE; ++OI) + e.varargs.push_back(lookup_or_add(*OI)); + + if (CmpInst *C = dyn_cast<CmpInst>(I)) + e.opcode = (C->getOpcode() << 8) | C->getPredicate(); + else if (ExtractValueInst *E = dyn_cast<ExtractValueInst>(I)) { + for (ExtractValueInst::idx_iterator II = E->idx_begin(), IE = E->idx_end(); + II != IE; ++II) + e.varargs.push_back(*II); + } else if (InsertValueInst *E = dyn_cast<InsertValueInst>(I)) { + for (InsertValueInst::idx_iterator II = E->idx_begin(), IE = E->idx_end(); + II != IE; ++II) + e.varargs.push_back(*II); + } + + return e; +} + +//===----------------------------------------------------------------------===// +// ValueTable External Functions +//===----------------------------------------------------------------------===// + +/// add - Insert a value into the table with a specified value number. +void ValueTable::add(Value *V, uint32_t num) { + valueNumbering.insert(std::make_pair(V, num)); +} + +uint32_t ValueTable::lookup_or_add_call(CallInst* C) { + if (AA->doesNotAccessMemory(C)) { + Expression exp = create_expression(C); + uint32_t& e = expressionNumbering[exp]; + if (!e) e = nextValueNumber++; + valueNumbering[C] = e; + return e; + } else if (AA->onlyReadsMemory(C)) { + Expression exp = create_expression(C); + uint32_t& e = expressionNumbering[exp]; + if (!e) { + e = nextValueNumber++; + valueNumbering[C] = e; + return e; + } + if (!MD) { + e = nextValueNumber++; + valueNumbering[C] = e; + return e; + } + + MemDepResult local_dep = MD->getDependency(C); + + if (!local_dep.isDef() && !local_dep.isNonLocal()) { + valueNumbering[C] = nextValueNumber; + return nextValueNumber++; + } + + if (local_dep.isDef()) { + CallInst* local_cdep = cast<CallInst>(local_dep.getInst()); + + if (local_cdep->getNumArgOperands() != C->getNumArgOperands()) { + valueNumbering[C] = nextValueNumber; + return nextValueNumber++; + } + + for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) { + uint32_t c_vn = lookup_or_add(C->getArgOperand(i)); + uint32_t cd_vn = lookup_or_add(local_cdep->getArgOperand(i)); + if (c_vn != cd_vn) { + valueNumbering[C] = nextValueNumber; + return nextValueNumber++; + } + } + + uint32_t v = lookup_or_add(local_cdep); + valueNumbering[C] = v; + return v; + } + + // Non-local case. + const MemoryDependenceAnalysis::NonLocalDepInfo &deps = + MD->getNonLocalCallDependency(CallSite(C)); + // FIXME: call/call dependencies for readonly calls should return def, not + // clobber! Move the checking logic to MemDep! + CallInst* cdep = 0; + + // Check to see if we have a single dominating call instruction that is + // identical to C. + for (unsigned i = 0, e = deps.size(); i != e; ++i) { + const NonLocalDepEntry *I = &deps[i]; + // Ignore non-local dependencies. + if (I->getResult().isNonLocal()) + continue; + + // We don't handle non-depedencies. If we already have a call, reject + // instruction dependencies. + if (I->getResult().isClobber() || cdep != 0) { + cdep = 0; + break; + } + + CallInst *NonLocalDepCall = dyn_cast<CallInst>(I->getResult().getInst()); + // FIXME: All duplicated with non-local case. + if (NonLocalDepCall && DT->properlyDominates(I->getBB(), C->getParent())){ + cdep = NonLocalDepCall; + continue; + } + + cdep = 0; + break; + } + + if (!cdep) { + valueNumbering[C] = nextValueNumber; + return nextValueNumber++; + } + + if (cdep->getNumArgOperands() != C->getNumArgOperands()) { + valueNumbering[C] = nextValueNumber; + return nextValueNumber++; + } + for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) { + uint32_t c_vn = lookup_or_add(C->getArgOperand(i)); + uint32_t cd_vn = lookup_or_add(cdep->getArgOperand(i)); + if (c_vn != cd_vn) { + valueNumbering[C] = nextValueNumber; + return nextValueNumber++; + } + } + + uint32_t v = lookup_or_add(cdep); + valueNumbering[C] = v; + return v; + + } else { + valueNumbering[C] = nextValueNumber; + return nextValueNumber++; + } +} + +/// lookup_or_add - Returns the value number for the specified value, assigning +/// it a new number if it did not have one before. +uint32_t ValueTable::lookup_or_add(Value *V) { + DenseMap<Value*, uint32_t>::iterator VI = valueNumbering.find(V); + if (VI != valueNumbering.end()) + return VI->second; + + if (!isa<Instruction>(V)) { + valueNumbering[V] = nextValueNumber; + return nextValueNumber++; + } + + Instruction* I = cast<Instruction>(V); + Expression exp; + switch (I->getOpcode()) { + case Instruction::Call: + return lookup_or_add_call(cast<CallInst>(I)); + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or : + case Instruction::Xor: + case Instruction::ICmp: + case Instruction::FCmp: + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::FPTrunc: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::BitCast: + case Instruction::Select: + case Instruction::ExtractElement: + case Instruction::InsertElement: + case Instruction::ShuffleVector: + case Instruction::ExtractValue: + case Instruction::InsertValue: + case Instruction::GetElementPtr: + exp = create_expression(I); + break; + default: + valueNumbering[V] = nextValueNumber; + return nextValueNumber++; + } + + uint32_t& e = expressionNumbering[exp]; + if (!e) e = nextValueNumber++; + valueNumbering[V] = e; + return e; +} + +/// lookup - Returns the value number of the specified value. Fails if +/// the value has not yet been numbered. +uint32_t ValueTable::lookup(Value *V) const { + DenseMap<Value*, uint32_t>::const_iterator VI = valueNumbering.find(V); + assert(VI != valueNumbering.end() && "Value not numbered?"); + return VI->second; +} + +/// clear - Remove all entries from the ValueTable +void ValueTable::clear() { + valueNumbering.clear(); + expressionNumbering.clear(); + nextValueNumber = 1; +} + +/// erase - Remove a value from the value numbering +void ValueTable::erase(Value *V) { + valueNumbering.erase(V); +} + +/// verifyRemoved - Verify that the value is removed from all internal data +/// structures. +void ValueTable::verifyRemoved(const Value *V) const { + for (DenseMap<Value*, uint32_t>::const_iterator + I = valueNumbering.begin(), E = valueNumbering.end(); I != E; ++I) { + assert(I->first != V && "Inst still occurs in value numbering map!"); + } +} + +//===----------------------------------------------------------------------===// +// GVN Pass +//===----------------------------------------------------------------------===// + +namespace { + + class GVN : public FunctionPass { + bool runOnFunction(Function &F); + public: + static char ID; // Pass identification, replacement for typeid + explicit GVN(bool noloads = false) + : FunctionPass(ID), NoLoads(noloads), MD(0) { + initializeGVNPass(*PassRegistry::getPassRegistry()); + } + + private: + bool NoLoads; + MemoryDependenceAnalysis *MD; + DominatorTree *DT; + const TargetData* TD; + + ValueTable VN; + + /// LeaderTable - A mapping from value numbers to lists of Value*'s that + /// have that value number. Use findLeader to query it. + struct LeaderTableEntry { + Value *Val; + BasicBlock *BB; + LeaderTableEntry *Next; + }; + DenseMap<uint32_t, LeaderTableEntry> LeaderTable; + BumpPtrAllocator TableAllocator; + + /// addToLeaderTable - Push a new Value to the LeaderTable onto the list for + /// its value number. + void addToLeaderTable(uint32_t N, Value *V, BasicBlock *BB) { + LeaderTableEntry& Curr = LeaderTable[N]; + if (!Curr.Val) { + Curr.Val = V; + Curr.BB = BB; + return; + } + + LeaderTableEntry* Node = TableAllocator.Allocate<LeaderTableEntry>(); + Node->Val = V; + Node->BB = BB; + Node->Next = Curr.Next; + Curr.Next = Node; + } + + /// removeFromLeaderTable - Scan the list of values corresponding to a given + /// value number, and remove the given value if encountered. + void removeFromLeaderTable(uint32_t N, Value *V, BasicBlock *BB) { + LeaderTableEntry* Prev = 0; + LeaderTableEntry* Curr = &LeaderTable[N]; + + while (Curr->Val != V || Curr->BB != BB) { + Prev = Curr; + Curr = Curr->Next; + } + + if (Prev) { + Prev->Next = Curr->Next; + } else { + if (!Curr->Next) { + Curr->Val = 0; + Curr->BB = 0; + } else { + LeaderTableEntry* Next = Curr->Next; + Curr->Val = Next->Val; + Curr->BB = Next->BB; + Curr->Next = Next->Next; + } + } + } + + // List of critical edges to be split between iterations. + SmallVector<std::pair<TerminatorInst*, unsigned>, 4> toSplit; + + // This transformation requires dominator postdominator info + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<DominatorTree>(); + if (!NoLoads) + AU.addRequired<MemoryDependenceAnalysis>(); + AU.addRequired<AliasAnalysis>(); + + AU.addPreserved<DominatorTree>(); + AU.addPreserved<AliasAnalysis>(); + } + + // Helper fuctions + // FIXME: eliminate or document these better + bool processLoad(LoadInst* L, + SmallVectorImpl<Instruction*> &toErase); + bool processInstruction(Instruction *I, + SmallVectorImpl<Instruction*> &toErase); + bool processNonLocalLoad(LoadInst* L, + SmallVectorImpl<Instruction*> &toErase); + bool processBlock(BasicBlock *BB); + void dump(DenseMap<uint32_t, Value*>& d); + bool iterateOnFunction(Function &F); + bool performPRE(Function& F); + Value *findLeader(BasicBlock *BB, uint32_t num); + void cleanupGlobalSets(); + void verifyRemoved(const Instruction *I) const; + bool splitCriticalEdges(); + }; + + char GVN::ID = 0; +} + +// createGVNPass - The public interface to this file... +FunctionPass *llvm::createGVNPass(bool NoLoads) { + return new GVN(NoLoads); +} + +INITIALIZE_PASS_BEGIN(GVN, "gvn", "Global Value Numbering", false, false) +INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_END(GVN, "gvn", "Global Value Numbering", false, false) + +void GVN::dump(DenseMap<uint32_t, Value*>& d) { + errs() << "{\n"; + for (DenseMap<uint32_t, Value*>::iterator I = d.begin(), + E = d.end(); I != E; ++I) { + errs() << I->first << "\n"; + I->second->dump(); + } + errs() << "}\n"; +} + +/// IsValueFullyAvailableInBlock - Return true if we can prove that the value +/// we're analyzing is fully available in the specified block. As we go, keep +/// track of which blocks we know are fully alive in FullyAvailableBlocks. This +/// map is actually a tri-state map with the following values: +/// 0) we know the block *is not* fully available. +/// 1) we know the block *is* fully available. +/// 2) we do not know whether the block is fully available or not, but we are +/// currently speculating that it will be. +/// 3) we are speculating for this block and have used that to speculate for +/// other blocks. +static bool IsValueFullyAvailableInBlock(BasicBlock *BB, + DenseMap<BasicBlock*, char> &FullyAvailableBlocks) { + // Optimistically assume that the block is fully available and check to see + // if we already know about this block in one lookup. + std::pair<DenseMap<BasicBlock*, char>::iterator, char> IV = + FullyAvailableBlocks.insert(std::make_pair(BB, 2)); + + // If the entry already existed for this block, return the precomputed value. + if (!IV.second) { + // If this is a speculative "available" value, mark it as being used for + // speculation of other blocks. + if (IV.first->second == 2) + IV.first->second = 3; + return IV.first->second != 0; + } + + // Otherwise, see if it is fully available in all predecessors. + pred_iterator PI = pred_begin(BB), PE = pred_end(BB); + + // If this block has no predecessors, it isn't live-in here. + if (PI == PE) + goto SpeculationFailure; + + for (; PI != PE; ++PI) + // If the value isn't fully available in one of our predecessors, then it + // isn't fully available in this block either. Undo our previous + // optimistic assumption and bail out. + if (!IsValueFullyAvailableInBlock(*PI, FullyAvailableBlocks)) + goto SpeculationFailure; + + return true; + +// SpeculationFailure - If we get here, we found out that this is not, after +// all, a fully-available block. We have a problem if we speculated on this and +// used the speculation to mark other blocks as available. +SpeculationFailure: + char &BBVal = FullyAvailableBlocks[BB]; + + // If we didn't speculate on this, just return with it set to false. + if (BBVal == 2) { + BBVal = 0; + return false; + } + + // If we did speculate on this value, we could have blocks set to 1 that are + // incorrect. Walk the (transitive) successors of this block and mark them as + // 0 if set to one. + SmallVector<BasicBlock*, 32> BBWorklist; + BBWorklist.push_back(BB); + + do { + BasicBlock *Entry = BBWorklist.pop_back_val(); + // Note that this sets blocks to 0 (unavailable) if they happen to not + // already be in FullyAvailableBlocks. This is safe. + char &EntryVal = FullyAvailableBlocks[Entry]; + if (EntryVal == 0) continue; // Already unavailable. + + // Mark as unavailable. + EntryVal = 0; + + for (succ_iterator I = succ_begin(Entry), E = succ_end(Entry); I != E; ++I) + BBWorklist.push_back(*I); + } while (!BBWorklist.empty()); + + return false; +} + + +/// CanCoerceMustAliasedValueToLoad - Return true if +/// CoerceAvailableValueToLoadType will succeed. +static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal, + const Type *LoadTy, + const TargetData &TD) { + // If the loaded or stored value is an first class array or struct, don't try + // to transform them. We need to be able to bitcast to integer. + if (LoadTy->isStructTy() || LoadTy->isArrayTy() || + StoredVal->getType()->isStructTy() || + StoredVal->getType()->isArrayTy()) + return false; + + // The store has to be at least as big as the load. + if (TD.getTypeSizeInBits(StoredVal->getType()) < + TD.getTypeSizeInBits(LoadTy)) + return false; + + return true; +} + + +/// CoerceAvailableValueToLoadType - If we saw a store of a value to memory, and +/// then a load from a must-aliased pointer of a different type, try to coerce +/// the stored value. LoadedTy is the type of the load we want to replace and +/// InsertPt is the place to insert new instructions. +/// +/// If we can't do it, return null. +static Value *CoerceAvailableValueToLoadType(Value *StoredVal, + const Type *LoadedTy, + Instruction *InsertPt, + const TargetData &TD) { + if (!CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, TD)) + return 0; + + const Type *StoredValTy = StoredVal->getType(); + + uint64_t StoreSize = TD.getTypeStoreSizeInBits(StoredValTy); + uint64_t LoadSize = TD.getTypeSizeInBits(LoadedTy); + + // If the store and reload are the same size, we can always reuse it. + if (StoreSize == LoadSize) { + if (StoredValTy->isPointerTy() && LoadedTy->isPointerTy()) { + // Pointer to Pointer -> use bitcast. + return new BitCastInst(StoredVal, LoadedTy, "", InsertPt); + } + + // Convert source pointers to integers, which can be bitcast. + if (StoredValTy->isPointerTy()) { + StoredValTy = TD.getIntPtrType(StoredValTy->getContext()); + StoredVal = new PtrToIntInst(StoredVal, StoredValTy, "", InsertPt); + } + + const Type *TypeToCastTo = LoadedTy; + if (TypeToCastTo->isPointerTy()) + TypeToCastTo = TD.getIntPtrType(StoredValTy->getContext()); + + if (StoredValTy != TypeToCastTo) + StoredVal = new BitCastInst(StoredVal, TypeToCastTo, "", InsertPt); + + // Cast to pointer if the load needs a pointer type. + if (LoadedTy->isPointerTy()) + StoredVal = new IntToPtrInst(StoredVal, LoadedTy, "", InsertPt); + + return StoredVal; + } + + // If the loaded value is smaller than the available value, then we can + // extract out a piece from it. If the available value is too small, then we + // can't do anything. + assert(StoreSize >= LoadSize && "CanCoerceMustAliasedValueToLoad fail"); + + // Convert source pointers to integers, which can be manipulated. + if (StoredValTy->isPointerTy()) { + StoredValTy = TD.getIntPtrType(StoredValTy->getContext()); + StoredVal = new PtrToIntInst(StoredVal, StoredValTy, "", InsertPt); + } + + // Convert vectors and fp to integer, which can be manipulated. + if (!StoredValTy->isIntegerTy()) { + StoredValTy = IntegerType::get(StoredValTy->getContext(), StoreSize); + StoredVal = new BitCastInst(StoredVal, StoredValTy, "", InsertPt); + } + + // If this is a big-endian system, we need to shift the value down to the low + // bits so that a truncate will work. + if (TD.isBigEndian()) { + Constant *Val = ConstantInt::get(StoredVal->getType(), StoreSize-LoadSize); + StoredVal = BinaryOperator::CreateLShr(StoredVal, Val, "tmp", InsertPt); + } + + // Truncate the integer to the right size now. + const Type *NewIntTy = IntegerType::get(StoredValTy->getContext(), LoadSize); + StoredVal = new TruncInst(StoredVal, NewIntTy, "trunc", InsertPt); + + if (LoadedTy == NewIntTy) + return StoredVal; + + // If the result is a pointer, inttoptr. + if (LoadedTy->isPointerTy()) + return new IntToPtrInst(StoredVal, LoadedTy, "inttoptr", InsertPt); + + // Otherwise, bitcast. + return new BitCastInst(StoredVal, LoadedTy, "bitcast", InsertPt); +} + +/// AnalyzeLoadFromClobberingWrite - This function is called when we have a +/// memdep query of a load that ends up being a clobbering memory write (store, +/// memset, memcpy, memmove). This means that the write *may* provide bits used +/// by the load but we can't be sure because the pointers don't mustalias. +/// +/// Check this case to see if there is anything more we can do before we give +/// up. This returns -1 if we have to give up, or a byte number in the stored +/// value of the piece that feeds the load. +static int AnalyzeLoadFromClobberingWrite(const Type *LoadTy, Value *LoadPtr, + Value *WritePtr, + uint64_t WriteSizeInBits, + const TargetData &TD) { + // If the loaded or stored value is an first class array or struct, don't try + // to transform them. We need to be able to bitcast to integer. + if (LoadTy->isStructTy() || LoadTy->isArrayTy()) + return -1; + + int64_t StoreOffset = 0, LoadOffset = 0; + Value *StoreBase = GetPointerBaseWithConstantOffset(WritePtr, StoreOffset,TD); + Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, TD); + if (StoreBase != LoadBase) + return -1; + + // If the load and store are to the exact same address, they should have been + // a must alias. AA must have gotten confused. + // FIXME: Study to see if/when this happens. One case is forwarding a memset + // to a load from the base of the memset. +#if 0 + if (LoadOffset == StoreOffset) { + dbgs() << "STORE/LOAD DEP WITH COMMON POINTER MISSED:\n" + << "Base = " << *StoreBase << "\n" + << "Store Ptr = " << *WritePtr << "\n" + << "Store Offs = " << StoreOffset << "\n" + << "Load Ptr = " << *LoadPtr << "\n"; + abort(); + } +#endif + + // If the load and store don't overlap at all, the store doesn't provide + // anything to the load. In this case, they really don't alias at all, AA + // must have gotten confused. + uint64_t LoadSize = TD.getTypeSizeInBits(LoadTy); + + if ((WriteSizeInBits & 7) | (LoadSize & 7)) + return -1; + uint64_t StoreSize = WriteSizeInBits >> 3; // Convert to bytes. + LoadSize >>= 3; + + + bool isAAFailure = false; + if (StoreOffset < LoadOffset) + isAAFailure = StoreOffset+int64_t(StoreSize) <= LoadOffset; + else + isAAFailure = LoadOffset+int64_t(LoadSize) <= StoreOffset; + + if (isAAFailure) { +#if 0 + dbgs() << "STORE LOAD DEP WITH COMMON BASE:\n" + << "Base = " << *StoreBase << "\n" + << "Store Ptr = " << *WritePtr << "\n" + << "Store Offs = " << StoreOffset << "\n" + << "Load Ptr = " << *LoadPtr << "\n"; + abort(); +#endif + return -1; + } + + // If the Load isn't completely contained within the stored bits, we don't + // have all the bits to feed it. We could do something crazy in the future + // (issue a smaller load then merge the bits in) but this seems unlikely to be + // valuable. + if (StoreOffset > LoadOffset || + StoreOffset+StoreSize < LoadOffset+LoadSize) + return -1; + + // Okay, we can do this transformation. Return the number of bytes into the + // store that the load is. + return LoadOffset-StoreOffset; +} + +/// AnalyzeLoadFromClobberingStore - This function is called when we have a +/// memdep query of a load that ends up being a clobbering store. +static int AnalyzeLoadFromClobberingStore(const Type *LoadTy, Value *LoadPtr, + StoreInst *DepSI, + const TargetData &TD) { + // Cannot handle reading from store of first-class aggregate yet. + if (DepSI->getValueOperand()->getType()->isStructTy() || + DepSI->getValueOperand()->getType()->isArrayTy()) + return -1; + + Value *StorePtr = DepSI->getPointerOperand(); + uint64_t StoreSize =TD.getTypeSizeInBits(DepSI->getValueOperand()->getType()); + return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, + StorePtr, StoreSize, TD); +} + +static int AnalyzeLoadFromClobberingMemInst(const Type *LoadTy, Value *LoadPtr, + MemIntrinsic *MI, + const TargetData &TD) { + // If the mem operation is a non-constant size, we can't handle it. + ConstantInt *SizeCst = dyn_cast<ConstantInt>(MI->getLength()); + if (SizeCst == 0) return -1; + uint64_t MemSizeInBits = SizeCst->getZExtValue()*8; + + // If this is memset, we just need to see if the offset is valid in the size + // of the memset.. + if (MI->getIntrinsicID() == Intrinsic::memset) + return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(), + MemSizeInBits, TD); + + // If we have a memcpy/memmove, the only case we can handle is if this is a + // copy from constant memory. In that case, we can read directly from the + // constant memory. + MemTransferInst *MTI = cast<MemTransferInst>(MI); + + Constant *Src = dyn_cast<Constant>(MTI->getSource()); + if (Src == 0) return -1; + + GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, &TD)); + if (GV == 0 || !GV->isConstant()) return -1; + + // See if the access is within the bounds of the transfer. + int Offset = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, + MI->getDest(), MemSizeInBits, TD); + if (Offset == -1) + return Offset; + + // Otherwise, see if we can constant fold a load from the constant with the + // offset applied as appropriate. + Src = ConstantExpr::getBitCast(Src, + llvm::Type::getInt8PtrTy(Src->getContext())); + Constant *OffsetCst = + ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); + Src = ConstantExpr::getGetElementPtr(Src, &OffsetCst, 1); + Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy)); + if (ConstantFoldLoadFromConstPtr(Src, &TD)) + return Offset; + return -1; +} + + +/// GetStoreValueForLoad - This function is called when we have a +/// memdep query of a load that ends up being a clobbering store. This means +/// that the store *may* provide bits used by the load but we can't be sure +/// because the pointers don't mustalias. Check this case to see if there is +/// anything more we can do before we give up. +static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, + const Type *LoadTy, + Instruction *InsertPt, const TargetData &TD){ + LLVMContext &Ctx = SrcVal->getType()->getContext(); + + uint64_t StoreSize = (TD.getTypeSizeInBits(SrcVal->getType()) + 7) / 8; + uint64_t LoadSize = (TD.getTypeSizeInBits(LoadTy) + 7) / 8; + + IRBuilder<> Builder(InsertPt->getParent(), InsertPt); + + // Compute which bits of the stored value are being used by the load. Convert + // to an integer type to start with. + if (SrcVal->getType()->isPointerTy()) + SrcVal = Builder.CreatePtrToInt(SrcVal, TD.getIntPtrType(Ctx), "tmp"); + if (!SrcVal->getType()->isIntegerTy()) + SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8), + "tmp"); + + // Shift the bits to the least significant depending on endianness. + unsigned ShiftAmt; + if (TD.isLittleEndian()) + ShiftAmt = Offset*8; + else + ShiftAmt = (StoreSize-LoadSize-Offset)*8; + + if (ShiftAmt) + SrcVal = Builder.CreateLShr(SrcVal, ShiftAmt, "tmp"); + + if (LoadSize != StoreSize) + SrcVal = Builder.CreateTrunc(SrcVal, IntegerType::get(Ctx, LoadSize*8), + "tmp"); + + return CoerceAvailableValueToLoadType(SrcVal, LoadTy, InsertPt, TD); +} + +/// GetMemInstValueForLoad - This function is called when we have a +/// memdep query of a load that ends up being a clobbering mem intrinsic. +static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, + const Type *LoadTy, Instruction *InsertPt, + const TargetData &TD){ + LLVMContext &Ctx = LoadTy->getContext(); + uint64_t LoadSize = TD.getTypeSizeInBits(LoadTy)/8; + + IRBuilder<> Builder(InsertPt->getParent(), InsertPt); + + // We know that this method is only called when the mem transfer fully + // provides the bits for the load. + if (MemSetInst *MSI = dyn_cast<MemSetInst>(SrcInst)) { + // memset(P, 'x', 1234) -> splat('x'), even if x is a variable, and + // independently of what the offset is. + Value *Val = MSI->getValue(); + if (LoadSize != 1) + Val = Builder.CreateZExt(Val, IntegerType::get(Ctx, LoadSize*8)); + + Value *OneElt = Val; + + // Splat the value out to the right number of bits. + for (unsigned NumBytesSet = 1; NumBytesSet != LoadSize; ) { + // If we can double the number of bytes set, do it. + if (NumBytesSet*2 <= LoadSize) { + Value *ShVal = Builder.CreateShl(Val, NumBytesSet*8); + Val = Builder.CreateOr(Val, ShVal); + NumBytesSet <<= 1; + continue; + } + + // Otherwise insert one byte at a time. + Value *ShVal = Builder.CreateShl(Val, 1*8); + Val = Builder.CreateOr(OneElt, ShVal); + ++NumBytesSet; + } + + return CoerceAvailableValueToLoadType(Val, LoadTy, InsertPt, TD); + } + + // Otherwise, this is a memcpy/memmove from a constant global. + MemTransferInst *MTI = cast<MemTransferInst>(SrcInst); + Constant *Src = cast<Constant>(MTI->getSource()); + + // Otherwise, see if we can constant fold a load from the constant with the + // offset applied as appropriate. + Src = ConstantExpr::getBitCast(Src, + llvm::Type::getInt8PtrTy(Src->getContext())); + Constant *OffsetCst = + ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); + Src = ConstantExpr::getGetElementPtr(Src, &OffsetCst, 1); + Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy)); + return ConstantFoldLoadFromConstPtr(Src, &TD); +} + +namespace { + +struct AvailableValueInBlock { + /// BB - The basic block in question. + BasicBlock *BB; + enum ValType { + SimpleVal, // A simple offsetted value that is accessed. + MemIntrin // A memory intrinsic which is loaded from. + }; + + /// V - The value that is live out of the block. + PointerIntPair<Value *, 1, ValType> Val; + + /// Offset - The byte offset in Val that is interesting for the load query. + unsigned Offset; + + static AvailableValueInBlock get(BasicBlock *BB, Value *V, + unsigned Offset = 0) { + AvailableValueInBlock Res; + Res.BB = BB; + Res.Val.setPointer(V); + Res.Val.setInt(SimpleVal); + Res.Offset = Offset; + return Res; + } + + static AvailableValueInBlock getMI(BasicBlock *BB, MemIntrinsic *MI, + unsigned Offset = 0) { + AvailableValueInBlock Res; + Res.BB = BB; + Res.Val.setPointer(MI); + Res.Val.setInt(MemIntrin); + Res.Offset = Offset; + return Res; + } + + bool isSimpleValue() const { return Val.getInt() == SimpleVal; } + Value *getSimpleValue() const { + assert(isSimpleValue() && "Wrong accessor"); + return Val.getPointer(); + } + + MemIntrinsic *getMemIntrinValue() const { + assert(!isSimpleValue() && "Wrong accessor"); + return cast<MemIntrinsic>(Val.getPointer()); + } + + /// MaterializeAdjustedValue - Emit code into this block to adjust the value + /// defined here to the specified type. This handles various coercion cases. + Value *MaterializeAdjustedValue(const Type *LoadTy, + const TargetData *TD) const { + Value *Res; + if (isSimpleValue()) { + Res = getSimpleValue(); + if (Res->getType() != LoadTy) { + assert(TD && "Need target data to handle type mismatch case"); + Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(), + *TD); + + DEBUG(errs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << " " + << *getSimpleValue() << '\n' + << *Res << '\n' << "\n\n\n"); + } + } else { + Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset, + LoadTy, BB->getTerminator(), *TD); + DEBUG(errs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset + << " " << *getMemIntrinValue() << '\n' + << *Res << '\n' << "\n\n\n"); + } + return Res; + } +}; + +} + +/// ConstructSSAForLoadSet - Given a set of loads specified by ValuesPerBlock, +/// construct SSA form, allowing us to eliminate LI. This returns the value +/// that should be used at LI's definition site. +static Value *ConstructSSAForLoadSet(LoadInst *LI, + SmallVectorImpl<AvailableValueInBlock> &ValuesPerBlock, + const TargetData *TD, + const DominatorTree &DT, + AliasAnalysis *AA) { + // Check for the fully redundant, dominating load case. In this case, we can + // just use the dominating value directly. + if (ValuesPerBlock.size() == 1 && + DT.properlyDominates(ValuesPerBlock[0].BB, LI->getParent())) + return ValuesPerBlock[0].MaterializeAdjustedValue(LI->getType(), TD); + + // Otherwise, we have to construct SSA form. + SmallVector<PHINode*, 8> NewPHIs; + SSAUpdater SSAUpdate(&NewPHIs); + SSAUpdate.Initialize(LI->getType(), LI->getName()); + + const Type *LoadTy = LI->getType(); + + for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) { + const AvailableValueInBlock &AV = ValuesPerBlock[i]; + BasicBlock *BB = AV.BB; + + if (SSAUpdate.HasValueForBlock(BB)) + continue; + + SSAUpdate.AddAvailableValue(BB, AV.MaterializeAdjustedValue(LoadTy, TD)); + } + + // Perform PHI construction. + Value *V = SSAUpdate.GetValueInMiddleOfBlock(LI->getParent()); + + // If new PHI nodes were created, notify alias analysis. + if (V->getType()->isPointerTy()) + for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i) + AA->copyValue(LI, NewPHIs[i]); + + // Now that we've copied information to the new PHIs, scan through + // them again and inform alias analysis that we've added potentially + // escaping uses to any values that are operands to these PHIs. + for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i) { + PHINode *P = NewPHIs[i]; + for (unsigned ii = 0, ee = P->getNumIncomingValues(); ii != ee; ++ii) + AA->addEscapingUse(P->getOperandUse(2*ii)); + } + + return V; +} + +static bool isLifetimeStart(const Instruction *Inst) { + if (const IntrinsicInst* II = dyn_cast<IntrinsicInst>(Inst)) + return II->getIntrinsicID() == Intrinsic::lifetime_start; + return false; +} + +/// processNonLocalLoad - Attempt to eliminate a load whose dependencies are +/// non-local by performing PHI construction. +bool GVN::processNonLocalLoad(LoadInst *LI, + SmallVectorImpl<Instruction*> &toErase) { + // Find the non-local dependencies of the load. + SmallVector<NonLocalDepResult, 64> Deps; + AliasAnalysis::Location Loc = VN.getAliasAnalysis()->getLocation(LI); + MD->getNonLocalPointerDependency(Loc, true, LI->getParent(), Deps); + //DEBUG(dbgs() << "INVESTIGATING NONLOCAL LOAD: " + // << Deps.size() << *LI << '\n'); + + // If we had to process more than one hundred blocks to find the + // dependencies, this load isn't worth worrying about. Optimizing + // it will be too expensive. + if (Deps.size() > 100) + return false; + + // If we had a phi translation failure, we'll have a single entry which is a + // clobber in the current block. Reject this early. + if (Deps.size() == 1 && Deps[0].getResult().isClobber()) { + DEBUG( + dbgs() << "GVN: non-local load "; + WriteAsOperand(dbgs(), LI); + dbgs() << " is clobbered by " << *Deps[0].getResult().getInst() << '\n'; + ); + return false; + } + + // Filter out useless results (non-locals, etc). Keep track of the blocks + // where we have a value available in repl, also keep track of whether we see + // dependencies that produce an unknown value for the load (such as a call + // that could potentially clobber the load). + SmallVector<AvailableValueInBlock, 16> ValuesPerBlock; + SmallVector<BasicBlock*, 16> UnavailableBlocks; + + for (unsigned i = 0, e = Deps.size(); i != e; ++i) { + BasicBlock *DepBB = Deps[i].getBB(); + MemDepResult DepInfo = Deps[i].getResult(); + + if (DepInfo.isClobber()) { + // The address being loaded in this non-local block may not be the same as + // the pointer operand of the load if PHI translation occurs. Make sure + // to consider the right address. + Value *Address = Deps[i].getAddress(); + + // If the dependence is to a store that writes to a superset of the bits + // read by the load, we can extract the bits we need for the load from the + // stored value. + if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInfo.getInst())) { + if (TD && Address) { + int Offset = AnalyzeLoadFromClobberingStore(LI->getType(), Address, + DepSI, *TD); + if (Offset != -1) { + ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB, + DepSI->getValueOperand(), + Offset)); + continue; + } + } + } + + // If the clobbering value is a memset/memcpy/memmove, see if we can + // forward a value on from it. + if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInfo.getInst())) { + if (TD && Address) { + int Offset = AnalyzeLoadFromClobberingMemInst(LI->getType(), Address, + DepMI, *TD); + if (Offset != -1) { + ValuesPerBlock.push_back(AvailableValueInBlock::getMI(DepBB, DepMI, + Offset)); + continue; + } + } + } + + UnavailableBlocks.push_back(DepBB); + continue; + } + + Instruction *DepInst = DepInfo.getInst(); + + // Loading the allocation -> undef. + if (isa<AllocaInst>(DepInst) || isMalloc(DepInst) || + // Loading immediately after lifetime begin -> undef. + isLifetimeStart(DepInst)) { + ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB, + UndefValue::get(LI->getType()))); + continue; + } + + if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) { + // Reject loads and stores that are to the same address but are of + // different types if we have to. + if (S->getValueOperand()->getType() != LI->getType()) { + // If the stored value is larger or equal to the loaded value, we can + // reuse it. + if (TD == 0 || !CanCoerceMustAliasedValueToLoad(S->getValueOperand(), + LI->getType(), *TD)) { + UnavailableBlocks.push_back(DepBB); + continue; + } + } + + ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB, + S->getValueOperand())); + continue; + } + + if (LoadInst *LD = dyn_cast<LoadInst>(DepInst)) { + // If the types mismatch and we can't handle it, reject reuse of the load. + if (LD->getType() != LI->getType()) { + // If the stored value is larger or equal to the loaded value, we can + // reuse it. + if (TD == 0 || !CanCoerceMustAliasedValueToLoad(LD, LI->getType(),*TD)){ + UnavailableBlocks.push_back(DepBB); + continue; + } + } + ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB, LD)); + continue; + } + + UnavailableBlocks.push_back(DepBB); + continue; + } + + // If we have no predecessors that produce a known value for this load, exit + // early. + if (ValuesPerBlock.empty()) return false; + + // If all of the instructions we depend on produce a known value for this + // load, then it is fully redundant and we can use PHI insertion to compute + // its value. Insert PHIs and remove the fully redundant value now. + if (UnavailableBlocks.empty()) { + DEBUG(dbgs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n'); + + // Perform PHI construction. + Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, TD, *DT, + VN.getAliasAnalysis()); + LI->replaceAllUsesWith(V); + + if (isa<PHINode>(V)) + V->takeName(LI); + if (V->getType()->isPointerTy()) + MD->invalidateCachedPointerInfo(V); + VN.erase(LI); + toErase.push_back(LI); + ++NumGVNLoad; + return true; + } + + if (!EnablePRE || !EnableLoadPRE) + return false; + + // Okay, we have *some* definitions of the value. This means that the value + // is available in some of our (transitive) predecessors. Lets think about + // doing PRE of this load. This will involve inserting a new load into the + // predecessor when it's not available. We could do this in general, but + // prefer to not increase code size. As such, we only do this when we know + // that we only have to insert *one* load (which means we're basically moving + // the load, not inserting a new one). + + SmallPtrSet<BasicBlock *, 4> Blockers; + for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i) + Blockers.insert(UnavailableBlocks[i]); + + // Lets find first basic block with more than one predecessor. Walk backwards + // through predecessors if needed. + BasicBlock *LoadBB = LI->getParent(); + BasicBlock *TmpBB = LoadBB; + + bool isSinglePred = false; + bool allSingleSucc = true; + while (TmpBB->getSinglePredecessor()) { + isSinglePred = true; + TmpBB = TmpBB->getSinglePredecessor(); + if (TmpBB == LoadBB) // Infinite (unreachable) loop. + return false; + if (Blockers.count(TmpBB)) + return false; + + // If any of these blocks has more than one successor (i.e. if the edge we + // just traversed was critical), then there are other paths through this + // block along which the load may not be anticipated. Hoisting the load + // above this block would be adding the load to execution paths along + // which it was not previously executed. + if (TmpBB->getTerminator()->getNumSuccessors() != 1) + return false; + } + + assert(TmpBB); + LoadBB = TmpBB; + + // FIXME: It is extremely unclear what this loop is doing, other than + // artificially restricting loadpre. + if (isSinglePred) { + bool isHot = false; + for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) { + const AvailableValueInBlock &AV = ValuesPerBlock[i]; + if (AV.isSimpleValue()) + // "Hot" Instruction is in some loop (because it dominates its dep. + // instruction). + if (Instruction *I = dyn_cast<Instruction>(AV.getSimpleValue())) + if (DT->dominates(LI, I)) { + isHot = true; + break; + } + } + + // We are interested only in "hot" instructions. We don't want to do any + // mis-optimizations here. + if (!isHot) + return false; + } + + // Check to see how many predecessors have the loaded value fully + // available. + DenseMap<BasicBlock*, Value*> PredLoads; + DenseMap<BasicBlock*, char> FullyAvailableBlocks; + for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) + FullyAvailableBlocks[ValuesPerBlock[i].BB] = true; + for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i) + FullyAvailableBlocks[UnavailableBlocks[i]] = false; + + SmallVector<std::pair<TerminatorInst*, unsigned>, 4> NeedToSplit; + for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB); + PI != E; ++PI) { + BasicBlock *Pred = *PI; + if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks)) { + continue; + } + PredLoads[Pred] = 0; + + if (Pred->getTerminator()->getNumSuccessors() != 1) { + if (isa<IndirectBrInst>(Pred->getTerminator())) { + DEBUG(dbgs() << "COULD NOT PRE LOAD BECAUSE OF INDBR CRITICAL EDGE '" + << Pred->getName() << "': " << *LI << '\n'); + return false; + } + unsigned SuccNum = GetSuccessorNumber(Pred, LoadBB); + NeedToSplit.push_back(std::make_pair(Pred->getTerminator(), SuccNum)); + } + } + if (!NeedToSplit.empty()) { + toSplit.append(NeedToSplit.begin(), NeedToSplit.end()); + return false; + } + + // Decide whether PRE is profitable for this load. + unsigned NumUnavailablePreds = PredLoads.size(); + assert(NumUnavailablePreds != 0 && + "Fully available value should be eliminated above!"); + + // If this load is unavailable in multiple predecessors, reject it. + // FIXME: If we could restructure the CFG, we could make a common pred with + // all the preds that don't have an available LI and insert a new load into + // that one block. + if (NumUnavailablePreds != 1) + return false; + + // Check if the load can safely be moved to all the unavailable predecessors. + bool CanDoPRE = true; + SmallVector<Instruction*, 8> NewInsts; + for (DenseMap<BasicBlock*, Value*>::iterator I = PredLoads.begin(), + E = PredLoads.end(); I != E; ++I) { + BasicBlock *UnavailablePred = I->first; + + // Do PHI translation to get its value in the predecessor if necessary. The + // returned pointer (if non-null) is guaranteed to dominate UnavailablePred. + + // If all preds have a single successor, then we know it is safe to insert + // the load on the pred (?!?), so we can insert code to materialize the + // pointer if it is not available. + PHITransAddr Address(LI->getPointerOperand(), TD); + Value *LoadPtr = 0; + if (allSingleSucc) { + LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred, + *DT, NewInsts); + } else { + Address.PHITranslateValue(LoadBB, UnavailablePred, DT); + LoadPtr = Address.getAddr(); + } + + // If we couldn't find or insert a computation of this phi translated value, + // we fail PRE. + if (LoadPtr == 0) { + DEBUG(dbgs() << "COULDN'T INSERT PHI TRANSLATED VALUE OF: " + << *LI->getPointerOperand() << "\n"); + CanDoPRE = false; + break; + } + + // Make sure it is valid to move this load here. We have to watch out for: + // @1 = getelementptr (i8* p, ... + // test p and branch if == 0 + // load @1 + // It is valid to have the getelementptr before the test, even if p can + // be 0, as getelementptr only does address arithmetic. + // If we are not pushing the value through any multiple-successor blocks + // we do not have this case. Otherwise, check that the load is safe to + // put anywhere; this can be improved, but should be conservatively safe. + if (!allSingleSucc && + // FIXME: REEVALUTE THIS. + !isSafeToLoadUnconditionally(LoadPtr, + UnavailablePred->getTerminator(), + LI->getAlignment(), TD)) { + CanDoPRE = false; + break; + } + + I->second = LoadPtr; + } + + if (!CanDoPRE) { + while (!NewInsts.empty()) { + Instruction *I = NewInsts.pop_back_val(); + if (MD) MD->removeInstruction(I); + I->eraseFromParent(); + } + return false; + } + + // Okay, we can eliminate this load by inserting a reload in the predecessor + // and using PHI construction to get the value in the other predecessors, do + // it. + DEBUG(dbgs() << "GVN REMOVING PRE LOAD: " << *LI << '\n'); + DEBUG(if (!NewInsts.empty()) + dbgs() << "INSERTED " << NewInsts.size() << " INSTS: " + << *NewInsts.back() << '\n'); + + // Assign value numbers to the new instructions. + for (unsigned i = 0, e = NewInsts.size(); i != e; ++i) { + // FIXME: We really _ought_ to insert these value numbers into their + // parent's availability map. However, in doing so, we risk getting into + // ordering issues. If a block hasn't been processed yet, we would be + // marking a value as AVAIL-IN, which isn't what we intend. + VN.lookup_or_add(NewInsts[i]); + } + + for (DenseMap<BasicBlock*, Value*>::iterator I = PredLoads.begin(), + E = PredLoads.end(); I != E; ++I) { + BasicBlock *UnavailablePred = I->first; + Value *LoadPtr = I->second; + + Instruction *NewLoad = new LoadInst(LoadPtr, LI->getName()+".pre", false, + LI->getAlignment(), + UnavailablePred->getTerminator()); + + // Transfer the old load's TBAA tag to the new load. + if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) + NewLoad->setMetadata(LLVMContext::MD_tbaa, Tag); + + // Add the newly created load. + ValuesPerBlock.push_back(AvailableValueInBlock::get(UnavailablePred, + NewLoad)); + MD->invalidateCachedPointerInfo(LoadPtr); + DEBUG(dbgs() << "GVN INSERTED " << *NewLoad << '\n'); + } + + // Perform PHI construction. + Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, TD, *DT, + VN.getAliasAnalysis()); + LI->replaceAllUsesWith(V); + if (isa<PHINode>(V)) + V->takeName(LI); + if (V->getType()->isPointerTy()) + MD->invalidateCachedPointerInfo(V); + VN.erase(LI); + toErase.push_back(LI); + ++NumPRELoad; + return true; +} + +/// processLoad - Attempt to eliminate a load, first by eliminating it +/// locally, and then attempting non-local elimination if that fails. +bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) { + if (!MD) + return false; + + if (L->isVolatile()) + return false; + + // ... to a pointer that has been loaded from before... + MemDepResult Dep = MD->getDependency(L); + + // If the value isn't available, don't do anything! + if (Dep.isClobber()) { + // Check to see if we have something like this: + // store i32 123, i32* %P + // %A = bitcast i32* %P to i8* + // %B = gep i8* %A, i32 1 + // %C = load i8* %B + // + // We could do that by recognizing if the clobber instructions are obviously + // a common base + constant offset, and if the previous store (or memset) + // completely covers this load. This sort of thing can happen in bitfield + // access code. + Value *AvailVal = 0; + if (StoreInst *DepSI = dyn_cast<StoreInst>(Dep.getInst())) + if (TD) { + int Offset = AnalyzeLoadFromClobberingStore(L->getType(), + L->getPointerOperand(), + DepSI, *TD); + if (Offset != -1) + AvailVal = GetStoreValueForLoad(DepSI->getValueOperand(), Offset, + L->getType(), L, *TD); + } + + // If the clobbering value is a memset/memcpy/memmove, see if we can forward + // a value on from it. + if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(Dep.getInst())) { + if (TD) { + int Offset = AnalyzeLoadFromClobberingMemInst(L->getType(), + L->getPointerOperand(), + DepMI, *TD); + if (Offset != -1) + AvailVal = GetMemInstValueForLoad(DepMI, Offset, L->getType(), L,*TD); + } + } + + if (AvailVal) { + DEBUG(dbgs() << "GVN COERCED INST:\n" << *Dep.getInst() << '\n' + << *AvailVal << '\n' << *L << "\n\n\n"); + + // Replace the load! + L->replaceAllUsesWith(AvailVal); + if (AvailVal->getType()->isPointerTy()) + MD->invalidateCachedPointerInfo(AvailVal); + VN.erase(L); + toErase.push_back(L); + ++NumGVNLoad; + return true; + } + + DEBUG( + // fast print dep, using operator<< on instruction would be too slow + dbgs() << "GVN: load "; + WriteAsOperand(dbgs(), L); + Instruction *I = Dep.getInst(); + dbgs() << " is clobbered by " << *I << '\n'; + ); + return false; + } + + // If it is defined in another block, try harder. + if (Dep.isNonLocal()) + return processNonLocalLoad(L, toErase); + + Instruction *DepInst = Dep.getInst(); + if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInst)) { + Value *StoredVal = DepSI->getValueOperand(); + + // The store and load are to a must-aliased pointer, but they may not + // actually have the same type. See if we know how to reuse the stored + // value (depending on its type). + if (StoredVal->getType() != L->getType()) { + if (TD) { + StoredVal = CoerceAvailableValueToLoadType(StoredVal, L->getType(), + L, *TD); + if (StoredVal == 0) + return false; + + DEBUG(dbgs() << "GVN COERCED STORE:\n" << *DepSI << '\n' << *StoredVal + << '\n' << *L << "\n\n\n"); + } + else + return false; + } + + // Remove it! + L->replaceAllUsesWith(StoredVal); + if (StoredVal->getType()->isPointerTy()) + MD->invalidateCachedPointerInfo(StoredVal); + VN.erase(L); + toErase.push_back(L); + ++NumGVNLoad; + return true; + } + + if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInst)) { + Value *AvailableVal = DepLI; + + // The loads are of a must-aliased pointer, but they may not actually have + // the same type. See if we know how to reuse the previously loaded value + // (depending on its type). + if (DepLI->getType() != L->getType()) { + if (TD) { + AvailableVal = CoerceAvailableValueToLoadType(DepLI, L->getType(), L,*TD); + if (AvailableVal == 0) + return false; + + DEBUG(dbgs() << "GVN COERCED LOAD:\n" << *DepLI << "\n" << *AvailableVal + << "\n" << *L << "\n\n\n"); + } + else + return false; + } + + // Remove it! + L->replaceAllUsesWith(AvailableVal); + if (DepLI->getType()->isPointerTy()) + MD->invalidateCachedPointerInfo(DepLI); + VN.erase(L); + toErase.push_back(L); + ++NumGVNLoad; + return true; + } + + // If this load really doesn't depend on anything, then we must be loading an + // undef value. This can happen when loading for a fresh allocation with no + // intervening stores, for example. + if (isa<AllocaInst>(DepInst) || isMalloc(DepInst)) { + L->replaceAllUsesWith(UndefValue::get(L->getType())); + VN.erase(L); + toErase.push_back(L); + ++NumGVNLoad; + return true; + } + + // If this load occurs either right after a lifetime begin, + // then the loaded value is undefined. + if (IntrinsicInst* II = dyn_cast<IntrinsicInst>(DepInst)) { + if (II->getIntrinsicID() == Intrinsic::lifetime_start) { + L->replaceAllUsesWith(UndefValue::get(L->getType())); + VN.erase(L); + toErase.push_back(L); + ++NumGVNLoad; + return true; + } + } + + return false; +} + +// findLeader - In order to find a leader for a given value number at a +// specific basic block, we first obtain the list of all Values for that number, +// and then scan the list to find one whose block dominates the block in +// question. This is fast because dominator tree queries consist of only +// a few comparisons of DFS numbers. +Value *GVN::findLeader(BasicBlock *BB, uint32_t num) { + LeaderTableEntry Vals = LeaderTable[num]; + if (!Vals.Val) return 0; + + Value *Val = 0; + if (DT->dominates(Vals.BB, BB)) { + Val = Vals.Val; + if (isa<Constant>(Val)) return Val; + } + + LeaderTableEntry* Next = Vals.Next; + while (Next) { + if (DT->dominates(Next->BB, BB)) { + if (isa<Constant>(Next->Val)) return Next->Val; + if (!Val) Val = Next->Val; + } + + Next = Next->Next; + } + + return Val; +} + + +/// processInstruction - When calculating availability, handle an instruction +/// by inserting it into the appropriate sets +bool GVN::processInstruction(Instruction *I, + SmallVectorImpl<Instruction*> &toErase) { + // Ignore dbg info intrinsics. + if (isa<DbgInfoIntrinsic>(I)) + return false; + + // If the instruction can be easily simplified then do so now in preference + // to value numbering it. Value numbering often exposes redundancies, for + // example if it determines that %y is equal to %x then the instruction + // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify. + if (Value *V = SimplifyInstruction(I, TD, DT)) { + I->replaceAllUsesWith(V); + if (MD && V->getType()->isPointerTy()) + MD->invalidateCachedPointerInfo(V); + VN.erase(I); + toErase.push_back(I); + return true; + } + + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + bool Changed = processLoad(LI, toErase); + + if (!Changed) { + unsigned Num = VN.lookup_or_add(LI); + addToLeaderTable(Num, LI, LI->getParent()); + } + + return Changed; + } + + // For conditions branches, we can perform simple conditional propagation on + // the condition value itself. + if (BranchInst *BI = dyn_cast<BranchInst>(I)) { + if (!BI->isConditional() || isa<Constant>(BI->getCondition())) + return false; + + Value *BranchCond = BI->getCondition(); + uint32_t CondVN = VN.lookup_or_add(BranchCond); + + BasicBlock *TrueSucc = BI->getSuccessor(0); + BasicBlock *FalseSucc = BI->getSuccessor(1); + + if (TrueSucc->getSinglePredecessor()) + addToLeaderTable(CondVN, + ConstantInt::getTrue(TrueSucc->getContext()), + TrueSucc); + if (FalseSucc->getSinglePredecessor()) + addToLeaderTable(CondVN, + ConstantInt::getFalse(TrueSucc->getContext()), + FalseSucc); + + return false; + } + + // Instructions with void type don't return a value, so there's + // no point in trying to find redudancies in them. + if (I->getType()->isVoidTy()) return false; + + uint32_t NextNum = VN.getNextUnusedValueNumber(); + unsigned Num = VN.lookup_or_add(I); + + // Allocations are always uniquely numbered, so we can save time and memory + // by fast failing them. + if (isa<AllocaInst>(I) || isa<TerminatorInst>(I) || isa<PHINode>(I)) { + addToLeaderTable(Num, I, I->getParent()); + return false; + } + + // If the number we were assigned was a brand new VN, then we don't + // need to do a lookup to see if the number already exists + // somewhere in the domtree: it can't! + if (Num == NextNum) { + addToLeaderTable(Num, I, I->getParent()); + return false; + } + + // Perform fast-path value-number based elimination of values inherited from + // dominators. + Value *repl = findLeader(I->getParent(), Num); + if (repl == 0) { + // Failure, just remember this instance for future use. + addToLeaderTable(Num, I, I->getParent()); + return false; + } + + // Remove it! + VN.erase(I); + I->replaceAllUsesWith(repl); + if (MD && repl->getType()->isPointerTy()) + MD->invalidateCachedPointerInfo(repl); + toErase.push_back(I); + return true; +} + +/// runOnFunction - This is the main transformation entry point for a function. +bool GVN::runOnFunction(Function& F) { + if (!NoLoads) + MD = &getAnalysis<MemoryDependenceAnalysis>(); + DT = &getAnalysis<DominatorTree>(); + TD = getAnalysisIfAvailable<TargetData>(); + VN.setAliasAnalysis(&getAnalysis<AliasAnalysis>()); + VN.setMemDep(MD); + VN.setDomTree(DT); + + bool Changed = false; + bool ShouldContinue = true; + + // Merge unconditional branches, allowing PRE to catch more + // optimization opportunities. + for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ) { + BasicBlock *BB = FI++; + + bool removedBlock = MergeBlockIntoPredecessor(BB, this); + if (removedBlock) ++NumGVNBlocks; + + Changed |= removedBlock; + } + + unsigned Iteration = 0; + while (ShouldContinue) { + DEBUG(dbgs() << "GVN iteration: " << Iteration << "\n"); + ShouldContinue = iterateOnFunction(F); + if (splitCriticalEdges()) + ShouldContinue = true; + Changed |= ShouldContinue; + ++Iteration; + } + + if (EnablePRE) { + bool PREChanged = true; + while (PREChanged) { + PREChanged = performPRE(F); + Changed |= PREChanged; + } + } + // FIXME: Should perform GVN again after PRE does something. PRE can move + // computations into blocks where they become fully redundant. Note that + // we can't do this until PRE's critical edge splitting updates memdep. + // Actually, when this happens, we should just fully integrate PRE into GVN. + + cleanupGlobalSets(); + + return Changed; +} + + +bool GVN::processBlock(BasicBlock *BB) { + // FIXME: Kill off toErase by doing erasing eagerly in a helper function (and + // incrementing BI before processing an instruction). + SmallVector<Instruction*, 8> toErase; + bool ChangedFunction = false; + + for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); + BI != BE;) { + ChangedFunction |= processInstruction(BI, toErase); + if (toErase.empty()) { + ++BI; + continue; + } + + // If we need some instructions deleted, do it now. + NumGVNInstr += toErase.size(); + + // Avoid iterator invalidation. + bool AtStart = BI == BB->begin(); + if (!AtStart) + --BI; + + for (SmallVector<Instruction*, 4>::iterator I = toErase.begin(), + E = toErase.end(); I != E; ++I) { + DEBUG(dbgs() << "GVN removed: " << **I << '\n'); + if (MD) MD->removeInstruction(*I); + (*I)->eraseFromParent(); + DEBUG(verifyRemoved(*I)); + } + toErase.clear(); + + if (AtStart) + BI = BB->begin(); + else + ++BI; + } + + return ChangedFunction; +} + +/// performPRE - Perform a purely local form of PRE that looks for diamond +/// control flow patterns and attempts to perform simple PRE at the join point. +bool GVN::performPRE(Function &F) { + bool Changed = false; + DenseMap<BasicBlock*, Value*> predMap; + for (df_iterator<BasicBlock*> DI = df_begin(&F.getEntryBlock()), + DE = df_end(&F.getEntryBlock()); DI != DE; ++DI) { + BasicBlock *CurrentBlock = *DI; + + // Nothing to PRE in the entry block. + if (CurrentBlock == &F.getEntryBlock()) continue; + + for (BasicBlock::iterator BI = CurrentBlock->begin(), + BE = CurrentBlock->end(); BI != BE; ) { + Instruction *CurInst = BI++; + + if (isa<AllocaInst>(CurInst) || + isa<TerminatorInst>(CurInst) || isa<PHINode>(CurInst) || + CurInst->getType()->isVoidTy() || + CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() || + isa<DbgInfoIntrinsic>(CurInst)) + continue; + + // We don't currently value number ANY inline asm calls. + if (CallInst *CallI = dyn_cast<CallInst>(CurInst)) + if (CallI->isInlineAsm()) + continue; + + uint32_t ValNo = VN.lookup(CurInst); + + // Look for the predecessors for PRE opportunities. We're + // only trying to solve the basic diamond case, where + // a value is computed in the successor and one predecessor, + // but not the other. We also explicitly disallow cases + // where the successor is its own predecessor, because they're + // more complicated to get right. + unsigned NumWith = 0; + unsigned NumWithout = 0; + BasicBlock *PREPred = 0; + predMap.clear(); + + for (pred_iterator PI = pred_begin(CurrentBlock), + PE = pred_end(CurrentBlock); PI != PE; ++PI) { + BasicBlock *P = *PI; + // We're not interested in PRE where the block is its + // own predecessor, or in blocks with predecessors + // that are not reachable. + if (P == CurrentBlock) { + NumWithout = 2; + break; + } else if (!DT->dominates(&F.getEntryBlock(), P)) { + NumWithout = 2; + break; + } + + Value* predV = findLeader(P, ValNo); + if (predV == 0) { + PREPred = P; + ++NumWithout; + } else if (predV == CurInst) { + NumWithout = 2; + } else { + predMap[P] = predV; + ++NumWith; + } + } + + // Don't do PRE when it might increase code size, i.e. when + // we would need to insert instructions in more than one pred. + if (NumWithout != 1 || NumWith == 0) + continue; + + // Don't do PRE across indirect branch. + if (isa<IndirectBrInst>(PREPred->getTerminator())) + continue; + + // We can't do PRE safely on a critical edge, so instead we schedule + // the edge to be split and perform the PRE the next time we iterate + // on the function. + unsigned SuccNum = GetSuccessorNumber(PREPred, CurrentBlock); + if (isCriticalEdge(PREPred->getTerminator(), SuccNum)) { + toSplit.push_back(std::make_pair(PREPred->getTerminator(), SuccNum)); + continue; + } + + // Instantiate the expression in the predecessor that lacked it. + // Because we are going top-down through the block, all value numbers + // will be available in the predecessor by the time we need them. Any + // that weren't originally present will have been instantiated earlier + // in this loop. + Instruction *PREInstr = CurInst->clone(); + bool success = true; + for (unsigned i = 0, e = CurInst->getNumOperands(); i != e; ++i) { + Value *Op = PREInstr->getOperand(i); + if (isa<Argument>(Op) || isa<Constant>(Op) || isa<GlobalValue>(Op)) + continue; + + if (Value *V = findLeader(PREPred, VN.lookup(Op))) { + PREInstr->setOperand(i, V); + } else { + success = false; + break; + } + } + + // Fail out if we encounter an operand that is not available in + // the PRE predecessor. This is typically because of loads which + // are not value numbered precisely. + if (!success) { + delete PREInstr; + DEBUG(verifyRemoved(PREInstr)); + continue; + } + + PREInstr->insertBefore(PREPred->getTerminator()); + PREInstr->setName(CurInst->getName() + ".pre"); + predMap[PREPred] = PREInstr; + VN.add(PREInstr, ValNo); + ++NumGVNPRE; + + // Update the availability map to include the new instruction. + addToLeaderTable(ValNo, PREInstr, PREPred); + + // Create a PHI to make the value available in this block. + PHINode* Phi = PHINode::Create(CurInst->getType(), + CurInst->getName() + ".pre-phi", + CurrentBlock->begin()); + for (pred_iterator PI = pred_begin(CurrentBlock), + PE = pred_end(CurrentBlock); PI != PE; ++PI) { + BasicBlock *P = *PI; + Phi->addIncoming(predMap[P], P); + } + + VN.add(Phi, ValNo); + addToLeaderTable(ValNo, Phi, CurrentBlock); + + CurInst->replaceAllUsesWith(Phi); + if (Phi->getType()->isPointerTy()) { + // Because we have added a PHI-use of the pointer value, it has now + // "escaped" from alias analysis' perspective. We need to inform + // AA of this. + for (unsigned ii = 0, ee = Phi->getNumIncomingValues(); ii != ee; ++ii) + VN.getAliasAnalysis()->addEscapingUse(Phi->getOperandUse(2*ii)); + + if (MD) + MD->invalidateCachedPointerInfo(Phi); + } + VN.erase(CurInst); + removeFromLeaderTable(ValNo, CurInst, CurrentBlock); + + DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n'); + if (MD) MD->removeInstruction(CurInst); + CurInst->eraseFromParent(); + DEBUG(verifyRemoved(CurInst)); + Changed = true; + } + } + + if (splitCriticalEdges()) + Changed = true; + + return Changed; +} + +/// splitCriticalEdges - Split critical edges found during the previous +/// iteration that may enable further optimization. +bool GVN::splitCriticalEdges() { + if (toSplit.empty()) + return false; + do { + std::pair<TerminatorInst*, unsigned> Edge = toSplit.pop_back_val(); + SplitCriticalEdge(Edge.first, Edge.second, this); + } while (!toSplit.empty()); + if (MD) MD->invalidateCachedPredecessors(); + return true; +} + +/// iterateOnFunction - Executes one iteration of GVN +bool GVN::iterateOnFunction(Function &F) { + cleanupGlobalSets(); + + // Top-down walk of the dominator tree + bool Changed = false; +#if 0 + // Needed for value numbering with phi construction to work. + ReversePostOrderTraversal<Function*> RPOT(&F); + for (ReversePostOrderTraversal<Function*>::rpo_iterator RI = RPOT.begin(), + RE = RPOT.end(); RI != RE; ++RI) + Changed |= processBlock(*RI); +#else + for (df_iterator<DomTreeNode*> DI = df_begin(DT->getRootNode()), + DE = df_end(DT->getRootNode()); DI != DE; ++DI) + Changed |= processBlock(DI->getBlock()); +#endif + + return Changed; +} + +void GVN::cleanupGlobalSets() { + VN.clear(); + LeaderTable.clear(); + TableAllocator.Reset(); +} + +/// verifyRemoved - Verify that the specified instruction does not occur in our +/// internal data structures. +void GVN::verifyRemoved(const Instruction *Inst) const { + VN.verifyRemoved(Inst); + + // Walk through the value number scope to make sure the instruction isn't + // ferreted away in it. + for (DenseMap<uint32_t, LeaderTableEntry>::const_iterator + I = LeaderTable.begin(), E = LeaderTable.end(); I != E; ++I) { + const LeaderTableEntry *Node = &I->second; + assert(Node->Val != Inst && "Inst still in value numbering scope!"); + + while (Node->Next) { + Node = Node->Next; + assert(Node->Val != Inst && "Inst still in value numbering scope!"); + } + } +} diff --git a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp new file mode 100644 index 0000000..0fb6798 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -0,0 +1,1043 @@ +//===- IndVarSimplify.cpp - Induction Variable Elimination ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This transformation analyzes and transforms the induction variables (and +// computations derived from them) into simpler forms suitable for subsequent +// analysis and transformation. +// +// This transformation makes the following changes to each loop with an +// identifiable induction variable: +// 1. All loops are transformed to have a SINGLE canonical induction variable +// which starts at zero and steps by one. +// 2. The canonical induction variable is guaranteed to be the first PHI node +// in the loop header block. +// 3. The canonical induction variable is guaranteed to be in a wide enough +// type so that IV expressions need not be (directly) zero-extended or +// sign-extended. +// 4. Any pointer arithmetic recurrences are raised to use array subscripts. +// +// If the trip count of a loop is computable, this pass also makes the following +// changes: +// 1. The exit condition for the loop is canonicalized to compare the +// induction value against the exit value. This turns loops like: +// 'for (i = 7; i*i < 1000; ++i)' into 'for (i = 0; i != 25; ++i)' +// 2. Any use outside of the loop of an expression derived from the indvar +// is changed to compute the derived value outside of the loop, eliminating +// the dependence on the exit value of the induction variable. If the only +// purpose of the loop is to compute the exit value of some derived +// expression, this transformation will make the loop dead. +// +// This transformation should be followed by strength reduction after all of the +// desired loop transformations have been performed. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "indvars" +#include "llvm/Transforms/Scalar.h" +#include "llvm/BasicBlock.h" +#include "llvm/Constants.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/LLVMContext.h" +#include "llvm/Type.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/IVUsers.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Support/CFG.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +using namespace llvm; + +STATISTIC(NumRemoved , "Number of aux indvars removed"); +STATISTIC(NumInserted, "Number of canonical indvars added"); +STATISTIC(NumReplaced, "Number of exit values replaced"); +STATISTIC(NumLFTR , "Number of loop exit tests replaced"); + +namespace { + class IndVarSimplify : public LoopPass { + IVUsers *IU; + LoopInfo *LI; + ScalarEvolution *SE; + DominatorTree *DT; + bool Changed; + public: + + static char ID; // Pass identification, replacement for typeid + IndVarSimplify() : LoopPass(ID) { + initializeIndVarSimplifyPass(*PassRegistry::getPassRegistry()); + } + + virtual bool runOnLoop(Loop *L, LPPassManager &LPM); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<DominatorTree>(); + AU.addRequired<LoopInfo>(); + AU.addRequired<ScalarEvolution>(); + AU.addRequiredID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + AU.addRequired<IVUsers>(); + AU.addPreserved<ScalarEvolution>(); + AU.addPreservedID(LoopSimplifyID); + AU.addPreservedID(LCSSAID); + AU.addPreserved<IVUsers>(); + AU.setPreservesCFG(); + } + + private: + + void EliminateIVComparisons(); + void EliminateIVRemainders(); + void RewriteNonIntegerIVs(Loop *L); + + ICmpInst *LinearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount, + PHINode *IndVar, + BasicBlock *ExitingBlock, + BranchInst *BI, + SCEVExpander &Rewriter); + void RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter); + + void RewriteIVExpressions(Loop *L, SCEVExpander &Rewriter); + + void SinkUnusedInvariants(Loop *L); + + void HandleFloatingPointIV(Loop *L, PHINode *PH); + }; +} + +char IndVarSimplify::ID = 0; +INITIALIZE_PASS_BEGIN(IndVarSimplify, "indvars", + "Canonicalize Induction Variables", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_DEPENDENCY(IVUsers) +INITIALIZE_PASS_END(IndVarSimplify, "indvars", + "Canonicalize Induction Variables", false, false) + +Pass *llvm::createIndVarSimplifyPass() { + return new IndVarSimplify(); +} + +/// LinearFunctionTestReplace - This method rewrites the exit condition of the +/// loop to be a canonical != comparison against the incremented loop induction +/// variable. This pass is able to rewrite the exit tests of any loop where the +/// SCEV analysis can determine a loop-invariant trip count of the loop, which +/// is actually a much broader range than just linear tests. +ICmpInst *IndVarSimplify::LinearFunctionTestReplace(Loop *L, + const SCEV *BackedgeTakenCount, + PHINode *IndVar, + BasicBlock *ExitingBlock, + BranchInst *BI, + SCEVExpander &Rewriter) { + // Special case: If the backedge-taken count is a UDiv, it's very likely a + // UDiv that ScalarEvolution produced in order to compute a precise + // expression, rather than a UDiv from the user's code. If we can't find a + // UDiv in the code with some simple searching, assume the former and forego + // rewriting the loop. + if (isa<SCEVUDivExpr>(BackedgeTakenCount)) { + ICmpInst *OrigCond = dyn_cast<ICmpInst>(BI->getCondition()); + if (!OrigCond) return 0; + const SCEV *R = SE->getSCEV(OrigCond->getOperand(1)); + R = SE->getMinusSCEV(R, SE->getConstant(R->getType(), 1)); + if (R != BackedgeTakenCount) { + const SCEV *L = SE->getSCEV(OrigCond->getOperand(0)); + L = SE->getMinusSCEV(L, SE->getConstant(L->getType(), 1)); + if (L != BackedgeTakenCount) + return 0; + } + } + + // If the exiting block is not the same as the backedge block, we must compare + // against the preincremented value, otherwise we prefer to compare against + // the post-incremented value. + Value *CmpIndVar; + const SCEV *RHS = BackedgeTakenCount; + if (ExitingBlock == L->getLoopLatch()) { + // Add one to the "backedge-taken" count to get the trip count. + // If this addition may overflow, we have to be more pessimistic and + // cast the induction variable before doing the add. + const SCEV *Zero = SE->getConstant(BackedgeTakenCount->getType(), 0); + const SCEV *N = + SE->getAddExpr(BackedgeTakenCount, + SE->getConstant(BackedgeTakenCount->getType(), 1)); + if ((isa<SCEVConstant>(N) && !N->isZero()) || + SE->isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, N, Zero)) { + // No overflow. Cast the sum. + RHS = SE->getTruncateOrZeroExtend(N, IndVar->getType()); + } else { + // Potential overflow. Cast before doing the add. + RHS = SE->getTruncateOrZeroExtend(BackedgeTakenCount, + IndVar->getType()); + RHS = SE->getAddExpr(RHS, + SE->getConstant(IndVar->getType(), 1)); + } + + // The BackedgeTaken expression contains the number of times that the + // backedge branches to the loop header. This is one less than the + // number of times the loop executes, so use the incremented indvar. + CmpIndVar = IndVar->getIncomingValueForBlock(ExitingBlock); + } else { + // We have to use the preincremented value... + RHS = SE->getTruncateOrZeroExtend(BackedgeTakenCount, + IndVar->getType()); + CmpIndVar = IndVar; + } + + // Expand the code for the iteration count. + assert(SE->isLoopInvariant(RHS, L) && + "Computed iteration count is not loop invariant!"); + Value *ExitCnt = Rewriter.expandCodeFor(RHS, IndVar->getType(), BI); + + // Insert a new icmp_ne or icmp_eq instruction before the branch. + ICmpInst::Predicate Opcode; + if (L->contains(BI->getSuccessor(0))) + Opcode = ICmpInst::ICMP_NE; + else + Opcode = ICmpInst::ICMP_EQ; + + DEBUG(dbgs() << "INDVARS: Rewriting loop exit condition to:\n" + << " LHS:" << *CmpIndVar << '\n' + << " op:\t" + << (Opcode == ICmpInst::ICMP_NE ? "!=" : "==") << "\n" + << " RHS:\t" << *RHS << "\n"); + + ICmpInst *Cond = new ICmpInst(BI, Opcode, CmpIndVar, ExitCnt, "exitcond"); + + Value *OrigCond = BI->getCondition(); + // It's tempting to use replaceAllUsesWith here to fully replace the old + // comparison, but that's not immediately safe, since users of the old + // comparison may not be dominated by the new comparison. Instead, just + // update the branch to use the new comparison; in the common case this + // will make old comparison dead. + BI->setCondition(Cond); + RecursivelyDeleteTriviallyDeadInstructions(OrigCond); + + ++NumLFTR; + Changed = true; + return Cond; +} + +/// RewriteLoopExitValues - Check to see if this loop has a computable +/// loop-invariant execution count. If so, this means that we can compute the +/// final value of any expressions that are recurrent in the loop, and +/// substitute the exit values from the loop into any instructions outside of +/// the loop that use the final values of the current expressions. +/// +/// This is mostly redundant with the regular IndVarSimplify activities that +/// happen later, except that it's more powerful in some cases, because it's +/// able to brute-force evaluate arbitrary instructions as long as they have +/// constant operands at the beginning of the loop. +void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { + // Verify the input to the pass in already in LCSSA form. + assert(L->isLCSSAForm(*DT)); + + SmallVector<BasicBlock*, 8> ExitBlocks; + L->getUniqueExitBlocks(ExitBlocks); + + // Find all values that are computed inside the loop, but used outside of it. + // Because of LCSSA, these values will only occur in LCSSA PHI Nodes. Scan + // the exit blocks of the loop to find them. + for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) { + BasicBlock *ExitBB = ExitBlocks[i]; + + // If there are no PHI nodes in this exit block, then no values defined + // inside the loop are used on this path, skip it. + PHINode *PN = dyn_cast<PHINode>(ExitBB->begin()); + if (!PN) continue; + + unsigned NumPreds = PN->getNumIncomingValues(); + + // Iterate over all of the PHI nodes. + BasicBlock::iterator BBI = ExitBB->begin(); + while ((PN = dyn_cast<PHINode>(BBI++))) { + if (PN->use_empty()) + continue; // dead use, don't replace it + + // SCEV only supports integer expressions for now. + if (!PN->getType()->isIntegerTy() && !PN->getType()->isPointerTy()) + continue; + + // It's necessary to tell ScalarEvolution about this explicitly so that + // it can walk the def-use list and forget all SCEVs, as it may not be + // watching the PHI itself. Once the new exit value is in place, there + // may not be a def-use connection between the loop and every instruction + // which got a SCEVAddRecExpr for that loop. + SE->forgetValue(PN); + + // Iterate over all of the values in all the PHI nodes. + for (unsigned i = 0; i != NumPreds; ++i) { + // If the value being merged in is not integer or is not defined + // in the loop, skip it. + Value *InVal = PN->getIncomingValue(i); + if (!isa<Instruction>(InVal)) + continue; + + // If this pred is for a subloop, not L itself, skip it. + if (LI->getLoopFor(PN->getIncomingBlock(i)) != L) + continue; // The Block is in a subloop, skip it. + + // Check that InVal is defined in the loop. + Instruction *Inst = cast<Instruction>(InVal); + if (!L->contains(Inst)) + continue; + + // Okay, this instruction has a user outside of the current loop + // and varies predictably *inside* the loop. Evaluate the value it + // contains when the loop exits, if possible. + const SCEV *ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop()); + if (!SE->isLoopInvariant(ExitValue, L)) + continue; + + Changed = true; + ++NumReplaced; + + Value *ExitVal = Rewriter.expandCodeFor(ExitValue, PN->getType(), Inst); + + DEBUG(dbgs() << "INDVARS: RLEV: AfterLoopVal = " << *ExitVal << '\n' + << " LoopVal = " << *Inst << "\n"); + + PN->setIncomingValue(i, ExitVal); + + // If this instruction is dead now, delete it. + RecursivelyDeleteTriviallyDeadInstructions(Inst); + + if (NumPreds == 1) { + // Completely replace a single-pred PHI. This is safe, because the + // NewVal won't be variant in the loop, so we don't need an LCSSA phi + // node anymore. + PN->replaceAllUsesWith(ExitVal); + RecursivelyDeleteTriviallyDeadInstructions(PN); + } + } + if (NumPreds != 1) { + // Clone the PHI and delete the original one. This lets IVUsers and + // any other maps purge the original user from their records. + PHINode *NewPN = cast<PHINode>(PN->clone()); + NewPN->takeName(PN); + NewPN->insertBefore(PN); + PN->replaceAllUsesWith(NewPN); + PN->eraseFromParent(); + } + } + } + + // The insertion point instruction may have been deleted; clear it out + // so that the rewriter doesn't trip over it later. + Rewriter.clearInsertPoint(); +} + +void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) { + // First step. Check to see if there are any floating-point recurrences. + // If there are, change them into integer recurrences, permitting analysis by + // the SCEV routines. + // + BasicBlock *Header = L->getHeader(); + + SmallVector<WeakVH, 8> PHIs; + for (BasicBlock::iterator I = Header->begin(); + PHINode *PN = dyn_cast<PHINode>(I); ++I) + PHIs.push_back(PN); + + for (unsigned i = 0, e = PHIs.size(); i != e; ++i) + if (PHINode *PN = dyn_cast_or_null<PHINode>(&*PHIs[i])) + HandleFloatingPointIV(L, PN); + + // If the loop previously had floating-point IV, ScalarEvolution + // may not have been able to compute a trip count. Now that we've done some + // re-writing, the trip count may be computable. + if (Changed) + SE->forgetLoop(L); +} + +void IndVarSimplify::EliminateIVComparisons() { + SmallVector<WeakVH, 16> DeadInsts; + + // Look for ICmp users. + for (IVUsers::iterator I = IU->begin(), E = IU->end(); I != E; ++I) { + IVStrideUse &UI = *I; + ICmpInst *ICmp = dyn_cast<ICmpInst>(UI.getUser()); + if (!ICmp) continue; + + bool Swapped = UI.getOperandValToReplace() == ICmp->getOperand(1); + ICmpInst::Predicate Pred = ICmp->getPredicate(); + if (Swapped) Pred = ICmpInst::getSwappedPredicate(Pred); + + // Get the SCEVs for the ICmp operands. + const SCEV *S = IU->getReplacementExpr(UI); + const SCEV *X = SE->getSCEV(ICmp->getOperand(!Swapped)); + + // Simplify unnecessary loops away. + const Loop *ICmpLoop = LI->getLoopFor(ICmp->getParent()); + S = SE->getSCEVAtScope(S, ICmpLoop); + X = SE->getSCEVAtScope(X, ICmpLoop); + + // If the condition is always true or always false, replace it with + // a constant value. + if (SE->isKnownPredicate(Pred, S, X)) + ICmp->replaceAllUsesWith(ConstantInt::getTrue(ICmp->getContext())); + else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X)) + ICmp->replaceAllUsesWith(ConstantInt::getFalse(ICmp->getContext())); + else + continue; + + DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n'); + DeadInsts.push_back(ICmp); + } + + // Now that we're done iterating through lists, clean up any instructions + // which are now dead. + while (!DeadInsts.empty()) + if (Instruction *Inst = + dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val())) + RecursivelyDeleteTriviallyDeadInstructions(Inst); +} + +void IndVarSimplify::EliminateIVRemainders() { + SmallVector<WeakVH, 16> DeadInsts; + + // Look for SRem and URem users. + for (IVUsers::iterator I = IU->begin(), E = IU->end(); I != E; ++I) { + IVStrideUse &UI = *I; + BinaryOperator *Rem = dyn_cast<BinaryOperator>(UI.getUser()); + if (!Rem) continue; + + bool isSigned = Rem->getOpcode() == Instruction::SRem; + if (!isSigned && Rem->getOpcode() != Instruction::URem) + continue; + + // We're only interested in the case where we know something about + // the numerator. + if (UI.getOperandValToReplace() != Rem->getOperand(0)) + continue; + + // Get the SCEVs for the ICmp operands. + const SCEV *S = SE->getSCEV(Rem->getOperand(0)); + const SCEV *X = SE->getSCEV(Rem->getOperand(1)); + + // Simplify unnecessary loops away. + const Loop *ICmpLoop = LI->getLoopFor(Rem->getParent()); + S = SE->getSCEVAtScope(S, ICmpLoop); + X = SE->getSCEVAtScope(X, ICmpLoop); + + // i % n --> i if i is in [0,n). + if ((!isSigned || SE->isKnownNonNegative(S)) && + SE->isKnownPredicate(isSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, + S, X)) + Rem->replaceAllUsesWith(Rem->getOperand(0)); + else { + // (i+1) % n --> (i+1)==n?0:(i+1) if i is in [0,n). + const SCEV *LessOne = + SE->getMinusSCEV(S, SE->getConstant(S->getType(), 1)); + if ((!isSigned || SE->isKnownNonNegative(LessOne)) && + SE->isKnownPredicate(isSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, + LessOne, X)) { + ICmpInst *ICmp = new ICmpInst(Rem, ICmpInst::ICMP_EQ, + Rem->getOperand(0), Rem->getOperand(1), + "tmp"); + SelectInst *Sel = + SelectInst::Create(ICmp, + ConstantInt::get(Rem->getType(), 0), + Rem->getOperand(0), "tmp", Rem); + Rem->replaceAllUsesWith(Sel); + } else + continue; + } + + // Inform IVUsers about the new users. + if (Instruction *I = dyn_cast<Instruction>(Rem->getOperand(0))) + IU->AddUsersIfInteresting(I); + + DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n'); + DeadInsts.push_back(Rem); + } + + // Now that we're done iterating through lists, clean up any instructions + // which are now dead. + while (!DeadInsts.empty()) + if (Instruction *Inst = + dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val())) + RecursivelyDeleteTriviallyDeadInstructions(Inst); +} + +bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { + // If LoopSimplify form is not available, stay out of trouble. Some notes: + // - LSR currently only supports LoopSimplify-form loops. Indvars' + // canonicalization can be a pessimization without LSR to "clean up" + // afterwards. + // - We depend on having a preheader; in particular, + // Loop::getCanonicalInductionVariable only supports loops with preheaders, + // and we're in trouble if we can't find the induction variable even when + // we've manually inserted one. + if (!L->isLoopSimplifyForm()) + return false; + + IU = &getAnalysis<IVUsers>(); + LI = &getAnalysis<LoopInfo>(); + SE = &getAnalysis<ScalarEvolution>(); + DT = &getAnalysis<DominatorTree>(); + Changed = false; + + // If there are any floating-point recurrences, attempt to + // transform them to use integer recurrences. + RewriteNonIntegerIVs(L); + + BasicBlock *ExitingBlock = L->getExitingBlock(); // may be null + const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L); + + // Create a rewriter object which we'll use to transform the code with. + SCEVExpander Rewriter(*SE); + + // Check to see if this loop has a computable loop-invariant execution count. + // If so, this means that we can compute the final value of any expressions + // that are recurrent in the loop, and substitute the exit values from the + // loop into any instructions outside of the loop that use the final values of + // the current expressions. + // + if (!isa<SCEVCouldNotCompute>(BackedgeTakenCount)) + RewriteLoopExitValues(L, Rewriter); + + // Simplify ICmp IV users. + EliminateIVComparisons(); + + // Simplify SRem and URem IV users. + EliminateIVRemainders(); + + // Compute the type of the largest recurrence expression, and decide whether + // a canonical induction variable should be inserted. + const Type *LargestType = 0; + bool NeedCannIV = false; + if (!isa<SCEVCouldNotCompute>(BackedgeTakenCount)) { + LargestType = BackedgeTakenCount->getType(); + LargestType = SE->getEffectiveSCEVType(LargestType); + // If we have a known trip count and a single exit block, we'll be + // rewriting the loop exit test condition below, which requires a + // canonical induction variable. + if (ExitingBlock) + NeedCannIV = true; + } + for (IVUsers::const_iterator I = IU->begin(), E = IU->end(); I != E; ++I) { + const Type *Ty = + SE->getEffectiveSCEVType(I->getOperandValToReplace()->getType()); + if (!LargestType || + SE->getTypeSizeInBits(Ty) > + SE->getTypeSizeInBits(LargestType)) + LargestType = Ty; + NeedCannIV = true; + } + + // Now that we know the largest of the induction variable expressions + // in this loop, insert a canonical induction variable of the largest size. + PHINode *IndVar = 0; + if (NeedCannIV) { + // Check to see if the loop already has any canonical-looking induction + // variables. If any are present and wider than the planned canonical + // induction variable, temporarily remove them, so that the Rewriter + // doesn't attempt to reuse them. + SmallVector<PHINode *, 2> OldCannIVs; + while (PHINode *OldCannIV = L->getCanonicalInductionVariable()) { + if (SE->getTypeSizeInBits(OldCannIV->getType()) > + SE->getTypeSizeInBits(LargestType)) + OldCannIV->removeFromParent(); + else + break; + OldCannIVs.push_back(OldCannIV); + } + + IndVar = Rewriter.getOrInsertCanonicalInductionVariable(L, LargestType); + + ++NumInserted; + Changed = true; + DEBUG(dbgs() << "INDVARS: New CanIV: " << *IndVar << '\n'); + + // Now that the official induction variable is established, reinsert + // any old canonical-looking variables after it so that the IR remains + // consistent. They will be deleted as part of the dead-PHI deletion at + // the end of the pass. + while (!OldCannIVs.empty()) { + PHINode *OldCannIV = OldCannIVs.pop_back_val(); + OldCannIV->insertBefore(L->getHeader()->getFirstNonPHI()); + } + } + + // If we have a trip count expression, rewrite the loop's exit condition + // using it. We can currently only handle loops with a single exit. + ICmpInst *NewICmp = 0; + if (!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && + !BackedgeTakenCount->isZero() && + ExitingBlock) { + assert(NeedCannIV && + "LinearFunctionTestReplace requires a canonical induction variable"); + // Can't rewrite non-branch yet. + if (BranchInst *BI = dyn_cast<BranchInst>(ExitingBlock->getTerminator())) + NewICmp = LinearFunctionTestReplace(L, BackedgeTakenCount, IndVar, + ExitingBlock, BI, Rewriter); + } + + // Rewrite IV-derived expressions. Clears the rewriter cache. + RewriteIVExpressions(L, Rewriter); + + // The Rewriter may not be used from this point on. + + // Loop-invariant instructions in the preheader that aren't used in the + // loop may be sunk below the loop to reduce register pressure. + SinkUnusedInvariants(L); + + // For completeness, inform IVUsers of the IV use in the newly-created + // loop exit test instruction. + if (NewICmp) + IU->AddUsersIfInteresting(cast<Instruction>(NewICmp->getOperand(0))); + + // Clean up dead instructions. + Changed |= DeleteDeadPHIs(L->getHeader()); + // Check a post-condition. + assert(L->isLCSSAForm(*DT) && "Indvars did not leave the loop in lcssa form!"); + return Changed; +} + +// FIXME: It is an extremely bad idea to indvar substitute anything more +// complex than affine induction variables. Doing so will put expensive +// polynomial evaluations inside of the loop, and the str reduction pass +// currently can only reduce affine polynomials. For now just disable +// indvar subst on anything more complex than an affine addrec, unless +// it can be expanded to a trivial value. +static bool isSafe(const SCEV *S, const Loop *L, ScalarEvolution *SE) { + // Loop-invariant values are safe. + if (SE->isLoopInvariant(S, L)) return true; + + // Affine addrecs are safe. Non-affine are not, because LSR doesn't know how + // to transform them into efficient code. + if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) + return AR->isAffine(); + + // An add is safe it all its operands are safe. + if (const SCEVCommutativeExpr *Commutative = dyn_cast<SCEVCommutativeExpr>(S)) { + for (SCEVCommutativeExpr::op_iterator I = Commutative->op_begin(), + E = Commutative->op_end(); I != E; ++I) + if (!isSafe(*I, L, SE)) return false; + return true; + } + + // A cast is safe if its operand is. + if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S)) + return isSafe(C->getOperand(), L, SE); + + // A udiv is safe if its operands are. + if (const SCEVUDivExpr *UD = dyn_cast<SCEVUDivExpr>(S)) + return isSafe(UD->getLHS(), L, SE) && + isSafe(UD->getRHS(), L, SE); + + // SCEVUnknown is always safe. + if (isa<SCEVUnknown>(S)) + return true; + + // Nothing else is safe. + return false; +} + +void IndVarSimplify::RewriteIVExpressions(Loop *L, SCEVExpander &Rewriter) { + SmallVector<WeakVH, 16> DeadInsts; + + // Rewrite all induction variable expressions in terms of the canonical + // induction variable. + // + // If there were induction variables of other sizes or offsets, manually + // add the offsets to the primary induction variable and cast, avoiding + // the need for the code evaluation methods to insert induction variables + // of different sizes. + for (IVUsers::iterator UI = IU->begin(), E = IU->end(); UI != E; ++UI) { + Value *Op = UI->getOperandValToReplace(); + const Type *UseTy = Op->getType(); + Instruction *User = UI->getUser(); + + // Compute the final addrec to expand into code. + const SCEV *AR = IU->getReplacementExpr(*UI); + + // Evaluate the expression out of the loop, if possible. + if (!L->contains(UI->getUser())) { + const SCEV *ExitVal = SE->getSCEVAtScope(AR, L->getParentLoop()); + if (SE->isLoopInvariant(ExitVal, L)) + AR = ExitVal; + } + + // FIXME: It is an extremely bad idea to indvar substitute anything more + // complex than affine induction variables. Doing so will put expensive + // polynomial evaluations inside of the loop, and the str reduction pass + // currently can only reduce affine polynomials. For now just disable + // indvar subst on anything more complex than an affine addrec, unless + // it can be expanded to a trivial value. + if (!isSafe(AR, L, SE)) + continue; + + // Determine the insertion point for this user. By default, insert + // immediately before the user. The SCEVExpander class will automatically + // hoist loop invariants out of the loop. For PHI nodes, there may be + // multiple uses, so compute the nearest common dominator for the + // incoming blocks. + Instruction *InsertPt = User; + if (PHINode *PHI = dyn_cast<PHINode>(InsertPt)) + for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i) + if (PHI->getIncomingValue(i) == Op) { + if (InsertPt == User) + InsertPt = PHI->getIncomingBlock(i)->getTerminator(); + else + InsertPt = + DT->findNearestCommonDominator(InsertPt->getParent(), + PHI->getIncomingBlock(i)) + ->getTerminator(); + } + + // Now expand it into actual Instructions and patch it into place. + Value *NewVal = Rewriter.expandCodeFor(AR, UseTy, InsertPt); + + // Inform ScalarEvolution that this value is changing. The change doesn't + // affect its value, but it does potentially affect which use lists the + // value will be on after the replacement, which affects ScalarEvolution's + // ability to walk use lists and drop dangling pointers when a value is + // deleted. + SE->forgetValue(User); + + // Patch the new value into place. + if (Op->hasName()) + NewVal->takeName(Op); + User->replaceUsesOfWith(Op, NewVal); + UI->setOperandValToReplace(NewVal); + DEBUG(dbgs() << "INDVARS: Rewrote IV '" << *AR << "' " << *Op << '\n' + << " into = " << *NewVal << "\n"); + ++NumRemoved; + Changed = true; + + // The old value may be dead now. + DeadInsts.push_back(Op); + } + + // Clear the rewriter cache, because values that are in the rewriter's cache + // can be deleted in the loop below, causing the AssertingVH in the cache to + // trigger. + Rewriter.clear(); + // Now that we're done iterating through lists, clean up any instructions + // which are now dead. + while (!DeadInsts.empty()) + if (Instruction *Inst = + dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val())) + RecursivelyDeleteTriviallyDeadInstructions(Inst); +} + +/// If there's a single exit block, sink any loop-invariant values that +/// were defined in the preheader but not used inside the loop into the +/// exit block to reduce register pressure in the loop. +void IndVarSimplify::SinkUnusedInvariants(Loop *L) { + BasicBlock *ExitBlock = L->getExitBlock(); + if (!ExitBlock) return; + + BasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) return; + + Instruction *InsertPt = ExitBlock->getFirstNonPHI(); + BasicBlock::iterator I = Preheader->getTerminator(); + while (I != Preheader->begin()) { + --I; + // New instructions were inserted at the end of the preheader. + if (isa<PHINode>(I)) + break; + + // Don't move instructions which might have side effects, since the side + // effects need to complete before instructions inside the loop. Also don't + // move instructions which might read memory, since the loop may modify + // memory. Note that it's okay if the instruction might have undefined + // behavior: LoopSimplify guarantees that the preheader dominates the exit + // block. + if (I->mayHaveSideEffects() || I->mayReadFromMemory()) + continue; + + // Skip debug info intrinsics. + if (isa<DbgInfoIntrinsic>(I)) + continue; + + // Don't sink static AllocaInsts out of the entry block, which would + // turn them into dynamic allocas! + if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) + if (AI->isStaticAlloca()) + continue; + + // Determine if there is a use in or before the loop (direct or + // otherwise). + bool UsedInLoop = false; + for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); + UI != UE; ++UI) { + User *U = *UI; + BasicBlock *UseBB = cast<Instruction>(U)->getParent(); + if (PHINode *P = dyn_cast<PHINode>(U)) { + unsigned i = + PHINode::getIncomingValueNumForOperand(UI.getOperandNo()); + UseBB = P->getIncomingBlock(i); + } + if (UseBB == Preheader || L->contains(UseBB)) { + UsedInLoop = true; + break; + } + } + + // If there is, the def must remain in the preheader. + if (UsedInLoop) + continue; + + // Otherwise, sink it to the exit block. + Instruction *ToMove = I; + bool Done = false; + + if (I != Preheader->begin()) { + // Skip debug info intrinsics. + do { + --I; + } while (isa<DbgInfoIntrinsic>(I) && I != Preheader->begin()); + + if (isa<DbgInfoIntrinsic>(I) && I == Preheader->begin()) + Done = true; + } else { + Done = true; + } + + ToMove->moveBefore(InsertPt); + if (Done) break; + InsertPt = ToMove; + } +} + +/// ConvertToSInt - Convert APF to an integer, if possible. +static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) { + bool isExact = false; + if (&APF.getSemantics() == &APFloat::PPCDoubleDouble) + return false; + // See if we can convert this to an int64_t + uint64_t UIntVal; + if (APF.convertToInteger(&UIntVal, 64, true, APFloat::rmTowardZero, + &isExact) != APFloat::opOK || !isExact) + return false; + IntVal = UIntVal; + return true; +} + +/// HandleFloatingPointIV - If the loop has floating induction variable +/// then insert corresponding integer induction variable if possible. +/// For example, +/// for(double i = 0; i < 10000; ++i) +/// bar(i) +/// is converted into +/// for(int i = 0; i < 10000; ++i) +/// bar((double)i); +/// +void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) { + unsigned IncomingEdge = L->contains(PN->getIncomingBlock(0)); + unsigned BackEdge = IncomingEdge^1; + + // Check incoming value. + ConstantFP *InitValueVal = + dyn_cast<ConstantFP>(PN->getIncomingValue(IncomingEdge)); + + int64_t InitValue; + if (!InitValueVal || !ConvertToSInt(InitValueVal->getValueAPF(), InitValue)) + return; + + // Check IV increment. Reject this PN if increment operation is not + // an add or increment value can not be represented by an integer. + BinaryOperator *Incr = + dyn_cast<BinaryOperator>(PN->getIncomingValue(BackEdge)); + if (Incr == 0 || Incr->getOpcode() != Instruction::FAdd) return; + + // If this is not an add of the PHI with a constantfp, or if the constant fp + // is not an integer, bail out. + ConstantFP *IncValueVal = dyn_cast<ConstantFP>(Incr->getOperand(1)); + int64_t IncValue; + if (IncValueVal == 0 || Incr->getOperand(0) != PN || + !ConvertToSInt(IncValueVal->getValueAPF(), IncValue)) + return; + + // Check Incr uses. One user is PN and the other user is an exit condition + // used by the conditional terminator. + Value::use_iterator IncrUse = Incr->use_begin(); + Instruction *U1 = cast<Instruction>(*IncrUse++); + if (IncrUse == Incr->use_end()) return; + Instruction *U2 = cast<Instruction>(*IncrUse++); + if (IncrUse != Incr->use_end()) return; + + // Find exit condition, which is an fcmp. If it doesn't exist, or if it isn't + // only used by a branch, we can't transform it. + FCmpInst *Compare = dyn_cast<FCmpInst>(U1); + if (!Compare) + Compare = dyn_cast<FCmpInst>(U2); + if (Compare == 0 || !Compare->hasOneUse() || + !isa<BranchInst>(Compare->use_back())) + return; + + BranchInst *TheBr = cast<BranchInst>(Compare->use_back()); + + // We need to verify that the branch actually controls the iteration count + // of the loop. If not, the new IV can overflow and no one will notice. + // The branch block must be in the loop and one of the successors must be out + // of the loop. + assert(TheBr->isConditional() && "Can't use fcmp if not conditional"); + if (!L->contains(TheBr->getParent()) || + (L->contains(TheBr->getSuccessor(0)) && + L->contains(TheBr->getSuccessor(1)))) + return; + + + // If it isn't a comparison with an integer-as-fp (the exit value), we can't + // transform it. + ConstantFP *ExitValueVal = dyn_cast<ConstantFP>(Compare->getOperand(1)); + int64_t ExitValue; + if (ExitValueVal == 0 || + !ConvertToSInt(ExitValueVal->getValueAPF(), ExitValue)) + return; + + // Find new predicate for integer comparison. + CmpInst::Predicate NewPred = CmpInst::BAD_ICMP_PREDICATE; + switch (Compare->getPredicate()) { + default: return; // Unknown comparison. + case CmpInst::FCMP_OEQ: + case CmpInst::FCMP_UEQ: NewPred = CmpInst::ICMP_EQ; break; + case CmpInst::FCMP_ONE: + case CmpInst::FCMP_UNE: NewPred = CmpInst::ICMP_NE; break; + case CmpInst::FCMP_OGT: + case CmpInst::FCMP_UGT: NewPred = CmpInst::ICMP_SGT; break; + case CmpInst::FCMP_OGE: + case CmpInst::FCMP_UGE: NewPred = CmpInst::ICMP_SGE; break; + case CmpInst::FCMP_OLT: + case CmpInst::FCMP_ULT: NewPred = CmpInst::ICMP_SLT; break; + case CmpInst::FCMP_OLE: + case CmpInst::FCMP_ULE: NewPred = CmpInst::ICMP_SLE; break; + } + + // We convert the floating point induction variable to a signed i32 value if + // we can. This is only safe if the comparison will not overflow in a way + // that won't be trapped by the integer equivalent operations. Check for this + // now. + // TODO: We could use i64 if it is native and the range requires it. + + // The start/stride/exit values must all fit in signed i32. + if (!isInt<32>(InitValue) || !isInt<32>(IncValue) || !isInt<32>(ExitValue)) + return; + + // If not actually striding (add x, 0.0), avoid touching the code. + if (IncValue == 0) + return; + + // Positive and negative strides have different safety conditions. + if (IncValue > 0) { + // If we have a positive stride, we require the init to be less than the + // exit value and an equality or less than comparison. + if (InitValue >= ExitValue || + NewPred == CmpInst::ICMP_SGT || NewPred == CmpInst::ICMP_SGE) + return; + + uint32_t Range = uint32_t(ExitValue-InitValue); + if (NewPred == CmpInst::ICMP_SLE) { + // Normalize SLE -> SLT, check for infinite loop. + if (++Range == 0) return; // Range overflows. + } + + unsigned Leftover = Range % uint32_t(IncValue); + + // If this is an equality comparison, we require that the strided value + // exactly land on the exit value, otherwise the IV condition will wrap + // around and do things the fp IV wouldn't. + if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) && + Leftover != 0) + return; + + // If the stride would wrap around the i32 before exiting, we can't + // transform the IV. + if (Leftover != 0 && int32_t(ExitValue+IncValue) < ExitValue) + return; + + } else { + // If we have a negative stride, we require the init to be greater than the + // exit value and an equality or greater than comparison. + if (InitValue >= ExitValue || + NewPred == CmpInst::ICMP_SLT || NewPred == CmpInst::ICMP_SLE) + return; + + uint32_t Range = uint32_t(InitValue-ExitValue); + if (NewPred == CmpInst::ICMP_SGE) { + // Normalize SGE -> SGT, check for infinite loop. + if (++Range == 0) return; // Range overflows. + } + + unsigned Leftover = Range % uint32_t(-IncValue); + + // If this is an equality comparison, we require that the strided value + // exactly land on the exit value, otherwise the IV condition will wrap + // around and do things the fp IV wouldn't. + if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) && + Leftover != 0) + return; + + // If the stride would wrap around the i32 before exiting, we can't + // transform the IV. + if (Leftover != 0 && int32_t(ExitValue+IncValue) > ExitValue) + return; + } + + const IntegerType *Int32Ty = Type::getInt32Ty(PN->getContext()); + + // Insert new integer induction variable. + PHINode *NewPHI = PHINode::Create(Int32Ty, PN->getName()+".int", PN); + NewPHI->addIncoming(ConstantInt::get(Int32Ty, InitValue), + PN->getIncomingBlock(IncomingEdge)); + + Value *NewAdd = + BinaryOperator::CreateAdd(NewPHI, ConstantInt::get(Int32Ty, IncValue), + Incr->getName()+".int", Incr); + NewPHI->addIncoming(NewAdd, PN->getIncomingBlock(BackEdge)); + + ICmpInst *NewCompare = new ICmpInst(TheBr, NewPred, NewAdd, + ConstantInt::get(Int32Ty, ExitValue), + Compare->getName()); + + // In the following deletions, PN may become dead and may be deleted. + // Use a WeakVH to observe whether this happens. + WeakVH WeakPH = PN; + + // Delete the old floating point exit comparison. The branch starts using the + // new comparison. + NewCompare->takeName(Compare); + Compare->replaceAllUsesWith(NewCompare); + RecursivelyDeleteTriviallyDeadInstructions(Compare); + + // Delete the old floating point increment. + Incr->replaceAllUsesWith(UndefValue::get(Incr->getType())); + RecursivelyDeleteTriviallyDeadInstructions(Incr); + + // If the FP induction variable still has uses, this is because something else + // in the loop uses its value. In order to canonicalize the induction + // variable, we chose to eliminate the IV and rewrite it in terms of an + // int->fp cast. + // + // We give preference to sitofp over uitofp because it is faster on most + // platforms. + if (WeakPH) { + Value *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv", + PN->getParent()->getFirstNonPHI()); + PN->replaceAllUsesWith(Conv); + RecursivelyDeleteTriviallyDeadInstructions(PN); + } + + // Add a new IVUsers entry for the newly-created integer PHI. + IU->AddUsersIfInteresting(NewPHI); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp new file mode 100644 index 0000000..90094a8 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -0,0 +1,1576 @@ +//===- JumpThreading.cpp - Thread control through conditional blocks ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the Jump Threading pass. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "jump-threading" +#include "llvm/Transforms/Scalar.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/LLVMContext.h" +#include "llvm/Pass.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LazyValueInfo.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" +#include "llvm/Target/TargetData.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ValueHandle.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +STATISTIC(NumThreads, "Number of jumps threaded"); +STATISTIC(NumFolds, "Number of terminators folded"); +STATISTIC(NumDupes, "Number of branch blocks duplicated to eliminate phi"); + +static cl::opt<unsigned> +Threshold("jump-threading-threshold", + cl::desc("Max block size to duplicate for jump threading"), + cl::init(6), cl::Hidden); + +namespace { + // These are at global scope so static functions can use them too. + typedef SmallVectorImpl<std::pair<Constant*, BasicBlock*> > PredValueInfo; + typedef SmallVector<std::pair<Constant*, BasicBlock*>, 8> PredValueInfoTy; + + // This is used to keep track of what kind of constant we're currently hoping + // to find. + enum ConstantPreference { + WantInteger, + WantBlockAddress + }; + + /// This pass performs 'jump threading', which looks at blocks that have + /// multiple predecessors and multiple successors. If one or more of the + /// predecessors of the block can be proven to always jump to one of the + /// successors, we forward the edge from the predecessor to the successor by + /// duplicating the contents of this block. + /// + /// An example of when this can occur is code like this: + /// + /// if () { ... + /// X = 4; + /// } + /// if (X < 3) { + /// + /// In this case, the unconditional branch at the end of the first if can be + /// revectored to the false side of the second if. + /// + class JumpThreading : public FunctionPass { + TargetData *TD; + LazyValueInfo *LVI; +#ifdef NDEBUG + SmallPtrSet<BasicBlock*, 16> LoopHeaders; +#else + SmallSet<AssertingVH<BasicBlock>, 16> LoopHeaders; +#endif + DenseSet<std::pair<Value*, BasicBlock*> > RecursionSet; + + // RAII helper for updating the recursion stack. + struct RecursionSetRemover { + DenseSet<std::pair<Value*, BasicBlock*> > &TheSet; + std::pair<Value*, BasicBlock*> ThePair; + + RecursionSetRemover(DenseSet<std::pair<Value*, BasicBlock*> > &S, + std::pair<Value*, BasicBlock*> P) + : TheSet(S), ThePair(P) { } + + ~RecursionSetRemover() { + TheSet.erase(ThePair); + } + }; + public: + static char ID; // Pass identification + JumpThreading() : FunctionPass(ID) { + initializeJumpThreadingPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<LazyValueInfo>(); + AU.addPreserved<LazyValueInfo>(); + } + + void FindLoopHeaders(Function &F); + bool ProcessBlock(BasicBlock *BB); + bool ThreadEdge(BasicBlock *BB, const SmallVectorImpl<BasicBlock*> &PredBBs, + BasicBlock *SuccBB); + bool DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, + const SmallVectorImpl<BasicBlock *> &PredBBs); + + bool ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, + PredValueInfo &Result, + ConstantPreference Preference); + bool ProcessThreadableEdges(Value *Cond, BasicBlock *BB, + ConstantPreference Preference); + + bool ProcessBranchOnPHI(PHINode *PN); + bool ProcessBranchOnXOR(BinaryOperator *BO); + + bool SimplifyPartiallyRedundantLoad(LoadInst *LI); + }; +} + +char JumpThreading::ID = 0; +INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading", + "Jump Threading", false, false) +INITIALIZE_PASS_DEPENDENCY(LazyValueInfo) +INITIALIZE_PASS_END(JumpThreading, "jump-threading", + "Jump Threading", false, false) + +// Public interface to the Jump Threading pass +FunctionPass *llvm::createJumpThreadingPass() { return new JumpThreading(); } + +/// runOnFunction - Top level algorithm. +/// +bool JumpThreading::runOnFunction(Function &F) { + DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n"); + TD = getAnalysisIfAvailable<TargetData>(); + LVI = &getAnalysis<LazyValueInfo>(); + + FindLoopHeaders(F); + + bool Changed, EverChanged = false; + do { + Changed = false; + for (Function::iterator I = F.begin(), E = F.end(); I != E;) { + BasicBlock *BB = I; + // Thread all of the branches we can over this block. + while (ProcessBlock(BB)) + Changed = true; + + ++I; + + // If the block is trivially dead, zap it. This eliminates the successor + // edges which simplifies the CFG. + if (pred_begin(BB) == pred_end(BB) && + BB != &BB->getParent()->getEntryBlock()) { + DEBUG(dbgs() << " JT: Deleting dead block '" << BB->getName() + << "' with terminator: " << *BB->getTerminator() << '\n'); + LoopHeaders.erase(BB); + LVI->eraseBlock(BB); + DeleteDeadBlock(BB); + Changed = true; + continue; + } + + BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()); + + // Can't thread an unconditional jump, but if the block is "almost + // empty", we can replace uses of it with uses of the successor and make + // this dead. + if (BI && BI->isUnconditional() && + BB != &BB->getParent()->getEntryBlock() && + // If the terminator is the only non-phi instruction, try to nuke it. + BB->getFirstNonPHIOrDbg()->isTerminator()) { + // Since TryToSimplifyUncondBranchFromEmptyBlock may delete the + // block, we have to make sure it isn't in the LoopHeaders set. We + // reinsert afterward if needed. + bool ErasedFromLoopHeaders = LoopHeaders.erase(BB); + BasicBlock *Succ = BI->getSuccessor(0); + + // FIXME: It is always conservatively correct to drop the info + // for a block even if it doesn't get erased. This isn't totally + // awesome, but it allows us to use AssertingVH to prevent nasty + // dangling pointer issues within LazyValueInfo. + LVI->eraseBlock(BB); + if (TryToSimplifyUncondBranchFromEmptyBlock(BB)) { + Changed = true; + // If we deleted BB and BB was the header of a loop, then the + // successor is now the header of the loop. + BB = Succ; + } + + if (ErasedFromLoopHeaders) + LoopHeaders.insert(BB); + } + } + EverChanged |= Changed; + } while (Changed); + + LoopHeaders.clear(); + return EverChanged; +} + +/// getJumpThreadDuplicationCost - Return the cost of duplicating this block to +/// thread across it. +static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB) { + /// Ignore PHI nodes, these will be flattened when duplication happens. + BasicBlock::const_iterator I = BB->getFirstNonPHI(); + + // FIXME: THREADING will delete values that are just used to compute the + // branch, so they shouldn't count against the duplication cost. + + + // Sum up the cost of each instruction until we get to the terminator. Don't + // include the terminator because the copy won't include it. + unsigned Size = 0; + for (; !isa<TerminatorInst>(I); ++I) { + // Debugger intrinsics don't incur code size. + if (isa<DbgInfoIntrinsic>(I)) continue; + + // If this is a pointer->pointer bitcast, it is free. + if (isa<BitCastInst>(I) && I->getType()->isPointerTy()) + continue; + + // All other instructions count for at least one unit. + ++Size; + + // Calls are more expensive. If they are non-intrinsic calls, we model them + // as having cost of 4. If they are a non-vector intrinsic, we model them + // as having cost of 2 total, and if they are a vector intrinsic, we model + // them as having cost 1. + if (const CallInst *CI = dyn_cast<CallInst>(I)) { + if (!isa<IntrinsicInst>(CI)) + Size += 3; + else if (!CI->getType()->isVectorTy()) + Size += 1; + } + } + + // Threading through a switch statement is particularly profitable. If this + // block ends in a switch, decrease its cost to make it more likely to happen. + if (isa<SwitchInst>(I)) + Size = Size > 6 ? Size-6 : 0; + + // The same holds for indirect branches, but slightly more so. + if (isa<IndirectBrInst>(I)) + Size = Size > 8 ? Size-8 : 0; + + return Size; +} + +/// FindLoopHeaders - We do not want jump threading to turn proper loop +/// structures into irreducible loops. Doing this breaks up the loop nesting +/// hierarchy and pessimizes later transformations. To prevent this from +/// happening, we first have to find the loop headers. Here we approximate this +/// by finding targets of backedges in the CFG. +/// +/// Note that there definitely are cases when we want to allow threading of +/// edges across a loop header. For example, threading a jump from outside the +/// loop (the preheader) to an exit block of the loop is definitely profitable. +/// It is also almost always profitable to thread backedges from within the loop +/// to exit blocks, and is often profitable to thread backedges to other blocks +/// within the loop (forming a nested loop). This simple analysis is not rich +/// enough to track all of these properties and keep it up-to-date as the CFG +/// mutates, so we don't allow any of these transformations. +/// +void JumpThreading::FindLoopHeaders(Function &F) { + SmallVector<std::pair<const BasicBlock*,const BasicBlock*>, 32> Edges; + FindFunctionBackedges(F, Edges); + + for (unsigned i = 0, e = Edges.size(); i != e; ++i) + LoopHeaders.insert(const_cast<BasicBlock*>(Edges[i].second)); +} + +/// getKnownConstant - Helper method to determine if we can thread over a +/// terminator with the given value as its condition, and if so what value to +/// use for that. What kind of value this is depends on whether we want an +/// integer or a block address, but an undef is always accepted. +/// Returns null if Val is null or not an appropriate constant. +static Constant *getKnownConstant(Value *Val, ConstantPreference Preference) { + if (!Val) + return 0; + + // Undef is "known" enough. + if (UndefValue *U = dyn_cast<UndefValue>(Val)) + return U; + + if (Preference == WantBlockAddress) + return dyn_cast<BlockAddress>(Val->stripPointerCasts()); + + return dyn_cast<ConstantInt>(Val); +} + +/// ComputeValueKnownInPredecessors - Given a basic block BB and a value V, see +/// if we can infer that the value is a known ConstantInt/BlockAddress or undef +/// in any of our predecessors. If so, return the known list of value and pred +/// BB in the result vector. +/// +/// This returns true if there were any known values. +/// +bool JumpThreading:: +ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, + ConstantPreference Preference) { + // This method walks up use-def chains recursively. Because of this, we could + // get into an infinite loop going around loops in the use-def chain. To + // prevent this, keep track of what (value, block) pairs we've already visited + // and terminate the search if we loop back to them + if (!RecursionSet.insert(std::make_pair(V, BB)).second) + return false; + + // An RAII help to remove this pair from the recursion set once the recursion + // stack pops back out again. + RecursionSetRemover remover(RecursionSet, std::make_pair(V, BB)); + + // If V is a constant, then it is known in all predecessors. + if (Constant *KC = getKnownConstant(V, Preference)) { + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) + Result.push_back(std::make_pair(KC, *PI)); + + return true; + } + + // If V is a non-instruction value, or an instruction in a different block, + // then it can't be derived from a PHI. + Instruction *I = dyn_cast<Instruction>(V); + if (I == 0 || I->getParent() != BB) { + + // Okay, if this is a live-in value, see if it has a known value at the end + // of any of our predecessors. + // + // FIXME: This should be an edge property, not a block end property. + /// TODO: Per PR2563, we could infer value range information about a + /// predecessor based on its terminator. + // + // FIXME: change this to use the more-rich 'getPredicateOnEdge' method if + // "I" is a non-local compare-with-a-constant instruction. This would be + // able to handle value inequalities better, for example if the compare is + // "X < 4" and "X < 3" is known true but "X < 4" itself is not available. + // Perhaps getConstantOnEdge should be smart enough to do this? + + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { + BasicBlock *P = *PI; + // If the value is known by LazyValueInfo to be a constant in a + // predecessor, use that information to try to thread this block. + Constant *PredCst = LVI->getConstantOnEdge(V, P, BB); + if (Constant *KC = getKnownConstant(PredCst, Preference)) + Result.push_back(std::make_pair(KC, P)); + } + + return !Result.empty(); + } + + /// If I is a PHI node, then we know the incoming values for any constants. + if (PHINode *PN = dyn_cast<PHINode>(I)) { + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + Value *InVal = PN->getIncomingValue(i); + if (Constant *KC = getKnownConstant(InVal, Preference)) { + Result.push_back(std::make_pair(KC, PN->getIncomingBlock(i))); + } else { + Constant *CI = LVI->getConstantOnEdge(InVal, + PN->getIncomingBlock(i), BB); + if (Constant *KC = getKnownConstant(CI, Preference)) + Result.push_back(std::make_pair(KC, PN->getIncomingBlock(i))); + } + } + + return !Result.empty(); + } + + PredValueInfoTy LHSVals, RHSVals; + + // Handle some boolean conditions. + if (I->getType()->getPrimitiveSizeInBits() == 1) { + assert(Preference == WantInteger && "One-bit non-integer type?"); + // X | true -> true + // X & false -> false + if (I->getOpcode() == Instruction::Or || + I->getOpcode() == Instruction::And) { + ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals, + WantInteger); + ComputeValueKnownInPredecessors(I->getOperand(1), BB, RHSVals, + WantInteger); + + if (LHSVals.empty() && RHSVals.empty()) + return false; + + ConstantInt *InterestingVal; + if (I->getOpcode() == Instruction::Or) + InterestingVal = ConstantInt::getTrue(I->getContext()); + else + InterestingVal = ConstantInt::getFalse(I->getContext()); + + SmallPtrSet<BasicBlock*, 4> LHSKnownBBs; + + // Scan for the sentinel. If we find an undef, force it to the + // interesting value: x|undef -> true and x&undef -> false. + for (unsigned i = 0, e = LHSVals.size(); i != e; ++i) + if (LHSVals[i].first == InterestingVal || + isa<UndefValue>(LHSVals[i].first)) { + Result.push_back(LHSVals[i]); + Result.back().first = InterestingVal; + LHSKnownBBs.insert(LHSVals[i].second); + } + for (unsigned i = 0, e = RHSVals.size(); i != e; ++i) + if (RHSVals[i].first == InterestingVal || + isa<UndefValue>(RHSVals[i].first)) { + // If we already inferred a value for this block on the LHS, don't + // re-add it. + if (!LHSKnownBBs.count(RHSVals[i].second)) { + Result.push_back(RHSVals[i]); + Result.back().first = InterestingVal; + } + } + + return !Result.empty(); + } + + // Handle the NOT form of XOR. + if (I->getOpcode() == Instruction::Xor && + isa<ConstantInt>(I->getOperand(1)) && + cast<ConstantInt>(I->getOperand(1))->isOne()) { + ComputeValueKnownInPredecessors(I->getOperand(0), BB, Result, + WantInteger); + if (Result.empty()) + return false; + + // Invert the known values. + for (unsigned i = 0, e = Result.size(); i != e; ++i) + Result[i].first = ConstantExpr::getNot(Result[i].first); + + return true; + } + + // Try to simplify some other binary operator values. + } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) { + assert(Preference != WantBlockAddress + && "A binary operator creating a block address?"); + if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) { + PredValueInfoTy LHSVals; + ComputeValueKnownInPredecessors(BO->getOperand(0), BB, LHSVals, + WantInteger); + + // Try to use constant folding to simplify the binary operator. + for (unsigned i = 0, e = LHSVals.size(); i != e; ++i) { + Constant *V = LHSVals[i].first; + Constant *Folded = ConstantExpr::get(BO->getOpcode(), V, CI); + + if (Constant *KC = getKnownConstant(Folded, WantInteger)) + Result.push_back(std::make_pair(KC, LHSVals[i].second)); + } + } + + return !Result.empty(); + } + + // Handle compare with phi operand, where the PHI is defined in this block. + if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) { + assert(Preference == WantInteger && "Compares only produce integers"); + PHINode *PN = dyn_cast<PHINode>(Cmp->getOperand(0)); + if (PN && PN->getParent() == BB) { + // We can do this simplification if any comparisons fold to true or false. + // See if any do. + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + BasicBlock *PredBB = PN->getIncomingBlock(i); + Value *LHS = PN->getIncomingValue(i); + Value *RHS = Cmp->getOperand(1)->DoPHITranslation(BB, PredBB); + + Value *Res = SimplifyCmpInst(Cmp->getPredicate(), LHS, RHS, TD); + if (Res == 0) { + if (!isa<Constant>(RHS)) + continue; + + LazyValueInfo::Tristate + ResT = LVI->getPredicateOnEdge(Cmp->getPredicate(), LHS, + cast<Constant>(RHS), PredBB, BB); + if (ResT == LazyValueInfo::Unknown) + continue; + Res = ConstantInt::get(Type::getInt1Ty(LHS->getContext()), ResT); + } + + if (Constant *KC = getKnownConstant(Res, WantInteger)) + Result.push_back(std::make_pair(KC, PredBB)); + } + + return !Result.empty(); + } + + + // If comparing a live-in value against a constant, see if we know the + // live-in value on any predecessors. + if (isa<Constant>(Cmp->getOperand(1)) && Cmp->getType()->isIntegerTy()) { + if (!isa<Instruction>(Cmp->getOperand(0)) || + cast<Instruction>(Cmp->getOperand(0))->getParent() != BB) { + Constant *RHSCst = cast<Constant>(Cmp->getOperand(1)); + + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB);PI != E; ++PI){ + BasicBlock *P = *PI; + // If the value is known by LazyValueInfo to be a constant in a + // predecessor, use that information to try to thread this block. + LazyValueInfo::Tristate Res = + LVI->getPredicateOnEdge(Cmp->getPredicate(), Cmp->getOperand(0), + RHSCst, P, BB); + if (Res == LazyValueInfo::Unknown) + continue; + + Constant *ResC = ConstantInt::get(Cmp->getType(), Res); + Result.push_back(std::make_pair(ResC, P)); + } + + return !Result.empty(); + } + + // Try to find a constant value for the LHS of a comparison, + // and evaluate it statically if we can. + if (Constant *CmpConst = dyn_cast<Constant>(Cmp->getOperand(1))) { + PredValueInfoTy LHSVals; + ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals, + WantInteger); + + for (unsigned i = 0, e = LHSVals.size(); i != e; ++i) { + Constant *V = LHSVals[i].first; + Constant *Folded = ConstantExpr::getCompare(Cmp->getPredicate(), + V, CmpConst); + if (Constant *KC = getKnownConstant(Folded, WantInteger)) + Result.push_back(std::make_pair(KC, LHSVals[i].second)); + } + + return !Result.empty(); + } + } + } + + if (SelectInst *SI = dyn_cast<SelectInst>(I)) { + // Handle select instructions where at least one operand is a known constant + // and we can figure out the condition value for any predecessor block. + Constant *TrueVal = getKnownConstant(SI->getTrueValue(), Preference); + Constant *FalseVal = getKnownConstant(SI->getFalseValue(), Preference); + PredValueInfoTy Conds; + if ((TrueVal || FalseVal) && + ComputeValueKnownInPredecessors(SI->getCondition(), BB, Conds, + WantInteger)) { + for (unsigned i = 0, e = Conds.size(); i != e; ++i) { + Constant *Cond = Conds[i].first; + + // Figure out what value to use for the condition. + bool KnownCond; + if (ConstantInt *CI = dyn_cast<ConstantInt>(Cond)) { + // A known boolean. + KnownCond = CI->isOne(); + } else { + assert(isa<UndefValue>(Cond) && "Unexpected condition value"); + // Either operand will do, so be sure to pick the one that's a known + // constant. + // FIXME: Do this more cleverly if both values are known constants? + KnownCond = (TrueVal != 0); + } + + // See if the select has a known constant value for this predecessor. + if (Constant *Val = KnownCond ? TrueVal : FalseVal) + Result.push_back(std::make_pair(Val, Conds[i].second)); + } + + return !Result.empty(); + } + } + + // If all else fails, see if LVI can figure out a constant value for us. + Constant *CI = LVI->getConstant(V, BB); + if (Constant *KC = getKnownConstant(CI, Preference)) { + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) + Result.push_back(std::make_pair(KC, *PI)); + } + + return !Result.empty(); +} + + + +/// GetBestDestForBranchOnUndef - If we determine that the specified block ends +/// in an undefined jump, decide which block is best to revector to. +/// +/// Since we can pick an arbitrary destination, we pick the successor with the +/// fewest predecessors. This should reduce the in-degree of the others. +/// +static unsigned GetBestDestForJumpOnUndef(BasicBlock *BB) { + TerminatorInst *BBTerm = BB->getTerminator(); + unsigned MinSucc = 0; + BasicBlock *TestBB = BBTerm->getSuccessor(MinSucc); + // Compute the successor with the minimum number of predecessors. + unsigned MinNumPreds = std::distance(pred_begin(TestBB), pred_end(TestBB)); + for (unsigned i = 1, e = BBTerm->getNumSuccessors(); i != e; ++i) { + TestBB = BBTerm->getSuccessor(i); + unsigned NumPreds = std::distance(pred_begin(TestBB), pred_end(TestBB)); + if (NumPreds < MinNumPreds) + MinSucc = i; + } + + return MinSucc; +} + +static bool hasAddressTakenAndUsed(BasicBlock *BB) { + if (!BB->hasAddressTaken()) return false; + + // If the block has its address taken, it may be a tree of dead constants + // hanging off of it. These shouldn't keep the block alive. + BlockAddress *BA = BlockAddress::get(BB); + BA->removeDeadConstantUsers(); + return !BA->use_empty(); +} + +/// ProcessBlock - If there are any predecessors whose control can be threaded +/// through to a successor, transform them now. +bool JumpThreading::ProcessBlock(BasicBlock *BB) { + // If the block is trivially dead, just return and let the caller nuke it. + // This simplifies other transformations. + if (pred_begin(BB) == pred_end(BB) && + BB != &BB->getParent()->getEntryBlock()) + return false; + + // If this block has a single predecessor, and if that pred has a single + // successor, merge the blocks. This encourages recursive jump threading + // because now the condition in this block can be threaded through + // predecessors of our predecessor block. + if (BasicBlock *SinglePred = BB->getSinglePredecessor()) { + if (SinglePred->getTerminator()->getNumSuccessors() == 1 && + SinglePred != BB && !hasAddressTakenAndUsed(BB)) { + // If SinglePred was a loop header, BB becomes one. + if (LoopHeaders.erase(SinglePred)) + LoopHeaders.insert(BB); + + // Remember if SinglePred was the entry block of the function. If so, we + // will need to move BB back to the entry position. + bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock(); + LVI->eraseBlock(SinglePred); + MergeBasicBlockIntoOnlyPred(BB); + + if (isEntry && BB != &BB->getParent()->getEntryBlock()) + BB->moveBefore(&BB->getParent()->getEntryBlock()); + return true; + } + } + + // What kind of constant we're looking for. + ConstantPreference Preference = WantInteger; + + // Look to see if the terminator is a conditional branch, switch or indirect + // branch, if not we can't thread it. + Value *Condition; + Instruction *Terminator = BB->getTerminator(); + if (BranchInst *BI = dyn_cast<BranchInst>(Terminator)) { + // Can't thread an unconditional jump. + if (BI->isUnconditional()) return false; + Condition = BI->getCondition(); + } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Terminator)) { + Condition = SI->getCondition(); + } else if (IndirectBrInst *IB = dyn_cast<IndirectBrInst>(Terminator)) { + Condition = IB->getAddress()->stripPointerCasts(); + Preference = WantBlockAddress; + } else { + return false; // Must be an invoke. + } + + // If the terminator is branching on an undef, we can pick any of the + // successors to branch to. Let GetBestDestForJumpOnUndef decide. + if (isa<UndefValue>(Condition)) { + unsigned BestSucc = GetBestDestForJumpOnUndef(BB); + + // Fold the branch/switch. + TerminatorInst *BBTerm = BB->getTerminator(); + for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i) { + if (i == BestSucc) continue; + BBTerm->getSuccessor(i)->removePredecessor(BB, true); + } + + DEBUG(dbgs() << " In block '" << BB->getName() + << "' folding undef terminator: " << *BBTerm << '\n'); + BranchInst::Create(BBTerm->getSuccessor(BestSucc), BBTerm); + BBTerm->eraseFromParent(); + return true; + } + + // If the terminator of this block is branching on a constant, simplify the + // terminator to an unconditional branch. This can occur due to threading in + // other blocks. + if (getKnownConstant(Condition, Preference)) { + DEBUG(dbgs() << " In block '" << BB->getName() + << "' folding terminator: " << *BB->getTerminator() << '\n'); + ++NumFolds; + ConstantFoldTerminator(BB); + return true; + } + + Instruction *CondInst = dyn_cast<Instruction>(Condition); + + // All the rest of our checks depend on the condition being an instruction. + if (CondInst == 0) { + // FIXME: Unify this with code below. + if (ProcessThreadableEdges(Condition, BB, Preference)) + return true; + return false; + } + + + if (CmpInst *CondCmp = dyn_cast<CmpInst>(CondInst)) { + // For a comparison where the LHS is outside this block, it's possible + // that we've branched on it before. Used LVI to see if we can simplify + // the branch based on that. + BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator()); + Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1)); + pred_iterator PI = pred_begin(BB), PE = pred_end(BB); + if (CondBr && CondConst && CondBr->isConditional() && PI != PE && + (!isa<Instruction>(CondCmp->getOperand(0)) || + cast<Instruction>(CondCmp->getOperand(0))->getParent() != BB)) { + // For predecessor edge, determine if the comparison is true or false + // on that edge. If they're all true or all false, we can simplify the + // branch. + // FIXME: We could handle mixed true/false by duplicating code. + LazyValueInfo::Tristate Baseline = + LVI->getPredicateOnEdge(CondCmp->getPredicate(), CondCmp->getOperand(0), + CondConst, *PI, BB); + if (Baseline != LazyValueInfo::Unknown) { + // Check that all remaining incoming values match the first one. + while (++PI != PE) { + LazyValueInfo::Tristate Ret = + LVI->getPredicateOnEdge(CondCmp->getPredicate(), + CondCmp->getOperand(0), CondConst, *PI, BB); + if (Ret != Baseline) break; + } + + // If we terminated early, then one of the values didn't match. + if (PI == PE) { + unsigned ToRemove = Baseline == LazyValueInfo::True ? 1 : 0; + unsigned ToKeep = Baseline == LazyValueInfo::True ? 0 : 1; + CondBr->getSuccessor(ToRemove)->removePredecessor(BB, true); + BranchInst::Create(CondBr->getSuccessor(ToKeep), CondBr); + CondBr->eraseFromParent(); + return true; + } + } + } + } + + // Check for some cases that are worth simplifying. Right now we want to look + // for loads that are used by a switch or by the condition for the branch. If + // we see one, check to see if it's partially redundant. If so, insert a PHI + // which can then be used to thread the values. + // + Value *SimplifyValue = CondInst; + if (CmpInst *CondCmp = dyn_cast<CmpInst>(SimplifyValue)) + if (isa<Constant>(CondCmp->getOperand(1))) + SimplifyValue = CondCmp->getOperand(0); + + // TODO: There are other places where load PRE would be profitable, such as + // more complex comparisons. + if (LoadInst *LI = dyn_cast<LoadInst>(SimplifyValue)) + if (SimplifyPartiallyRedundantLoad(LI)) + return true; + + + // Handle a variety of cases where we are branching on something derived from + // a PHI node in the current block. If we can prove that any predecessors + // compute a predictable value based on a PHI node, thread those predecessors. + // + if (ProcessThreadableEdges(CondInst, BB, Preference)) + return true; + + // If this is an otherwise-unfoldable branch on a phi node in the current + // block, see if we can simplify. + if (PHINode *PN = dyn_cast<PHINode>(CondInst)) + if (PN->getParent() == BB && isa<BranchInst>(BB->getTerminator())) + return ProcessBranchOnPHI(PN); + + + // If this is an otherwise-unfoldable branch on a XOR, see if we can simplify. + if (CondInst->getOpcode() == Instruction::Xor && + CondInst->getParent() == BB && isa<BranchInst>(BB->getTerminator())) + return ProcessBranchOnXOR(cast<BinaryOperator>(CondInst)); + + + // TODO: If we have: "br (X > 0)" and we have a predecessor where we know + // "(X == 4)", thread through this block. + + return false; +} + + +/// SimplifyPartiallyRedundantLoad - If LI is an obviously partially redundant +/// load instruction, eliminate it by replacing it with a PHI node. This is an +/// important optimization that encourages jump threading, and needs to be run +/// interlaced with other jump threading tasks. +bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { + // Don't hack volatile loads. + if (LI->isVolatile()) return false; + + // If the load is defined in a block with exactly one predecessor, it can't be + // partially redundant. + BasicBlock *LoadBB = LI->getParent(); + if (LoadBB->getSinglePredecessor()) + return false; + + Value *LoadedPtr = LI->getOperand(0); + + // If the loaded operand is defined in the LoadBB, it can't be available. + // TODO: Could do simple PHI translation, that would be fun :) + if (Instruction *PtrOp = dyn_cast<Instruction>(LoadedPtr)) + if (PtrOp->getParent() == LoadBB) + return false; + + // Scan a few instructions up from the load, to see if it is obviously live at + // the entry to its block. + BasicBlock::iterator BBIt = LI; + + if (Value *AvailableVal = + FindAvailableLoadedValue(LoadedPtr, LoadBB, BBIt, 6)) { + // If the value if the load is locally available within the block, just use + // it. This frequently occurs for reg2mem'd allocas. + //cerr << "LOAD ELIMINATED:\n" << *BBIt << *LI << "\n"; + + // If the returned value is the load itself, replace with an undef. This can + // only happen in dead loops. + if (AvailableVal == LI) AvailableVal = UndefValue::get(LI->getType()); + LI->replaceAllUsesWith(AvailableVal); + LI->eraseFromParent(); + return true; + } + + // Otherwise, if we scanned the whole block and got to the top of the block, + // we know the block is locally transparent to the load. If not, something + // might clobber its value. + if (BBIt != LoadBB->begin()) + return false; + + + SmallPtrSet<BasicBlock*, 8> PredsScanned; + typedef SmallVector<std::pair<BasicBlock*, Value*>, 8> AvailablePredsTy; + AvailablePredsTy AvailablePreds; + BasicBlock *OneUnavailablePred = 0; + + // If we got here, the loaded value is transparent through to the start of the + // block. Check to see if it is available in any of the predecessor blocks. + for (pred_iterator PI = pred_begin(LoadBB), PE = pred_end(LoadBB); + PI != PE; ++PI) { + BasicBlock *PredBB = *PI; + + // If we already scanned this predecessor, skip it. + if (!PredsScanned.insert(PredBB)) + continue; + + // Scan the predecessor to see if the value is available in the pred. + BBIt = PredBB->end(); + Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt, 6); + if (!PredAvailable) { + OneUnavailablePred = PredBB; + continue; + } + + // If so, this load is partially redundant. Remember this info so that we + // can create a PHI node. + AvailablePreds.push_back(std::make_pair(PredBB, PredAvailable)); + } + + // If the loaded value isn't available in any predecessor, it isn't partially + // redundant. + if (AvailablePreds.empty()) return false; + + // Okay, the loaded value is available in at least one (and maybe all!) + // predecessors. If the value is unavailable in more than one unique + // predecessor, we want to insert a merge block for those common predecessors. + // This ensures that we only have to insert one reload, thus not increasing + // code size. + BasicBlock *UnavailablePred = 0; + + // If there is exactly one predecessor where the value is unavailable, the + // already computed 'OneUnavailablePred' block is it. If it ends in an + // unconditional branch, we know that it isn't a critical edge. + if (PredsScanned.size() == AvailablePreds.size()+1 && + OneUnavailablePred->getTerminator()->getNumSuccessors() == 1) { + UnavailablePred = OneUnavailablePred; + } else if (PredsScanned.size() != AvailablePreds.size()) { + // Otherwise, we had multiple unavailable predecessors or we had a critical + // edge from the one. + SmallVector<BasicBlock*, 8> PredsToSplit; + SmallPtrSet<BasicBlock*, 8> AvailablePredSet; + + for (unsigned i = 0, e = AvailablePreds.size(); i != e; ++i) + AvailablePredSet.insert(AvailablePreds[i].first); + + // Add all the unavailable predecessors to the PredsToSplit list. + for (pred_iterator PI = pred_begin(LoadBB), PE = pred_end(LoadBB); + PI != PE; ++PI) { + BasicBlock *P = *PI; + // If the predecessor is an indirect goto, we can't split the edge. + if (isa<IndirectBrInst>(P->getTerminator())) + return false; + + if (!AvailablePredSet.count(P)) + PredsToSplit.push_back(P); + } + + // Split them out to their own block. + UnavailablePred = + SplitBlockPredecessors(LoadBB, &PredsToSplit[0], PredsToSplit.size(), + "thread-pre-split", this); + } + + // If the value isn't available in all predecessors, then there will be + // exactly one where it isn't available. Insert a load on that edge and add + // it to the AvailablePreds list. + if (UnavailablePred) { + assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 && + "Can't handle critical edge here!"); + Value *NewVal = new LoadInst(LoadedPtr, LI->getName()+".pr", false, + LI->getAlignment(), + UnavailablePred->getTerminator()); + AvailablePreds.push_back(std::make_pair(UnavailablePred, NewVal)); + } + + // Now we know that each predecessor of this block has a value in + // AvailablePreds, sort them for efficient access as we're walking the preds. + array_pod_sort(AvailablePreds.begin(), AvailablePreds.end()); + + // Create a PHI node at the start of the block for the PRE'd load value. + PHINode *PN = PHINode::Create(LI->getType(), "", LoadBB->begin()); + PN->takeName(LI); + + // Insert new entries into the PHI for each predecessor. A single block may + // have multiple entries here. + for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB); PI != E; + ++PI) { + BasicBlock *P = *PI; + AvailablePredsTy::iterator I = + std::lower_bound(AvailablePreds.begin(), AvailablePreds.end(), + std::make_pair(P, (Value*)0)); + + assert(I != AvailablePreds.end() && I->first == P && + "Didn't find entry for predecessor!"); + + PN->addIncoming(I->second, I->first); + } + + //cerr << "PRE: " << *LI << *PN << "\n"; + + LI->replaceAllUsesWith(PN); + LI->eraseFromParent(); + + return true; +} + +/// FindMostPopularDest - The specified list contains multiple possible +/// threadable destinations. Pick the one that occurs the most frequently in +/// the list. +static BasicBlock * +FindMostPopularDest(BasicBlock *BB, + const SmallVectorImpl<std::pair<BasicBlock*, + BasicBlock*> > &PredToDestList) { + assert(!PredToDestList.empty()); + + // Determine popularity. If there are multiple possible destinations, we + // explicitly choose to ignore 'undef' destinations. We prefer to thread + // blocks with known and real destinations to threading undef. We'll handle + // them later if interesting. + DenseMap<BasicBlock*, unsigned> DestPopularity; + for (unsigned i = 0, e = PredToDestList.size(); i != e; ++i) + if (PredToDestList[i].second) + DestPopularity[PredToDestList[i].second]++; + + // Find the most popular dest. + DenseMap<BasicBlock*, unsigned>::iterator DPI = DestPopularity.begin(); + BasicBlock *MostPopularDest = DPI->first; + unsigned Popularity = DPI->second; + SmallVector<BasicBlock*, 4> SamePopularity; + + for (++DPI; DPI != DestPopularity.end(); ++DPI) { + // If the popularity of this entry isn't higher than the popularity we've + // seen so far, ignore it. + if (DPI->second < Popularity) + ; // ignore. + else if (DPI->second == Popularity) { + // If it is the same as what we've seen so far, keep track of it. + SamePopularity.push_back(DPI->first); + } else { + // If it is more popular, remember it. + SamePopularity.clear(); + MostPopularDest = DPI->first; + Popularity = DPI->second; + } + } + + // Okay, now we know the most popular destination. If there is more than one + // destination, we need to determine one. This is arbitrary, but we need + // to make a deterministic decision. Pick the first one that appears in the + // successor list. + if (!SamePopularity.empty()) { + SamePopularity.push_back(MostPopularDest); + TerminatorInst *TI = BB->getTerminator(); + for (unsigned i = 0; ; ++i) { + assert(i != TI->getNumSuccessors() && "Didn't find any successor!"); + + if (std::find(SamePopularity.begin(), SamePopularity.end(), + TI->getSuccessor(i)) == SamePopularity.end()) + continue; + + MostPopularDest = TI->getSuccessor(i); + break; + } + } + + // Okay, we have finally picked the most popular destination. + return MostPopularDest; +} + +bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB, + ConstantPreference Preference) { + // If threading this would thread across a loop header, don't even try to + // thread the edge. + if (LoopHeaders.count(BB)) + return false; + + PredValueInfoTy PredValues; + if (!ComputeValueKnownInPredecessors(Cond, BB, PredValues, Preference)) + return false; + + assert(!PredValues.empty() && + "ComputeValueKnownInPredecessors returned true with no values"); + + DEBUG(dbgs() << "IN BB: " << *BB; + for (unsigned i = 0, e = PredValues.size(); i != e; ++i) { + dbgs() << " BB '" << BB->getName() << "': FOUND condition = " + << *PredValues[i].first + << " for pred '" << PredValues[i].second->getName() << "'.\n"; + }); + + // Decide what we want to thread through. Convert our list of known values to + // a list of known destinations for each pred. This also discards duplicate + // predecessors and keeps track of the undefined inputs (which are represented + // as a null dest in the PredToDestList). + SmallPtrSet<BasicBlock*, 16> SeenPreds; + SmallVector<std::pair<BasicBlock*, BasicBlock*>, 16> PredToDestList; + + BasicBlock *OnlyDest = 0; + BasicBlock *MultipleDestSentinel = (BasicBlock*)(intptr_t)~0ULL; + + for (unsigned i = 0, e = PredValues.size(); i != e; ++i) { + BasicBlock *Pred = PredValues[i].second; + if (!SeenPreds.insert(Pred)) + continue; // Duplicate predecessor entry. + + // If the predecessor ends with an indirect goto, we can't change its + // destination. + if (isa<IndirectBrInst>(Pred->getTerminator())) + continue; + + Constant *Val = PredValues[i].first; + + BasicBlock *DestBB; + if (isa<UndefValue>(Val)) + DestBB = 0; + else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) + DestBB = BI->getSuccessor(cast<ConstantInt>(Val)->isZero()); + else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) + DestBB = SI->getSuccessor(SI->findCaseValue(cast<ConstantInt>(Val))); + else { + assert(isa<IndirectBrInst>(BB->getTerminator()) + && "Unexpected terminator"); + DestBB = cast<BlockAddress>(Val)->getBasicBlock(); + } + + // If we have exactly one destination, remember it for efficiency below. + if (PredToDestList.empty()) + OnlyDest = DestBB; + else if (OnlyDest != DestBB) + OnlyDest = MultipleDestSentinel; + + PredToDestList.push_back(std::make_pair(Pred, DestBB)); + } + + // If all edges were unthreadable, we fail. + if (PredToDestList.empty()) + return false; + + // Determine which is the most common successor. If we have many inputs and + // this block is a switch, we want to start by threading the batch that goes + // to the most popular destination first. If we only know about one + // threadable destination (the common case) we can avoid this. + BasicBlock *MostPopularDest = OnlyDest; + + if (MostPopularDest == MultipleDestSentinel) + MostPopularDest = FindMostPopularDest(BB, PredToDestList); + + // Now that we know what the most popular destination is, factor all + // predecessors that will jump to it into a single predecessor. + SmallVector<BasicBlock*, 16> PredsToFactor; + for (unsigned i = 0, e = PredToDestList.size(); i != e; ++i) + if (PredToDestList[i].second == MostPopularDest) { + BasicBlock *Pred = PredToDestList[i].first; + + // This predecessor may be a switch or something else that has multiple + // edges to the block. Factor each of these edges by listing them + // according to # occurrences in PredsToFactor. + TerminatorInst *PredTI = Pred->getTerminator(); + for (unsigned i = 0, e = PredTI->getNumSuccessors(); i != e; ++i) + if (PredTI->getSuccessor(i) == BB) + PredsToFactor.push_back(Pred); + } + + // If the threadable edges are branching on an undefined value, we get to pick + // the destination that these predecessors should get to. + if (MostPopularDest == 0) + MostPopularDest = BB->getTerminator()-> + getSuccessor(GetBestDestForJumpOnUndef(BB)); + + // Ok, try to thread it! + return ThreadEdge(BB, PredsToFactor, MostPopularDest); +} + +/// ProcessBranchOnPHI - We have an otherwise unthreadable conditional branch on +/// a PHI node in the current block. See if there are any simplifications we +/// can do based on inputs to the phi node. +/// +bool JumpThreading::ProcessBranchOnPHI(PHINode *PN) { + BasicBlock *BB = PN->getParent(); + + // TODO: We could make use of this to do it once for blocks with common PHI + // values. + SmallVector<BasicBlock*, 1> PredBBs; + PredBBs.resize(1); + + // If any of the predecessor blocks end in an unconditional branch, we can + // *duplicate* the conditional branch into that block in order to further + // encourage jump threading and to eliminate cases where we have branch on a + // phi of an icmp (branch on icmp is much better). + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + BasicBlock *PredBB = PN->getIncomingBlock(i); + if (BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator())) + if (PredBr->isUnconditional()) { + PredBBs[0] = PredBB; + // Try to duplicate BB into PredBB. + if (DuplicateCondBranchOnPHIIntoPred(BB, PredBBs)) + return true; + } + } + + return false; +} + +/// ProcessBranchOnXOR - We have an otherwise unthreadable conditional branch on +/// a xor instruction in the current block. See if there are any +/// simplifications we can do based on inputs to the xor. +/// +bool JumpThreading::ProcessBranchOnXOR(BinaryOperator *BO) { + BasicBlock *BB = BO->getParent(); + + // If either the LHS or RHS of the xor is a constant, don't do this + // optimization. + if (isa<ConstantInt>(BO->getOperand(0)) || + isa<ConstantInt>(BO->getOperand(1))) + return false; + + // If the first instruction in BB isn't a phi, we won't be able to infer + // anything special about any particular predecessor. + if (!isa<PHINode>(BB->front())) + return false; + + // If we have a xor as the branch input to this block, and we know that the + // LHS or RHS of the xor in any predecessor is true/false, then we can clone + // the condition into the predecessor and fix that value to true, saving some + // logical ops on that path and encouraging other paths to simplify. + // + // This copies something like this: + // + // BB: + // %X = phi i1 [1], [%X'] + // %Y = icmp eq i32 %A, %B + // %Z = xor i1 %X, %Y + // br i1 %Z, ... + // + // Into: + // BB': + // %Y = icmp ne i32 %A, %B + // br i1 %Z, ... + + PredValueInfoTy XorOpValues; + bool isLHS = true; + if (!ComputeValueKnownInPredecessors(BO->getOperand(0), BB, XorOpValues, + WantInteger)) { + assert(XorOpValues.empty()); + if (!ComputeValueKnownInPredecessors(BO->getOperand(1), BB, XorOpValues, + WantInteger)) + return false; + isLHS = false; + } + + assert(!XorOpValues.empty() && + "ComputeValueKnownInPredecessors returned true with no values"); + + // Scan the information to see which is most popular: true or false. The + // predecessors can be of the set true, false, or undef. + unsigned NumTrue = 0, NumFalse = 0; + for (unsigned i = 0, e = XorOpValues.size(); i != e; ++i) { + if (isa<UndefValue>(XorOpValues[i].first)) + // Ignore undefs for the count. + continue; + if (cast<ConstantInt>(XorOpValues[i].first)->isZero()) + ++NumFalse; + else + ++NumTrue; + } + + // Determine which value to split on, true, false, or undef if neither. + ConstantInt *SplitVal = 0; + if (NumTrue > NumFalse) + SplitVal = ConstantInt::getTrue(BB->getContext()); + else if (NumTrue != 0 || NumFalse != 0) + SplitVal = ConstantInt::getFalse(BB->getContext()); + + // Collect all of the blocks that this can be folded into so that we can + // factor this once and clone it once. + SmallVector<BasicBlock*, 8> BlocksToFoldInto; + for (unsigned i = 0, e = XorOpValues.size(); i != e; ++i) { + if (XorOpValues[i].first != SplitVal && + !isa<UndefValue>(XorOpValues[i].first)) + continue; + + BlocksToFoldInto.push_back(XorOpValues[i].second); + } + + // If we inferred a value for all of the predecessors, then duplication won't + // help us. However, we can just replace the LHS or RHS with the constant. + if (BlocksToFoldInto.size() == + cast<PHINode>(BB->front()).getNumIncomingValues()) { + if (SplitVal == 0) { + // If all preds provide undef, just nuke the xor, because it is undef too. + BO->replaceAllUsesWith(UndefValue::get(BO->getType())); + BO->eraseFromParent(); + } else if (SplitVal->isZero()) { + // If all preds provide 0, replace the xor with the other input. + BO->replaceAllUsesWith(BO->getOperand(isLHS)); + BO->eraseFromParent(); + } else { + // If all preds provide 1, set the computed value to 1. + BO->setOperand(!isLHS, SplitVal); + } + + return true; + } + + // Try to duplicate BB into PredBB. + return DuplicateCondBranchOnPHIIntoPred(BB, BlocksToFoldInto); +} + + +/// AddPHINodeEntriesForMappedBlock - We're adding 'NewPred' as a new +/// predecessor to the PHIBB block. If it has PHI nodes, add entries for +/// NewPred using the entries from OldPred (suitably mapped). +static void AddPHINodeEntriesForMappedBlock(BasicBlock *PHIBB, + BasicBlock *OldPred, + BasicBlock *NewPred, + DenseMap<Instruction*, Value*> &ValueMap) { + for (BasicBlock::iterator PNI = PHIBB->begin(); + PHINode *PN = dyn_cast<PHINode>(PNI); ++PNI) { + // Ok, we have a PHI node. Figure out what the incoming value was for the + // DestBlock. + Value *IV = PN->getIncomingValueForBlock(OldPred); + + // Remap the value if necessary. + if (Instruction *Inst = dyn_cast<Instruction>(IV)) { + DenseMap<Instruction*, Value*>::iterator I = ValueMap.find(Inst); + if (I != ValueMap.end()) + IV = I->second; + } + + PN->addIncoming(IV, NewPred); + } +} + +/// ThreadEdge - We have decided that it is safe and profitable to factor the +/// blocks in PredBBs to one predecessor, then thread an edge from it to SuccBB +/// across BB. Transform the IR to reflect this change. +bool JumpThreading::ThreadEdge(BasicBlock *BB, + const SmallVectorImpl<BasicBlock*> &PredBBs, + BasicBlock *SuccBB) { + // If threading to the same block as we come from, we would infinite loop. + if (SuccBB == BB) { + DEBUG(dbgs() << " Not threading across BB '" << BB->getName() + << "' - would thread to self!\n"); + return false; + } + + // If threading this would thread across a loop header, don't thread the edge. + // See the comments above FindLoopHeaders for justifications and caveats. + if (LoopHeaders.count(BB)) { + DEBUG(dbgs() << " Not threading across loop header BB '" << BB->getName() + << "' to dest BB '" << SuccBB->getName() + << "' - it might create an irreducible loop!\n"); + return false; + } + + unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB); + if (JumpThreadCost > Threshold) { + DEBUG(dbgs() << " Not threading BB '" << BB->getName() + << "' - Cost is too high: " << JumpThreadCost << "\n"); + return false; + } + + // And finally, do it! Start by factoring the predecessors is needed. + BasicBlock *PredBB; + if (PredBBs.size() == 1) + PredBB = PredBBs[0]; + else { + DEBUG(dbgs() << " Factoring out " << PredBBs.size() + << " common predecessors.\n"); + PredBB = SplitBlockPredecessors(BB, &PredBBs[0], PredBBs.size(), + ".thr_comm", this); + } + + // And finally, do it! + DEBUG(dbgs() << " Threading edge from '" << PredBB->getName() << "' to '" + << SuccBB->getName() << "' with cost: " << JumpThreadCost + << ", across block:\n " + << *BB << "\n"); + + LVI->threadEdge(PredBB, BB, SuccBB); + + // We are going to have to map operands from the original BB block to the new + // copy of the block 'NewBB'. If there are PHI nodes in BB, evaluate them to + // account for entry from PredBB. + DenseMap<Instruction*, Value*> ValueMapping; + + BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), + BB->getName()+".thread", + BB->getParent(), BB); + NewBB->moveAfter(PredBB); + + BasicBlock::iterator BI = BB->begin(); + for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI) + ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB); + + // Clone the non-phi instructions of BB into NewBB, keeping track of the + // mapping and using it to remap operands in the cloned instructions. + for (; !isa<TerminatorInst>(BI); ++BI) { + Instruction *New = BI->clone(); + New->setName(BI->getName()); + NewBB->getInstList().push_back(New); + ValueMapping[BI] = New; + + // Remap operands to patch up intra-block references. + for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i) + if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) { + DenseMap<Instruction*, Value*>::iterator I = ValueMapping.find(Inst); + if (I != ValueMapping.end()) + New->setOperand(i, I->second); + } + } + + // We didn't copy the terminator from BB over to NewBB, because there is now + // an unconditional jump to SuccBB. Insert the unconditional jump. + BranchInst::Create(SuccBB, NewBB); + + // Check to see if SuccBB has PHI nodes. If so, we need to add entries to the + // PHI nodes for NewBB now. + AddPHINodeEntriesForMappedBlock(SuccBB, BB, NewBB, ValueMapping); + + // If there were values defined in BB that are used outside the block, then we + // now have to update all uses of the value to use either the original value, + // the cloned value, or some PHI derived value. This can require arbitrary + // PHI insertion, of which we are prepared to do, clean these up now. + SSAUpdater SSAUpdate; + SmallVector<Use*, 16> UsesToRename; + for (BasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) { + // Scan all uses of this instruction to see if it is used outside of its + // block, and if so, record them in UsesToRename. + for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI != E; + ++UI) { + Instruction *User = cast<Instruction>(*UI); + if (PHINode *UserPN = dyn_cast<PHINode>(User)) { + if (UserPN->getIncomingBlock(UI) == BB) + continue; + } else if (User->getParent() == BB) + continue; + + UsesToRename.push_back(&UI.getUse()); + } + + // If there are no uses outside the block, we're done with this instruction. + if (UsesToRename.empty()) + continue; + + DEBUG(dbgs() << "JT: Renaming non-local uses of: " << *I << "\n"); + + // We found a use of I outside of BB. Rename all uses of I that are outside + // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks + // with the two values we know. + SSAUpdate.Initialize(I->getType(), I->getName()); + SSAUpdate.AddAvailableValue(BB, I); + SSAUpdate.AddAvailableValue(NewBB, ValueMapping[I]); + + while (!UsesToRename.empty()) + SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); + DEBUG(dbgs() << "\n"); + } + + + // Ok, NewBB is good to go. Update the terminator of PredBB to jump to + // NewBB instead of BB. This eliminates predecessors from BB, which requires + // us to simplify any PHI nodes in BB. + TerminatorInst *PredTerm = PredBB->getTerminator(); + for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e; ++i) + if (PredTerm->getSuccessor(i) == BB) { + BB->removePredecessor(PredBB, true); + PredTerm->setSuccessor(i, NewBB); + } + + // At this point, the IR is fully up to date and consistent. Do a quick scan + // over the new instructions and zap any that are constants or dead. This + // frequently happens because of phi translation. + SimplifyInstructionsInBlock(NewBB, TD); + + // Threaded an edge! + ++NumThreads; + return true; +} + +/// DuplicateCondBranchOnPHIIntoPred - PredBB contains an unconditional branch +/// to BB which contains an i1 PHI node and a conditional branch on that PHI. +/// If we can duplicate the contents of BB up into PredBB do so now, this +/// improves the odds that the branch will be on an analyzable instruction like +/// a compare. +bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, + const SmallVectorImpl<BasicBlock *> &PredBBs) { + assert(!PredBBs.empty() && "Can't handle an empty set"); + + // If BB is a loop header, then duplicating this block outside the loop would + // cause us to transform this into an irreducible loop, don't do this. + // See the comments above FindLoopHeaders for justifications and caveats. + if (LoopHeaders.count(BB)) { + DEBUG(dbgs() << " Not duplicating loop header '" << BB->getName() + << "' into predecessor block '" << PredBBs[0]->getName() + << "' - it might create an irreducible loop!\n"); + return false; + } + + unsigned DuplicationCost = getJumpThreadDuplicationCost(BB); + if (DuplicationCost > Threshold) { + DEBUG(dbgs() << " Not duplicating BB '" << BB->getName() + << "' - Cost is too high: " << DuplicationCost << "\n"); + return false; + } + + // And finally, do it! Start by factoring the predecessors is needed. + BasicBlock *PredBB; + if (PredBBs.size() == 1) + PredBB = PredBBs[0]; + else { + DEBUG(dbgs() << " Factoring out " << PredBBs.size() + << " common predecessors.\n"); + PredBB = SplitBlockPredecessors(BB, &PredBBs[0], PredBBs.size(), + ".thr_comm", this); + } + + // Okay, we decided to do this! Clone all the instructions in BB onto the end + // of PredBB. + DEBUG(dbgs() << " Duplicating block '" << BB->getName() << "' into end of '" + << PredBB->getName() << "' to eliminate branch on phi. Cost: " + << DuplicationCost << " block is:" << *BB << "\n"); + + // Unless PredBB ends with an unconditional branch, split the edge so that we + // can just clone the bits from BB into the end of the new PredBB. + BranchInst *OldPredBranch = dyn_cast<BranchInst>(PredBB->getTerminator()); + + if (OldPredBranch == 0 || !OldPredBranch->isUnconditional()) { + PredBB = SplitEdge(PredBB, BB, this); + OldPredBranch = cast<BranchInst>(PredBB->getTerminator()); + } + + // We are going to have to map operands from the original BB block into the + // PredBB block. Evaluate PHI nodes in BB. + DenseMap<Instruction*, Value*> ValueMapping; + + BasicBlock::iterator BI = BB->begin(); + for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI) + ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB); + + // Clone the non-phi instructions of BB into PredBB, keeping track of the + // mapping and using it to remap operands in the cloned instructions. + for (; BI != BB->end(); ++BI) { + Instruction *New = BI->clone(); + + // Remap operands to patch up intra-block references. + for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i) + if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) { + DenseMap<Instruction*, Value*>::iterator I = ValueMapping.find(Inst); + if (I != ValueMapping.end()) + New->setOperand(i, I->second); + } + + // If this instruction can be simplified after the operands are updated, + // just use the simplified value instead. This frequently happens due to + // phi translation. + if (Value *IV = SimplifyInstruction(New, TD)) { + delete New; + ValueMapping[BI] = IV; + } else { + // Otherwise, insert the new instruction into the block. + New->setName(BI->getName()); + PredBB->getInstList().insert(OldPredBranch, New); + ValueMapping[BI] = New; + } + } + + // Check to see if the targets of the branch had PHI nodes. If so, we need to + // add entries to the PHI nodes for branch from PredBB now. + BranchInst *BBBranch = cast<BranchInst>(BB->getTerminator()); + AddPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(0), BB, PredBB, + ValueMapping); + AddPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(1), BB, PredBB, + ValueMapping); + + // If there were values defined in BB that are used outside the block, then we + // now have to update all uses of the value to use either the original value, + // the cloned value, or some PHI derived value. This can require arbitrary + // PHI insertion, of which we are prepared to do, clean these up now. + SSAUpdater SSAUpdate; + SmallVector<Use*, 16> UsesToRename; + for (BasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) { + // Scan all uses of this instruction to see if it is used outside of its + // block, and if so, record them in UsesToRename. + for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI != E; + ++UI) { + Instruction *User = cast<Instruction>(*UI); + if (PHINode *UserPN = dyn_cast<PHINode>(User)) { + if (UserPN->getIncomingBlock(UI) == BB) + continue; + } else if (User->getParent() == BB) + continue; + + UsesToRename.push_back(&UI.getUse()); + } + + // If there are no uses outside the block, we're done with this instruction. + if (UsesToRename.empty()) + continue; + + DEBUG(dbgs() << "JT: Renaming non-local uses of: " << *I << "\n"); + + // We found a use of I outside of BB. Rename all uses of I that are outside + // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks + // with the two values we know. + SSAUpdate.Initialize(I->getType(), I->getName()); + SSAUpdate.AddAvailableValue(BB, I); + SSAUpdate.AddAvailableValue(PredBB, ValueMapping[I]); + + while (!UsesToRename.empty()) + SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); + DEBUG(dbgs() << "\n"); + } + + // PredBB no longer jumps to BB, remove entries in the PHI node for the edge + // that we nuked. + BB->removePredecessor(PredBB, true); + + // Remove the unconditional branch at the end of the PredBB block. + OldPredBranch->eraseFromParent(); + + ++NumDupes; + return true; +} + + diff --git a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp new file mode 100644 index 0000000..0786793 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp @@ -0,0 +1,789 @@ +//===-- LICM.cpp - Loop Invariant Code Motion Pass ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass performs loop invariant code motion, attempting to remove as much +// code from the body of a loop as possible. It does this by either hoisting +// code into the preheader block, or by sinking code to the exit blocks if it is +// safe. This pass also promotes must-aliased memory locations in the loop to +// live in registers, thus hoisting and sinking "invariant" loads and stores. +// +// This pass uses alias analysis for two purposes: +// +// 1. Moving loop invariant loads and calls out of loops. If we can determine +// that a load or call inside of a loop never aliases anything stored to, +// we can hoist it or sink it like any other instruction. +// 2. Scalar Promotion of Memory - If there is a store instruction inside of +// the loop, we try to move the store to happen AFTER the loop instead of +// inside of the loop. This can only happen if a few conditions are true: +// A. The pointer stored through is loop invariant +// B. There are no stores or loads in the loop which _may_ alias the +// pointer. There are no calls in the loop which mod/ref the pointer. +// If these conditions are true, we can promote the loads and stores in the +// loop of the pointer to use a temporary alloca'd variable. We then use +// the SSAUpdater to construct the appropriate SSA form for the value. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "licm" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Instructions.h" +#include "llvm/LLVMContext.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" +#include "llvm/Support/CFG.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Debug.h" +#include "llvm/ADT/Statistic.h" +#include <algorithm> +using namespace llvm; + +STATISTIC(NumSunk , "Number of instructions sunk out of loop"); +STATISTIC(NumHoisted , "Number of instructions hoisted out of loop"); +STATISTIC(NumMovedLoads, "Number of load insts hoisted or sunk"); +STATISTIC(NumMovedCalls, "Number of call insts hoisted or sunk"); +STATISTIC(NumPromoted , "Number of memory locations promoted to registers"); + +static cl::opt<bool> +DisablePromotion("disable-licm-promotion", cl::Hidden, + cl::desc("Disable memory promotion in LICM pass")); + +namespace { + struct LICM : public LoopPass { + static char ID; // Pass identification, replacement for typeid + LICM() : LoopPass(ID) { + initializeLICMPass(*PassRegistry::getPassRegistry()); + } + + virtual bool runOnLoop(Loop *L, LPPassManager &LPM); + + /// This transformation requires natural loop information & requires that + /// loop preheaders be inserted into the CFG... + /// + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired<DominatorTree>(); + AU.addRequired<LoopInfo>(); + AU.addRequiredID(LoopSimplifyID); + AU.addRequired<AliasAnalysis>(); + AU.addPreserved<AliasAnalysis>(); + AU.addPreserved("scalar-evolution"); + AU.addPreservedID(LoopSimplifyID); + } + + bool doFinalization() { + assert(LoopToAliasSetMap.empty() && "Didn't free loop alias sets"); + return false; + } + + private: + AliasAnalysis *AA; // Current AliasAnalysis information + LoopInfo *LI; // Current LoopInfo + DominatorTree *DT; // Dominator Tree for the current Loop. + + // State that is updated as we process loops. + bool Changed; // Set to true when we change anything. + BasicBlock *Preheader; // The preheader block of the current loop... + Loop *CurLoop; // The current loop we are working on... + AliasSetTracker *CurAST; // AliasSet information for the current loop... + DenseMap<Loop*, AliasSetTracker*> LoopToAliasSetMap; + + /// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info. + void cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L); + + /// deleteAnalysisValue - Simple Analysis hook. Delete value V from alias + /// set. + void deleteAnalysisValue(Value *V, Loop *L); + + /// SinkRegion - Walk the specified region of the CFG (defined by all blocks + /// dominated by the specified block, and that are in the current loop) in + /// reverse depth first order w.r.t the DominatorTree. This allows us to + /// visit uses before definitions, allowing us to sink a loop body in one + /// pass without iteration. + /// + void SinkRegion(DomTreeNode *N); + + /// HoistRegion - Walk the specified region of the CFG (defined by all + /// blocks dominated by the specified block, and that are in the current + /// loop) in depth first order w.r.t the DominatorTree. This allows us to + /// visit definitions before uses, allowing us to hoist a loop body in one + /// pass without iteration. + /// + void HoistRegion(DomTreeNode *N); + + /// inSubLoop - Little predicate that returns true if the specified basic + /// block is in a subloop of the current one, not the current one itself. + /// + bool inSubLoop(BasicBlock *BB) { + assert(CurLoop->contains(BB) && "Only valid if BB is IN the loop"); + return LI->getLoopFor(BB) != CurLoop; + } + + /// sink - When an instruction is found to only be used outside of the loop, + /// this function moves it to the exit blocks and patches up SSA form as + /// needed. + /// + void sink(Instruction &I); + + /// hoist - When an instruction is found to only use loop invariant operands + /// that is safe to hoist, this instruction is called to do the dirty work. + /// + void hoist(Instruction &I); + + /// isSafeToExecuteUnconditionally - Only sink or hoist an instruction if it + /// is not a trapping instruction or if it is a trapping instruction and is + /// guaranteed to execute. + /// + bool isSafeToExecuteUnconditionally(Instruction &I); + + /// pointerInvalidatedByLoop - Return true if the body of this loop may + /// store into the memory location pointed to by V. + /// + bool pointerInvalidatedByLoop(Value *V, uint64_t Size, + const MDNode *TBAAInfo) { + // Check to see if any of the basic blocks in CurLoop invalidate *V. + return CurAST->getAliasSetForPointer(V, Size, TBAAInfo).isMod(); + } + + bool canSinkOrHoistInst(Instruction &I); + bool isNotUsedInLoop(Instruction &I); + + void PromoteAliasSet(AliasSet &AS); + }; +} + +char LICM::ID = 0; +INITIALIZE_PASS_BEGIN(LICM, "licm", "Loop Invariant Code Motion", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_END(LICM, "licm", "Loop Invariant Code Motion", false, false) + +Pass *llvm::createLICMPass() { return new LICM(); } + +/// Hoist expressions out of the specified loop. Note, alias info for inner +/// loop is not preserved so it is not a good idea to run LICM multiple +/// times on one loop. +/// +bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { + Changed = false; + + // Get our Loop and Alias Analysis information... + LI = &getAnalysis<LoopInfo>(); + AA = &getAnalysis<AliasAnalysis>(); + DT = &getAnalysis<DominatorTree>(); + + CurAST = new AliasSetTracker(*AA); + // Collect Alias info from subloops. + for (Loop::iterator LoopItr = L->begin(), LoopItrE = L->end(); + LoopItr != LoopItrE; ++LoopItr) { + Loop *InnerL = *LoopItr; + AliasSetTracker *InnerAST = LoopToAliasSetMap[InnerL]; + assert(InnerAST && "Where is my AST?"); + + // What if InnerLoop was modified by other passes ? + CurAST->add(*InnerAST); + + // Once we've incorporated the inner loop's AST into ours, we don't need the + // subloop's anymore. + delete InnerAST; + LoopToAliasSetMap.erase(InnerL); + } + + CurLoop = L; + + // Get the preheader block to move instructions into... + Preheader = L->getLoopPreheader(); + + // Loop over the body of this loop, looking for calls, invokes, and stores. + // Because subloops have already been incorporated into AST, we skip blocks in + // subloops. + // + for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); + I != E; ++I) { + BasicBlock *BB = *I; + if (LI->getLoopFor(BB) == L) // Ignore blocks in subloops. + CurAST->add(*BB); // Incorporate the specified basic block + } + + // We want to visit all of the instructions in this loop... that are not parts + // of our subloops (they have already had their invariants hoisted out of + // their loop, into this loop, so there is no need to process the BODIES of + // the subloops). + // + // Traverse the body of the loop in depth first order on the dominator tree so + // that we are guaranteed to see definitions before we see uses. This allows + // us to sink instructions in one pass, without iteration. After sinking + // instructions, we perform another pass to hoist them out of the loop. + // + if (L->hasDedicatedExits()) + SinkRegion(DT->getNode(L->getHeader())); + if (Preheader) + HoistRegion(DT->getNode(L->getHeader())); + + // Now that all loop invariants have been removed from the loop, promote any + // memory references to scalars that we can. + if (!DisablePromotion && Preheader && L->hasDedicatedExits()) { + // Loop over all of the alias sets in the tracker object. + for (AliasSetTracker::iterator I = CurAST->begin(), E = CurAST->end(); + I != E; ++I) + PromoteAliasSet(*I); + } + + // Clear out loops state information for the next iteration + CurLoop = 0; + Preheader = 0; + + // If this loop is nested inside of another one, save the alias information + // for when we process the outer loop. + if (L->getParentLoop()) + LoopToAliasSetMap[L] = CurAST; + else + delete CurAST; + return Changed; +} + +/// SinkRegion - Walk the specified region of the CFG (defined by all blocks +/// dominated by the specified block, and that are in the current loop) in +/// reverse depth first order w.r.t the DominatorTree. This allows us to visit +/// uses before definitions, allowing us to sink a loop body in one pass without +/// iteration. +/// +void LICM::SinkRegion(DomTreeNode *N) { + assert(N != 0 && "Null dominator tree node?"); + BasicBlock *BB = N->getBlock(); + + // If this subregion is not in the top level loop at all, exit. + if (!CurLoop->contains(BB)) return; + + // We are processing blocks in reverse dfo, so process children first. + const std::vector<DomTreeNode*> &Children = N->getChildren(); + for (unsigned i = 0, e = Children.size(); i != e; ++i) + SinkRegion(Children[i]); + + // Only need to process the contents of this block if it is not part of a + // subloop (which would already have been processed). + if (inSubLoop(BB)) return; + + for (BasicBlock::iterator II = BB->end(); II != BB->begin(); ) { + Instruction &I = *--II; + + // If the instruction is dead, we would try to sink it because it isn't used + // in the loop, instead, just delete it. + if (isInstructionTriviallyDead(&I)) { + DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n'); + ++II; + CurAST->deleteValue(&I); + I.eraseFromParent(); + Changed = true; + continue; + } + + // Check to see if we can sink this instruction to the exit blocks + // of the loop. We can do this if the all users of the instruction are + // outside of the loop. In this case, it doesn't even matter if the + // operands of the instruction are loop invariant. + // + if (isNotUsedInLoop(I) && canSinkOrHoistInst(I)) { + ++II; + sink(I); + } + } +} + +/// HoistRegion - Walk the specified region of the CFG (defined by all blocks +/// dominated by the specified block, and that are in the current loop) in depth +/// first order w.r.t the DominatorTree. This allows us to visit definitions +/// before uses, allowing us to hoist a loop body in one pass without iteration. +/// +void LICM::HoistRegion(DomTreeNode *N) { + assert(N != 0 && "Null dominator tree node?"); + BasicBlock *BB = N->getBlock(); + + // If this subregion is not in the top level loop at all, exit. + if (!CurLoop->contains(BB)) return; + + // Only need to process the contents of this block if it is not part of a + // subloop (which would already have been processed). + if (!inSubLoop(BB)) + for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ) { + Instruction &I = *II++; + + // Try constant folding this instruction. If all the operands are + // constants, it is technically hoistable, but it would be better to just + // fold it. + if (Constant *C = ConstantFoldInstruction(&I)) { + DEBUG(dbgs() << "LICM folding inst: " << I << " --> " << *C << '\n'); + CurAST->copyValue(&I, C); + CurAST->deleteValue(&I); + I.replaceAllUsesWith(C); + I.eraseFromParent(); + continue; + } + + // Try hoisting the instruction out to the preheader. We can only do this + // if all of the operands of the instruction are loop invariant and if it + // is safe to hoist the instruction. + // + if (CurLoop->hasLoopInvariantOperands(&I) && canSinkOrHoistInst(I) && + isSafeToExecuteUnconditionally(I)) + hoist(I); + } + + const std::vector<DomTreeNode*> &Children = N->getChildren(); + for (unsigned i = 0, e = Children.size(); i != e; ++i) + HoistRegion(Children[i]); +} + +/// canSinkOrHoistInst - Return true if the hoister and sinker can handle this +/// instruction. +/// +bool LICM::canSinkOrHoistInst(Instruction &I) { + // Loads have extra constraints we have to verify before we can hoist them. + if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { + if (LI->isVolatile()) + return false; // Don't hoist volatile loads! + + // Loads from constant memory are always safe to move, even if they end up + // in the same alias set as something that ends up being modified. + if (AA->pointsToConstantMemory(LI->getOperand(0))) + return true; + + // Don't hoist loads which have may-aliased stores in loop. + uint64_t Size = 0; + if (LI->getType()->isSized()) + Size = AA->getTypeStoreSize(LI->getType()); + return !pointerInvalidatedByLoop(LI->getOperand(0), Size, + LI->getMetadata(LLVMContext::MD_tbaa)); + } else if (CallInst *CI = dyn_cast<CallInst>(&I)) { + // Handle obvious cases efficiently. + AliasAnalysis::ModRefBehavior Behavior = AA->getModRefBehavior(CI); + if (Behavior == AliasAnalysis::DoesNotAccessMemory) + return true; + if (AliasAnalysis::onlyReadsMemory(Behavior)) { + // If this call only reads from memory and there are no writes to memory + // in the loop, we can hoist or sink the call as appropriate. + bool FoundMod = false; + for (AliasSetTracker::iterator I = CurAST->begin(), E = CurAST->end(); + I != E; ++I) { + AliasSet &AS = *I; + if (!AS.isForwardingAliasSet() && AS.isMod()) { + FoundMod = true; + break; + } + } + if (!FoundMod) return true; + } + + // FIXME: This should use mod/ref information to see if we can hoist or sink + // the call. + + return false; + } + + // Otherwise these instructions are hoistable/sinkable + return isa<BinaryOperator>(I) || isa<CastInst>(I) || + isa<SelectInst>(I) || isa<GetElementPtrInst>(I) || isa<CmpInst>(I) || + isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || + isa<ShuffleVectorInst>(I); +} + +/// isNotUsedInLoop - Return true if the only users of this instruction are +/// outside of the loop. If this is true, we can sink the instruction to the +/// exit blocks of the loop. +/// +bool LICM::isNotUsedInLoop(Instruction &I) { + for (Value::use_iterator UI = I.use_begin(), E = I.use_end(); UI != E; ++UI) { + Instruction *User = cast<Instruction>(*UI); + if (PHINode *PN = dyn_cast<PHINode>(User)) { + // PHI node uses occur in predecessor blocks! + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + if (PN->getIncomingValue(i) == &I) + if (CurLoop->contains(PN->getIncomingBlock(i))) + return false; + } else if (CurLoop->contains(User)) { + return false; + } + } + return true; +} + + +/// sink - When an instruction is found to only be used outside of the loop, +/// this function moves it to the exit blocks and patches up SSA form as needed. +/// This method is guaranteed to remove the original instruction from its +/// position, and may either delete it or move it to outside of the loop. +/// +void LICM::sink(Instruction &I) { + DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n"); + + SmallVector<BasicBlock*, 8> ExitBlocks; + CurLoop->getUniqueExitBlocks(ExitBlocks); + + if (isa<LoadInst>(I)) ++NumMovedLoads; + else if (isa<CallInst>(I)) ++NumMovedCalls; + ++NumSunk; + Changed = true; + + // The case where there is only a single exit node of this loop is common + // enough that we handle it as a special (more efficient) case. It is more + // efficient to handle because there are no PHI nodes that need to be placed. + if (ExitBlocks.size() == 1) { + if (!DT->dominates(I.getParent(), ExitBlocks[0])) { + // Instruction is not used, just delete it. + CurAST->deleteValue(&I); + // If I has users in unreachable blocks, eliminate. + // If I is not void type then replaceAllUsesWith undef. + // This allows ValueHandlers and custom metadata to adjust itself. + if (!I.use_empty()) + I.replaceAllUsesWith(UndefValue::get(I.getType())); + I.eraseFromParent(); + } else { + // Move the instruction to the start of the exit block, after any PHI + // nodes in it. + I.moveBefore(ExitBlocks[0]->getFirstNonPHI()); + + // This instruction is no longer in the AST for the current loop, because + // we just sunk it out of the loop. If we just sunk it into an outer + // loop, we will rediscover the operation when we process it. + CurAST->deleteValue(&I); + } + return; + } + + if (ExitBlocks.empty()) { + // The instruction is actually dead if there ARE NO exit blocks. + CurAST->deleteValue(&I); + // If I has users in unreachable blocks, eliminate. + // If I is not void type then replaceAllUsesWith undef. + // This allows ValueHandlers and custom metadata to adjust itself. + if (!I.use_empty()) + I.replaceAllUsesWith(UndefValue::get(I.getType())); + I.eraseFromParent(); + return; + } + + // Otherwise, if we have multiple exits, use the SSAUpdater to do all of the + // hard work of inserting PHI nodes as necessary. + SmallVector<PHINode*, 8> NewPHIs; + SSAUpdater SSA(&NewPHIs); + + if (!I.use_empty()) + SSA.Initialize(I.getType(), I.getName()); + + // Insert a copy of the instruction in each exit block of the loop that is + // dominated by the instruction. Each exit block is known to only be in the + // ExitBlocks list once. + BasicBlock *InstOrigBB = I.getParent(); + unsigned NumInserted = 0; + + for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) { + BasicBlock *ExitBlock = ExitBlocks[i]; + + if (!DT->dominates(InstOrigBB, ExitBlock)) + continue; + + // Insert the code after the last PHI node. + BasicBlock::iterator InsertPt = ExitBlock->getFirstNonPHI(); + + // If this is the first exit block processed, just move the original + // instruction, otherwise clone the original instruction and insert + // the copy. + Instruction *New; + if (NumInserted++ == 0) { + I.moveBefore(InsertPt); + New = &I; + } else { + New = I.clone(); + if (!I.getName().empty()) + New->setName(I.getName()+".le"); + ExitBlock->getInstList().insert(InsertPt, New); + } + + // Now that we have inserted the instruction, inform SSAUpdater. + if (!I.use_empty()) + SSA.AddAvailableValue(ExitBlock, New); + } + + // If the instruction doesn't dominate any exit blocks, it must be dead. + if (NumInserted == 0) { + CurAST->deleteValue(&I); + if (!I.use_empty()) + I.replaceAllUsesWith(UndefValue::get(I.getType())); + I.eraseFromParent(); + return; + } + + // Next, rewrite uses of the instruction, inserting PHI nodes as needed. + for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE; ) { + // Grab the use before incrementing the iterator. + Use &U = UI.getUse(); + // Increment the iterator before removing the use from the list. + ++UI; + SSA.RewriteUseAfterInsertions(U); + } + + // Update CurAST for NewPHIs if I had pointer type. + if (I.getType()->isPointerTy()) + for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i) + CurAST->copyValue(&I, NewPHIs[i]); + + // Finally, remove the instruction from CurAST. It is no longer in the loop. + CurAST->deleteValue(&I); +} + +/// hoist - When an instruction is found to only use loop invariant operands +/// that is safe to hoist, this instruction is called to do the dirty work. +/// +void LICM::hoist(Instruction &I) { + DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " + << I << "\n"); + + // Move the new node to the Preheader, before its terminator. + I.moveBefore(Preheader->getTerminator()); + + if (isa<LoadInst>(I)) ++NumMovedLoads; + else if (isa<CallInst>(I)) ++NumMovedCalls; + ++NumHoisted; + Changed = true; +} + +/// isSafeToExecuteUnconditionally - Only sink or hoist an instruction if it is +/// not a trapping instruction or if it is a trapping instruction and is +/// guaranteed to execute. +/// +bool LICM::isSafeToExecuteUnconditionally(Instruction &Inst) { + // If it is not a trapping instruction, it is always safe to hoist. + if (Inst.isSafeToSpeculativelyExecute()) + return true; + + // Otherwise we have to check to make sure that the instruction dominates all + // of the exit blocks. If it doesn't, then there is a path out of the loop + // which does not execute this instruction, so we can't hoist it. + + // If the instruction is in the header block for the loop (which is very + // common), it is always guaranteed to dominate the exit blocks. Since this + // is a common case, and can save some work, check it now. + if (Inst.getParent() == CurLoop->getHeader()) + return true; + + // Get the exit blocks for the current loop. + SmallVector<BasicBlock*, 8> ExitBlocks; + CurLoop->getExitBlocks(ExitBlocks); + + // Verify that the block dominates each of the exit blocks of the loop. + for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) + if (!DT->dominates(Inst.getParent(), ExitBlocks[i])) + return false; + + return true; +} + +namespace { + class LoopPromoter : public LoadAndStorePromoter { + Value *SomePtr; // Designated pointer to store to. + SmallPtrSet<Value*, 4> &PointerMustAliases; + SmallVectorImpl<BasicBlock*> &LoopExitBlocks; + AliasSetTracker &AST; + public: + LoopPromoter(Value *SP, + const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S, + SmallPtrSet<Value*, 4> &PMA, + SmallVectorImpl<BasicBlock*> &LEB, AliasSetTracker &ast) + : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA), + LoopExitBlocks(LEB), AST(ast) {} + + virtual bool isInstInList(Instruction *I, + const SmallVectorImpl<Instruction*> &) const { + Value *Ptr; + if (LoadInst *LI = dyn_cast<LoadInst>(I)) + Ptr = LI->getOperand(0); + else + Ptr = cast<StoreInst>(I)->getPointerOperand(); + return PointerMustAliases.count(Ptr); + } + + virtual void doExtraRewritesBeforeFinalDeletion() const { + // Insert stores after in the loop exit blocks. Each exit block gets a + // store of the live-out values that feed them. Since we've already told + // the SSA updater about the defs in the loop and the preheader + // definition, it is all set and we can start using it. + for (unsigned i = 0, e = LoopExitBlocks.size(); i != e; ++i) { + BasicBlock *ExitBlock = LoopExitBlocks[i]; + Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock); + Instruction *InsertPos = ExitBlock->getFirstNonPHI(); + new StoreInst(LiveInValue, SomePtr, InsertPos); + } + } + + virtual void replaceLoadWithValue(LoadInst *LI, Value *V) const { + // Update alias analysis. + AST.copyValue(LI, V); + } + virtual void instructionDeleted(Instruction *I) const { + AST.deleteValue(I); + } + }; +} // end anon namespace + +/// PromoteAliasSet - Try to promote memory values to scalars by sinking +/// stores out of the loop and moving loads to before the loop. We do this by +/// looping over the stores in the loop, looking for stores to Must pointers +/// which are loop invariant. +/// +void LICM::PromoteAliasSet(AliasSet &AS) { + // We can promote this alias set if it has a store, if it is a "Must" alias + // set, if the pointer is loop invariant, and if we are not eliminating any + // volatile loads or stores. + if (AS.isForwardingAliasSet() || !AS.isMod() || !AS.isMustAlias() || + AS.isVolatile() || !CurLoop->isLoopInvariant(AS.begin()->getValue())) + return; + + assert(!AS.empty() && + "Must alias set should have at least one pointer element in it!"); + Value *SomePtr = AS.begin()->getValue(); + + // It isn't safe to promote a load/store from the loop if the load/store is + // conditional. For example, turning: + // + // for () { if (c) *P += 1; } + // + // into: + // + // tmp = *P; for () { if (c) tmp +=1; } *P = tmp; + // + // is not safe, because *P may only be valid to access if 'c' is true. + // + // It is safe to promote P if all uses are direct load/stores and if at + // least one is guaranteed to be executed. + bool GuaranteedToExecute = false; + + SmallVector<Instruction*, 64> LoopUses; + SmallPtrSet<Value*, 4> PointerMustAliases; + + // Check that all of the pointers in the alias set have the same type. We + // cannot (yet) promote a memory location that is loaded and stored in + // different sizes. + for (AliasSet::iterator ASI = AS.begin(), E = AS.end(); ASI != E; ++ASI) { + Value *ASIV = ASI->getValue(); + PointerMustAliases.insert(ASIV); + + // Check that all of the pointers in the alias set have the same type. We + // cannot (yet) promote a memory location that is loaded and stored in + // different sizes. + if (SomePtr->getType() != ASIV->getType()) + return; + + for (Value::use_iterator UI = ASIV->use_begin(), UE = ASIV->use_end(); + UI != UE; ++UI) { + // Ignore instructions that are outside the loop. + Instruction *Use = dyn_cast<Instruction>(*UI); + if (!Use || !CurLoop->contains(Use)) + continue; + + // If there is an non-load/store instruction in the loop, we can't promote + // it. + if (isa<LoadInst>(Use)) + assert(!cast<LoadInst>(Use)->isVolatile() && "AST broken"); + else if (isa<StoreInst>(Use)) { + // Stores *of* the pointer are not interesting, only stores *to* the + // pointer. + if (Use->getOperand(1) != ASIV) + continue; + assert(!cast<StoreInst>(Use)->isVolatile() && "AST broken"); + } else + return; // Not a load or store. + + if (!GuaranteedToExecute) + GuaranteedToExecute = isSafeToExecuteUnconditionally(*Use); + + LoopUses.push_back(Use); + } + } + + // If there isn't a guaranteed-to-execute instruction, we can't promote. + if (!GuaranteedToExecute) + return; + + // Otherwise, this is safe to promote, lets do it! + DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " <<*SomePtr<<'\n'); + Changed = true; + ++NumPromoted; + + SmallVector<BasicBlock*, 8> ExitBlocks; + CurLoop->getUniqueExitBlocks(ExitBlocks); + + // We use the SSAUpdater interface to insert phi nodes as required. + SmallVector<PHINode*, 16> NewPHIs; + SSAUpdater SSA(&NewPHIs); + LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks, + *CurAST); + + // Set up the preheader to have a definition of the value. It is the live-out + // value from the preheader that uses in the loop will use. + LoadInst *PreheaderLoad = + new LoadInst(SomePtr, SomePtr->getName()+".promoted", + Preheader->getTerminator()); + SSA.AddAvailableValue(Preheader, PreheaderLoad); + + // Copy any value stored to or loaded from a must-alias of the pointer. + if (PreheaderLoad->getType()->isPointerTy()) { + Value *SomeValue; + if (LoadInst *LI = dyn_cast<LoadInst>(LoopUses[0])) + SomeValue = LI; + else + SomeValue = cast<StoreInst>(LoopUses[0])->getValueOperand(); + + CurAST->copyValue(SomeValue, PreheaderLoad); + } + + // Rewrite all the loads in the loop and remember all the definitions from + // stores in the loop. + Promoter.run(LoopUses); + + // If the preheader load is itself a pointer, we need to tell alias analysis + // about the new pointer we created in the preheader block and about any PHI + // nodes that just got inserted. + if (PreheaderLoad->getType()->isPointerTy()) { + for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i) + CurAST->copyValue(PreheaderLoad, NewPHIs[i]); + } + + // fwew, we're done! +} + + +/// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info. +void LICM::cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L) { + AliasSetTracker *AST = LoopToAliasSetMap.lookup(L); + if (!AST) + return; + + AST->copyValue(From, To); +} + +/// deleteAnalysisValue - Simple Analysis hook. Delete value V from alias +/// set. +void LICM::deleteAnalysisValue(Value *V, Loop *L) { + AliasSetTracker *AST = LoopToAliasSetMap.lookup(L); + if (!AST) + return; + + AST->deleteValue(V); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp new file mode 100644 index 0000000..6d1d344 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp @@ -0,0 +1,239 @@ +//===- LoopDeletion.cpp - Dead Loop Deletion Pass ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the Dead Loop Deletion Pass. This pass is responsible +// for eliminating loops with non-infinite computable trip counts that have no +// side effects or volatile instructions, and do not contribute to the +// computation of the function's return value. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "loop-delete" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/SmallVector.h" +using namespace llvm; + +STATISTIC(NumDeleted, "Number of loops deleted"); + +namespace { + class LoopDeletion : public LoopPass { + public: + static char ID; // Pass ID, replacement for typeid + LoopDeletion() : LoopPass(ID) { + initializeLoopDeletionPass(*PassRegistry::getPassRegistry()); + } + + // Possibly eliminate loop L if it is dead. + bool runOnLoop(Loop* L, LPPassManager& LPM); + + bool IsLoopDead(Loop* L, SmallVector<BasicBlock*, 4>& exitingBlocks, + SmallVector<BasicBlock*, 4>& exitBlocks, + bool &Changed, BasicBlock *Preheader); + + virtual void getAnalysisUsage(AnalysisUsage& AU) const { + AU.addRequired<DominatorTree>(); + AU.addRequired<LoopInfo>(); + AU.addRequired<ScalarEvolution>(); + AU.addRequiredID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + + AU.addPreserved<ScalarEvolution>(); + AU.addPreserved<DominatorTree>(); + AU.addPreserved<LoopInfo>(); + AU.addPreservedID(LoopSimplifyID); + AU.addPreservedID(LCSSAID); + } + }; +} + +char LoopDeletion::ID = 0; +INITIALIZE_PASS_BEGIN(LoopDeletion, "loop-deletion", + "Delete dead loops", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_END(LoopDeletion, "loop-deletion", + "Delete dead loops", false, false) + +Pass* llvm::createLoopDeletionPass() { + return new LoopDeletion(); +} + +/// IsLoopDead - Determined if a loop is dead. This assumes that we've already +/// checked for unique exit and exiting blocks, and that the code is in LCSSA +/// form. +bool LoopDeletion::IsLoopDead(Loop* L, + SmallVector<BasicBlock*, 4>& exitingBlocks, + SmallVector<BasicBlock*, 4>& exitBlocks, + bool &Changed, BasicBlock *Preheader) { + BasicBlock* exitingBlock = exitingBlocks[0]; + BasicBlock* exitBlock = exitBlocks[0]; + + // Make sure that all PHI entries coming from the loop are loop invariant. + // Because the code is in LCSSA form, any values used outside of the loop + // must pass through a PHI in the exit block, meaning that this check is + // sufficient to guarantee that no loop-variant values are used outside + // of the loop. + BasicBlock::iterator BI = exitBlock->begin(); + while (PHINode* P = dyn_cast<PHINode>(BI)) { + Value* incoming = P->getIncomingValueForBlock(exitingBlock); + if (Instruction* I = dyn_cast<Instruction>(incoming)) + if (!L->makeLoopInvariant(I, Changed, Preheader->getTerminator())) + return false; + + ++BI; + } + + // Make sure that no instructions in the block have potential side-effects. + // This includes instructions that could write to memory, and loads that are + // marked volatile. This could be made more aggressive by using aliasing + // information to identify readonly and readnone calls. + for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end(); + LI != LE; ++LI) { + for (BasicBlock::iterator BI = (*LI)->begin(), BE = (*LI)->end(); + BI != BE; ++BI) { + if (BI->mayHaveSideEffects()) + return false; + } + } + + return true; +} + +/// runOnLoop - Remove dead loops, by which we mean loops that do not impact the +/// observable behavior of the program other than finite running time. Note +/// we do ensure that this never remove a loop that might be infinite, as doing +/// so could change the halting/non-halting nature of a program. +/// NOTE: This entire process relies pretty heavily on LoopSimplify and LCSSA +/// in order to make various safety checks work. +bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) { + // We can only remove the loop if there is a preheader that we can + // branch from after removing it. + BasicBlock* preheader = L->getLoopPreheader(); + if (!preheader) + return false; + + // If LoopSimplify form is not available, stay out of trouble. + if (!L->hasDedicatedExits()) + return false; + + // We can't remove loops that contain subloops. If the subloops were dead, + // they would already have been removed in earlier executions of this pass. + if (L->begin() != L->end()) + return false; + + SmallVector<BasicBlock*, 4> exitingBlocks; + L->getExitingBlocks(exitingBlocks); + + SmallVector<BasicBlock*, 4> exitBlocks; + L->getUniqueExitBlocks(exitBlocks); + + // We require that the loop only have a single exit block. Otherwise, we'd + // be in the situation of needing to be able to solve statically which exit + // block will be branched to, or trying to preserve the branching logic in + // a loop invariant manner. + if (exitBlocks.size() != 1) + return false; + + // Loops with multiple exits are too complicated to handle correctly. + if (exitingBlocks.size() != 1) + return false; + + // Finally, we have to check that the loop really is dead. + bool Changed = false; + if (!IsLoopDead(L, exitingBlocks, exitBlocks, Changed, preheader)) + return Changed; + + // Don't remove loops for which we can't solve the trip count. + // They could be infinite, in which case we'd be changing program behavior. + ScalarEvolution& SE = getAnalysis<ScalarEvolution>(); + const SCEV *S = SE.getMaxBackedgeTakenCount(L); + if (isa<SCEVCouldNotCompute>(S)) + return Changed; + + // Now that we know the removal is safe, remove the loop by changing the + // branch from the preheader to go to the single exit block. + BasicBlock* exitBlock = exitBlocks[0]; + BasicBlock* exitingBlock = exitingBlocks[0]; + + // Because we're deleting a large chunk of code at once, the sequence in which + // we remove things is very important to avoid invalidation issues. Don't + // mess with this unless you have good reason and know what you're doing. + + // Tell ScalarEvolution that the loop is deleted. Do this before + // deleting the loop so that ScalarEvolution can look at the loop + // to determine what it needs to clean up. + SE.forgetLoop(L); + + // Connect the preheader directly to the exit block. + TerminatorInst* TI = preheader->getTerminator(); + TI->replaceUsesOfWith(L->getHeader(), exitBlock); + + // Rewrite phis in the exit block to get their inputs from + // the preheader instead of the exiting block. + BasicBlock::iterator BI = exitBlock->begin(); + while (PHINode* P = dyn_cast<PHINode>(BI)) { + P->replaceUsesOfWith(exitingBlock, preheader); + ++BI; + } + + // Update the dominator tree and remove the instructions and blocks that will + // be deleted from the reference counting scheme. + DominatorTree& DT = getAnalysis<DominatorTree>(); + SmallVector<DomTreeNode*, 8> ChildNodes; + for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end(); + LI != LE; ++LI) { + // Move all of the block's children to be children of the preheader, which + // allows us to remove the domtree entry for the block. + ChildNodes.insert(ChildNodes.begin(), DT[*LI]->begin(), DT[*LI]->end()); + for (SmallVector<DomTreeNode*, 8>::iterator DI = ChildNodes.begin(), + DE = ChildNodes.end(); DI != DE; ++DI) { + DT.changeImmediateDominator(*DI, DT[preheader]); + } + + ChildNodes.clear(); + DT.eraseNode(*LI); + + // Remove the block from the reference counting scheme, so that we can + // delete it freely later. + (*LI)->dropAllReferences(); + } + + // Erase the instructions and the blocks without having to worry + // about ordering because we already dropped the references. + // NOTE: This iteration is safe because erasing the block does not remove its + // entry from the loop's block list. We do that in the next section. + for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end(); + LI != LE; ++LI) + (*LI)->eraseFromParent(); + + // Finally, the blocks from loopinfo. This has to happen late because + // otherwise our loop iterators won't work. + LoopInfo& loopInfo = getAnalysis<LoopInfo>(); + SmallPtrSet<BasicBlock*, 8> blocks; + blocks.insert(L->block_begin(), L->block_end()); + for (SmallPtrSet<BasicBlock*,8>::iterator I = blocks.begin(), + E = blocks.end(); I != E; ++I) + loopInfo.removeBlock(*I); + + // The last step is to inform the loop pass manager that we've + // eliminated this loop. + LPM.deleteLoopFromQueue(L); + Changed = true; + + ++NumDeleted; + + return Changed; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp new file mode 100644 index 0000000..d7fa149 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -0,0 +1,594 @@ +//===-- LoopIdiomRecognize.cpp - Loop idiom recognition -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass implements an idiom recognizer that transforms simple loops into a +// non-loop form. In cases that this kicks in, it can be a significant +// performance win. +// +//===----------------------------------------------------------------------===// +// +// TODO List: +// +// Future loop memory idioms to recognize: +// memcmp, memmove, strlen, etc. +// Future floating point idioms to recognize in -ffast-math mode: +// fpowi +// Future integer operation idioms to recognize: +// ctpop, ctlz, cttz +// +// Beware that isel's default lowering for ctpop is highly inefficient for +// i64 and larger types when i64 is legal and the value has few bits set. It +// would be good to enhance isel to emit a loop for ctpop in this case. +// +// We should enhance the memset/memcpy recognition to handle multiple stores in +// the loop. This would handle things like: +// void foo(_Complex float *P) +// for (i) { __real__(*P) = 0; __imag__(*P) = 0; } +// +// This could recognize common matrix multiplies and dot product idioms and +// replace them with calls to BLAS (if linked in??). +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "loop-idiom" +#include "llvm/Transforms/Scalar.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Module.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/IRBuilder.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumMemSet, "Number of memset's formed from loop stores"); +STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores"); + +namespace { + class LoopIdiomRecognize : public LoopPass { + Loop *CurLoop; + const TargetData *TD; + DominatorTree *DT; + ScalarEvolution *SE; + TargetLibraryInfo *TLI; + public: + static char ID; + explicit LoopIdiomRecognize() : LoopPass(ID) { + initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry()); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl<BasicBlock*> &ExitBlocks); + + bool processLoopStore(StoreInst *SI, const SCEV *BECount); + bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount); + + bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize, + unsigned StoreAlignment, + Value *SplatValue, Instruction *TheStore, + const SCEVAddRecExpr *Ev, + const SCEV *BECount); + bool processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, + const SCEVAddRecExpr *StoreEv, + const SCEVAddRecExpr *LoadEv, + const SCEV *BECount); + + /// This transformation requires natural loop information & requires that + /// loop preheaders be inserted into the CFG. + /// + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<LoopInfo>(); + AU.addPreserved<LoopInfo>(); + AU.addRequiredID(LoopSimplifyID); + AU.addPreservedID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + AU.addPreservedID(LCSSAID); + AU.addRequired<AliasAnalysis>(); + AU.addPreserved<AliasAnalysis>(); + AU.addRequired<ScalarEvolution>(); + AU.addPreserved<ScalarEvolution>(); + AU.addPreserved<DominatorTree>(); + AU.addRequired<DominatorTree>(); + AU.addRequired<TargetLibraryInfo>(); + } + }; +} + +char LoopIdiomRecognize::ID = 0; +INITIALIZE_PASS_BEGIN(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", + false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_END(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", + false, false) + +Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognize(); } + +/// DeleteDeadInstruction - Delete this instruction. Before we do, go through +/// and zero out all the operands of this instruction. If any of them become +/// dead, delete them and the computation tree that feeds them. +/// +static void DeleteDeadInstruction(Instruction *I, ScalarEvolution &SE) { + SmallVector<Instruction*, 32> NowDeadInsts; + + NowDeadInsts.push_back(I); + + // Before we touch this instruction, remove it from SE! + do { + Instruction *DeadInst = NowDeadInsts.pop_back_val(); + + // This instruction is dead, zap it, in stages. Start by removing it from + // SCEV. + SE.forgetValue(DeadInst); + + for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) { + Value *Op = DeadInst->getOperand(op); + DeadInst->setOperand(op, 0); + + // If this operand just became dead, add it to the NowDeadInsts list. + if (!Op->use_empty()) continue; + + if (Instruction *OpI = dyn_cast<Instruction>(Op)) + if (isInstructionTriviallyDead(OpI)) + NowDeadInsts.push_back(OpI); + } + + DeadInst->eraseFromParent(); + + } while (!NowDeadInsts.empty()); +} + +bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { + CurLoop = L; + + // The trip count of the loop must be analyzable. + SE = &getAnalysis<ScalarEvolution>(); + if (!SE->hasLoopInvariantBackedgeTakenCount(L)) + return false; + const SCEV *BECount = SE->getBackedgeTakenCount(L); + if (isa<SCEVCouldNotCompute>(BECount)) return false; + + // If this loop executes exactly one time, then it should be peeled, not + // optimized by this pass. + if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount)) + if (BECst->getValue()->getValue() == 0) + return false; + + // We require target data for now. + TD = getAnalysisIfAvailable<TargetData>(); + if (TD == 0) return false; + + DT = &getAnalysis<DominatorTree>(); + LoopInfo &LI = getAnalysis<LoopInfo>(); + TLI = &getAnalysis<TargetLibraryInfo>(); + + SmallVector<BasicBlock*, 8> ExitBlocks; + CurLoop->getUniqueExitBlocks(ExitBlocks); + + DEBUG(dbgs() << "loop-idiom Scanning: F[" + << L->getHeader()->getParent()->getName() + << "] Loop %" << L->getHeader()->getName() << "\n"); + + bool MadeChange = false; + // Scan all the blocks in the loop that are not in subloops. + for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E; + ++BI) { + // Ignore blocks in subloops. + if (LI.getLoopFor(*BI) != CurLoop) + continue; + + MadeChange |= runOnLoopBlock(*BI, BECount, ExitBlocks); + } + return MadeChange; +} + +/// runOnLoopBlock - Process the specified block, which lives in a counted loop +/// with the specified backedge count. This block is known to be in the current +/// loop and not in any subloops. +bool LoopIdiomRecognize::runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl<BasicBlock*> &ExitBlocks) { + // We can only promote stores in this block if they are unconditionally + // executed in the loop. For a block to be unconditionally executed, it has + // to dominate all the exit blocks of the loop. Verify this now. + for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) + if (!DT->dominates(BB, ExitBlocks[i])) + return false; + + bool MadeChange = false; + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) { + Instruction *Inst = I++; + // Look for store instructions, which may be optimized to memset/memcpy. + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + WeakVH InstPtr(I); + if (!processLoopStore(SI, BECount)) continue; + MadeChange = true; + + // If processing the store invalidated our iterator, start over from the + // top of the block. + if (InstPtr == 0) + I = BB->begin(); + continue; + } + + // Look for memset instructions, which may be optimized to a larger memset. + if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) { + WeakVH InstPtr(I); + if (!processLoopMemSet(MSI, BECount)) continue; + MadeChange = true; + + // If processing the memset invalidated our iterator, start over from the + // top of the block. + if (InstPtr == 0) + I = BB->begin(); + continue; + } + } + + return MadeChange; +} + + +/// processLoopStore - See if this store can be promoted to a memset or memcpy. +bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) { + if (SI->isVolatile()) return false; + + Value *StoredVal = SI->getValueOperand(); + Value *StorePtr = SI->getPointerOperand(); + + // Reject stores that are so large that they overflow an unsigned. + uint64_t SizeInBits = TD->getTypeSizeInBits(StoredVal->getType()); + if ((SizeInBits & 7) || (SizeInBits >> 32) != 0) + return false; + + // See if the pointer expression is an AddRec like {base,+,1} on the current + // loop, which indicates a strided store. If we have something else, it's a + // random store we can't handle. + const SCEVAddRecExpr *StoreEv = + dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); + if (StoreEv == 0 || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine()) + return false; + + // Check to see if the stride matches the size of the store. If so, then we + // know that every byte is touched in the loop. + unsigned StoreSize = (unsigned)SizeInBits >> 3; + const SCEVConstant *Stride = dyn_cast<SCEVConstant>(StoreEv->getOperand(1)); + + // TODO: Could also handle negative stride here someday, that will require the + // validity check in mayLoopAccessLocation to be updated though. + if (Stride == 0 || StoreSize != Stride->getValue()->getValue()) + return false; + + // See if we can optimize just this store in isolation. + if (processLoopStridedStore(StorePtr, StoreSize, SI->getAlignment(), + StoredVal, SI, StoreEv, BECount)) + return true; + + // If the stored value is a strided load in the same loop with the same stride + // this this may be transformable into a memcpy. This kicks in for stuff like + // for (i) A[i] = B[i]; + if (LoadInst *LI = dyn_cast<LoadInst>(StoredVal)) { + const SCEVAddRecExpr *LoadEv = + dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getOperand(0))); + if (LoadEv && LoadEv->getLoop() == CurLoop && LoadEv->isAffine() && + StoreEv->getOperand(1) == LoadEv->getOperand(1) && !LI->isVolatile()) + if (processLoopStoreOfLoopLoad(SI, StoreSize, StoreEv, LoadEv, BECount)) + return true; + } + //errs() << "UNHANDLED strided store: " << *StoreEv << " - " << *SI << "\n"; + + return false; +} + +/// processLoopMemSet - See if this memset can be promoted to a large memset. +bool LoopIdiomRecognize:: +processLoopMemSet(MemSetInst *MSI, const SCEV *BECount) { + // We can only handle non-volatile memsets with a constant size. + if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength())) return false; + + // If we're not allowed to hack on memset, we fail. + if (!TLI->has(LibFunc::memset)) + return false; + + Value *Pointer = MSI->getDest(); + + // See if the pointer expression is an AddRec like {base,+,1} on the current + // loop, which indicates a strided store. If we have something else, it's a + // random store we can't handle. + const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer)); + if (Ev == 0 || Ev->getLoop() != CurLoop || !Ev->isAffine()) + return false; + + // Reject memsets that are so large that they overflow an unsigned. + uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue(); + if ((SizeInBytes >> 32) != 0) + return false; + + // Check to see if the stride matches the size of the memset. If so, then we + // know that every byte is touched in the loop. + const SCEVConstant *Stride = dyn_cast<SCEVConstant>(Ev->getOperand(1)); + + // TODO: Could also handle negative stride here someday, that will require the + // validity check in mayLoopAccessLocation to be updated though. + if (Stride == 0 || MSI->getLength() != Stride->getValue()) + return false; + + return processLoopStridedStore(Pointer, (unsigned)SizeInBytes, + MSI->getAlignment(), MSI->getValue(), + MSI, Ev, BECount); +} + + +/// mayLoopAccessLocation - Return true if the specified loop might access the +/// specified pointer location, which is a loop-strided access. The 'Access' +/// argument specifies what the verboten forms of access are (read or write). +static bool mayLoopAccessLocation(Value *Ptr,AliasAnalysis::ModRefResult Access, + Loop *L, const SCEV *BECount, + unsigned StoreSize, AliasAnalysis &AA, + Instruction *IgnoredStore) { + // Get the location that may be stored across the loop. Since the access is + // strided positively through memory, we say that the modified location starts + // at the pointer and has infinite size. + uint64_t AccessSize = AliasAnalysis::UnknownSize; + + // If the loop iterates a fixed number of times, we can refine the access size + // to be exactly the size of the memset, which is (BECount+1)*StoreSize + if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount)) + AccessSize = (BECst->getValue()->getZExtValue()+1)*StoreSize; + + // TODO: For this to be really effective, we have to dive into the pointer + // operand in the store. Store to &A[i] of 100 will always return may alias + // with store of &A[100], we need to StoreLoc to be "A" with size of 100, + // which will then no-alias a store to &A[100]. + AliasAnalysis::Location StoreLoc(Ptr, AccessSize); + + for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E; + ++BI) + for (BasicBlock::iterator I = (*BI)->begin(), E = (*BI)->end(); I != E; ++I) + if (&*I != IgnoredStore && + (AA.getModRefInfo(I, StoreLoc) & Access)) + return true; + + return false; +} + +/// getMemSetPatternValue - If a strided store of the specified value is safe to +/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should +/// be passed in. Otherwise, return null. +/// +/// Note that we don't ever attempt to use memset_pattern8 or 4, because these +/// just replicate their input array and then pass on to memset_pattern16. +static Constant *getMemSetPatternValue(Value *V, const TargetData &TD) { + // If the value isn't a constant, we can't promote it to being in a constant + // array. We could theoretically do a store to an alloca or something, but + // that doesn't seem worthwhile. + Constant *C = dyn_cast<Constant>(V); + if (C == 0) return 0; + + // Only handle simple values that are a power of two bytes in size. + uint64_t Size = TD.getTypeSizeInBits(V->getType()); + if (Size == 0 || (Size & 7) || (Size & (Size-1))) + return 0; + + // Don't care enough about darwin/ppc to implement this. + if (TD.isBigEndian()) + return 0; + + // Convert to size in bytes. + Size /= 8; + + // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see + // if the top and bottom are the same (e.g. for vectors and large integers). + if (Size > 16) return 0; + + // If the constant is exactly 16 bytes, just use it. + if (Size == 16) return C; + + // Otherwise, we'll use an array of the constants. + unsigned ArraySize = 16/Size; + ArrayType *AT = ArrayType::get(V->getType(), ArraySize); + return ConstantArray::get(AT, std::vector<Constant*>(ArraySize, C)); +} + + +/// processLoopStridedStore - We see a strided store of some value. If we can +/// transform this into a memset or memset_pattern in the loop preheader, do so. +bool LoopIdiomRecognize:: +processLoopStridedStore(Value *DestPtr, unsigned StoreSize, + unsigned StoreAlignment, Value *StoredVal, + Instruction *TheStore, const SCEVAddRecExpr *Ev, + const SCEV *BECount) { + + // If the stored value is a byte-wise value (like i32 -1), then it may be + // turned into a memset of i8 -1, assuming that all the consecutive bytes + // are stored. A store of i32 0x01020304 can never be turned into a memset, + // but it can be turned into memset_pattern if the target supports it. + Value *SplatValue = isBytewiseValue(StoredVal); + Constant *PatternValue = 0; + + // If we're allowed to form a memset, and the stored value would be acceptable + // for memset, use it. + if (SplatValue && TLI->has(LibFunc::memset) && + // Verify that the stored value is loop invariant. If not, we can't + // promote the memset. + CurLoop->isLoopInvariant(SplatValue)) { + // Keep and use SplatValue. + PatternValue = 0; + } else if (TLI->has(LibFunc::memset_pattern16) && + (PatternValue = getMemSetPatternValue(StoredVal, *TD))) { + // It looks like we can use PatternValue! + SplatValue = 0; + } else { + // Otherwise, this isn't an idiom we can transform. For example, we can't + // do anything with a 3-byte store, for example. + return false; + } + + + // Okay, we have a strided store "p[i]" of a splattable value. We can turn + // this into a memset in the loop preheader now if we want. However, this + // would be unsafe to do if there is anything else in the loop that may read + // or write to the aliased location. Check for an alias. + if (mayLoopAccessLocation(DestPtr, AliasAnalysis::ModRef, + CurLoop, BECount, + StoreSize, getAnalysis<AliasAnalysis>(), TheStore)) + return false; + + // Okay, everything looks good, insert the memset. + BasicBlock *Preheader = CurLoop->getLoopPreheader(); + + IRBuilder<> Builder(Preheader->getTerminator()); + + // The trip count of the loop and the base pointer of the addrec SCEV is + // guaranteed to be loop invariant, which means that it should dominate the + // header. Just insert code for it in the preheader. + SCEVExpander Expander(*SE); + + unsigned AddrSpace = cast<PointerType>(DestPtr->getType())->getAddressSpace(); + Value *BasePtr = + Expander.expandCodeFor(Ev->getStart(), Builder.getInt8PtrTy(AddrSpace), + Preheader->getTerminator()); + + // The # stored bytes is (BECount+1)*Size. Expand the trip count out to + // pointer size if it isn't already. + const Type *IntPtr = TD->getIntPtrType(DestPtr->getContext()); + BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr); + + const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1), + true /*no unsigned overflow*/); + if (StoreSize != 1) + NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize), + true /*no unsigned overflow*/); + + Value *NumBytes = + Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator()); + + Value *NewCall; + if (SplatValue) + NewCall = Builder.CreateMemSet(BasePtr, SplatValue,NumBytes,StoreAlignment); + else { + Module *M = TheStore->getParent()->getParent()->getParent(); + Value *MSP = M->getOrInsertFunction("memset_pattern16", + Builder.getVoidTy(), + Builder.getInt8PtrTy(), + Builder.getInt8PtrTy(), IntPtr, + (void*)0); + + // Otherwise we should form a memset_pattern16. PatternValue is known to be + // an constant array of 16-bytes. Plop the value into a mergable global. + GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true, + GlobalValue::InternalLinkage, + PatternValue, ".memset_pattern"); + GV->setUnnamedAddr(true); // Ok to merge these. + GV->setAlignment(16); + Value *PatternPtr = ConstantExpr::getBitCast(GV, Builder.getInt8PtrTy()); + NewCall = Builder.CreateCall3(MSP, BasePtr, PatternPtr, NumBytes); + } + + DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n" + << " from store to: " << *Ev << " at: " << *TheStore << "\n"); + (void)NewCall; + + // Okay, the memset has been formed. Zap the original store and anything that + // feeds into it. + DeleteDeadInstruction(TheStore, *SE); + ++NumMemSet; + return true; +} + +/// processLoopStoreOfLoopLoad - We see a strided store whose value is a +/// same-strided load. +bool LoopIdiomRecognize:: +processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, + const SCEVAddRecExpr *StoreEv, + const SCEVAddRecExpr *LoadEv, + const SCEV *BECount) { + // If we're not allowed to form memcpy, we fail. + if (!TLI->has(LibFunc::memcpy)) + return false; + + LoadInst *LI = cast<LoadInst>(SI->getValueOperand()); + + // Okay, we have a strided store "p[i]" of a loaded value. We can turn + // this into a memcpy in the loop preheader now if we want. However, this + // would be unsafe to do if there is anything else in the loop that may read + // or write to the stored location (including the load feeding the stores). + // Check for an alias. + if (mayLoopAccessLocation(SI->getPointerOperand(), AliasAnalysis::ModRef, + CurLoop, BECount, StoreSize, + getAnalysis<AliasAnalysis>(), SI)) + return false; + + // For a memcpy, we have to make sure that the input array is not being + // mutated by the loop. + if (mayLoopAccessLocation(LI->getPointerOperand(), AliasAnalysis::Mod, + CurLoop, BECount, StoreSize, + getAnalysis<AliasAnalysis>(), SI)) + return false; + + // Okay, everything looks good, insert the memcpy. + BasicBlock *Preheader = CurLoop->getLoopPreheader(); + + IRBuilder<> Builder(Preheader->getTerminator()); + + // The trip count of the loop and the base pointer of the addrec SCEV is + // guaranteed to be loop invariant, which means that it should dominate the + // header. Just insert code for it in the preheader. + SCEVExpander Expander(*SE); + + Value *LoadBasePtr = + Expander.expandCodeFor(LoadEv->getStart(), + Builder.getInt8PtrTy(LI->getPointerAddressSpace()), + Preheader->getTerminator()); + Value *StoreBasePtr = + Expander.expandCodeFor(StoreEv->getStart(), + Builder.getInt8PtrTy(SI->getPointerAddressSpace()), + Preheader->getTerminator()); + + // The # stored bytes is (BECount+1)*Size. Expand the trip count out to + // pointer size if it isn't already. + const Type *IntPtr = TD->getIntPtrType(SI->getContext()); + BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr); + + const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1), + true /*no unsigned overflow*/); + if (StoreSize != 1) + NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize), + true /*no unsigned overflow*/); + + Value *NumBytes = + Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator()); + + Value *NewCall = + Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, + std::min(SI->getAlignment(), LI->getAlignment())); + + DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n" + << " from load ptr=" << *LoadEv << " at: " << *LI << "\n" + << " from store ptr=" << *StoreEv << " at: " << *SI << "\n"); + (void)NewCall; + + // Okay, the memset has been formed. Zap the original store and anything that + // feeds into it. + DeleteDeadInstruction(SI, *SE); + ++NumMemCpy; + return true; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp new file mode 100644 index 0000000..af25c5c --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp @@ -0,0 +1,170 @@ +//===- LoopInstSimplify.cpp - Loop Instruction Simplification Pass --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass performs lightweight instruction simplification on loop bodies. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "loop-instsimplify" +#include "llvm/Instructions.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumSimplified, "Number of redundant instructions simplified"); + +namespace { + class LoopInstSimplify : public LoopPass { + public: + static char ID; // Pass ID, replacement for typeid + LoopInstSimplify() : LoopPass(ID) { + initializeLoopInstSimplifyPass(*PassRegistry::getPassRegistry()); + } + + bool runOnLoop(Loop*, LPPassManager&); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired<LoopInfo>(); + AU.addRequiredID(LoopSimplifyID); + AU.addPreservedID(LoopSimplifyID); + AU.addPreservedID(LCSSAID); + AU.addPreserved("scalar-evolution"); + } + }; +} + +char LoopInstSimplify::ID = 0; +INITIALIZE_PASS_BEGIN(LoopInstSimplify, "loop-instsimplify", + "Simplify instructions in loops", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_END(LoopInstSimplify, "loop-instsimplify", + "Simplify instructions in loops", false, false) + +Pass *llvm::createLoopInstSimplifyPass() { + return new LoopInstSimplify(); +} + +bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { + DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>(); + LoopInfo *LI = &getAnalysis<LoopInfo>(); + const TargetData *TD = getAnalysisIfAvailable<TargetData>(); + + SmallVector<BasicBlock*, 8> ExitBlocks; + L->getUniqueExitBlocks(ExitBlocks); + array_pod_sort(ExitBlocks.begin(), ExitBlocks.end()); + + SmallPtrSet<const Instruction*, 8> S1, S2, *ToSimplify = &S1, *Next = &S2; + + // The bit we are stealing from the pointer represents whether this basic + // block is the header of a subloop, in which case we only process its phis. + typedef PointerIntPair<BasicBlock*, 1> WorklistItem; + SmallVector<WorklistItem, 16> VisitStack; + SmallPtrSet<BasicBlock*, 32> Visited; + + bool Changed = false; + bool LocalChanged; + do { + LocalChanged = false; + + VisitStack.clear(); + Visited.clear(); + + VisitStack.push_back(WorklistItem(L->getHeader(), false)); + + while (!VisitStack.empty()) { + WorklistItem Item = VisitStack.pop_back_val(); + BasicBlock *BB = Item.getPointer(); + bool IsSubloopHeader = Item.getInt(); + + // Simplify instructions in the current basic block. + for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { + Instruction *I = BI++; + + // The first time through the loop ToSimplify is empty and we try to + // simplify all instructions. On later iterations ToSimplify is not + // empty and we only bother simplifying instructions that are in it. + if (!ToSimplify->empty() && !ToSimplify->count(I)) + continue; + + // Don't bother simplifying unused instructions. + if (!I->use_empty()) { + Value *V = SimplifyInstruction(I, TD, DT); + if (V && LI->replacementPreservesLCSSAForm(I, V)) { + // Mark all uses for resimplification next time round the loop. + for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); + UI != UE; ++UI) + Next->insert(cast<Instruction>(*UI)); + + I->replaceAllUsesWith(V); + LocalChanged = true; + ++NumSimplified; + } + } + LocalChanged |= RecursivelyDeleteTriviallyDeadInstructions(I); + + if (IsSubloopHeader && !isa<PHINode>(I)) + break; + } + + // Add all successors to the worklist, except for loop exit blocks and the + // bodies of subloops. We visit the headers of loops so that we can process + // their phis, but we contract the rest of the subloop body and only follow + // edges leading back to the original loop. + for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; + ++SI) { + BasicBlock *SuccBB = *SI; + if (!Visited.insert(SuccBB)) + continue; + + const Loop *SuccLoop = LI->getLoopFor(SuccBB); + if (SuccLoop && SuccLoop->getHeader() == SuccBB + && L->contains(SuccLoop)) { + VisitStack.push_back(WorklistItem(SuccBB, true)); + + SmallVector<BasicBlock*, 8> SubLoopExitBlocks; + SuccLoop->getExitBlocks(SubLoopExitBlocks); + + for (unsigned i = 0; i < SubLoopExitBlocks.size(); ++i) { + BasicBlock *ExitBB = SubLoopExitBlocks[i]; + if (LI->getLoopFor(ExitBB) == L && Visited.insert(ExitBB)) + VisitStack.push_back(WorklistItem(ExitBB, false)); + } + + continue; + } + + bool IsExitBlock = std::binary_search(ExitBlocks.begin(), + ExitBlocks.end(), SuccBB); + if (IsExitBlock) + continue; + + VisitStack.push_back(WorklistItem(SuccBB, false)); + } + } + + // Place the list of instructions to simplify on the next loop iteration + // into ToSimplify. + std::swap(ToSimplify, Next); + Next->clear(); + + Changed |= LocalChanged; + } while (LocalChanged); + + return Changed; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp new file mode 100644 index 0000000..95e1578 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp @@ -0,0 +1,348 @@ +//===- LoopRotation.cpp - Loop Rotation Pass ------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements Loop Rotation Pass. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "loop-rotate" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Function.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm/Support/Debug.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +#define MAX_HEADER_SIZE 16 + +STATISTIC(NumRotated, "Number of loops rotated"); +namespace { + + class LoopRotate : public LoopPass { + public: + static char ID; // Pass ID, replacement for typeid + LoopRotate() : LoopPass(ID) { + initializeLoopRotatePass(*PassRegistry::getPassRegistry()); + } + + // LCSSA form makes instruction renaming easier. + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addPreserved<DominatorTree>(); + AU.addRequired<LoopInfo>(); + AU.addPreserved<LoopInfo>(); + AU.addRequiredID(LoopSimplifyID); + AU.addPreservedID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + AU.addPreservedID(LCSSAID); + AU.addPreserved<ScalarEvolution>(); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM); + bool rotateLoop(Loop *L); + + private: + LoopInfo *LI; + }; +} + +char LoopRotate::ID = 0; +INITIALIZE_PASS_BEGIN(LoopRotate, "loop-rotate", "Rotate Loops", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_END(LoopRotate, "loop-rotate", "Rotate Loops", false, false) + +Pass *llvm::createLoopRotatePass() { return new LoopRotate(); } + +/// Rotate Loop L as many times as possible. Return true if +/// the loop is rotated at least once. +bool LoopRotate::runOnLoop(Loop *L, LPPassManager &LPM) { + LI = &getAnalysis<LoopInfo>(); + + // One loop can be rotated multiple times. + bool MadeChange = false; + while (rotateLoop(L)) + MadeChange = true; + + return MadeChange; +} + +/// RewriteUsesOfClonedInstructions - We just cloned the instructions from the +/// old header into the preheader. If there were uses of the values produced by +/// these instruction that were outside of the loop, we have to insert PHI nodes +/// to merge the two values. Do this now. +static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader, + BasicBlock *OrigPreheader, + ValueToValueMapTy &ValueMap) { + // Remove PHI node entries that are no longer live. + BasicBlock::iterator I, E = OrigHeader->end(); + for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I) + PN->removeIncomingValue(PN->getBasicBlockIndex(OrigPreheader)); + + // Now fix up users of the instructions in OrigHeader, inserting PHI nodes + // as necessary. + SSAUpdater SSA; + for (I = OrigHeader->begin(); I != E; ++I) { + Value *OrigHeaderVal = I; + + // If there are no uses of the value (e.g. because it returns void), there + // is nothing to rewrite. + if (OrigHeaderVal->use_empty()) + continue; + + Value *OrigPreHeaderVal = ValueMap[OrigHeaderVal]; + + // The value now exits in two versions: the initial value in the preheader + // and the loop "next" value in the original header. + SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName()); + SSA.AddAvailableValue(OrigHeader, OrigHeaderVal); + SSA.AddAvailableValue(OrigPreheader, OrigPreHeaderVal); + + // Visit each use of the OrigHeader instruction. + for (Value::use_iterator UI = OrigHeaderVal->use_begin(), + UE = OrigHeaderVal->use_end(); UI != UE; ) { + // Grab the use before incrementing the iterator. + Use &U = UI.getUse(); + + // Increment the iterator before removing the use from the list. + ++UI; + + // SSAUpdater can't handle a non-PHI use in the same block as an + // earlier def. We can easily handle those cases manually. + Instruction *UserInst = cast<Instruction>(U.getUser()); + if (!isa<PHINode>(UserInst)) { + BasicBlock *UserBB = UserInst->getParent(); + + // The original users in the OrigHeader are already using the + // original definitions. + if (UserBB == OrigHeader) + continue; + + // Users in the OrigPreHeader need to use the value to which the + // original definitions are mapped. + if (UserBB == OrigPreheader) { + U = OrigPreHeaderVal; + continue; + } + } + + // Anything else can be handled by SSAUpdater. + SSA.RewriteUse(U); + } + } +} + +/// Rotate loop LP. Return true if the loop is rotated. +bool LoopRotate::rotateLoop(Loop *L) { + // If the loop has only one block then there is not much to rotate. + if (L->getBlocks().size() == 1) + return false; + + BasicBlock *OrigHeader = L->getHeader(); + + BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator()); + if (BI == 0 || BI->isUnconditional()) + return false; + + // If the loop header is not one of the loop exiting blocks then + // either this loop is already rotated or it is not + // suitable for loop rotation transformations. + if (!L->isLoopExiting(OrigHeader)) + return false; + + // Updating PHInodes in loops with multiple exits adds complexity. + // Keep it simple, and restrict loop rotation to loops with one exit only. + // In future, lift this restriction and support for multiple exits if + // required. + SmallVector<BasicBlock*, 8> ExitBlocks; + L->getExitBlocks(ExitBlocks); + if (ExitBlocks.size() > 1) + return false; + + // Check size of original header and reject loop if it is very big. + { + CodeMetrics Metrics; + Metrics.analyzeBasicBlock(OrigHeader); + if (Metrics.NumInsts > MAX_HEADER_SIZE) + return false; + } + + // Now, this loop is suitable for rotation. + BasicBlock *OrigPreheader = L->getLoopPreheader(); + BasicBlock *OrigLatch = L->getLoopLatch(); + assert(OrigPreheader && OrigLatch && "Loop not in canonical form?"); + + // Anything ScalarEvolution may know about this loop or the PHI nodes + // in its header will soon be invalidated. + if (ScalarEvolution *SE = getAnalysisIfAvailable<ScalarEvolution>()) + SE->forgetLoop(L); + + // Find new Loop header. NewHeader is a Header's one and only successor + // that is inside loop. Header's other successor is outside the + // loop. Otherwise loop is not suitable for rotation. + BasicBlock *Exit = BI->getSuccessor(0); + BasicBlock *NewHeader = BI->getSuccessor(1); + if (L->contains(Exit)) + std::swap(Exit, NewHeader); + assert(NewHeader && "Unable to determine new loop header"); + assert(L->contains(NewHeader) && !L->contains(Exit) && + "Unable to determine loop header and exit blocks"); + + // This code assumes that the new header has exactly one predecessor. + // Remove any single-entry PHI nodes in it. + assert(NewHeader->getSinglePredecessor() && + "New header doesn't have one pred!"); + FoldSingleEntryPHINodes(NewHeader); + + // Begin by walking OrigHeader and populating ValueMap with an entry for + // each Instruction. + BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end(); + ValueToValueMapTy ValueMap; + + // For PHI nodes, the value available in OldPreHeader is just the + // incoming value from OldPreHeader. + for (; PHINode *PN = dyn_cast<PHINode>(I); ++I) + ValueMap[PN] = PN->getIncomingValue(PN->getBasicBlockIndex(OrigPreheader)); + + // For the rest of the instructions, either hoist to the OrigPreheader if + // possible or create a clone in the OldPreHeader if not. + TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator(); + while (I != E) { + Instruction *Inst = I++; + + // If the instruction's operands are invariant and it doesn't read or write + // memory, then it is safe to hoist. Doing this doesn't change the order of + // execution in the preheader, but does prevent the instruction from + // executing in each iteration of the loop. This means it is safe to hoist + // something that might trap, but isn't safe to hoist something that reads + // memory (without proving that the loop doesn't write). + if (L->hasLoopInvariantOperands(Inst) && + !Inst->mayReadFromMemory() && !Inst->mayWriteToMemory() && + !isa<TerminatorInst>(Inst) && !isa<DbgInfoIntrinsic>(Inst)) { + Inst->moveBefore(LoopEntryBranch); + continue; + } + + // Otherwise, create a duplicate of the instruction. + Instruction *C = Inst->clone(); + + // Eagerly remap the operands of the instruction. + RemapInstruction(C, ValueMap, + RF_NoModuleLevelChanges|RF_IgnoreMissingEntries); + + // With the operands remapped, see if the instruction constant folds or is + // otherwise simplifyable. This commonly occurs because the entry from PHI + // nodes allows icmps and other instructions to fold. + Value *V = SimplifyInstruction(C); + if (V && LI->replacementPreservesLCSSAForm(C, V)) { + // If so, then delete the temporary instruction and stick the folded value + // in the map. + delete C; + ValueMap[Inst] = V; + } else { + // Otherwise, stick the new instruction into the new block! + C->setName(Inst->getName()); + C->insertBefore(LoopEntryBranch); + ValueMap[Inst] = C; + } + } + + // Along with all the other instructions, we just cloned OrigHeader's + // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's + // successors by duplicating their incoming values for OrigHeader. + TerminatorInst *TI = OrigHeader->getTerminator(); + for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) + for (BasicBlock::iterator BI = TI->getSuccessor(i)->begin(); + PHINode *PN = dyn_cast<PHINode>(BI); ++BI) + PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader); + + // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove + // OrigPreHeader's old terminator (the original branch into the loop), and + // remove the corresponding incoming values from the PHI nodes in OrigHeader. + LoopEntryBranch->eraseFromParent(); + + // If there were any uses of instructions in the duplicated block outside the + // loop, update them, inserting PHI nodes as required + RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap); + + // NewHeader is now the header of the loop. + L->moveToHeader(NewHeader); + assert(L->getHeader() == NewHeader && "Latch block is our new header"); + + + // At this point, we've finished our major CFG changes. As part of cloning + // the loop into the preheader we've simplified instructions and the + // duplicated conditional branch may now be branching on a constant. If it is + // branching on a constant and if that constant means that we enter the loop, + // then we fold away the cond branch to an uncond branch. This simplifies the + // loop in cases important for nested loops, and it also means we don't have + // to split as many edges. + BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator()); + assert(PHBI->isConditional() && "Should be clone of BI condbr!"); + if (!isa<ConstantInt>(PHBI->getCondition()) || + PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero()) + != NewHeader) { + // The conditional branch can't be folded, handle the general case. + // Update DominatorTree to reflect the CFG change we just made. Then split + // edges as necessary to preserve LoopSimplify form. + if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>()) { + // Since OrigPreheader now has the conditional branch to Exit block, it is + // the dominator of Exit. + DT->changeImmediateDominator(Exit, OrigPreheader); + DT->changeImmediateDominator(NewHeader, OrigPreheader); + + // Update OrigHeader to be dominated by the new header block. + DT->changeImmediateDominator(OrigHeader, OrigLatch); + } + + // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and + // thus is not a preheader anymore. Split the edge to form a real preheader. + BasicBlock *NewPH = SplitCriticalEdge(OrigPreheader, NewHeader, this); + NewPH->setName(NewHeader->getName() + ".lr.ph"); + + // Preserve canonical loop form, which means that 'Exit' should have only one + // predecessor. + BasicBlock *ExitSplit = SplitCriticalEdge(L->getLoopLatch(), Exit, this); + ExitSplit->moveBefore(Exit); + } else { + // We can fold the conditional branch in the preheader, this makes things + // simpler. The first step is to remove the extra edge to the Exit block. + Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/); + BranchInst::Create(NewHeader, PHBI); + PHBI->eraseFromParent(); + + // With our CFG finalized, update DomTree if it is available. + if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>()) { + // Update OrigHeader to be dominated by the new header block. + DT->changeImmediateDominator(NewHeader, OrigPreheader); + DT->changeImmediateDominator(OrigHeader, OrigLatch); + } + } + + assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation"); + assert(L->getLoopLatch() && "Invalid loop latch after loop rotation"); + + // Now that the CFG and DomTree are in a consistent state again, try to merge + // the OrigHeader block into OrigLatch. This will succeed if they are + // connected by an unconditional branch. This is just a cleanup so the + // emitted code isn't too gross in this common case. + MergeBlockIntoPredecessor(OrigHeader, this); + + ++NumRotated; + return true; +} + diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp new file mode 100644 index 0000000..ac4aea2 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -0,0 +1,3845 @@ +//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This transformation analyzes and transforms the induction variables (and +// computations derived from them) into forms suitable for efficient execution +// on the target. +// +// This pass performs a strength reduction on array references inside loops that +// have as one or more of their components the loop induction variable, it +// rewrites expressions to take advantage of scaled-index addressing modes +// available on the target, and it performs a variety of other optimizations +// related to loop induction variables. +// +// Terminology note: this code has a lot of handling for "post-increment" or +// "post-inc" users. This is not talking about post-increment addressing modes; +// it is instead talking about code like this: +// +// %i = phi [ 0, %entry ], [ %i.next, %latch ] +// ... +// %i.next = add %i, 1 +// %c = icmp eq %i.next, %n +// +// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however +// it's useful to think about these as the same register, with some uses using +// the value of the register before the add and some using // it after. In this +// example, the icmp is a post-increment user, since it uses %i.next, which is +// the value of the induction variable after the increment. The other common +// case of post-increment users is users outside the loop. +// +// TODO: More sophistication in the way Formulae are generated and filtered. +// +// TODO: Handle multiple loops at a time. +// +// TODO: Should TargetLowering::AddrMode::BaseGV be changed to a ConstantExpr +// instead of a GlobalValue? +// +// TODO: When truncation is free, truncate ICmp users' operands to make it a +// smaller encoding (on x86 at least). +// +// TODO: When a negated register is used by an add (such as in a list of +// multiple base registers, or as the increment expression in an addrec), +// we may not actually need both reg and (-1 * reg) in registers; the +// negation can be implemented by using a sub instead of an add. The +// lack of support for taking this into consideration when making +// register pressure decisions is partly worked around by the "Special" +// use kind. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "loop-reduce" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Constants.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Analysis/IVUsers.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Assembly/Writer.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/ADT/SmallBitVector.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ValueHandle.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetLowering.h" +#include <algorithm> +using namespace llvm; + +namespace { + +/// RegSortData - This class holds data which is used to order reuse candidates. +class RegSortData { +public: + /// UsedByIndices - This represents the set of LSRUse indices which reference + /// a particular register. + SmallBitVector UsedByIndices; + + RegSortData() {} + + void print(raw_ostream &OS) const; + void dump() const; +}; + +} + +void RegSortData::print(raw_ostream &OS) const { + OS << "[NumUses=" << UsedByIndices.count() << ']'; +} + +void RegSortData::dump() const { + print(errs()); errs() << '\n'; +} + +namespace { + +/// RegUseTracker - Map register candidates to information about how they are +/// used. +class RegUseTracker { + typedef DenseMap<const SCEV *, RegSortData> RegUsesTy; + + RegUsesTy RegUsesMap; + SmallVector<const SCEV *, 16> RegSequence; + +public: + void CountRegister(const SCEV *Reg, size_t LUIdx); + void DropRegister(const SCEV *Reg, size_t LUIdx); + void SwapAndDropUse(size_t LUIdx, size_t LastLUIdx); + + bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const; + + const SmallBitVector &getUsedByIndices(const SCEV *Reg) const; + + void clear(); + + typedef SmallVectorImpl<const SCEV *>::iterator iterator; + typedef SmallVectorImpl<const SCEV *>::const_iterator const_iterator; + iterator begin() { return RegSequence.begin(); } + iterator end() { return RegSequence.end(); } + const_iterator begin() const { return RegSequence.begin(); } + const_iterator end() const { return RegSequence.end(); } +}; + +} + +void +RegUseTracker::CountRegister(const SCEV *Reg, size_t LUIdx) { + std::pair<RegUsesTy::iterator, bool> Pair = + RegUsesMap.insert(std::make_pair(Reg, RegSortData())); + RegSortData &RSD = Pair.first->second; + if (Pair.second) + RegSequence.push_back(Reg); + RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1)); + RSD.UsedByIndices.set(LUIdx); +} + +void +RegUseTracker::DropRegister(const SCEV *Reg, size_t LUIdx) { + RegUsesTy::iterator It = RegUsesMap.find(Reg); + assert(It != RegUsesMap.end()); + RegSortData &RSD = It->second; + assert(RSD.UsedByIndices.size() > LUIdx); + RSD.UsedByIndices.reset(LUIdx); +} + +void +RegUseTracker::SwapAndDropUse(size_t LUIdx, size_t LastLUIdx) { + assert(LUIdx <= LastLUIdx); + + // Update RegUses. The data structure is not optimized for this purpose; + // we must iterate through it and update each of the bit vectors. + for (RegUsesTy::iterator I = RegUsesMap.begin(), E = RegUsesMap.end(); + I != E; ++I) { + SmallBitVector &UsedByIndices = I->second.UsedByIndices; + if (LUIdx < UsedByIndices.size()) + UsedByIndices[LUIdx] = + LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : 0; + UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx)); + } +} + +bool +RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const { + RegUsesTy::const_iterator I = RegUsesMap.find(Reg); + if (I == RegUsesMap.end()) + return false; + const SmallBitVector &UsedByIndices = I->second.UsedByIndices; + int i = UsedByIndices.find_first(); + if (i == -1) return false; + if ((size_t)i != LUIdx) return true; + return UsedByIndices.find_next(i) != -1; +} + +const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const { + RegUsesTy::const_iterator I = RegUsesMap.find(Reg); + assert(I != RegUsesMap.end() && "Unknown register!"); + return I->second.UsedByIndices; +} + +void RegUseTracker::clear() { + RegUsesMap.clear(); + RegSequence.clear(); +} + +namespace { + +/// Formula - This class holds information that describes a formula for +/// computing satisfying a use. It may include broken-out immediates and scaled +/// registers. +struct Formula { + /// AM - This is used to represent complex addressing, as well as other kinds + /// of interesting uses. + TargetLowering::AddrMode AM; + + /// BaseRegs - The list of "base" registers for this use. When this is + /// non-empty, AM.HasBaseReg should be set to true. + SmallVector<const SCEV *, 2> BaseRegs; + + /// ScaledReg - The 'scaled' register for this use. This should be non-null + /// when AM.Scale is not zero. + const SCEV *ScaledReg; + + Formula() : ScaledReg(0) {} + + void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE); + + unsigned getNumRegs() const; + const Type *getType() const; + + void DeleteBaseReg(const SCEV *&S); + + bool referencesReg(const SCEV *S) const; + bool hasRegsUsedByUsesOtherThan(size_t LUIdx, + const RegUseTracker &RegUses) const; + + void print(raw_ostream &OS) const; + void dump() const; +}; + +} + +/// DoInitialMatch - Recursion helper for InitialMatch. +static void DoInitialMatch(const SCEV *S, Loop *L, + SmallVectorImpl<const SCEV *> &Good, + SmallVectorImpl<const SCEV *> &Bad, + ScalarEvolution &SE) { + // Collect expressions which properly dominate the loop header. + if (SE.properlyDominates(S, L->getHeader())) { + Good.push_back(S); + return; + } + + // Look at add operands. + if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { + for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); + I != E; ++I) + DoInitialMatch(*I, L, Good, Bad, SE); + return; + } + + // Look at addrec operands. + if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) + if (!AR->getStart()->isZero()) { + DoInitialMatch(AR->getStart(), L, Good, Bad, SE); + DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0), + AR->getStepRecurrence(SE), + AR->getLoop()), + L, Good, Bad, SE); + return; + } + + // Handle a multiplication by -1 (negation) if it didn't fold. + if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) + if (Mul->getOperand(0)->isAllOnesValue()) { + SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end()); + const SCEV *NewMul = SE.getMulExpr(Ops); + + SmallVector<const SCEV *, 4> MyGood; + SmallVector<const SCEV *, 4> MyBad; + DoInitialMatch(NewMul, L, MyGood, MyBad, SE); + const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue( + SE.getEffectiveSCEVType(NewMul->getType()))); + for (SmallVectorImpl<const SCEV *>::const_iterator I = MyGood.begin(), + E = MyGood.end(); I != E; ++I) + Good.push_back(SE.getMulExpr(NegOne, *I)); + for (SmallVectorImpl<const SCEV *>::const_iterator I = MyBad.begin(), + E = MyBad.end(); I != E; ++I) + Bad.push_back(SE.getMulExpr(NegOne, *I)); + return; + } + + // Ok, we can't do anything interesting. Just stuff the whole thing into a + // register and hope for the best. + Bad.push_back(S); +} + +/// InitialMatch - Incorporate loop-variant parts of S into this Formula, +/// attempting to keep all loop-invariant and loop-computable values in a +/// single base register. +void Formula::InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) { + SmallVector<const SCEV *, 4> Good; + SmallVector<const SCEV *, 4> Bad; + DoInitialMatch(S, L, Good, Bad, SE); + if (!Good.empty()) { + const SCEV *Sum = SE.getAddExpr(Good); + if (!Sum->isZero()) + BaseRegs.push_back(Sum); + AM.HasBaseReg = true; + } + if (!Bad.empty()) { + const SCEV *Sum = SE.getAddExpr(Bad); + if (!Sum->isZero()) + BaseRegs.push_back(Sum); + AM.HasBaseReg = true; + } +} + +/// getNumRegs - Return the total number of register operands used by this +/// formula. This does not include register uses implied by non-constant +/// addrec strides. +unsigned Formula::getNumRegs() const { + return !!ScaledReg + BaseRegs.size(); +} + +/// getType - Return the type of this formula, if it has one, or null +/// otherwise. This type is meaningless except for the bit size. +const Type *Formula::getType() const { + return !BaseRegs.empty() ? BaseRegs.front()->getType() : + ScaledReg ? ScaledReg->getType() : + AM.BaseGV ? AM.BaseGV->getType() : + 0; +} + +/// DeleteBaseReg - Delete the given base reg from the BaseRegs list. +void Formula::DeleteBaseReg(const SCEV *&S) { + if (&S != &BaseRegs.back()) + std::swap(S, BaseRegs.back()); + BaseRegs.pop_back(); +} + +/// referencesReg - Test if this formula references the given register. +bool Formula::referencesReg(const SCEV *S) const { + return S == ScaledReg || + std::find(BaseRegs.begin(), BaseRegs.end(), S) != BaseRegs.end(); +} + +/// hasRegsUsedByUsesOtherThan - Test whether this formula uses registers +/// which are used by uses other than the use with the given index. +bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx, + const RegUseTracker &RegUses) const { + if (ScaledReg) + if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx)) + return true; + for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(), + E = BaseRegs.end(); I != E; ++I) + if (RegUses.isRegUsedByUsesOtherThan(*I, LUIdx)) + return true; + return false; +} + +void Formula::print(raw_ostream &OS) const { + bool First = true; + if (AM.BaseGV) { + if (!First) OS << " + "; else First = false; + WriteAsOperand(OS, AM.BaseGV, /*PrintType=*/false); + } + if (AM.BaseOffs != 0) { + if (!First) OS << " + "; else First = false; + OS << AM.BaseOffs; + } + for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(), + E = BaseRegs.end(); I != E; ++I) { + if (!First) OS << " + "; else First = false; + OS << "reg(" << **I << ')'; + } + if (AM.HasBaseReg && BaseRegs.empty()) { + if (!First) OS << " + "; else First = false; + OS << "**error: HasBaseReg**"; + } else if (!AM.HasBaseReg && !BaseRegs.empty()) { + if (!First) OS << " + "; else First = false; + OS << "**error: !HasBaseReg**"; + } + if (AM.Scale != 0) { + if (!First) OS << " + "; else First = false; + OS << AM.Scale << "*reg("; + if (ScaledReg) + OS << *ScaledReg; + else + OS << "<unknown>"; + OS << ')'; + } +} + +void Formula::dump() const { + print(errs()); errs() << '\n'; +} + +/// isAddRecSExtable - Return true if the given addrec can be sign-extended +/// without changing its value. +static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) { + const Type *WideTy = + IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1); + return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy)); +} + +/// isAddSExtable - Return true if the given add can be sign-extended +/// without changing its value. +static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) { + const Type *WideTy = + IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1); + return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy)); +} + +/// isMulSExtable - Return true if the given mul can be sign-extended +/// without changing its value. +static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) { + const Type *WideTy = + IntegerType::get(SE.getContext(), + SE.getTypeSizeInBits(M->getType()) * M->getNumOperands()); + return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy)); +} + +/// getExactSDiv - Return an expression for LHS /s RHS, if it can be determined +/// and if the remainder is known to be zero, or null otherwise. If +/// IgnoreSignificantBits is true, expressions like (X * Y) /s Y are simplified +/// to Y, ignoring that the multiplication may overflow, which is useful when +/// the result will be used in a context where the most significant bits are +/// ignored. +static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, + ScalarEvolution &SE, + bool IgnoreSignificantBits = false) { + // Handle the trivial case, which works for any SCEV type. + if (LHS == RHS) + return SE.getConstant(LHS->getType(), 1); + + // Handle a few RHS special cases. + const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS); + if (RC) { + const APInt &RA = RC->getValue()->getValue(); + // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do + // some folding. + if (RA.isAllOnesValue()) + return SE.getMulExpr(LHS, RC); + // Handle x /s 1 as x. + if (RA == 1) + return LHS; + } + + // Check for a division of a constant by a constant. + if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) { + if (!RC) + return 0; + const APInt &LA = C->getValue()->getValue(); + const APInt &RA = RC->getValue()->getValue(); + if (LA.srem(RA) != 0) + return 0; + return SE.getConstant(LA.sdiv(RA)); + } + + // Distribute the sdiv over addrec operands, if the addrec doesn't overflow. + if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) { + if (IgnoreSignificantBits || isAddRecSExtable(AR, SE)) { + const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE, + IgnoreSignificantBits); + if (!Step) return 0; + const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE, + IgnoreSignificantBits); + if (!Start) return 0; + return SE.getAddRecExpr(Start, Step, AR->getLoop()); + } + return 0; + } + + // Distribute the sdiv over add operands, if the add doesn't overflow. + if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) { + if (IgnoreSignificantBits || isAddSExtable(Add, SE)) { + SmallVector<const SCEV *, 8> Ops; + for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); + I != E; ++I) { + const SCEV *Op = getExactSDiv(*I, RHS, SE, + IgnoreSignificantBits); + if (!Op) return 0; + Ops.push_back(Op); + } + return SE.getAddExpr(Ops); + } + return 0; + } + + // Check for a multiply operand that we can pull RHS out of. + if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) { + if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) { + SmallVector<const SCEV *, 4> Ops; + bool Found = false; + for (SCEVMulExpr::op_iterator I = Mul->op_begin(), E = Mul->op_end(); + I != E; ++I) { + const SCEV *S = *I; + if (!Found) + if (const SCEV *Q = getExactSDiv(S, RHS, SE, + IgnoreSignificantBits)) { + S = Q; + Found = true; + } + Ops.push_back(S); + } + return Found ? SE.getMulExpr(Ops) : 0; + } + return 0; + } + + // Otherwise we don't know. + return 0; +} + +/// ExtractImmediate - If S involves the addition of a constant integer value, +/// return that integer value, and mutate S to point to a new SCEV with that +/// value excluded. +static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) { + if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) { + if (C->getValue()->getValue().getMinSignedBits() <= 64) { + S = SE.getConstant(C->getType(), 0); + return C->getValue()->getSExtValue(); + } + } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { + SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end()); + int64_t Result = ExtractImmediate(NewOps.front(), SE); + if (Result != 0) + S = SE.getAddExpr(NewOps); + return Result; + } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { + SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end()); + int64_t Result = ExtractImmediate(NewOps.front(), SE); + if (Result != 0) + S = SE.getAddRecExpr(NewOps, AR->getLoop()); + return Result; + } + return 0; +} + +/// ExtractSymbol - If S involves the addition of a GlobalValue address, +/// return that symbol, and mutate S to point to a new SCEV with that +/// value excluded. +static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) { + if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) { + if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) { + S = SE.getConstant(GV->getType(), 0); + return GV; + } + } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { + SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end()); + GlobalValue *Result = ExtractSymbol(NewOps.back(), SE); + if (Result) + S = SE.getAddExpr(NewOps); + return Result; + } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { + SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end()); + GlobalValue *Result = ExtractSymbol(NewOps.front(), SE); + if (Result) + S = SE.getAddRecExpr(NewOps, AR->getLoop()); + return Result; + } + return 0; +} + +/// isAddressUse - Returns true if the specified instruction is using the +/// specified value as an address. +static bool isAddressUse(Instruction *Inst, Value *OperandVal) { + bool isAddress = isa<LoadInst>(Inst); + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + if (SI->getOperand(1) == OperandVal) + isAddress = true; + } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { + // Addressing modes can also be folded into prefetches and a variety + // of intrinsics. + switch (II->getIntrinsicID()) { + default: break; + case Intrinsic::prefetch: + case Intrinsic::x86_sse2_loadu_dq: + case Intrinsic::x86_sse2_loadu_pd: + case Intrinsic::x86_sse_loadu_ps: + case Intrinsic::x86_sse_storeu_ps: + case Intrinsic::x86_sse2_storeu_pd: + case Intrinsic::x86_sse2_storeu_dq: + case Intrinsic::x86_sse2_storel_dq: + if (II->getArgOperand(0) == OperandVal) + isAddress = true; + break; + } + } + return isAddress; +} + +/// getAccessType - Return the type of the memory being accessed. +static const Type *getAccessType(const Instruction *Inst) { + const Type *AccessTy = Inst->getType(); + if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) + AccessTy = SI->getOperand(0)->getType(); + else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { + // Addressing modes can also be folded into prefetches and a variety + // of intrinsics. + switch (II->getIntrinsicID()) { + default: break; + case Intrinsic::x86_sse_storeu_ps: + case Intrinsic::x86_sse2_storeu_pd: + case Intrinsic::x86_sse2_storeu_dq: + case Intrinsic::x86_sse2_storel_dq: + AccessTy = II->getArgOperand(0)->getType(); + break; + } + } + + // All pointers have the same requirements, so canonicalize them to an + // arbitrary pointer type to minimize variation. + if (const PointerType *PTy = dyn_cast<PointerType>(AccessTy)) + AccessTy = PointerType::get(IntegerType::get(PTy->getContext(), 1), + PTy->getAddressSpace()); + + return AccessTy; +} + +/// DeleteTriviallyDeadInstructions - If any of the instructions is the +/// specified set are trivially dead, delete them and see if this makes any of +/// their operands subsequently dead. +static bool +DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) { + bool Changed = false; + + while (!DeadInsts.empty()) { + Instruction *I = dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val()); + + if (I == 0 || !isInstructionTriviallyDead(I)) + continue; + + for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) + if (Instruction *U = dyn_cast<Instruction>(*OI)) { + *OI = 0; + if (U->use_empty()) + DeadInsts.push_back(U); + } + + I->eraseFromParent(); + Changed = true; + } + + return Changed; +} + +namespace { + +/// Cost - This class is used to measure and compare candidate formulae. +class Cost { + /// TODO: Some of these could be merged. Also, a lexical ordering + /// isn't always optimal. + unsigned NumRegs; + unsigned AddRecCost; + unsigned NumIVMuls; + unsigned NumBaseAdds; + unsigned ImmCost; + unsigned SetupCost; + +public: + Cost() + : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0), + SetupCost(0) {} + + bool operator<(const Cost &Other) const; + + void Loose(); + + void RateFormula(const Formula &F, + SmallPtrSet<const SCEV *, 16> &Regs, + const DenseSet<const SCEV *> &VisitedRegs, + const Loop *L, + const SmallVectorImpl<int64_t> &Offsets, + ScalarEvolution &SE, DominatorTree &DT); + + void print(raw_ostream &OS) const; + void dump() const; + +private: + void RateRegister(const SCEV *Reg, + SmallPtrSet<const SCEV *, 16> &Regs, + const Loop *L, + ScalarEvolution &SE, DominatorTree &DT); + void RatePrimaryRegister(const SCEV *Reg, + SmallPtrSet<const SCEV *, 16> &Regs, + const Loop *L, + ScalarEvolution &SE, DominatorTree &DT); +}; + +} + +/// RateRegister - Tally up interesting quantities from the given register. +void Cost::RateRegister(const SCEV *Reg, + SmallPtrSet<const SCEV *, 16> &Regs, + const Loop *L, + ScalarEvolution &SE, DominatorTree &DT) { + if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) { + if (AR->getLoop() == L) + AddRecCost += 1; /// TODO: This should be a function of the stride. + + // If this is an addrec for a loop that's already been visited by LSR, + // don't second-guess its addrec phi nodes. LSR isn't currently smart + // enough to reason about more than one loop at a time. Consider these + // registers free and leave them alone. + else if (L->contains(AR->getLoop()) || + (!AR->getLoop()->contains(L) && + DT.dominates(L->getHeader(), AR->getLoop()->getHeader()))) { + for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin(); + PHINode *PN = dyn_cast<PHINode>(I); ++I) + if (SE.isSCEVable(PN->getType()) && + (SE.getEffectiveSCEVType(PN->getType()) == + SE.getEffectiveSCEVType(AR->getType())) && + SE.getSCEV(PN) == AR) + return; + + // If this isn't one of the addrecs that the loop already has, it + // would require a costly new phi and add. TODO: This isn't + // precisely modeled right now. + ++NumBaseAdds; + if (!Regs.count(AR->getStart())) + RateRegister(AR->getStart(), Regs, L, SE, DT); + } + + // Add the step value register, if it needs one. + // TODO: The non-affine case isn't precisely modeled here. + if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) + if (!Regs.count(AR->getStart())) + RateRegister(AR->getOperand(1), Regs, L, SE, DT); + } + ++NumRegs; + + // Rough heuristic; favor registers which don't require extra setup + // instructions in the preheader. + if (!isa<SCEVUnknown>(Reg) && + !isa<SCEVConstant>(Reg) && + !(isa<SCEVAddRecExpr>(Reg) && + (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) || + isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart())))) + ++SetupCost; + + NumIVMuls += isa<SCEVMulExpr>(Reg) && + SE.hasComputableLoopEvolution(Reg, L); +} + +/// RatePrimaryRegister - Record this register in the set. If we haven't seen it +/// before, rate it. +void Cost::RatePrimaryRegister(const SCEV *Reg, + SmallPtrSet<const SCEV *, 16> &Regs, + const Loop *L, + ScalarEvolution &SE, DominatorTree &DT) { + if (Regs.insert(Reg)) + RateRegister(Reg, Regs, L, SE, DT); +} + +void Cost::RateFormula(const Formula &F, + SmallPtrSet<const SCEV *, 16> &Regs, + const DenseSet<const SCEV *> &VisitedRegs, + const Loop *L, + const SmallVectorImpl<int64_t> &Offsets, + ScalarEvolution &SE, DominatorTree &DT) { + // Tally up the registers. + if (const SCEV *ScaledReg = F.ScaledReg) { + if (VisitedRegs.count(ScaledReg)) { + Loose(); + return; + } + RatePrimaryRegister(ScaledReg, Regs, L, SE, DT); + } + for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(), + E = F.BaseRegs.end(); I != E; ++I) { + const SCEV *BaseReg = *I; + if (VisitedRegs.count(BaseReg)) { + Loose(); + return; + } + RatePrimaryRegister(BaseReg, Regs, L, SE, DT); + } + + if (F.BaseRegs.size() > 1) + NumBaseAdds += F.BaseRegs.size() - 1; + + // Tally up the non-zero immediates. + for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(), + E = Offsets.end(); I != E; ++I) { + int64_t Offset = (uint64_t)*I + F.AM.BaseOffs; + if (F.AM.BaseGV) + ImmCost += 64; // Handle symbolic values conservatively. + // TODO: This should probably be the pointer size. + else if (Offset != 0) + ImmCost += APInt(64, Offset, true).getMinSignedBits(); + } +} + +/// Loose - Set this cost to a loosing value. +void Cost::Loose() { + NumRegs = ~0u; + AddRecCost = ~0u; + NumIVMuls = ~0u; + NumBaseAdds = ~0u; + ImmCost = ~0u; + SetupCost = ~0u; +} + +/// operator< - Choose the lower cost. +bool Cost::operator<(const Cost &Other) const { + if (NumRegs != Other.NumRegs) + return NumRegs < Other.NumRegs; + if (AddRecCost != Other.AddRecCost) + return AddRecCost < Other.AddRecCost; + if (NumIVMuls != Other.NumIVMuls) + return NumIVMuls < Other.NumIVMuls; + if (NumBaseAdds != Other.NumBaseAdds) + return NumBaseAdds < Other.NumBaseAdds; + if (ImmCost != Other.ImmCost) + return ImmCost < Other.ImmCost; + if (SetupCost != Other.SetupCost) + return SetupCost < Other.SetupCost; + return false; +} + +void Cost::print(raw_ostream &OS) const { + OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s"); + if (AddRecCost != 0) + OS << ", with addrec cost " << AddRecCost; + if (NumIVMuls != 0) + OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s"); + if (NumBaseAdds != 0) + OS << ", plus " << NumBaseAdds << " base add" + << (NumBaseAdds == 1 ? "" : "s"); + if (ImmCost != 0) + OS << ", plus " << ImmCost << " imm cost"; + if (SetupCost != 0) + OS << ", plus " << SetupCost << " setup cost"; +} + +void Cost::dump() const { + print(errs()); errs() << '\n'; +} + +namespace { + +/// LSRFixup - An operand value in an instruction which is to be replaced +/// with some equivalent, possibly strength-reduced, replacement. +struct LSRFixup { + /// UserInst - The instruction which will be updated. + Instruction *UserInst; + + /// OperandValToReplace - The operand of the instruction which will + /// be replaced. The operand may be used more than once; every instance + /// will be replaced. + Value *OperandValToReplace; + + /// PostIncLoops - If this user is to use the post-incremented value of an + /// induction variable, this variable is non-null and holds the loop + /// associated with the induction variable. + PostIncLoopSet PostIncLoops; + + /// LUIdx - The index of the LSRUse describing the expression which + /// this fixup needs, minus an offset (below). + size_t LUIdx; + + /// Offset - A constant offset to be added to the LSRUse expression. + /// This allows multiple fixups to share the same LSRUse with different + /// offsets, for example in an unrolled loop. + int64_t Offset; + + bool isUseFullyOutsideLoop(const Loop *L) const; + + LSRFixup(); + + void print(raw_ostream &OS) const; + void dump() const; +}; + +} + +LSRFixup::LSRFixup() + : UserInst(0), OperandValToReplace(0), LUIdx(~size_t(0)), Offset(0) {} + +/// isUseFullyOutsideLoop - Test whether this fixup always uses its +/// value outside of the given loop. +bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const { + // PHI nodes use their value in their incoming blocks. + if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) { + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + if (PN->getIncomingValue(i) == OperandValToReplace && + L->contains(PN->getIncomingBlock(i))) + return false; + return true; + } + + return !L->contains(UserInst); +} + +void LSRFixup::print(raw_ostream &OS) const { + OS << "UserInst="; + // Store is common and interesting enough to be worth special-casing. + if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) { + OS << "store "; + WriteAsOperand(OS, Store->getOperand(0), /*PrintType=*/false); + } else if (UserInst->getType()->isVoidTy()) + OS << UserInst->getOpcodeName(); + else + WriteAsOperand(OS, UserInst, /*PrintType=*/false); + + OS << ", OperandValToReplace="; + WriteAsOperand(OS, OperandValToReplace, /*PrintType=*/false); + + for (PostIncLoopSet::const_iterator I = PostIncLoops.begin(), + E = PostIncLoops.end(); I != E; ++I) { + OS << ", PostIncLoop="; + WriteAsOperand(OS, (*I)->getHeader(), /*PrintType=*/false); + } + + if (LUIdx != ~size_t(0)) + OS << ", LUIdx=" << LUIdx; + + if (Offset != 0) + OS << ", Offset=" << Offset; +} + +void LSRFixup::dump() const { + print(errs()); errs() << '\n'; +} + +namespace { + +/// UniquifierDenseMapInfo - A DenseMapInfo implementation for holding +/// DenseMaps and DenseSets of sorted SmallVectors of const SCEV*. +struct UniquifierDenseMapInfo { + static SmallVector<const SCEV *, 2> getEmptyKey() { + SmallVector<const SCEV *, 2> V; + V.push_back(reinterpret_cast<const SCEV *>(-1)); + return V; + } + + static SmallVector<const SCEV *, 2> getTombstoneKey() { + SmallVector<const SCEV *, 2> V; + V.push_back(reinterpret_cast<const SCEV *>(-2)); + return V; + } + + static unsigned getHashValue(const SmallVector<const SCEV *, 2> &V) { + unsigned Result = 0; + for (SmallVectorImpl<const SCEV *>::const_iterator I = V.begin(), + E = V.end(); I != E; ++I) + Result ^= DenseMapInfo<const SCEV *>::getHashValue(*I); + return Result; + } + + static bool isEqual(const SmallVector<const SCEV *, 2> &LHS, + const SmallVector<const SCEV *, 2> &RHS) { + return LHS == RHS; + } +}; + +/// LSRUse - This class holds the state that LSR keeps for each use in +/// IVUsers, as well as uses invented by LSR itself. It includes information +/// about what kinds of things can be folded into the user, information about +/// the user itself, and information about how the use may be satisfied. +/// TODO: Represent multiple users of the same expression in common? +class LSRUse { + DenseSet<SmallVector<const SCEV *, 2>, UniquifierDenseMapInfo> Uniquifier; + +public: + /// KindType - An enum for a kind of use, indicating what types of + /// scaled and immediate operands it might support. + enum KindType { + Basic, ///< A normal use, with no folding. + Special, ///< A special case of basic, allowing -1 scales. + Address, ///< An address use; folding according to TargetLowering + ICmpZero ///< An equality icmp with both operands folded into one. + // TODO: Add a generic icmp too? + }; + + KindType Kind; + const Type *AccessTy; + + SmallVector<int64_t, 8> Offsets; + int64_t MinOffset; + int64_t MaxOffset; + + /// AllFixupsOutsideLoop - This records whether all of the fixups using this + /// LSRUse are outside of the loop, in which case some special-case heuristics + /// may be used. + bool AllFixupsOutsideLoop; + + /// WidestFixupType - This records the widest use type for any fixup using + /// this LSRUse. FindUseWithSimilarFormula can't consider uses with different + /// max fixup widths to be equivalent, because the narrower one may be relying + /// on the implicit truncation to truncate away bogus bits. + const Type *WidestFixupType; + + /// Formulae - A list of ways to build a value that can satisfy this user. + /// After the list is populated, one of these is selected heuristically and + /// used to formulate a replacement for OperandValToReplace in UserInst. + SmallVector<Formula, 12> Formulae; + + /// Regs - The set of register candidates used by all formulae in this LSRUse. + SmallPtrSet<const SCEV *, 4> Regs; + + LSRUse(KindType K, const Type *T) : Kind(K), AccessTy(T), + MinOffset(INT64_MAX), + MaxOffset(INT64_MIN), + AllFixupsOutsideLoop(true), + WidestFixupType(0) {} + + bool HasFormulaWithSameRegs(const Formula &F) const; + bool InsertFormula(const Formula &F); + void DeleteFormula(Formula &F); + void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses); + + void print(raw_ostream &OS) const; + void dump() const; +}; + +} + +/// HasFormula - Test whether this use as a formula which has the same +/// registers as the given formula. +bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const { + SmallVector<const SCEV *, 2> Key = F.BaseRegs; + if (F.ScaledReg) Key.push_back(F.ScaledReg); + // Unstable sort by host order ok, because this is only used for uniquifying. + std::sort(Key.begin(), Key.end()); + return Uniquifier.count(Key); +} + +/// InsertFormula - If the given formula has not yet been inserted, add it to +/// the list, and return true. Return false otherwise. +bool LSRUse::InsertFormula(const Formula &F) { + SmallVector<const SCEV *, 2> Key = F.BaseRegs; + if (F.ScaledReg) Key.push_back(F.ScaledReg); + // Unstable sort by host order ok, because this is only used for uniquifying. + std::sort(Key.begin(), Key.end()); + + if (!Uniquifier.insert(Key).second) + return false; + + // Using a register to hold the value of 0 is not profitable. + assert((!F.ScaledReg || !F.ScaledReg->isZero()) && + "Zero allocated in a scaled register!"); +#ifndef NDEBUG + for (SmallVectorImpl<const SCEV *>::const_iterator I = + F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) + assert(!(*I)->isZero() && "Zero allocated in a base register!"); +#endif + + // Add the formula to the list. + Formulae.push_back(F); + + // Record registers now being used by this use. + if (F.ScaledReg) Regs.insert(F.ScaledReg); + Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end()); + + return true; +} + +/// DeleteFormula - Remove the given formula from this use's list. +void LSRUse::DeleteFormula(Formula &F) { + if (&F != &Formulae.back()) + std::swap(F, Formulae.back()); + Formulae.pop_back(); + assert(!Formulae.empty() && "LSRUse has no formulae left!"); +} + +/// RecomputeRegs - Recompute the Regs field, and update RegUses. +void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) { + // Now that we've filtered out some formulae, recompute the Regs set. + SmallPtrSet<const SCEV *, 4> OldRegs = Regs; + Regs.clear(); + for (SmallVectorImpl<Formula>::const_iterator I = Formulae.begin(), + E = Formulae.end(); I != E; ++I) { + const Formula &F = *I; + if (F.ScaledReg) Regs.insert(F.ScaledReg); + Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end()); + } + + // Update the RegTracker. + for (SmallPtrSet<const SCEV *, 4>::iterator I = OldRegs.begin(), + E = OldRegs.end(); I != E; ++I) + if (!Regs.count(*I)) + RegUses.DropRegister(*I, LUIdx); +} + +void LSRUse::print(raw_ostream &OS) const { + OS << "LSR Use: Kind="; + switch (Kind) { + case Basic: OS << "Basic"; break; + case Special: OS << "Special"; break; + case ICmpZero: OS << "ICmpZero"; break; + case Address: + OS << "Address of "; + if (AccessTy->isPointerTy()) + OS << "pointer"; // the full pointer type could be really verbose + else + OS << *AccessTy; + } + + OS << ", Offsets={"; + for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(), + E = Offsets.end(); I != E; ++I) { + OS << *I; + if (llvm::next(I) != E) + OS << ','; + } + OS << '}'; + + if (AllFixupsOutsideLoop) + OS << ", all-fixups-outside-loop"; + + if (WidestFixupType) + OS << ", widest fixup type: " << *WidestFixupType; +} + +void LSRUse::dump() const { + print(errs()); errs() << '\n'; +} + +/// isLegalUse - Test whether the use described by AM is "legal", meaning it can +/// be completely folded into the user instruction at isel time. This includes +/// address-mode folding and special icmp tricks. +static bool isLegalUse(const TargetLowering::AddrMode &AM, + LSRUse::KindType Kind, const Type *AccessTy, + const TargetLowering *TLI) { + switch (Kind) { + case LSRUse::Address: + // If we have low-level target information, ask the target if it can + // completely fold this address. + if (TLI) return TLI->isLegalAddressingMode(AM, AccessTy); + + // Otherwise, just guess that reg+reg addressing is legal. + return !AM.BaseGV && AM.BaseOffs == 0 && AM.Scale <= 1; + + case LSRUse::ICmpZero: + // There's not even a target hook for querying whether it would be legal to + // fold a GV into an ICmp. + if (AM.BaseGV) + return false; + + // ICmp only has two operands; don't allow more than two non-trivial parts. + if (AM.Scale != 0 && AM.HasBaseReg && AM.BaseOffs != 0) + return false; + + // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by + // putting the scaled register in the other operand of the icmp. + if (AM.Scale != 0 && AM.Scale != -1) + return false; + + // If we have low-level target information, ask the target if it can fold an + // integer immediate on an icmp. + if (AM.BaseOffs != 0) { + if (TLI) return TLI->isLegalICmpImmediate(-AM.BaseOffs); + return false; + } + + return true; + + case LSRUse::Basic: + // Only handle single-register values. + return !AM.BaseGV && AM.Scale == 0 && AM.BaseOffs == 0; + + case LSRUse::Special: + // Only handle -1 scales, or no scale. + return AM.Scale == 0 || AM.Scale == -1; + } + + return false; +} + +static bool isLegalUse(TargetLowering::AddrMode AM, + int64_t MinOffset, int64_t MaxOffset, + LSRUse::KindType Kind, const Type *AccessTy, + const TargetLowering *TLI) { + // Check for overflow. + if (((int64_t)((uint64_t)AM.BaseOffs + MinOffset) > AM.BaseOffs) != + (MinOffset > 0)) + return false; + AM.BaseOffs = (uint64_t)AM.BaseOffs + MinOffset; + if (isLegalUse(AM, Kind, AccessTy, TLI)) { + AM.BaseOffs = (uint64_t)AM.BaseOffs - MinOffset; + // Check for overflow. + if (((int64_t)((uint64_t)AM.BaseOffs + MaxOffset) > AM.BaseOffs) != + (MaxOffset > 0)) + return false; + AM.BaseOffs = (uint64_t)AM.BaseOffs + MaxOffset; + return isLegalUse(AM, Kind, AccessTy, TLI); + } + return false; +} + +static bool isAlwaysFoldable(int64_t BaseOffs, + GlobalValue *BaseGV, + bool HasBaseReg, + LSRUse::KindType Kind, const Type *AccessTy, + const TargetLowering *TLI) { + // Fast-path: zero is always foldable. + if (BaseOffs == 0 && !BaseGV) return true; + + // Conservatively, create an address with an immediate and a + // base and a scale. + TargetLowering::AddrMode AM; + AM.BaseOffs = BaseOffs; + AM.BaseGV = BaseGV; + AM.HasBaseReg = HasBaseReg; + AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1; + + // Canonicalize a scale of 1 to a base register if the formula doesn't + // already have a base register. + if (!AM.HasBaseReg && AM.Scale == 1) { + AM.Scale = 0; + AM.HasBaseReg = true; + } + + return isLegalUse(AM, Kind, AccessTy, TLI); +} + +static bool isAlwaysFoldable(const SCEV *S, + int64_t MinOffset, int64_t MaxOffset, + bool HasBaseReg, + LSRUse::KindType Kind, const Type *AccessTy, + const TargetLowering *TLI, + ScalarEvolution &SE) { + // Fast-path: zero is always foldable. + if (S->isZero()) return true; + + // Conservatively, create an address with an immediate and a + // base and a scale. + int64_t BaseOffs = ExtractImmediate(S, SE); + GlobalValue *BaseGV = ExtractSymbol(S, SE); + + // If there's anything else involved, it's not foldable. + if (!S->isZero()) return false; + + // Fast-path: zero is always foldable. + if (BaseOffs == 0 && !BaseGV) return true; + + // Conservatively, create an address with an immediate and a + // base and a scale. + TargetLowering::AddrMode AM; + AM.BaseOffs = BaseOffs; + AM.BaseGV = BaseGV; + AM.HasBaseReg = HasBaseReg; + AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1; + + return isLegalUse(AM, MinOffset, MaxOffset, Kind, AccessTy, TLI); +} + +namespace { + +/// UseMapDenseMapInfo - A DenseMapInfo implementation for holding +/// DenseMaps and DenseSets of pairs of const SCEV* and LSRUse::Kind. +struct UseMapDenseMapInfo { + static std::pair<const SCEV *, LSRUse::KindType> getEmptyKey() { + return std::make_pair(reinterpret_cast<const SCEV *>(-1), LSRUse::Basic); + } + + static std::pair<const SCEV *, LSRUse::KindType> getTombstoneKey() { + return std::make_pair(reinterpret_cast<const SCEV *>(-2), LSRUse::Basic); + } + + static unsigned + getHashValue(const std::pair<const SCEV *, LSRUse::KindType> &V) { + unsigned Result = DenseMapInfo<const SCEV *>::getHashValue(V.first); + Result ^= DenseMapInfo<unsigned>::getHashValue(unsigned(V.second)); + return Result; + } + + static bool isEqual(const std::pair<const SCEV *, LSRUse::KindType> &LHS, + const std::pair<const SCEV *, LSRUse::KindType> &RHS) { + return LHS == RHS; + } +}; + +/// LSRInstance - This class holds state for the main loop strength reduction +/// logic. +class LSRInstance { + IVUsers &IU; + ScalarEvolution &SE; + DominatorTree &DT; + LoopInfo &LI; + const TargetLowering *const TLI; + Loop *const L; + bool Changed; + + /// IVIncInsertPos - This is the insert position that the current loop's + /// induction variable increment should be placed. In simple loops, this is + /// the latch block's terminator. But in more complicated cases, this is a + /// position which will dominate all the in-loop post-increment users. + Instruction *IVIncInsertPos; + + /// Factors - Interesting factors between use strides. + SmallSetVector<int64_t, 8> Factors; + + /// Types - Interesting use types, to facilitate truncation reuse. + SmallSetVector<const Type *, 4> Types; + + /// Fixups - The list of operands which are to be replaced. + SmallVector<LSRFixup, 16> Fixups; + + /// Uses - The list of interesting uses. + SmallVector<LSRUse, 16> Uses; + + /// RegUses - Track which uses use which register candidates. + RegUseTracker RegUses; + + void OptimizeShadowIV(); + bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse); + ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse); + void OptimizeLoopTermCond(); + + void CollectInterestingTypesAndFactors(); + void CollectFixupsAndInitialFormulae(); + + LSRFixup &getNewFixup() { + Fixups.push_back(LSRFixup()); + return Fixups.back(); + } + + // Support for sharing of LSRUses between LSRFixups. + typedef DenseMap<std::pair<const SCEV *, LSRUse::KindType>, + size_t, + UseMapDenseMapInfo> UseMapTy; + UseMapTy UseMap; + + bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, + LSRUse::KindType Kind, const Type *AccessTy); + + std::pair<size_t, int64_t> getUse(const SCEV *&Expr, + LSRUse::KindType Kind, + const Type *AccessTy); + + void DeleteUse(LSRUse &LU, size_t LUIdx); + + LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU); + +public: + void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx); + void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx); + void CountRegisters(const Formula &F, size_t LUIdx); + bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F); + + void CollectLoopInvariantFixupsAndFormulae(); + + void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base, + unsigned Depth = 0); + void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base); + void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base); + void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base); + void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base); + void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base); + void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base); + void GenerateCrossUseConstantOffsets(); + void GenerateAllReuseFormulae(); + + void FilterOutUndesirableDedicatedRegisters(); + + size_t EstimateSearchSpaceComplexity() const; + void NarrowSearchSpaceByDetectingSupersets(); + void NarrowSearchSpaceByCollapsingUnrolledCode(); + void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(); + void NarrowSearchSpaceByPickingWinnerRegs(); + void NarrowSearchSpaceUsingHeuristics(); + + void SolveRecurse(SmallVectorImpl<const Formula *> &Solution, + Cost &SolutionCost, + SmallVectorImpl<const Formula *> &Workspace, + const Cost &CurCost, + const SmallPtrSet<const SCEV *, 16> &CurRegs, + DenseSet<const SCEV *> &VisitedRegs) const; + void Solve(SmallVectorImpl<const Formula *> &Solution) const; + + BasicBlock::iterator + HoistInsertPosition(BasicBlock::iterator IP, + const SmallVectorImpl<Instruction *> &Inputs) const; + BasicBlock::iterator AdjustInsertPositionForExpand(BasicBlock::iterator IP, + const LSRFixup &LF, + const LSRUse &LU) const; + + Value *Expand(const LSRFixup &LF, + const Formula &F, + BasicBlock::iterator IP, + SCEVExpander &Rewriter, + SmallVectorImpl<WeakVH> &DeadInsts) const; + void RewriteForPHI(PHINode *PN, const LSRFixup &LF, + const Formula &F, + SCEVExpander &Rewriter, + SmallVectorImpl<WeakVH> &DeadInsts, + Pass *P) const; + void Rewrite(const LSRFixup &LF, + const Formula &F, + SCEVExpander &Rewriter, + SmallVectorImpl<WeakVH> &DeadInsts, + Pass *P) const; + void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, + Pass *P); + + LSRInstance(const TargetLowering *tli, Loop *l, Pass *P); + + bool getChanged() const { return Changed; } + + void print_factors_and_types(raw_ostream &OS) const; + void print_fixups(raw_ostream &OS) const; + void print_uses(raw_ostream &OS) const; + void print(raw_ostream &OS) const; + void dump() const; +}; + +} + +/// OptimizeShadowIV - If IV is used in a int-to-float cast +/// inside the loop then try to eliminate the cast operation. +void LSRInstance::OptimizeShadowIV() { + const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); + if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) + return; + + for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); + UI != E; /* empty */) { + IVUsers::const_iterator CandidateUI = UI; + ++UI; + Instruction *ShadowUse = CandidateUI->getUser(); + const Type *DestTy = NULL; + + /* If shadow use is a int->float cast then insert a second IV + to eliminate this cast. + + for (unsigned i = 0; i < n; ++i) + foo((double)i); + + is transformed into + + double d = 0.0; + for (unsigned i = 0; i < n; ++i, ++d) + foo(d); + */ + if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) + DestTy = UCast->getDestTy(); + else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) + DestTy = SCast->getDestTy(); + if (!DestTy) continue; + + if (TLI) { + // If target does not support DestTy natively then do not apply + // this transformation. + EVT DVT = TLI->getValueType(DestTy); + if (!TLI->isTypeLegal(DVT)) continue; + } + + PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0)); + if (!PH) continue; + if (PH->getNumIncomingValues() != 2) continue; + + const Type *SrcTy = PH->getType(); + int Mantissa = DestTy->getFPMantissaWidth(); + if (Mantissa == -1) continue; + if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa) + continue; + + unsigned Entry, Latch; + if (PH->getIncomingBlock(0) == L->getLoopPreheader()) { + Entry = 0; + Latch = 1; + } else { + Entry = 1; + Latch = 0; + } + + ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry)); + if (!Init) continue; + Constant *NewInit = ConstantFP::get(DestTy, Init->getZExtValue()); + + BinaryOperator *Incr = + dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch)); + if (!Incr) continue; + if (Incr->getOpcode() != Instruction::Add + && Incr->getOpcode() != Instruction::Sub) + continue; + + /* Initialize new IV, double d = 0.0 in above example. */ + ConstantInt *C = NULL; + if (Incr->getOperand(0) == PH) + C = dyn_cast<ConstantInt>(Incr->getOperand(1)); + else if (Incr->getOperand(1) == PH) + C = dyn_cast<ConstantInt>(Incr->getOperand(0)); + else + continue; + + if (!C) continue; + + // Ignore negative constants, as the code below doesn't handle them + // correctly. TODO: Remove this restriction. + if (!C->getValue().isStrictlyPositive()) continue; + + /* Add new PHINode. */ + PHINode *NewPH = PHINode::Create(DestTy, "IV.S.", PH); + + /* create new increment. '++d' in above example. */ + Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue()); + BinaryOperator *NewIncr = + BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ? + Instruction::FAdd : Instruction::FSub, + NewPH, CFP, "IV.S.next.", Incr); + + NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry)); + NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch)); + + /* Remove cast operation */ + ShadowUse->replaceAllUsesWith(NewPH); + ShadowUse->eraseFromParent(); + Changed = true; + break; + } +} + +/// FindIVUserForCond - If Cond has an operand that is an expression of an IV, +/// set the IV user and stride information and return true, otherwise return +/// false. +bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) { + for (IVUsers::iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) + if (UI->getUser() == Cond) { + // NOTE: we could handle setcc instructions with multiple uses here, but + // InstCombine does it as well for simple uses, it's not clear that it + // occurs enough in real life to handle. + CondUse = UI; + return true; + } + return false; +} + +/// OptimizeMax - Rewrite the loop's terminating condition if it uses +/// a max computation. +/// +/// This is a narrow solution to a specific, but acute, problem. For loops +/// like this: +/// +/// i = 0; +/// do { +/// p[i] = 0.0; +/// } while (++i < n); +/// +/// the trip count isn't just 'n', because 'n' might not be positive. And +/// unfortunately this can come up even for loops where the user didn't use +/// a C do-while loop. For example, seemingly well-behaved top-test loops +/// will commonly be lowered like this: +// +/// if (n > 0) { +/// i = 0; +/// do { +/// p[i] = 0.0; +/// } while (++i < n); +/// } +/// +/// and then it's possible for subsequent optimization to obscure the if +/// test in such a way that indvars can't find it. +/// +/// When indvars can't find the if test in loops like this, it creates a +/// max expression, which allows it to give the loop a canonical +/// induction variable: +/// +/// i = 0; +/// max = n < 1 ? 1 : n; +/// do { +/// p[i] = 0.0; +/// } while (++i != max); +/// +/// Canonical induction variables are necessary because the loop passes +/// are designed around them. The most obvious example of this is the +/// LoopInfo analysis, which doesn't remember trip count values. It +/// expects to be able to rediscover the trip count each time it is +/// needed, and it does this using a simple analysis that only succeeds if +/// the loop has a canonical induction variable. +/// +/// However, when it comes time to generate code, the maximum operation +/// can be quite costly, especially if it's inside of an outer loop. +/// +/// This function solves this problem by detecting this type of loop and +/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting +/// the instructions for the maximum computation. +/// +ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) { + // Check that the loop matches the pattern we're looking for. + if (Cond->getPredicate() != CmpInst::ICMP_EQ && + Cond->getPredicate() != CmpInst::ICMP_NE) + return Cond; + + SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1)); + if (!Sel || !Sel->hasOneUse()) return Cond; + + const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); + if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) + return Cond; + const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1); + + // Add one to the backedge-taken count to get the trip count. + const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount); + if (IterationCount != SE.getSCEV(Sel)) return Cond; + + // Check for a max calculation that matches the pattern. There's no check + // for ICMP_ULE here because the comparison would be with zero, which + // isn't interesting. + CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE; + const SCEVNAryExpr *Max = 0; + if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) { + Pred = ICmpInst::ICMP_SLE; + Max = S; + } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) { + Pred = ICmpInst::ICMP_SLT; + Max = S; + } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) { + Pred = ICmpInst::ICMP_ULT; + Max = U; + } else { + // No match; bail. + return Cond; + } + + // To handle a max with more than two operands, this optimization would + // require additional checking and setup. + if (Max->getNumOperands() != 2) + return Cond; + + const SCEV *MaxLHS = Max->getOperand(0); + const SCEV *MaxRHS = Max->getOperand(1); + + // ScalarEvolution canonicalizes constants to the left. For < and >, look + // for a comparison with 1. For <= and >=, a comparison with zero. + if (!MaxLHS || + (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One))) + return Cond; + + // Check the relevant induction variable for conformance to + // the pattern. + const SCEV *IV = SE.getSCEV(Cond->getOperand(0)); + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV); + if (!AR || !AR->isAffine() || + AR->getStart() != One || + AR->getStepRecurrence(SE) != One) + return Cond; + + assert(AR->getLoop() == L && + "Loop condition operand is an addrec in a different loop!"); + + // Check the right operand of the select, and remember it, as it will + // be used in the new comparison instruction. + Value *NewRHS = 0; + if (ICmpInst::isTrueWhenEqual(Pred)) { + // Look for n+1, and grab n. + if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1))) + if (isa<ConstantInt>(BO->getOperand(1)) && + cast<ConstantInt>(BO->getOperand(1))->isOne() && + SE.getSCEV(BO->getOperand(0)) == MaxRHS) + NewRHS = BO->getOperand(0); + if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2))) + if (isa<ConstantInt>(BO->getOperand(1)) && + cast<ConstantInt>(BO->getOperand(1))->isOne() && + SE.getSCEV(BO->getOperand(0)) == MaxRHS) + NewRHS = BO->getOperand(0); + if (!NewRHS) + return Cond; + } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS) + NewRHS = Sel->getOperand(1); + else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS) + NewRHS = Sel->getOperand(2); + else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS)) + NewRHS = SU->getValue(); + else + // Max doesn't match expected pattern. + return Cond; + + // Determine the new comparison opcode. It may be signed or unsigned, + // and the original comparison may be either equality or inequality. + if (Cond->getPredicate() == CmpInst::ICMP_EQ) + Pred = CmpInst::getInversePredicate(Pred); + + // Ok, everything looks ok to change the condition into an SLT or SGE and + // delete the max calculation. + ICmpInst *NewCond = + new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp"); + + // Delete the max calculation instructions. + Cond->replaceAllUsesWith(NewCond); + CondUse->setUser(NewCond); + Instruction *Cmp = cast<Instruction>(Sel->getOperand(0)); + Cond->eraseFromParent(); + Sel->eraseFromParent(); + if (Cmp->use_empty()) + Cmp->eraseFromParent(); + return NewCond; +} + +/// OptimizeLoopTermCond - Change loop terminating condition to use the +/// postinc iv when possible. +void +LSRInstance::OptimizeLoopTermCond() { + SmallPtrSet<Instruction *, 4> PostIncs; + + BasicBlock *LatchBlock = L->getLoopLatch(); + SmallVector<BasicBlock*, 8> ExitingBlocks; + L->getExitingBlocks(ExitingBlocks); + + for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) { + BasicBlock *ExitingBlock = ExitingBlocks[i]; + + // Get the terminating condition for the loop if possible. If we + // can, we want to change it to use a post-incremented version of its + // induction variable, to allow coalescing the live ranges for the IV into + // one register value. + + BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator()); + if (!TermBr) + continue; + // FIXME: Overly conservative, termination condition could be an 'or' etc.. + if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition())) + continue; + + // Search IVUsesByStride to find Cond's IVUse if there is one. + IVStrideUse *CondUse = 0; + ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition()); + if (!FindIVUserForCond(Cond, CondUse)) + continue; + + // If the trip count is computed in terms of a max (due to ScalarEvolution + // being unable to find a sufficient guard, for example), change the loop + // comparison to use SLT or ULT instead of NE. + // One consequence of doing this now is that it disrupts the count-down + // optimization. That's not always a bad thing though, because in such + // cases it may still be worthwhile to avoid a max. + Cond = OptimizeMax(Cond, CondUse); + + // If this exiting block dominates the latch block, it may also use + // the post-inc value if it won't be shared with other uses. + // Check for dominance. + if (!DT.dominates(ExitingBlock, LatchBlock)) + continue; + + // Conservatively avoid trying to use the post-inc value in non-latch + // exits if there may be pre-inc users in intervening blocks. + if (LatchBlock != ExitingBlock) + for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) + // Test if the use is reachable from the exiting block. This dominator + // query is a conservative approximation of reachability. + if (&*UI != CondUse && + !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) { + // Conservatively assume there may be reuse if the quotient of their + // strides could be a legal scale. + const SCEV *A = IU.getStride(*CondUse, L); + const SCEV *B = IU.getStride(*UI, L); + if (!A || !B) continue; + if (SE.getTypeSizeInBits(A->getType()) != + SE.getTypeSizeInBits(B->getType())) { + if (SE.getTypeSizeInBits(A->getType()) > + SE.getTypeSizeInBits(B->getType())) + B = SE.getSignExtendExpr(B, A->getType()); + else + A = SE.getSignExtendExpr(A, B->getType()); + } + if (const SCEVConstant *D = + dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) { + const ConstantInt *C = D->getValue(); + // Stride of one or negative one can have reuse with non-addresses. + if (C->isOne() || C->isAllOnesValue()) + goto decline_post_inc; + // Avoid weird situations. + if (C->getValue().getMinSignedBits() >= 64 || + C->getValue().isMinSignedValue()) + goto decline_post_inc; + // Without TLI, assume that any stride might be valid, and so any + // use might be shared. + if (!TLI) + goto decline_post_inc; + // Check for possible scaled-address reuse. + const Type *AccessTy = getAccessType(UI->getUser()); + TargetLowering::AddrMode AM; + AM.Scale = C->getSExtValue(); + if (TLI->isLegalAddressingMode(AM, AccessTy)) + goto decline_post_inc; + AM.Scale = -AM.Scale; + if (TLI->isLegalAddressingMode(AM, AccessTy)) + goto decline_post_inc; + } + } + + DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: " + << *Cond << '\n'); + + // It's possible for the setcc instruction to be anywhere in the loop, and + // possible for it to have multiple users. If it is not immediately before + // the exiting block branch, move it. + if (&*++BasicBlock::iterator(Cond) != TermBr) { + if (Cond->hasOneUse()) { + Cond->moveBefore(TermBr); + } else { + // Clone the terminating condition and insert into the loopend. + ICmpInst *OldCond = Cond; + Cond = cast<ICmpInst>(Cond->clone()); + Cond->setName(L->getHeader()->getName() + ".termcond"); + ExitingBlock->getInstList().insert(TermBr, Cond); + + // Clone the IVUse, as the old use still exists! + CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace()); + TermBr->replaceUsesOfWith(OldCond, Cond); + } + } + + // If we get to here, we know that we can transform the setcc instruction to + // use the post-incremented version of the IV, allowing us to coalesce the + // live ranges for the IV correctly. + CondUse->transformToPostInc(L); + Changed = true; + + PostIncs.insert(Cond); + decline_post_inc:; + } + + // Determine an insertion point for the loop induction variable increment. It + // must dominate all the post-inc comparisons we just set up, and it must + // dominate the loop latch edge. + IVIncInsertPos = L->getLoopLatch()->getTerminator(); + for (SmallPtrSet<Instruction *, 4>::const_iterator I = PostIncs.begin(), + E = PostIncs.end(); I != E; ++I) { + BasicBlock *BB = + DT.findNearestCommonDominator(IVIncInsertPos->getParent(), + (*I)->getParent()); + if (BB == (*I)->getParent()) + IVIncInsertPos = *I; + else if (BB != IVIncInsertPos->getParent()) + IVIncInsertPos = BB->getTerminator(); + } +} + +/// reconcileNewOffset - Determine if the given use can accomodate a fixup +/// at the given offset and other details. If so, update the use and +/// return true. +bool +LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, + LSRUse::KindType Kind, const Type *AccessTy) { + int64_t NewMinOffset = LU.MinOffset; + int64_t NewMaxOffset = LU.MaxOffset; + const Type *NewAccessTy = AccessTy; + + // Check for a mismatched kind. It's tempting to collapse mismatched kinds to + // something conservative, however this can pessimize in the case that one of + // the uses will have all its uses outside the loop, for example. + if (LU.Kind != Kind) + return false; + // Conservatively assume HasBaseReg is true for now. + if (NewOffset < LU.MinOffset) { + if (!isAlwaysFoldable(LU.MaxOffset - NewOffset, 0, HasBaseReg, + Kind, AccessTy, TLI)) + return false; + NewMinOffset = NewOffset; + } else if (NewOffset > LU.MaxOffset) { + if (!isAlwaysFoldable(NewOffset - LU.MinOffset, 0, HasBaseReg, + Kind, AccessTy, TLI)) + return false; + NewMaxOffset = NewOffset; + } + // Check for a mismatched access type, and fall back conservatively as needed. + // TODO: Be less conservative when the type is similar and can use the same + // addressing modes. + if (Kind == LSRUse::Address && AccessTy != LU.AccessTy) + NewAccessTy = Type::getVoidTy(AccessTy->getContext()); + + // Update the use. + LU.MinOffset = NewMinOffset; + LU.MaxOffset = NewMaxOffset; + LU.AccessTy = NewAccessTy; + if (NewOffset != LU.Offsets.back()) + LU.Offsets.push_back(NewOffset); + return true; +} + +/// getUse - Return an LSRUse index and an offset value for a fixup which +/// needs the given expression, with the given kind and optional access type. +/// Either reuse an existing use or create a new one, as needed. +std::pair<size_t, int64_t> +LSRInstance::getUse(const SCEV *&Expr, + LSRUse::KindType Kind, const Type *AccessTy) { + const SCEV *Copy = Expr; + int64_t Offset = ExtractImmediate(Expr, SE); + + // Basic uses can't accept any offset, for example. + if (!isAlwaysFoldable(Offset, 0, /*HasBaseReg=*/true, Kind, AccessTy, TLI)) { + Expr = Copy; + Offset = 0; + } + + std::pair<UseMapTy::iterator, bool> P = + UseMap.insert(std::make_pair(std::make_pair(Expr, Kind), 0)); + if (!P.second) { + // A use already existed with this base. + size_t LUIdx = P.first->second; + LSRUse &LU = Uses[LUIdx]; + if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy)) + // Reuse this use. + return std::make_pair(LUIdx, Offset); + } + + // Create a new use. + size_t LUIdx = Uses.size(); + P.first->second = LUIdx; + Uses.push_back(LSRUse(Kind, AccessTy)); + LSRUse &LU = Uses[LUIdx]; + + // We don't need to track redundant offsets, but we don't need to go out + // of our way here to avoid them. + if (LU.Offsets.empty() || Offset != LU.Offsets.back()) + LU.Offsets.push_back(Offset); + + LU.MinOffset = Offset; + LU.MaxOffset = Offset; + return std::make_pair(LUIdx, Offset); +} + +/// DeleteUse - Delete the given use from the Uses list. +void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) { + if (&LU != &Uses.back()) + std::swap(LU, Uses.back()); + Uses.pop_back(); + + // Update RegUses. + RegUses.SwapAndDropUse(LUIdx, Uses.size()); +} + +/// FindUseWithFormula - Look for a use distinct from OrigLU which is has +/// a formula that has the same registers as the given formula. +LSRUse * +LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF, + const LSRUse &OrigLU) { + // Search all uses for the formula. This could be more clever. + for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { + LSRUse &LU = Uses[LUIdx]; + // Check whether this use is close enough to OrigLU, to see whether it's + // worthwhile looking through its formulae. + // Ignore ICmpZero uses because they may contain formulae generated by + // GenerateICmpZeroScales, in which case adding fixup offsets may + // be invalid. + if (&LU != &OrigLU && + LU.Kind != LSRUse::ICmpZero && + LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy && + LU.WidestFixupType == OrigLU.WidestFixupType && + LU.HasFormulaWithSameRegs(OrigF)) { + // Scan through this use's formulae. + for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(), + E = LU.Formulae.end(); I != E; ++I) { + const Formula &F = *I; + // Check to see if this formula has the same registers and symbols + // as OrigF. + if (F.BaseRegs == OrigF.BaseRegs && + F.ScaledReg == OrigF.ScaledReg && + F.AM.BaseGV == OrigF.AM.BaseGV && + F.AM.Scale == OrigF.AM.Scale) { + if (F.AM.BaseOffs == 0) + return &LU; + // This is the formula where all the registers and symbols matched; + // there aren't going to be any others. Since we declined it, we + // can skip the rest of the formulae and procede to the next LSRUse. + break; + } + } + } + } + + // Nothing looked good. + return 0; +} + +void LSRInstance::CollectInterestingTypesAndFactors() { + SmallSetVector<const SCEV *, 4> Strides; + + // Collect interesting types and strides. + SmallVector<const SCEV *, 4> Worklist; + for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) { + const SCEV *Expr = IU.getExpr(*UI); + + // Collect interesting types. + Types.insert(SE.getEffectiveSCEVType(Expr->getType())); + + // Add strides for mentioned loops. + Worklist.push_back(Expr); + do { + const SCEV *S = Worklist.pop_back_val(); + if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { + Strides.insert(AR->getStepRecurrence(SE)); + Worklist.push_back(AR->getStart()); + } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { + Worklist.append(Add->op_begin(), Add->op_end()); + } + } while (!Worklist.empty()); + } + + // Compute interesting factors from the set of interesting strides. + for (SmallSetVector<const SCEV *, 4>::const_iterator + I = Strides.begin(), E = Strides.end(); I != E; ++I) + for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter = + llvm::next(I); NewStrideIter != E; ++NewStrideIter) { + const SCEV *OldStride = *I; + const SCEV *NewStride = *NewStrideIter; + + if (SE.getTypeSizeInBits(OldStride->getType()) != + SE.getTypeSizeInBits(NewStride->getType())) { + if (SE.getTypeSizeInBits(OldStride->getType()) > + SE.getTypeSizeInBits(NewStride->getType())) + NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType()); + else + OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType()); + } + if (const SCEVConstant *Factor = + dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride, + SE, true))) { + if (Factor->getValue()->getValue().getMinSignedBits() <= 64) + Factors.insert(Factor->getValue()->getValue().getSExtValue()); + } else if (const SCEVConstant *Factor = + dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride, + NewStride, + SE, true))) { + if (Factor->getValue()->getValue().getMinSignedBits() <= 64) + Factors.insert(Factor->getValue()->getValue().getSExtValue()); + } + } + + // If all uses use the same type, don't bother looking for truncation-based + // reuse. + if (Types.size() == 1) + Types.clear(); + + DEBUG(print_factors_and_types(dbgs())); +} + +void LSRInstance::CollectFixupsAndInitialFormulae() { + for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) { + // Record the uses. + LSRFixup &LF = getNewFixup(); + LF.UserInst = UI->getUser(); + LF.OperandValToReplace = UI->getOperandValToReplace(); + LF.PostIncLoops = UI->getPostIncLoops(); + + LSRUse::KindType Kind = LSRUse::Basic; + const Type *AccessTy = 0; + if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) { + Kind = LSRUse::Address; + AccessTy = getAccessType(LF.UserInst); + } + + const SCEV *S = IU.getExpr(*UI); + + // Equality (== and !=) ICmps are special. We can rewrite (i == N) as + // (N - i == 0), and this allows (N - i) to be the expression that we work + // with rather than just N or i, so we can consider the register + // requirements for both N and i at the same time. Limiting this code to + // equality icmps is not a problem because all interesting loops use + // equality icmps, thanks to IndVarSimplify. + if (ICmpInst *CI = dyn_cast<ICmpInst>(LF.UserInst)) + if (CI->isEquality()) { + // Swap the operands if needed to put the OperandValToReplace on the + // left, for consistency. + Value *NV = CI->getOperand(1); + if (NV == LF.OperandValToReplace) { + CI->setOperand(1, CI->getOperand(0)); + CI->setOperand(0, NV); + NV = CI->getOperand(1); + Changed = true; + } + + // x == y --> x - y == 0 + const SCEV *N = SE.getSCEV(NV); + if (SE.isLoopInvariant(N, L)) { + Kind = LSRUse::ICmpZero; + S = SE.getMinusSCEV(N, S); + } + + // -1 and the negations of all interesting strides (except the negation + // of -1) are now also interesting. + for (size_t i = 0, e = Factors.size(); i != e; ++i) + if (Factors[i] != -1) + Factors.insert(-(uint64_t)Factors[i]); + Factors.insert(-1); + } + + // Set up the initial formula for this use. + std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy); + LF.LUIdx = P.first; + LF.Offset = P.second; + LSRUse &LU = Uses[LF.LUIdx]; + LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); + if (!LU.WidestFixupType || + SE.getTypeSizeInBits(LU.WidestFixupType) < + SE.getTypeSizeInBits(LF.OperandValToReplace->getType())) + LU.WidestFixupType = LF.OperandValToReplace->getType(); + + // If this is the first use of this LSRUse, give it a formula. + if (LU.Formulae.empty()) { + InsertInitialFormula(S, LU, LF.LUIdx); + CountRegisters(LU.Formulae.back(), LF.LUIdx); + } + } + + DEBUG(print_fixups(dbgs())); +} + +/// InsertInitialFormula - Insert a formula for the given expression into +/// the given use, separating out loop-variant portions from loop-invariant +/// and loop-computable portions. +void +LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) { + Formula F; + F.InitialMatch(S, L, SE); + bool Inserted = InsertFormula(LU, LUIdx, F); + assert(Inserted && "Initial formula already exists!"); (void)Inserted; +} + +/// InsertSupplementalFormula - Insert a simple single-register formula for +/// the given expression into the given use. +void +LSRInstance::InsertSupplementalFormula(const SCEV *S, + LSRUse &LU, size_t LUIdx) { + Formula F; + F.BaseRegs.push_back(S); + F.AM.HasBaseReg = true; + bool Inserted = InsertFormula(LU, LUIdx, F); + assert(Inserted && "Supplemental formula already exists!"); (void)Inserted; +} + +/// CountRegisters - Note which registers are used by the given formula, +/// updating RegUses. +void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) { + if (F.ScaledReg) + RegUses.CountRegister(F.ScaledReg, LUIdx); + for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(), + E = F.BaseRegs.end(); I != E; ++I) + RegUses.CountRegister(*I, LUIdx); +} + +/// InsertFormula - If the given formula has not yet been inserted, add it to +/// the list, and return true. Return false otherwise. +bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) { + if (!LU.InsertFormula(F)) + return false; + + CountRegisters(F, LUIdx); + return true; +} + +/// CollectLoopInvariantFixupsAndFormulae - Check for other uses of +/// loop-invariant values which we're tracking. These other uses will pin these +/// values in registers, making them less profitable for elimination. +/// TODO: This currently misses non-constant addrec step registers. +/// TODO: Should this give more weight to users inside the loop? +void +LSRInstance::CollectLoopInvariantFixupsAndFormulae() { + SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end()); + SmallPtrSet<const SCEV *, 8> Inserted; + + while (!Worklist.empty()) { + const SCEV *S = Worklist.pop_back_val(); + + if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S)) + Worklist.append(N->op_begin(), N->op_end()); + else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S)) + Worklist.push_back(C->getOperand()); + else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) { + Worklist.push_back(D->getLHS()); + Worklist.push_back(D->getRHS()); + } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) { + if (!Inserted.insert(U)) continue; + const Value *V = U->getValue(); + if (const Instruction *Inst = dyn_cast<Instruction>(V)) { + // Look for instructions defined outside the loop. + if (L->contains(Inst)) continue; + } else if (isa<UndefValue>(V)) + // Undef doesn't have a live range, so it doesn't matter. + continue; + for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end(); + UI != UE; ++UI) { + const Instruction *UserInst = dyn_cast<Instruction>(*UI); + // Ignore non-instructions. + if (!UserInst) + continue; + // Ignore instructions in other functions (as can happen with + // Constants). + if (UserInst->getParent()->getParent() != L->getHeader()->getParent()) + continue; + // Ignore instructions not dominated by the loop. + const BasicBlock *UseBB = !isa<PHINode>(UserInst) ? + UserInst->getParent() : + cast<PHINode>(UserInst)->getIncomingBlock( + PHINode::getIncomingValueNumForOperand(UI.getOperandNo())); + if (!DT.dominates(L->getHeader(), UseBB)) + continue; + // Ignore uses which are part of other SCEV expressions, to avoid + // analyzing them multiple times. + if (SE.isSCEVable(UserInst->getType())) { + const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst)); + // If the user is a no-op, look through to its uses. + if (!isa<SCEVUnknown>(UserS)) + continue; + if (UserS == U) { + Worklist.push_back( + SE.getUnknown(const_cast<Instruction *>(UserInst))); + continue; + } + } + // Ignore icmp instructions which are already being analyzed. + if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) { + unsigned OtherIdx = !UI.getOperandNo(); + Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx)); + if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L)) + continue; + } + + LSRFixup &LF = getNewFixup(); + LF.UserInst = const_cast<Instruction *>(UserInst); + LF.OperandValToReplace = UI.getUse(); + std::pair<size_t, int64_t> P = getUse(S, LSRUse::Basic, 0); + LF.LUIdx = P.first; + LF.Offset = P.second; + LSRUse &LU = Uses[LF.LUIdx]; + LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); + if (!LU.WidestFixupType || + SE.getTypeSizeInBits(LU.WidestFixupType) < + SE.getTypeSizeInBits(LF.OperandValToReplace->getType())) + LU.WidestFixupType = LF.OperandValToReplace->getType(); + InsertSupplementalFormula(U, LU, LF.LUIdx); + CountRegisters(LU.Formulae.back(), Uses.size() - 1); + break; + } + } + } +} + +/// CollectSubexprs - Split S into subexpressions which can be pulled out into +/// separate registers. If C is non-null, multiply each subexpression by C. +static void CollectSubexprs(const SCEV *S, const SCEVConstant *C, + SmallVectorImpl<const SCEV *> &Ops, + const Loop *L, + ScalarEvolution &SE) { + if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { + // Break out add operands. + for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); + I != E; ++I) + CollectSubexprs(*I, C, Ops, L, SE); + return; + } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { + // Split a non-zero base out of an addrec. + if (!AR->getStart()->isZero()) { + CollectSubexprs(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0), + AR->getStepRecurrence(SE), + AR->getLoop()), + C, Ops, L, SE); + CollectSubexprs(AR->getStart(), C, Ops, L, SE); + return; + } + } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) { + // Break (C * (a + b + c)) into C*a + C*b + C*c. + if (Mul->getNumOperands() == 2) + if (const SCEVConstant *Op0 = + dyn_cast<SCEVConstant>(Mul->getOperand(0))) { + CollectSubexprs(Mul->getOperand(1), + C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0, + Ops, L, SE); + return; + } + } + + // Otherwise use the value itself, optionally with a scale applied. + Ops.push_back(C ? SE.getMulExpr(C, S) : S); +} + +/// GenerateReassociations - Split out subexpressions from adds and the bases of +/// addrecs. +void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, + Formula Base, + unsigned Depth) { + // Arbitrarily cap recursion to protect compile time. + if (Depth >= 3) return; + + for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) { + const SCEV *BaseReg = Base.BaseRegs[i]; + + SmallVector<const SCEV *, 8> AddOps; + CollectSubexprs(BaseReg, 0, AddOps, L, SE); + + if (AddOps.size() == 1) continue; + + for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(), + JE = AddOps.end(); J != JE; ++J) { + + // Loop-variant "unknown" values are uninteresting; we won't be able to + // do anything meaningful with them. + if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L)) + continue; + + // Don't pull a constant into a register if the constant could be folded + // into an immediate field. + if (isAlwaysFoldable(*J, LU.MinOffset, LU.MaxOffset, + Base.getNumRegs() > 1, + LU.Kind, LU.AccessTy, TLI, SE)) + continue; + + // Collect all operands except *J. + SmallVector<const SCEV *, 8> InnerAddOps + (((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J); + InnerAddOps.append + (llvm::next(J), ((const SmallVector<const SCEV *, 8> &)AddOps).end()); + + // Don't leave just a constant behind in a register if the constant could + // be folded into an immediate field. + if (InnerAddOps.size() == 1 && + isAlwaysFoldable(InnerAddOps[0], LU.MinOffset, LU.MaxOffset, + Base.getNumRegs() > 1, + LU.Kind, LU.AccessTy, TLI, SE)) + continue; + + const SCEV *InnerSum = SE.getAddExpr(InnerAddOps); + if (InnerSum->isZero()) + continue; + Formula F = Base; + F.BaseRegs[i] = InnerSum; + F.BaseRegs.push_back(*J); + if (InsertFormula(LU, LUIdx, F)) + // If that formula hadn't been seen before, recurse to find more like + // it. + GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth+1); + } + } +} + +/// GenerateCombinations - Generate a formula consisting of all of the +/// loop-dominating registers added into a single register. +void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, + Formula Base) { + // This method is only interesting on a plurality of registers. + if (Base.BaseRegs.size() <= 1) return; + + Formula F = Base; + F.BaseRegs.clear(); + SmallVector<const SCEV *, 4> Ops; + for (SmallVectorImpl<const SCEV *>::const_iterator + I = Base.BaseRegs.begin(), E = Base.BaseRegs.end(); I != E; ++I) { + const SCEV *BaseReg = *I; + if (SE.properlyDominates(BaseReg, L->getHeader()) && + !SE.hasComputableLoopEvolution(BaseReg, L)) + Ops.push_back(BaseReg); + else + F.BaseRegs.push_back(BaseReg); + } + if (Ops.size() > 1) { + const SCEV *Sum = SE.getAddExpr(Ops); + // TODO: If Sum is zero, it probably means ScalarEvolution missed an + // opportunity to fold something. For now, just ignore such cases + // rather than proceed with zero in a register. + if (!Sum->isZero()) { + F.BaseRegs.push_back(Sum); + (void)InsertFormula(LU, LUIdx, F); + } + } +} + +/// GenerateSymbolicOffsets - Generate reuse formulae using symbolic offsets. +void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, + Formula Base) { + // We can't add a symbolic offset if the address already contains one. + if (Base.AM.BaseGV) return; + + for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) { + const SCEV *G = Base.BaseRegs[i]; + GlobalValue *GV = ExtractSymbol(G, SE); + if (G->isZero() || !GV) + continue; + Formula F = Base; + F.AM.BaseGV = GV; + if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset, + LU.Kind, LU.AccessTy, TLI)) + continue; + F.BaseRegs[i] = G; + (void)InsertFormula(LU, LUIdx, F); + } +} + +/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets. +void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, + Formula Base) { + // TODO: For now, just add the min and max offset, because it usually isn't + // worthwhile looking at everything inbetween. + SmallVector<int64_t, 2> Worklist; + Worklist.push_back(LU.MinOffset); + if (LU.MaxOffset != LU.MinOffset) + Worklist.push_back(LU.MaxOffset); + + for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) { + const SCEV *G = Base.BaseRegs[i]; + + for (SmallVectorImpl<int64_t>::const_iterator I = Worklist.begin(), + E = Worklist.end(); I != E; ++I) { + Formula F = Base; + F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs - *I; + if (isLegalUse(F.AM, LU.MinOffset - *I, LU.MaxOffset - *I, + LU.Kind, LU.AccessTy, TLI)) { + // Add the offset to the base register. + const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), *I), G); + // If it cancelled out, drop the base register, otherwise update it. + if (NewG->isZero()) { + std::swap(F.BaseRegs[i], F.BaseRegs.back()); + F.BaseRegs.pop_back(); + } else + F.BaseRegs[i] = NewG; + + (void)InsertFormula(LU, LUIdx, F); + } + } + + int64_t Imm = ExtractImmediate(G, SE); + if (G->isZero() || Imm == 0) + continue; + Formula F = Base; + F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Imm; + if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset, + LU.Kind, LU.AccessTy, TLI)) + continue; + F.BaseRegs[i] = G; + (void)InsertFormula(LU, LUIdx, F); + } +} + +/// GenerateICmpZeroScales - For ICmpZero, check to see if we can scale up +/// the comparison. For example, x == y -> x*c == y*c. +void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, + Formula Base) { + if (LU.Kind != LSRUse::ICmpZero) return; + + // Determine the integer type for the base formula. + const Type *IntTy = Base.getType(); + if (!IntTy) return; + if (SE.getTypeSizeInBits(IntTy) > 64) return; + + // Don't do this if there is more than one offset. + if (LU.MinOffset != LU.MaxOffset) return; + + assert(!Base.AM.BaseGV && "ICmpZero use is not legal!"); + + // Check each interesting stride. + for (SmallSetVector<int64_t, 8>::const_iterator + I = Factors.begin(), E = Factors.end(); I != E; ++I) { + int64_t Factor = *I; + + // Check that the multiplication doesn't overflow. + if (Base.AM.BaseOffs == INT64_MIN && Factor == -1) + continue; + int64_t NewBaseOffs = (uint64_t)Base.AM.BaseOffs * Factor; + if (NewBaseOffs / Factor != Base.AM.BaseOffs) + continue; + + // Check that multiplying with the use offset doesn't overflow. + int64_t Offset = LU.MinOffset; + if (Offset == INT64_MIN && Factor == -1) + continue; + Offset = (uint64_t)Offset * Factor; + if (Offset / Factor != LU.MinOffset) + continue; + + Formula F = Base; + F.AM.BaseOffs = NewBaseOffs; + + // Check that this scale is legal. + if (!isLegalUse(F.AM, Offset, Offset, LU.Kind, LU.AccessTy, TLI)) + continue; + + // Compensate for the use having MinOffset built into it. + F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Offset - LU.MinOffset; + + const SCEV *FactorS = SE.getConstant(IntTy, Factor); + + // Check that multiplying with each base register doesn't overflow. + for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) { + F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS); + if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i]) + goto next; + } + + // Check that multiplying with the scaled register doesn't overflow. + if (F.ScaledReg) { + F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS); + if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg) + continue; + } + + // If we make it here and it's legal, add it. + (void)InsertFormula(LU, LUIdx, F); + next:; + } +} + +/// GenerateScales - Generate stride factor reuse formulae by making use of +/// scaled-offset address modes, for example. +void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { + // Determine the integer type for the base formula. + const Type *IntTy = Base.getType(); + if (!IntTy) return; + + // If this Formula already has a scaled register, we can't add another one. + if (Base.AM.Scale != 0) return; + + // Check each interesting stride. + for (SmallSetVector<int64_t, 8>::const_iterator + I = Factors.begin(), E = Factors.end(); I != E; ++I) { + int64_t Factor = *I; + + Base.AM.Scale = Factor; + Base.AM.HasBaseReg = Base.BaseRegs.size() > 1; + // Check whether this scale is going to be legal. + if (!isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset, + LU.Kind, LU.AccessTy, TLI)) { + // As a special-case, handle special out-of-loop Basic users specially. + // TODO: Reconsider this special case. + if (LU.Kind == LSRUse::Basic && + isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset, + LSRUse::Special, LU.AccessTy, TLI) && + LU.AllFixupsOutsideLoop) + LU.Kind = LSRUse::Special; + else + continue; + } + // For an ICmpZero, negating a solitary base register won't lead to + // new solutions. + if (LU.Kind == LSRUse::ICmpZero && + !Base.AM.HasBaseReg && Base.AM.BaseOffs == 0 && !Base.AM.BaseGV) + continue; + // For each addrec base reg, apply the scale, if possible. + for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) + if (const SCEVAddRecExpr *AR = + dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) { + const SCEV *FactorS = SE.getConstant(IntTy, Factor); + if (FactorS->isZero()) + continue; + // Divide out the factor, ignoring high bits, since we'll be + // scaling the value back up in the end. + if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) { + // TODO: This could be optimized to avoid all the copying. + Formula F = Base; + F.ScaledReg = Quotient; + F.DeleteBaseReg(F.BaseRegs[i]); + (void)InsertFormula(LU, LUIdx, F); + } + } + } +} + +/// GenerateTruncates - Generate reuse formulae from different IV types. +void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) { + // This requires TargetLowering to tell us which truncates are free. + if (!TLI) return; + + // Don't bother truncating symbolic values. + if (Base.AM.BaseGV) return; + + // Determine the integer type for the base formula. + const Type *DstTy = Base.getType(); + if (!DstTy) return; + DstTy = SE.getEffectiveSCEVType(DstTy); + + for (SmallSetVector<const Type *, 4>::const_iterator + I = Types.begin(), E = Types.end(); I != E; ++I) { + const Type *SrcTy = *I; + if (SrcTy != DstTy && TLI->isTruncateFree(SrcTy, DstTy)) { + Formula F = Base; + + if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, *I); + for (SmallVectorImpl<const SCEV *>::iterator J = F.BaseRegs.begin(), + JE = F.BaseRegs.end(); J != JE; ++J) + *J = SE.getAnyExtendExpr(*J, SrcTy); + + // TODO: This assumes we've done basic processing on all uses and + // have an idea what the register usage is. + if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses)) + continue; + + (void)InsertFormula(LU, LUIdx, F); + } + } +} + +namespace { + +/// WorkItem - Helper class for GenerateCrossUseConstantOffsets. It's used to +/// defer modifications so that the search phase doesn't have to worry about +/// the data structures moving underneath it. +struct WorkItem { + size_t LUIdx; + int64_t Imm; + const SCEV *OrigReg; + + WorkItem(size_t LI, int64_t I, const SCEV *R) + : LUIdx(LI), Imm(I), OrigReg(R) {} + + void print(raw_ostream &OS) const; + void dump() const; +}; + +} + +void WorkItem::print(raw_ostream &OS) const { + OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx + << " , add offset " << Imm; +} + +void WorkItem::dump() const { + print(errs()); errs() << '\n'; +} + +/// GenerateCrossUseConstantOffsets - Look for registers which are a constant +/// distance apart and try to form reuse opportunities between them. +void LSRInstance::GenerateCrossUseConstantOffsets() { + // Group the registers by their value without any added constant offset. + typedef std::map<int64_t, const SCEV *> ImmMapTy; + typedef DenseMap<const SCEV *, ImmMapTy> RegMapTy; + RegMapTy Map; + DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap; + SmallVector<const SCEV *, 8> Sequence; + for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end(); + I != E; ++I) { + const SCEV *Reg = *I; + int64_t Imm = ExtractImmediate(Reg, SE); + std::pair<RegMapTy::iterator, bool> Pair = + Map.insert(std::make_pair(Reg, ImmMapTy())); + if (Pair.second) + Sequence.push_back(Reg); + Pair.first->second.insert(std::make_pair(Imm, *I)); + UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(*I); + } + + // Now examine each set of registers with the same base value. Build up + // a list of work to do and do the work in a separate step so that we're + // not adding formulae and register counts while we're searching. + SmallVector<WorkItem, 32> WorkItems; + SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems; + for (SmallVectorImpl<const SCEV *>::const_iterator I = Sequence.begin(), + E = Sequence.end(); I != E; ++I) { + const SCEV *Reg = *I; + const ImmMapTy &Imms = Map.find(Reg)->second; + + // It's not worthwhile looking for reuse if there's only one offset. + if (Imms.size() == 1) + continue; + + DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':'; + for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end(); + J != JE; ++J) + dbgs() << ' ' << J->first; + dbgs() << '\n'); + + // Examine each offset. + for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end(); + J != JE; ++J) { + const SCEV *OrigReg = J->second; + + int64_t JImm = J->first; + const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg); + + if (!isa<SCEVConstant>(OrigReg) && + UsedByIndicesMap[Reg].count() == 1) { + DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg << '\n'); + continue; + } + + // Conservatively examine offsets between this orig reg a few selected + // other orig regs. + ImmMapTy::const_iterator OtherImms[] = { + Imms.begin(), prior(Imms.end()), + Imms.upper_bound((Imms.begin()->first + prior(Imms.end())->first) / 2) + }; + for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) { + ImmMapTy::const_iterator M = OtherImms[i]; + if (M == J || M == JE) continue; + + // Compute the difference between the two. + int64_t Imm = (uint64_t)JImm - M->first; + for (int LUIdx = UsedByIndices.find_first(); LUIdx != -1; + LUIdx = UsedByIndices.find_next(LUIdx)) + // Make a memo of this use, offset, and register tuple. + if (UniqueItems.insert(std::make_pair(LUIdx, Imm))) + WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg)); + } + } + } + + Map.clear(); + Sequence.clear(); + UsedByIndicesMap.clear(); + UniqueItems.clear(); + + // Now iterate through the worklist and add new formulae. + for (SmallVectorImpl<WorkItem>::const_iterator I = WorkItems.begin(), + E = WorkItems.end(); I != E; ++I) { + const WorkItem &WI = *I; + size_t LUIdx = WI.LUIdx; + LSRUse &LU = Uses[LUIdx]; + int64_t Imm = WI.Imm; + const SCEV *OrigReg = WI.OrigReg; + + const Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType()); + const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm)); + unsigned BitWidth = SE.getTypeSizeInBits(IntTy); + + // TODO: Use a more targeted data structure. + for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) { + const Formula &F = LU.Formulae[L]; + // Use the immediate in the scaled register. + if (F.ScaledReg == OrigReg) { + int64_t Offs = (uint64_t)F.AM.BaseOffs + + Imm * (uint64_t)F.AM.Scale; + // Don't create 50 + reg(-50). + if (F.referencesReg(SE.getSCEV( + ConstantInt::get(IntTy, -(uint64_t)Offs)))) + continue; + Formula NewF = F; + NewF.AM.BaseOffs = Offs; + if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset, + LU.Kind, LU.AccessTy, TLI)) + continue; + NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg); + + // If the new scale is a constant in a register, and adding the constant + // value to the immediate would produce a value closer to zero than the + // immediate itself, then the formula isn't worthwhile. + if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) + if (C->getValue()->getValue().isNegative() != + (NewF.AM.BaseOffs < 0) && + (C->getValue()->getValue().abs() * APInt(BitWidth, F.AM.Scale)) + .ule(abs64(NewF.AM.BaseOffs))) + continue; + + // OK, looks good. + (void)InsertFormula(LU, LUIdx, NewF); + } else { + // Use the immediate in a base register. + for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) { + const SCEV *BaseReg = F.BaseRegs[N]; + if (BaseReg != OrigReg) + continue; + Formula NewF = F; + NewF.AM.BaseOffs = (uint64_t)NewF.AM.BaseOffs + Imm; + if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset, + LU.Kind, LU.AccessTy, TLI)) + continue; + NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg); + + // If the new formula has a constant in a register, and adding the + // constant value to the immediate would produce a value closer to + // zero than the immediate itself, then the formula isn't worthwhile. + for (SmallVectorImpl<const SCEV *>::const_iterator + J = NewF.BaseRegs.begin(), JE = NewF.BaseRegs.end(); + J != JE; ++J) + if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*J)) + if ((C->getValue()->getValue() + NewF.AM.BaseOffs).abs().slt( + abs64(NewF.AM.BaseOffs)) && + (C->getValue()->getValue() + + NewF.AM.BaseOffs).countTrailingZeros() >= + CountTrailingZeros_64(NewF.AM.BaseOffs)) + goto skip_formula; + + // Ok, looks good. + (void)InsertFormula(LU, LUIdx, NewF); + break; + skip_formula:; + } + } + } + } +} + +/// GenerateAllReuseFormulae - Generate formulae for each use. +void +LSRInstance::GenerateAllReuseFormulae() { + // This is split into multiple loops so that hasRegsUsedByUsesOtherThan + // queries are more precise. + for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { + LSRUse &LU = Uses[LUIdx]; + for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) + GenerateReassociations(LU, LUIdx, LU.Formulae[i]); + for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) + GenerateCombinations(LU, LUIdx, LU.Formulae[i]); + } + for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { + LSRUse &LU = Uses[LUIdx]; + for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) + GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]); + for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) + GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]); + for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) + GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]); + for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) + GenerateScales(LU, LUIdx, LU.Formulae[i]); + } + for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { + LSRUse &LU = Uses[LUIdx]; + for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) + GenerateTruncates(LU, LUIdx, LU.Formulae[i]); + } + + GenerateCrossUseConstantOffsets(); + + DEBUG(dbgs() << "\n" + "After generating reuse formulae:\n"; + print_uses(dbgs())); +} + +/// If there are multiple formulae with the same set of registers used +/// by other uses, pick the best one and delete the others. +void LSRInstance::FilterOutUndesirableDedicatedRegisters() { + DenseSet<const SCEV *> VisitedRegs; + SmallPtrSet<const SCEV *, 16> Regs; +#ifndef NDEBUG + bool ChangedFormulae = false; +#endif + + // Collect the best formula for each unique set of shared registers. This + // is reset for each use. + typedef DenseMap<SmallVector<const SCEV *, 2>, size_t, UniquifierDenseMapInfo> + BestFormulaeTy; + BestFormulaeTy BestFormulae; + + for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { + LSRUse &LU = Uses[LUIdx]; + DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n'); + + bool Any = false; + for (size_t FIdx = 0, NumForms = LU.Formulae.size(); + FIdx != NumForms; ++FIdx) { + Formula &F = LU.Formulae[FIdx]; + + SmallVector<const SCEV *, 2> Key; + for (SmallVectorImpl<const SCEV *>::const_iterator J = F.BaseRegs.begin(), + JE = F.BaseRegs.end(); J != JE; ++J) { + const SCEV *Reg = *J; + if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx)) + Key.push_back(Reg); + } + if (F.ScaledReg && + RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx)) + Key.push_back(F.ScaledReg); + // Unstable sort by host order ok, because this is only used for + // uniquifying. + std::sort(Key.begin(), Key.end()); + + std::pair<BestFormulaeTy::const_iterator, bool> P = + BestFormulae.insert(std::make_pair(Key, FIdx)); + if (!P.second) { + Formula &Best = LU.Formulae[P.first->second]; + + Cost CostF; + CostF.RateFormula(F, Regs, VisitedRegs, L, LU.Offsets, SE, DT); + Regs.clear(); + Cost CostBest; + CostBest.RateFormula(Best, Regs, VisitedRegs, L, LU.Offsets, SE, DT); + Regs.clear(); + if (CostF < CostBest) + std::swap(F, Best); + DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs()); + dbgs() << "\n" + " in favor of formula "; Best.print(dbgs()); + dbgs() << '\n'); +#ifndef NDEBUG + ChangedFormulae = true; +#endif + LU.DeleteFormula(F); + --FIdx; + --NumForms; + Any = true; + continue; + } + } + + // Now that we've filtered out some formulae, recompute the Regs set. + if (Any) + LU.RecomputeRegs(LUIdx, RegUses); + + // Reset this to prepare for the next use. + BestFormulae.clear(); + } + + DEBUG(if (ChangedFormulae) { + dbgs() << "\n" + "After filtering out undesirable candidates:\n"; + print_uses(dbgs()); + }); +} + +// This is a rough guess that seems to work fairly well. +static const size_t ComplexityLimit = UINT16_MAX; + +/// EstimateSearchSpaceComplexity - Estimate the worst-case number of +/// solutions the solver might have to consider. It almost never considers +/// this many solutions because it prune the search space, but the pruning +/// isn't always sufficient. +size_t LSRInstance::EstimateSearchSpaceComplexity() const { + size_t Power = 1; + for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(), + E = Uses.end(); I != E; ++I) { + size_t FSize = I->Formulae.size(); + if (FSize >= ComplexityLimit) { + Power = ComplexityLimit; + break; + } + Power *= FSize; + if (Power >= ComplexityLimit) + break; + } + return Power; +} + +/// NarrowSearchSpaceByDetectingSupersets - When one formula uses a superset +/// of the registers of another formula, it won't help reduce register +/// pressure (though it may not necessarily hurt register pressure); remove +/// it to simplify the system. +void LSRInstance::NarrowSearchSpaceByDetectingSupersets() { + if (EstimateSearchSpaceComplexity() >= ComplexityLimit) { + DEBUG(dbgs() << "The search space is too complex.\n"); + + DEBUG(dbgs() << "Narrowing the search space by eliminating formulae " + "which use a superset of registers used by other " + "formulae.\n"); + + for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { + LSRUse &LU = Uses[LUIdx]; + bool Any = false; + for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) { + Formula &F = LU.Formulae[i]; + // Look for a formula with a constant or GV in a register. If the use + // also has a formula with that same value in an immediate field, + // delete the one that uses a register. + for (SmallVectorImpl<const SCEV *>::const_iterator + I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) { + if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) { + Formula NewF = F; + NewF.AM.BaseOffs += C->getValue()->getSExtValue(); + NewF.BaseRegs.erase(NewF.BaseRegs.begin() + + (I - F.BaseRegs.begin())); + if (LU.HasFormulaWithSameRegs(NewF)) { + DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n'); + LU.DeleteFormula(F); + --i; + --e; + Any = true; + break; + } + } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) { + if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) + if (!F.AM.BaseGV) { + Formula NewF = F; + NewF.AM.BaseGV = GV; + NewF.BaseRegs.erase(NewF.BaseRegs.begin() + + (I - F.BaseRegs.begin())); + if (LU.HasFormulaWithSameRegs(NewF)) { + DEBUG(dbgs() << " Deleting "; F.print(dbgs()); + dbgs() << '\n'); + LU.DeleteFormula(F); + --i; + --e; + Any = true; + break; + } + } + } + } + } + if (Any) + LU.RecomputeRegs(LUIdx, RegUses); + } + + DEBUG(dbgs() << "After pre-selection:\n"; + print_uses(dbgs())); + } +} + +/// NarrowSearchSpaceByCollapsingUnrolledCode - When there are many registers +/// for expressions like A, A+1, A+2, etc., allocate a single register for +/// them. +void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { + if (EstimateSearchSpaceComplexity() >= ComplexityLimit) { + DEBUG(dbgs() << "The search space is too complex.\n"); + + DEBUG(dbgs() << "Narrowing the search space by assuming that uses " + "separated by a constant offset will use the same " + "registers.\n"); + + // This is especially useful for unrolled loops. + + for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { + LSRUse &LU = Uses[LUIdx]; + for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(), + E = LU.Formulae.end(); I != E; ++I) { + const Formula &F = *I; + if (F.AM.BaseOffs != 0 && F.AM.Scale == 0) { + if (LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU)) { + if (reconcileNewOffset(*LUThatHas, F.AM.BaseOffs, + /*HasBaseReg=*/false, + LU.Kind, LU.AccessTy)) { + DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); + dbgs() << '\n'); + + LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop; + + // Update the relocs to reference the new use. + for (SmallVectorImpl<LSRFixup>::iterator I = Fixups.begin(), + E = Fixups.end(); I != E; ++I) { + LSRFixup &Fixup = *I; + if (Fixup.LUIdx == LUIdx) { + Fixup.LUIdx = LUThatHas - &Uses.front(); + Fixup.Offset += F.AM.BaseOffs; + // Add the new offset to LUThatHas' offset list. + if (LUThatHas->Offsets.back() != Fixup.Offset) { + LUThatHas->Offsets.push_back(Fixup.Offset); + if (Fixup.Offset > LUThatHas->MaxOffset) + LUThatHas->MaxOffset = Fixup.Offset; + if (Fixup.Offset < LUThatHas->MinOffset) + LUThatHas->MinOffset = Fixup.Offset; + } + DEBUG(dbgs() << "New fixup has offset " + << Fixup.Offset << '\n'); + } + if (Fixup.LUIdx == NumUses-1) + Fixup.LUIdx = LUIdx; + } + + // Delete formulae from the new use which are no longer legal. + bool Any = false; + for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) { + Formula &F = LUThatHas->Formulae[i]; + if (!isLegalUse(F.AM, + LUThatHas->MinOffset, LUThatHas->MaxOffset, + LUThatHas->Kind, LUThatHas->AccessTy, TLI)) { + DEBUG(dbgs() << " Deleting "; F.print(dbgs()); + dbgs() << '\n'); + LUThatHas->DeleteFormula(F); + --i; + --e; + Any = true; + } + } + if (Any) + LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses); + + // Delete the old use. + DeleteUse(LU, LUIdx); + --LUIdx; + --NumUses; + break; + } + } + } + } + } + + DEBUG(dbgs() << "After pre-selection:\n"; + print_uses(dbgs())); + } +} + +/// NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters - Call +/// FilterOutUndesirableDedicatedRegisters again, if necessary, now that +/// we've done more filtering, as it may be able to find more formulae to +/// eliminate. +void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){ + if (EstimateSearchSpaceComplexity() >= ComplexityLimit) { + DEBUG(dbgs() << "The search space is too complex.\n"); + + DEBUG(dbgs() << "Narrowing the search space by re-filtering out " + "undesirable dedicated registers.\n"); + + FilterOutUndesirableDedicatedRegisters(); + + DEBUG(dbgs() << "After pre-selection:\n"; + print_uses(dbgs())); + } +} + +/// NarrowSearchSpaceByPickingWinnerRegs - Pick a register which seems likely +/// to be profitable, and then in any use which has any reference to that +/// register, delete all formulae which do not reference that register. +void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() { + // With all other options exhausted, loop until the system is simple + // enough to handle. + SmallPtrSet<const SCEV *, 4> Taken; + while (EstimateSearchSpaceComplexity() >= ComplexityLimit) { + // Ok, we have too many of formulae on our hands to conveniently handle. + // Use a rough heuristic to thin out the list. + DEBUG(dbgs() << "The search space is too complex.\n"); + + // Pick the register which is used by the most LSRUses, which is likely + // to be a good reuse register candidate. + const SCEV *Best = 0; + unsigned BestNum = 0; + for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end(); + I != E; ++I) { + const SCEV *Reg = *I; + if (Taken.count(Reg)) + continue; + if (!Best) + Best = Reg; + else { + unsigned Count = RegUses.getUsedByIndices(Reg).count(); + if (Count > BestNum) { + Best = Reg; + BestNum = Count; + } + } + } + + DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best + << " will yield profitable reuse.\n"); + Taken.insert(Best); + + // In any use with formulae which references this register, delete formulae + // which don't reference it. + for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { + LSRUse &LU = Uses[LUIdx]; + if (!LU.Regs.count(Best)) continue; + + bool Any = false; + for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) { + Formula &F = LU.Formulae[i]; + if (!F.referencesReg(Best)) { + DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n'); + LU.DeleteFormula(F); + --e; + --i; + Any = true; + assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?"); + continue; + } + } + + if (Any) + LU.RecomputeRegs(LUIdx, RegUses); + } + + DEBUG(dbgs() << "After pre-selection:\n"; + print_uses(dbgs())); + } +} + +/// NarrowSearchSpaceUsingHeuristics - If there are an extraordinary number of +/// formulae to choose from, use some rough heuristics to prune down the number +/// of formulae. This keeps the main solver from taking an extraordinary amount +/// of time in some worst-case scenarios. +void LSRInstance::NarrowSearchSpaceUsingHeuristics() { + NarrowSearchSpaceByDetectingSupersets(); + NarrowSearchSpaceByCollapsingUnrolledCode(); + NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(); + NarrowSearchSpaceByPickingWinnerRegs(); +} + +/// SolveRecurse - This is the recursive solver. +void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution, + Cost &SolutionCost, + SmallVectorImpl<const Formula *> &Workspace, + const Cost &CurCost, + const SmallPtrSet<const SCEV *, 16> &CurRegs, + DenseSet<const SCEV *> &VisitedRegs) const { + // Some ideas: + // - prune more: + // - use more aggressive filtering + // - sort the formula so that the most profitable solutions are found first + // - sort the uses too + // - search faster: + // - don't compute a cost, and then compare. compare while computing a cost + // and bail early. + // - track register sets with SmallBitVector + + const LSRUse &LU = Uses[Workspace.size()]; + + // If this use references any register that's already a part of the + // in-progress solution, consider it a requirement that a formula must + // reference that register in order to be considered. This prunes out + // unprofitable searching. + SmallSetVector<const SCEV *, 4> ReqRegs; + for (SmallPtrSet<const SCEV *, 16>::const_iterator I = CurRegs.begin(), + E = CurRegs.end(); I != E; ++I) + if (LU.Regs.count(*I)) + ReqRegs.insert(*I); + + bool AnySatisfiedReqRegs = false; + SmallPtrSet<const SCEV *, 16> NewRegs; + Cost NewCost; +retry: + for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(), + E = LU.Formulae.end(); I != E; ++I) { + const Formula &F = *I; + + // Ignore formulae which do not use any of the required registers. + for (SmallSetVector<const SCEV *, 4>::const_iterator J = ReqRegs.begin(), + JE = ReqRegs.end(); J != JE; ++J) { + const SCEV *Reg = *J; + if ((!F.ScaledReg || F.ScaledReg != Reg) && + std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) == + F.BaseRegs.end()) + goto skip; + } + AnySatisfiedReqRegs = true; + + // Evaluate the cost of the current formula. If it's already worse than + // the current best, prune the search at that point. + NewCost = CurCost; + NewRegs = CurRegs; + NewCost.RateFormula(F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT); + if (NewCost < SolutionCost) { + Workspace.push_back(&F); + if (Workspace.size() != Uses.size()) { + SolveRecurse(Solution, SolutionCost, Workspace, NewCost, + NewRegs, VisitedRegs); + if (F.getNumRegs() == 1 && Workspace.size() == 1) + VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]); + } else { + DEBUG(dbgs() << "New best at "; NewCost.print(dbgs()); + dbgs() << ". Regs:"; + for (SmallPtrSet<const SCEV *, 16>::const_iterator + I = NewRegs.begin(), E = NewRegs.end(); I != E; ++I) + dbgs() << ' ' << **I; + dbgs() << '\n'); + + SolutionCost = NewCost; + Solution = Workspace; + } + Workspace.pop_back(); + } + skip:; + } + + // If none of the formulae had all of the required registers, relax the + // constraint so that we don't exclude all formulae. + if (!AnySatisfiedReqRegs) { + assert(!ReqRegs.empty() && "Solver failed even without required registers"); + ReqRegs.clear(); + goto retry; + } +} + +/// Solve - Choose one formula from each use. Return the results in the given +/// Solution vector. +void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const { + SmallVector<const Formula *, 8> Workspace; + Cost SolutionCost; + SolutionCost.Loose(); + Cost CurCost; + SmallPtrSet<const SCEV *, 16> CurRegs; + DenseSet<const SCEV *> VisitedRegs; + Workspace.reserve(Uses.size()); + + // SolveRecurse does all the work. + SolveRecurse(Solution, SolutionCost, Workspace, CurCost, + CurRegs, VisitedRegs); + + // Ok, we've now made all our decisions. + DEBUG(dbgs() << "\n" + "The chosen solution requires "; SolutionCost.print(dbgs()); + dbgs() << ":\n"; + for (size_t i = 0, e = Uses.size(); i != e; ++i) { + dbgs() << " "; + Uses[i].print(dbgs()); + dbgs() << "\n" + " "; + Solution[i]->print(dbgs()); + dbgs() << '\n'; + }); + + assert(Solution.size() == Uses.size() && "Malformed solution!"); +} + +/// HoistInsertPosition - Helper for AdjustInsertPositionForExpand. Climb up +/// the dominator tree far as we can go while still being dominated by the +/// input positions. This helps canonicalize the insert position, which +/// encourages sharing. +BasicBlock::iterator +LSRInstance::HoistInsertPosition(BasicBlock::iterator IP, + const SmallVectorImpl<Instruction *> &Inputs) + const { + for (;;) { + const Loop *IPLoop = LI.getLoopFor(IP->getParent()); + unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0; + + BasicBlock *IDom; + for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) { + if (!Rung) return IP; + Rung = Rung->getIDom(); + if (!Rung) return IP; + IDom = Rung->getBlock(); + + // Don't climb into a loop though. + const Loop *IDomLoop = LI.getLoopFor(IDom); + unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0; + if (IDomDepth <= IPLoopDepth && + (IDomDepth != IPLoopDepth || IDomLoop == IPLoop)) + break; + } + + bool AllDominate = true; + Instruction *BetterPos = 0; + Instruction *Tentative = IDom->getTerminator(); + for (SmallVectorImpl<Instruction *>::const_iterator I = Inputs.begin(), + E = Inputs.end(); I != E; ++I) { + Instruction *Inst = *I; + if (Inst == Tentative || !DT.dominates(Inst, Tentative)) { + AllDominate = false; + break; + } + // Attempt to find an insert position in the middle of the block, + // instead of at the end, so that it can be used for other expansions. + if (IDom == Inst->getParent() && + (!BetterPos || DT.dominates(BetterPos, Inst))) + BetterPos = llvm::next(BasicBlock::iterator(Inst)); + } + if (!AllDominate) + break; + if (BetterPos) + IP = BetterPos; + else + IP = Tentative; + } + + return IP; +} + +/// AdjustInsertPositionForExpand - Determine an input position which will be +/// dominated by the operands and which will dominate the result. +BasicBlock::iterator +LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator IP, + const LSRFixup &LF, + const LSRUse &LU) const { + // Collect some instructions which must be dominated by the + // expanding replacement. These must be dominated by any operands that + // will be required in the expansion. + SmallVector<Instruction *, 4> Inputs; + if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace)) + Inputs.push_back(I); + if (LU.Kind == LSRUse::ICmpZero) + if (Instruction *I = + dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1))) + Inputs.push_back(I); + if (LF.PostIncLoops.count(L)) { + if (LF.isUseFullyOutsideLoop(L)) + Inputs.push_back(L->getLoopLatch()->getTerminator()); + else + Inputs.push_back(IVIncInsertPos); + } + // The expansion must also be dominated by the increment positions of any + // loops it for which it is using post-inc mode. + for (PostIncLoopSet::const_iterator I = LF.PostIncLoops.begin(), + E = LF.PostIncLoops.end(); I != E; ++I) { + const Loop *PIL = *I; + if (PIL == L) continue; + + // Be dominated by the loop exit. + SmallVector<BasicBlock *, 4> ExitingBlocks; + PIL->getExitingBlocks(ExitingBlocks); + if (!ExitingBlocks.empty()) { + BasicBlock *BB = ExitingBlocks[0]; + for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i) + BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]); + Inputs.push_back(BB->getTerminator()); + } + } + + // Then, climb up the immediate dominator tree as far as we can go while + // still being dominated by the input positions. + IP = HoistInsertPosition(IP, Inputs); + + // Don't insert instructions before PHI nodes. + while (isa<PHINode>(IP)) ++IP; + + // Ignore debug intrinsics. + while (isa<DbgInfoIntrinsic>(IP)) ++IP; + + return IP; +} + +/// Expand - Emit instructions for the leading candidate expression for this +/// LSRUse (this is called "expanding"). +Value *LSRInstance::Expand(const LSRFixup &LF, + const Formula &F, + BasicBlock::iterator IP, + SCEVExpander &Rewriter, + SmallVectorImpl<WeakVH> &DeadInsts) const { + const LSRUse &LU = Uses[LF.LUIdx]; + + // Determine an input position which will be dominated by the operands and + // which will dominate the result. + IP = AdjustInsertPositionForExpand(IP, LF, LU); + + // Inform the Rewriter if we have a post-increment use, so that it can + // perform an advantageous expansion. + Rewriter.setPostInc(LF.PostIncLoops); + + // This is the type that the user actually needs. + const Type *OpTy = LF.OperandValToReplace->getType(); + // This will be the type that we'll initially expand to. + const Type *Ty = F.getType(); + if (!Ty) + // No type known; just expand directly to the ultimate type. + Ty = OpTy; + else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy)) + // Expand directly to the ultimate type if it's the right size. + Ty = OpTy; + // This is the type to do integer arithmetic in. + const Type *IntTy = SE.getEffectiveSCEVType(Ty); + + // Build up a list of operands to add together to form the full base. + SmallVector<const SCEV *, 8> Ops; + + // Expand the BaseRegs portion. + for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(), + E = F.BaseRegs.end(); I != E; ++I) { + const SCEV *Reg = *I; + assert(!Reg->isZero() && "Zero allocated in a base register!"); + + // If we're expanding for a post-inc user, make the post-inc adjustment. + PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops); + Reg = TransformForPostIncUse(Denormalize, Reg, + LF.UserInst, LF.OperandValToReplace, + Loops, SE, DT); + + Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, 0, IP))); + } + + // Flush the operand list to suppress SCEVExpander hoisting. + if (!Ops.empty()) { + Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); + Ops.clear(); + Ops.push_back(SE.getUnknown(FullV)); + } + + // Expand the ScaledReg portion. + Value *ICmpScaledV = 0; + if (F.AM.Scale != 0) { + const SCEV *ScaledS = F.ScaledReg; + + // If we're expanding for a post-inc user, make the post-inc adjustment. + PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops); + ScaledS = TransformForPostIncUse(Denormalize, ScaledS, + LF.UserInst, LF.OperandValToReplace, + Loops, SE, DT); + + if (LU.Kind == LSRUse::ICmpZero) { + // An interesting way of "folding" with an icmp is to use a negated + // scale, which we'll implement by inserting it into the other operand + // of the icmp. + assert(F.AM.Scale == -1 && + "The only scale supported by ICmpZero uses is -1!"); + ICmpScaledV = Rewriter.expandCodeFor(ScaledS, 0, IP); + } else { + // Otherwise just expand the scaled register and an explicit scale, + // which is expected to be matched as part of the address. + ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP)); + ScaledS = SE.getMulExpr(ScaledS, + SE.getConstant(ScaledS->getType(), F.AM.Scale)); + Ops.push_back(ScaledS); + + // Flush the operand list to suppress SCEVExpander hoisting. + Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); + Ops.clear(); + Ops.push_back(SE.getUnknown(FullV)); + } + } + + // Expand the GV portion. + if (F.AM.BaseGV) { + Ops.push_back(SE.getUnknown(F.AM.BaseGV)); + + // Flush the operand list to suppress SCEVExpander hoisting. + Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); + Ops.clear(); + Ops.push_back(SE.getUnknown(FullV)); + } + + // Expand the immediate portion. + int64_t Offset = (uint64_t)F.AM.BaseOffs + LF.Offset; + if (Offset != 0) { + if (LU.Kind == LSRUse::ICmpZero) { + // The other interesting way of "folding" with an ICmpZero is to use a + // negated immediate. + if (!ICmpScaledV) + ICmpScaledV = ConstantInt::get(IntTy, -Offset); + else { + Ops.push_back(SE.getUnknown(ICmpScaledV)); + ICmpScaledV = ConstantInt::get(IntTy, Offset); + } + } else { + // Just add the immediate values. These again are expected to be matched + // as part of the address. + Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset))); + } + } + + // Emit instructions summing all the operands. + const SCEV *FullS = Ops.empty() ? + SE.getConstant(IntTy, 0) : + SE.getAddExpr(Ops); + Value *FullV = Rewriter.expandCodeFor(FullS, Ty, IP); + + // We're done expanding now, so reset the rewriter. + Rewriter.clearPostInc(); + + // An ICmpZero Formula represents an ICmp which we're handling as a + // comparison against zero. Now that we've expanded an expression for that + // form, update the ICmp's other operand. + if (LU.Kind == LSRUse::ICmpZero) { + ICmpInst *CI = cast<ICmpInst>(LF.UserInst); + DeadInsts.push_back(CI->getOperand(1)); + assert(!F.AM.BaseGV && "ICmp does not support folding a global value and " + "a scale at the same time!"); + if (F.AM.Scale == -1) { + if (ICmpScaledV->getType() != OpTy) { + Instruction *Cast = + CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false, + OpTy, false), + ICmpScaledV, OpTy, "tmp", CI); + ICmpScaledV = Cast; + } + CI->setOperand(1, ICmpScaledV); + } else { + assert(F.AM.Scale == 0 && + "ICmp does not support folding a global value and " + "a scale at the same time!"); + Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy), + -(uint64_t)Offset); + if (C->getType() != OpTy) + C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false, + OpTy, false), + C, OpTy); + + CI->setOperand(1, C); + } + } + + return FullV; +} + +/// RewriteForPHI - Helper for Rewrite. PHI nodes are special because the use +/// of their operands effectively happens in their predecessor blocks, so the +/// expression may need to be expanded in multiple places. +void LSRInstance::RewriteForPHI(PHINode *PN, + const LSRFixup &LF, + const Formula &F, + SCEVExpander &Rewriter, + SmallVectorImpl<WeakVH> &DeadInsts, + Pass *P) const { + DenseMap<BasicBlock *, Value *> Inserted; + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + if (PN->getIncomingValue(i) == LF.OperandValToReplace) { + BasicBlock *BB = PN->getIncomingBlock(i); + + // If this is a critical edge, split the edge so that we do not insert + // the code on all predecessor/successor paths. We do this unless this + // is the canonical backedge for this loop, which complicates post-inc + // users. + if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 && + !isa<IndirectBrInst>(BB->getTerminator())) { + Loop *PNLoop = LI.getLoopFor(PN->getParent()); + if (!PNLoop || PN->getParent() != PNLoop->getHeader()) { + // Split the critical edge. + BasicBlock *NewBB = SplitCriticalEdge(BB, PN->getParent(), P); + + // If PN is outside of the loop and BB is in the loop, we want to + // move the block to be immediately before the PHI block, not + // immediately after BB. + if (L->contains(BB) && !L->contains(PN)) + NewBB->moveBefore(PN->getParent()); + + // Splitting the edge can reduce the number of PHI entries we have. + e = PN->getNumIncomingValues(); + BB = NewBB; + i = PN->getBasicBlockIndex(BB); + } + } + + std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair = + Inserted.insert(std::make_pair(BB, static_cast<Value *>(0))); + if (!Pair.second) + PN->setIncomingValue(i, Pair.first->second); + else { + Value *FullV = Expand(LF, F, BB->getTerminator(), Rewriter, DeadInsts); + + // If this is reuse-by-noop-cast, insert the noop cast. + const Type *OpTy = LF.OperandValToReplace->getType(); + if (FullV->getType() != OpTy) + FullV = + CastInst::Create(CastInst::getCastOpcode(FullV, false, + OpTy, false), + FullV, LF.OperandValToReplace->getType(), + "tmp", BB->getTerminator()); + + PN->setIncomingValue(i, FullV); + Pair.first->second = FullV; + } + } +} + +/// Rewrite - Emit instructions for the leading candidate expression for this +/// LSRUse (this is called "expanding"), and update the UserInst to reference +/// the newly expanded value. +void LSRInstance::Rewrite(const LSRFixup &LF, + const Formula &F, + SCEVExpander &Rewriter, + SmallVectorImpl<WeakVH> &DeadInsts, + Pass *P) const { + // First, find an insertion point that dominates UserInst. For PHI nodes, + // find the nearest block which dominates all the relevant uses. + if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) { + RewriteForPHI(PN, LF, F, Rewriter, DeadInsts, P); + } else { + Value *FullV = Expand(LF, F, LF.UserInst, Rewriter, DeadInsts); + + // If this is reuse-by-noop-cast, insert the noop cast. + const Type *OpTy = LF.OperandValToReplace->getType(); + if (FullV->getType() != OpTy) { + Instruction *Cast = + CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false), + FullV, OpTy, "tmp", LF.UserInst); + FullV = Cast; + } + + // Update the user. ICmpZero is handled specially here (for now) because + // Expand may have updated one of the operands of the icmp already, and + // its new value may happen to be equal to LF.OperandValToReplace, in + // which case doing replaceUsesOfWith leads to replacing both operands + // with the same value. TODO: Reorganize this. + if (Uses[LF.LUIdx].Kind == LSRUse::ICmpZero) + LF.UserInst->setOperand(0, FullV); + else + LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV); + } + + DeadInsts.push_back(LF.OperandValToReplace); +} + +/// ImplementSolution - Rewrite all the fixup locations with new values, +/// following the chosen solution. +void +LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, + Pass *P) { + // Keep track of instructions we may have made dead, so that + // we can remove them after we are done working. + SmallVector<WeakVH, 16> DeadInsts; + + SCEVExpander Rewriter(SE); + Rewriter.disableCanonicalMode(); + Rewriter.setIVIncInsertPos(L, IVIncInsertPos); + + // Expand the new value definitions and update the users. + for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(), + E = Fixups.end(); I != E; ++I) { + const LSRFixup &Fixup = *I; + + Rewrite(Fixup, *Solution[Fixup.LUIdx], Rewriter, DeadInsts, P); + + Changed = true; + } + + // Clean up after ourselves. This must be done before deleting any + // instructions. + Rewriter.clear(); + + Changed |= DeleteTriviallyDeadInstructions(DeadInsts); +} + +LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P) + : IU(P->getAnalysis<IVUsers>()), + SE(P->getAnalysis<ScalarEvolution>()), + DT(P->getAnalysis<DominatorTree>()), + LI(P->getAnalysis<LoopInfo>()), + TLI(tli), L(l), Changed(false), IVIncInsertPos(0) { + + // If LoopSimplify form is not available, stay out of trouble. + if (!L->isLoopSimplifyForm()) return; + + // If there's no interesting work to be done, bail early. + if (IU.empty()) return; + + DEBUG(dbgs() << "\nLSR on loop "; + WriteAsOperand(dbgs(), L->getHeader(), /*PrintType=*/false); + dbgs() << ":\n"); + + // First, perform some low-level loop optimizations. + OptimizeShadowIV(); + OptimizeLoopTermCond(); + + // Start collecting data and preparing for the solver. + CollectInterestingTypesAndFactors(); + CollectFixupsAndInitialFormulae(); + CollectLoopInvariantFixupsAndFormulae(); + + DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n"; + print_uses(dbgs())); + + // Now use the reuse data to generate a bunch of interesting ways + // to formulate the values needed for the uses. + GenerateAllReuseFormulae(); + + FilterOutUndesirableDedicatedRegisters(); + NarrowSearchSpaceUsingHeuristics(); + + SmallVector<const Formula *, 8> Solution; + Solve(Solution); + + // Release memory that is no longer needed. + Factors.clear(); + Types.clear(); + RegUses.clear(); + +#ifndef NDEBUG + // Formulae should be legal. + for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(), + E = Uses.end(); I != E; ++I) { + const LSRUse &LU = *I; + for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(), + JE = LU.Formulae.end(); J != JE; ++J) + assert(isLegalUse(J->AM, LU.MinOffset, LU.MaxOffset, + LU.Kind, LU.AccessTy, TLI) && + "Illegal formula generated!"); + }; +#endif + + // Now that we've decided what we want, make it so. + ImplementSolution(Solution, P); +} + +void LSRInstance::print_factors_and_types(raw_ostream &OS) const { + if (Factors.empty() && Types.empty()) return; + + OS << "LSR has identified the following interesting factors and types: "; + bool First = true; + + for (SmallSetVector<int64_t, 8>::const_iterator + I = Factors.begin(), E = Factors.end(); I != E; ++I) { + if (!First) OS << ", "; + First = false; + OS << '*' << *I; + } + + for (SmallSetVector<const Type *, 4>::const_iterator + I = Types.begin(), E = Types.end(); I != E; ++I) { + if (!First) OS << ", "; + First = false; + OS << '(' << **I << ')'; + } + OS << '\n'; +} + +void LSRInstance::print_fixups(raw_ostream &OS) const { + OS << "LSR is examining the following fixup sites:\n"; + for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(), + E = Fixups.end(); I != E; ++I) { + dbgs() << " "; + I->print(OS); + OS << '\n'; + } +} + +void LSRInstance::print_uses(raw_ostream &OS) const { + OS << "LSR is examining the following uses:\n"; + for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(), + E = Uses.end(); I != E; ++I) { + const LSRUse &LU = *I; + dbgs() << " "; + LU.print(OS); + OS << '\n'; + for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(), + JE = LU.Formulae.end(); J != JE; ++J) { + OS << " "; + J->print(OS); + OS << '\n'; + } + } +} + +void LSRInstance::print(raw_ostream &OS) const { + print_factors_and_types(OS); + print_fixups(OS); + print_uses(OS); +} + +void LSRInstance::dump() const { + print(errs()); errs() << '\n'; +} + +namespace { + +class LoopStrengthReduce : public LoopPass { + /// TLI - Keep a pointer of a TargetLowering to consult for determining + /// transformation profitability. + const TargetLowering *const TLI; + +public: + static char ID; // Pass ID, replacement for typeid + explicit LoopStrengthReduce(const TargetLowering *tli = 0); + +private: + bool runOnLoop(Loop *L, LPPassManager &LPM); + void getAnalysisUsage(AnalysisUsage &AU) const; +}; + +} + +char LoopStrengthReduce::ID = 0; +INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce", + "Loop Strength Reduction", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(IVUsers) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce", + "Loop Strength Reduction", false, false) + + +Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) { + return new LoopStrengthReduce(TLI); +} + +LoopStrengthReduce::LoopStrengthReduce(const TargetLowering *tli) + : LoopPass(ID), TLI(tli) { + initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry()); + } + +void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { + // We split critical edges, so we change the CFG. However, we do update + // many analyses if they are around. + AU.addPreservedID(LoopSimplifyID); + + AU.addRequired<LoopInfo>(); + AU.addPreserved<LoopInfo>(); + AU.addRequiredID(LoopSimplifyID); + AU.addRequired<DominatorTree>(); + AU.addPreserved<DominatorTree>(); + AU.addRequired<ScalarEvolution>(); + AU.addPreserved<ScalarEvolution>(); + // Requiring LoopSimplify a second time here prevents IVUsers from running + // twice, since LoopSimplify was invalidated by running ScalarEvolution. + AU.addRequiredID(LoopSimplifyID); + AU.addRequired<IVUsers>(); + AU.addPreserved<IVUsers>(); +} + +bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { + bool Changed = false; + + // Run the main LSR transformation. + Changed |= LSRInstance(TLI, L, this).getChanged(); + + // At this point, it is worth checking to see if any recurrence PHIs are also + // dead, so that we can remove them as well. + Changed |= DeleteDeadPHIs(L->getHeader()); + + return Changed; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp new file mode 100644 index 0000000..80b263a --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -0,0 +1,182 @@ +//===-- LoopUnroll.cpp - Loop unroller pass -------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass implements a simple loop unroller. It works best when loops have +// been canonicalized by the -indvars pass, allowing it to determine the trip +// counts of loops easily. +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "loop-unroll" +#include "llvm/IntrinsicInst.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/UnrollLoop.h" +#include <climits> + +using namespace llvm; + +static cl::opt<unsigned> +UnrollThreshold("unroll-threshold", cl::init(150), cl::Hidden, + cl::desc("The cut-off point for automatic loop unrolling")); + +static cl::opt<unsigned> +UnrollCount("unroll-count", cl::init(0), cl::Hidden, + cl::desc("Use this unroll count for all loops, for testing purposes")); + +static cl::opt<bool> +UnrollAllowPartial("unroll-allow-partial", cl::init(false), cl::Hidden, + cl::desc("Allows loops to be partially unrolled until " + "-unroll-threshold loop size is reached.")); + +namespace { + class LoopUnroll : public LoopPass { + public: + static char ID; // Pass ID, replacement for typeid + LoopUnroll() : LoopPass(ID) { + initializeLoopUnrollPass(*PassRegistry::getPassRegistry()); + } + + /// A magic value for use with the Threshold parameter to indicate + /// that the loop unroll should be performed regardless of how much + /// code expansion would result. + static const unsigned NoThreshold = UINT_MAX; + + // Threshold to use when optsize is specified (and there is no + // explicit -unroll-threshold). + static const unsigned OptSizeUnrollThreshold = 50; + + unsigned CurrentThreshold; + + bool runOnLoop(Loop *L, LPPassManager &LPM); + + /// This transformation requires natural loop information & requires that + /// loop preheaders be inserted into the CFG... + /// + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<LoopInfo>(); + AU.addPreserved<LoopInfo>(); + AU.addRequiredID(LoopSimplifyID); + AU.addPreservedID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + AU.addPreservedID(LCSSAID); + AU.addPreserved<ScalarEvolution>(); + // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info. + // If loop unroll does not preserve dom info then LCSSA pass on next + // loop will receive invalid dom info. + // For now, recreate dom info, if loop is unrolled. + AU.addPreserved<DominatorTree>(); + } + }; +} + +char LoopUnroll::ID = 0; +INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false) + +Pass *llvm::createLoopUnrollPass() { return new LoopUnroll(); } + +/// ApproximateLoopSize - Approximate the size of the loop. +static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls) { + CodeMetrics Metrics; + for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); + I != E; ++I) + Metrics.analyzeBasicBlock(*I); + NumCalls = Metrics.NumInlineCandidates; + + unsigned LoopSize = Metrics.NumInsts; + + // Don't allow an estimate of size zero. This would allows unrolling of loops + // with huge iteration counts, which is a compile time problem even if it's + // not a problem for code quality. + if (LoopSize == 0) LoopSize = 1; + + return LoopSize; +} + +bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { + LoopInfo *LI = &getAnalysis<LoopInfo>(); + + BasicBlock *Header = L->getHeader(); + DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName() + << "] Loop %" << Header->getName() << "\n"); + (void)Header; + + // Determine the current unrolling threshold. While this is normally set + // from UnrollThreshold, it is overridden to a smaller value if the current + // function is marked as optimize-for-size, and the unroll threshold was + // not user specified. + CurrentThreshold = UnrollThreshold; + if (Header->getParent()->hasFnAttr(Attribute::OptimizeForSize) && + UnrollThreshold.getNumOccurrences() == 0) + CurrentThreshold = OptSizeUnrollThreshold; + + // Find trip count + unsigned TripCount = L->getSmallConstantTripCount(); + unsigned Count = UnrollCount; + + // Automatically select an unroll count. + if (Count == 0) { + // Conservative heuristic: if we know the trip count, see if we can + // completely unroll (subject to the threshold, checked below); otherwise + // try to find greatest modulo of the trip count which is still under + // threshold value. + if (TripCount == 0) + return false; + Count = TripCount; + } + + // Enforce the threshold. + if (CurrentThreshold != NoThreshold) { + unsigned NumInlineCandidates; + unsigned LoopSize = ApproximateLoopSize(L, NumInlineCandidates); + DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n"); + if (NumInlineCandidates != 0) { + DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n"); + return false; + } + uint64_t Size = (uint64_t)LoopSize*Count; + if (TripCount != 1 && Size > CurrentThreshold) { + DEBUG(dbgs() << " Too large to fully unroll with count: " << Count + << " because size: " << Size << ">" << CurrentThreshold << "\n"); + if (!UnrollAllowPartial) { + DEBUG(dbgs() << " will not try to unroll partially because " + << "-unroll-allow-partial not given\n"); + return false; + } + // Reduce unroll count to be modulo of TripCount for partial unrolling + Count = CurrentThreshold / LoopSize; + while (Count != 0 && TripCount%Count != 0) { + Count--; + } + if (Count < 2) { + DEBUG(dbgs() << " could not unroll partially\n"); + return false; + } + DEBUG(dbgs() << " partially unrolling with count: " << Count << "\n"); + } + } + + // Unroll the loop. + Function *F = L->getHeader()->getParent(); + if (!UnrollLoop(L, Count, LI, &LPM)) + return false; + + // FIXME: Reconstruct dom info, because it is not preserved properly. + if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>()) + DT->runOnFunction(*F); + return true; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp new file mode 100644 index 0000000..b4e3d31 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -0,0 +1,1045 @@ +//===-- LoopUnswitch.cpp - Hoist loop-invariant conditionals in loop ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass transforms loops that contain branches on loop-invariant conditions +// to have multiple loops. For example, it turns the left into the right code: +// +// for (...) if (lic) +// A for (...) +// if (lic) A; B; C +// B else +// C for (...) +// A; C +// +// This can increase the size of the code exponentially (doubling it every time +// a loop is unswitched) so we only unswitch if the resultant code will be +// smaller than a threshold. +// +// This pass expects LICM to be run before it to hoist invariant conditions out +// of the loop, to make the unswitching opportunity obvious. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "loop-unswitch" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/Instructions.h" +#include "llvm/Analysis/InlineCost.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include <algorithm> +#include <set> +using namespace llvm; + +STATISTIC(NumBranches, "Number of branches unswitched"); +STATISTIC(NumSwitches, "Number of switches unswitched"); +STATISTIC(NumSelects , "Number of selects unswitched"); +STATISTIC(NumTrivial , "Number of unswitches that are trivial"); +STATISTIC(NumSimplify, "Number of simplifications of unswitched code"); + +// The specific value of 50 here was chosen based only on intuition and a +// few specific examples. +static cl::opt<unsigned> +Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"), + cl::init(50), cl::Hidden); + +namespace { + class LoopUnswitch : public LoopPass { + LoopInfo *LI; // Loop information + LPPassManager *LPM; + + // LoopProcessWorklist - Used to check if second loop needs processing + // after RewriteLoopBodyWithConditionConstant rewrites first loop. + std::vector<Loop*> LoopProcessWorklist; + SmallPtrSet<Value *,8> UnswitchedVals; + + bool OptimizeForSize; + bool redoLoop; + + Loop *currentLoop; + DominatorTree *DT; + BasicBlock *loopHeader; + BasicBlock *loopPreheader; + + // LoopBlocks contains all of the basic blocks of the loop, including the + // preheader of the loop, the body of the loop, and the exit blocks of the + // loop, in that order. + std::vector<BasicBlock*> LoopBlocks; + // NewBlocks contained cloned copy of basic blocks from LoopBlocks. + std::vector<BasicBlock*> NewBlocks; + + public: + static char ID; // Pass ID, replacement for typeid + explicit LoopUnswitch(bool Os = false) : + LoopPass(ID), OptimizeForSize(Os), redoLoop(false), + currentLoop(NULL), DT(NULL), loopHeader(NULL), + loopPreheader(NULL) { + initializeLoopUnswitchPass(*PassRegistry::getPassRegistry()); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM); + bool processCurrentLoop(); + + /// This transformation requires natural loop information & requires that + /// loop preheaders be inserted into the CFG. + /// + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequiredID(LoopSimplifyID); + AU.addPreservedID(LoopSimplifyID); + AU.addRequired<LoopInfo>(); + AU.addPreserved<LoopInfo>(); + AU.addRequiredID(LCSSAID); + AU.addPreservedID(LCSSAID); + AU.addPreserved<DominatorTree>(); + AU.addPreserved<ScalarEvolution>(); + } + + private: + + virtual void releaseMemory() { + UnswitchedVals.clear(); + } + + /// RemoveLoopFromWorklist - If the specified loop is on the loop worklist, + /// remove it. + void RemoveLoopFromWorklist(Loop *L) { + std::vector<Loop*>::iterator I = std::find(LoopProcessWorklist.begin(), + LoopProcessWorklist.end(), L); + if (I != LoopProcessWorklist.end()) + LoopProcessWorklist.erase(I); + } + + void initLoopData() { + loopHeader = currentLoop->getHeader(); + loopPreheader = currentLoop->getLoopPreheader(); + } + + /// Split all of the edges from inside the loop to their exit blocks. + /// Update the appropriate Phi nodes as we do so. + void SplitExitEdges(Loop *L, const SmallVector<BasicBlock *, 8> &ExitBlocks); + + bool UnswitchIfProfitable(Value *LoopCond, Constant *Val); + void UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val, + BasicBlock *ExitBlock); + void UnswitchNontrivialCondition(Value *LIC, Constant *OnVal, Loop *L); + + void RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, + Constant *Val, bool isEqual); + + void EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val, + BasicBlock *TrueDest, + BasicBlock *FalseDest, + Instruction *InsertPt); + + void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L); + void RemoveBlockIfDead(BasicBlock *BB, + std::vector<Instruction*> &Worklist, Loop *l); + void RemoveLoopFromHierarchy(Loop *L); + bool IsTrivialUnswitchCondition(Value *Cond, Constant **Val = 0, + BasicBlock **LoopExit = 0); + + }; +} +char LoopUnswitch::ID = 0; +INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops", + false, false) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops", + false, false) + +Pass *llvm::createLoopUnswitchPass(bool Os) { + return new LoopUnswitch(Os); +} + +/// FindLIVLoopCondition - Cond is a condition that occurs in L. If it is +/// invariant in the loop, or has an invariant piece, return the invariant. +/// Otherwise, return null. +static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) { + // We can never unswitch on vector conditions. + if (Cond->getType()->isVectorTy()) + return 0; + + // Constants should be folded, not unswitched on! + if (isa<Constant>(Cond)) return 0; + + // TODO: Handle: br (VARIANT|INVARIANT). + + // Hoist simple values out. + if (L->makeLoopInvariant(Cond, Changed)) + return Cond; + + if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Cond)) + if (BO->getOpcode() == Instruction::And || + BO->getOpcode() == Instruction::Or) { + // If either the left or right side is invariant, we can unswitch on this, + // which will cause the branch to go away in one loop and the condition to + // simplify in the other one. + if (Value *LHS = FindLIVLoopCondition(BO->getOperand(0), L, Changed)) + return LHS; + if (Value *RHS = FindLIVLoopCondition(BO->getOperand(1), L, Changed)) + return RHS; + } + + return 0; +} + +bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) { + LI = &getAnalysis<LoopInfo>(); + LPM = &LPM_Ref; + DT = getAnalysisIfAvailable<DominatorTree>(); + currentLoop = L; + Function *F = currentLoop->getHeader()->getParent(); + bool Changed = false; + do { + assert(currentLoop->isLCSSAForm(*DT)); + redoLoop = false; + Changed |= processCurrentLoop(); + } while(redoLoop); + + if (Changed) { + // FIXME: Reconstruct dom info, because it is not preserved properly. + if (DT) + DT->runOnFunction(*F); + } + return Changed; +} + +/// processCurrentLoop - Do actual work and unswitch loop if possible +/// and profitable. +bool LoopUnswitch::processCurrentLoop() { + bool Changed = false; + LLVMContext &Context = currentLoop->getHeader()->getContext(); + + // Loop over all of the basic blocks in the loop. If we find an interior + // block that is branching on a loop-invariant condition, we can unswitch this + // loop. + for (Loop::block_iterator I = currentLoop->block_begin(), + E = currentLoop->block_end(); I != E; ++I) { + TerminatorInst *TI = (*I)->getTerminator(); + if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { + // If this isn't branching on an invariant condition, we can't unswitch + // it. + if (BI->isConditional()) { + // See if this, or some part of it, is loop invariant. If so, we can + // unswitch on it if we desire. + Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), + currentLoop, Changed); + if (LoopCond && UnswitchIfProfitable(LoopCond, + ConstantInt::getTrue(Context))) { + ++NumBranches; + return true; + } + } + } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { + Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), + currentLoop, Changed); + if (LoopCond && SI->getNumCases() > 1) { + // Find a value to unswitch on: + // FIXME: this should chose the most expensive case! + Constant *UnswitchVal = SI->getCaseValue(1); + // Do not process same value again and again. + if (!UnswitchedVals.insert(UnswitchVal)) + continue; + + if (UnswitchIfProfitable(LoopCond, UnswitchVal)) { + ++NumSwitches; + return true; + } + } + } + + // Scan the instructions to check for unswitchable values. + for (BasicBlock::iterator BBI = (*I)->begin(), E = (*I)->end(); + BBI != E; ++BBI) + if (SelectInst *SI = dyn_cast<SelectInst>(BBI)) { + Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), + currentLoop, Changed); + if (LoopCond && UnswitchIfProfitable(LoopCond, + ConstantInt::getTrue(Context))) { + ++NumSelects; + return true; + } + } + } + return Changed; +} + +/// isTrivialLoopExitBlock - Check to see if all paths from BB exit the +/// loop with no side effects (including infinite loops). +/// +/// If true, we return true and set ExitBB to the block we +/// exit through. +/// +static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB, + BasicBlock *&ExitBB, + std::set<BasicBlock*> &Visited) { + if (!Visited.insert(BB).second) { + // Already visited. Without more analysis, this could indicate an infinte loop. + return false; + } else if (!L->contains(BB)) { + // Otherwise, this is a loop exit, this is fine so long as this is the + // first exit. + if (ExitBB != 0) return false; + ExitBB = BB; + return true; + } + + // Otherwise, this is an unvisited intra-loop node. Check all successors. + for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI) { + // Check to see if the successor is a trivial loop exit. + if (!isTrivialLoopExitBlockHelper(L, *SI, ExitBB, Visited)) + return false; + } + + // Okay, everything after this looks good, check to make sure that this block + // doesn't include any side effects. + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) + if (I->mayHaveSideEffects()) + return false; + + return true; +} + +/// isTrivialLoopExitBlock - Return true if the specified block unconditionally +/// leads to an exit from the specified loop, and has no side-effects in the +/// process. If so, return the block that is exited to, otherwise return null. +static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) { + std::set<BasicBlock*> Visited; + Visited.insert(L->getHeader()); // Branches to header make infinite loops. + BasicBlock *ExitBB = 0; + if (isTrivialLoopExitBlockHelper(L, BB, ExitBB, Visited)) + return ExitBB; + return 0; +} + +/// IsTrivialUnswitchCondition - Check to see if this unswitch condition is +/// trivial: that is, that the condition controls whether or not the loop does +/// anything at all. If this is a trivial condition, unswitching produces no +/// code duplications (equivalently, it produces a simpler loop and a new empty +/// loop, which gets deleted). +/// +/// If this is a trivial condition, return true, otherwise return false. When +/// returning true, this sets Cond and Val to the condition that controls the +/// trivial condition: when Cond dynamically equals Val, the loop is known to +/// exit. Finally, this sets LoopExit to the BB that the loop exits to when +/// Cond == Val. +/// +bool LoopUnswitch::IsTrivialUnswitchCondition(Value *Cond, Constant **Val, + BasicBlock **LoopExit) { + BasicBlock *Header = currentLoop->getHeader(); + TerminatorInst *HeaderTerm = Header->getTerminator(); + LLVMContext &Context = Header->getContext(); + + BasicBlock *LoopExitBB = 0; + if (BranchInst *BI = dyn_cast<BranchInst>(HeaderTerm)) { + // If the header block doesn't end with a conditional branch on Cond, we + // can't handle it. + if (!BI->isConditional() || BI->getCondition() != Cond) + return false; + + // Check to see if a successor of the branch is guaranteed to + // exit through a unique exit block without having any + // side-effects. If so, determine the value of Cond that causes it to do + // this. + if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, + BI->getSuccessor(0)))) { + if (Val) *Val = ConstantInt::getTrue(Context); + } else if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, + BI->getSuccessor(1)))) { + if (Val) *Val = ConstantInt::getFalse(Context); + } + } else if (SwitchInst *SI = dyn_cast<SwitchInst>(HeaderTerm)) { + // If this isn't a switch on Cond, we can't handle it. + if (SI->getCondition() != Cond) return false; + + // Check to see if a successor of the switch is guaranteed to go to the + // latch block or exit through a one exit block without having any + // side-effects. If so, determine the value of Cond that causes it to do + // this. Note that we can't trivially unswitch on the default case. + for (unsigned i = 1, e = SI->getNumSuccessors(); i != e; ++i) + if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, + SI->getSuccessor(i)))) { + // Okay, we found a trivial case, remember the value that is trivial. + if (Val) *Val = SI->getCaseValue(i); + break; + } + } + + // If we didn't find a single unique LoopExit block, or if the loop exit block + // contains phi nodes, this isn't trivial. + if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin())) + return false; // Can't handle this. + + if (LoopExit) *LoopExit = LoopExitBB; + + // We already know that nothing uses any scalar values defined inside of this + // loop. As such, we just have to check to see if this loop will execute any + // side-effecting instructions (e.g. stores, calls, volatile loads) in the + // part of the loop that the code *would* execute. We already checked the + // tail, check the header now. + for (BasicBlock::iterator I = Header->begin(), E = Header->end(); I != E; ++I) + if (I->mayHaveSideEffects()) + return false; + return true; +} + +/// UnswitchIfProfitable - We have found that we can unswitch currentLoop when +/// LoopCond == Val to simplify the loop. If we decide that this is profitable, +/// unswitch the loop, reprocess the pieces, then return true. +bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val) { + + initLoopData(); + + // If LoopSimplify was unable to form a preheader, don't do any unswitching. + if (!loopPreheader) + return false; + + Function *F = loopHeader->getParent(); + + Constant *CondVal = 0; + BasicBlock *ExitBlock = 0; + if (IsTrivialUnswitchCondition(LoopCond, &CondVal, &ExitBlock)) { + // If the condition is trivial, always unswitch. There is no code growth + // for this case. + UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, ExitBlock); + return true; + } + + // Check to see if it would be profitable to unswitch current loop. + + // Do not do non-trivial unswitch while optimizing for size. + if (OptimizeForSize || F->hasFnAttr(Attribute::OptimizeForSize)) + return false; + + // FIXME: This is overly conservative because it does not take into + // consideration code simplification opportunities and code that can + // be shared by the resultant unswitched loops. + CodeMetrics Metrics; + for (Loop::block_iterator I = currentLoop->block_begin(), + E = currentLoop->block_end(); + I != E; ++I) + Metrics.analyzeBasicBlock(*I); + + // Limit the number of instructions to avoid causing significant code + // expansion, and the number of basic blocks, to avoid loops with + // large numbers of branches which cause loop unswitching to go crazy. + // This is a very ad-hoc heuristic. + if (Metrics.NumInsts > Threshold || + Metrics.NumBlocks * 5 > Threshold || + Metrics.containsIndirectBr || Metrics.isRecursive) { + DEBUG(dbgs() << "NOT unswitching loop %" + << currentLoop->getHeader()->getName() << ", cost too high: " + << currentLoop->getBlocks().size() << "\n"); + return false; + } + + UnswitchNontrivialCondition(LoopCond, Val, currentLoop); + return true; +} + +/// CloneLoop - Recursively clone the specified loop and all of its children, +/// mapping the blocks with the specified map. +static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM, + LoopInfo *LI, LPPassManager *LPM) { + Loop *New = new Loop(); + LPM->insertLoop(New, PL); + + // Add all of the blocks in L to the new loop. + for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); + I != E; ++I) + if (LI->getLoopFor(*I) == L) + New->addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), LI->getBase()); + + // Add all of the subloops to the new loop. + for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) + CloneLoop(*I, New, VM, LI, LPM); + + return New; +} + +/// EmitPreheaderBranchOnCondition - Emit a conditional branch on two values +/// if LIC == Val, branch to TrueDst, otherwise branch to FalseDest. Insert the +/// code immediately before InsertPt. +void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val, + BasicBlock *TrueDest, + BasicBlock *FalseDest, + Instruction *InsertPt) { + // Insert a conditional branch on LIC to the two preheaders. The original + // code is the true version and the new code is the false version. + Value *BranchVal = LIC; + if (!isa<ConstantInt>(Val) || + Val->getType() != Type::getInt1Ty(LIC->getContext())) + BranchVal = new ICmpInst(InsertPt, ICmpInst::ICMP_EQ, LIC, Val, "tmp"); + else if (Val != ConstantInt::getTrue(Val->getContext())) + // We want to enter the new loop when the condition is true. + std::swap(TrueDest, FalseDest); + + // Insert the new branch. + BranchInst *BI = BranchInst::Create(TrueDest, FalseDest, BranchVal, InsertPt); + + // If either edge is critical, split it. This helps preserve LoopSimplify + // form for enclosing loops. + SplitCriticalEdge(BI, 0, this); + SplitCriticalEdge(BI, 1, this); +} + +/// UnswitchTrivialCondition - Given a loop that has a trivial unswitchable +/// condition in it (a cond branch from its header block to its latch block, +/// where the path through the loop that doesn't execute its body has no +/// side-effects), unswitch it. This doesn't involve any code duplication, just +/// moving the conditional branch outside of the loop and updating loop info. +void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, + Constant *Val, + BasicBlock *ExitBlock) { + DEBUG(dbgs() << "loop-unswitch: Trivial-Unswitch loop %" + << loopHeader->getName() << " [" << L->getBlocks().size() + << " blocks] in Function " << L->getHeader()->getParent()->getName() + << " on cond: " << *Val << " == " << *Cond << "\n"); + + // First step, split the preheader, so that we know that there is a safe place + // to insert the conditional branch. We will change loopPreheader to have a + // conditional branch on Cond. + BasicBlock *NewPH = SplitEdge(loopPreheader, loopHeader, this); + + // Now that we have a place to insert the conditional branch, create a place + // to branch to: this is the exit block out of the loop that we should + // short-circuit to. + + // Split this block now, so that the loop maintains its exit block, and so + // that the jump from the preheader can execute the contents of the exit block + // without actually branching to it (the exit block should be dominated by the + // loop header, not the preheader). + assert(!L->contains(ExitBlock) && "Exit block is in the loop?"); + BasicBlock *NewExit = SplitBlock(ExitBlock, ExitBlock->begin(), this); + + // Okay, now we have a position to branch from and a position to branch to, + // insert the new conditional branch. + EmitPreheaderBranchOnCondition(Cond, Val, NewExit, NewPH, + loopPreheader->getTerminator()); + LPM->deleteSimpleAnalysisValue(loopPreheader->getTerminator(), L); + loopPreheader->getTerminator()->eraseFromParent(); + + // We need to reprocess this loop, it could be unswitched again. + redoLoop = true; + + // Now that we know that the loop is never entered when this condition is a + // particular value, rewrite the loop with this info. We know that this will + // at least eliminate the old branch. + RewriteLoopBodyWithConditionConstant(L, Cond, Val, false); + ++NumTrivial; +} + +/// SplitExitEdges - Split all of the edges from inside the loop to their exit +/// blocks. Update the appropriate Phi nodes as we do so. +void LoopUnswitch::SplitExitEdges(Loop *L, + const SmallVector<BasicBlock *, 8> &ExitBlocks){ + + for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) { + BasicBlock *ExitBlock = ExitBlocks[i]; + SmallVector<BasicBlock *, 4> Preds(pred_begin(ExitBlock), + pred_end(ExitBlock)); + SplitBlockPredecessors(ExitBlock, Preds.data(), Preds.size(), + ".us-lcssa", this); + } +} + +/// UnswitchNontrivialCondition - We determined that the loop is profitable +/// to unswitch when LIC equal Val. Split it into loop versions and test the +/// condition outside of either loop. Return the loops created as Out1/Out2. +void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, + Loop *L) { + Function *F = loopHeader->getParent(); + DEBUG(dbgs() << "loop-unswitch: Unswitching loop %" + << loopHeader->getName() << " [" << L->getBlocks().size() + << " blocks] in Function " << F->getName() + << " when '" << *Val << "' == " << *LIC << "\n"); + + if (ScalarEvolution *SE = getAnalysisIfAvailable<ScalarEvolution>()) + SE->forgetLoop(L); + + LoopBlocks.clear(); + NewBlocks.clear(); + + // First step, split the preheader and exit blocks, and add these blocks to + // the LoopBlocks list. + BasicBlock *NewPreheader = SplitEdge(loopPreheader, loopHeader, this); + LoopBlocks.push_back(NewPreheader); + + // We want the loop to come after the preheader, but before the exit blocks. + LoopBlocks.insert(LoopBlocks.end(), L->block_begin(), L->block_end()); + + SmallVector<BasicBlock*, 8> ExitBlocks; + L->getUniqueExitBlocks(ExitBlocks); + + // Split all of the edges from inside the loop to their exit blocks. Update + // the appropriate Phi nodes as we do so. + SplitExitEdges(L, ExitBlocks); + + // The exit blocks may have been changed due to edge splitting, recompute. + ExitBlocks.clear(); + L->getUniqueExitBlocks(ExitBlocks); + + // Add exit blocks to the loop blocks. + LoopBlocks.insert(LoopBlocks.end(), ExitBlocks.begin(), ExitBlocks.end()); + + // Next step, clone all of the basic blocks that make up the loop (including + // the loop preheader and exit blocks), keeping track of the mapping between + // the instructions and blocks. + NewBlocks.reserve(LoopBlocks.size()); + ValueToValueMapTy VMap; + for (unsigned i = 0, e = LoopBlocks.size(); i != e; ++i) { + BasicBlock *NewBB = CloneBasicBlock(LoopBlocks[i], VMap, ".us", F); + NewBlocks.push_back(NewBB); + VMap[LoopBlocks[i]] = NewBB; // Keep the BB mapping. + LPM->cloneBasicBlockSimpleAnalysis(LoopBlocks[i], NewBB, L); + } + + // Splice the newly inserted blocks into the function right before the + // original preheader. + F->getBasicBlockList().splice(NewPreheader, F->getBasicBlockList(), + NewBlocks[0], F->end()); + + // Now we create the new Loop object for the versioned loop. + Loop *NewLoop = CloneLoop(L, L->getParentLoop(), VMap, LI, LPM); + Loop *ParentLoop = L->getParentLoop(); + if (ParentLoop) { + // Make sure to add the cloned preheader and exit blocks to the parent loop + // as well. + ParentLoop->addBasicBlockToLoop(NewBlocks[0], LI->getBase()); + } + + for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) { + BasicBlock *NewExit = cast<BasicBlock>(VMap[ExitBlocks[i]]); + // The new exit block should be in the same loop as the old one. + if (Loop *ExitBBLoop = LI->getLoopFor(ExitBlocks[i])) + ExitBBLoop->addBasicBlockToLoop(NewExit, LI->getBase()); + + assert(NewExit->getTerminator()->getNumSuccessors() == 1 && + "Exit block should have been split to have one successor!"); + BasicBlock *ExitSucc = NewExit->getTerminator()->getSuccessor(0); + + // If the successor of the exit block had PHI nodes, add an entry for + // NewExit. + PHINode *PN; + for (BasicBlock::iterator I = ExitSucc->begin(); isa<PHINode>(I); ++I) { + PN = cast<PHINode>(I); + Value *V = PN->getIncomingValueForBlock(ExitBlocks[i]); + ValueToValueMapTy::iterator It = VMap.find(V); + if (It != VMap.end()) V = It->second; + PN->addIncoming(V, NewExit); + } + } + + // Rewrite the code to refer to itself. + for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) + for (BasicBlock::iterator I = NewBlocks[i]->begin(), + E = NewBlocks[i]->end(); I != E; ++I) + RemapInstruction(I, VMap,RF_NoModuleLevelChanges|RF_IgnoreMissingEntries); + + // Rewrite the original preheader to select between versions of the loop. + BranchInst *OldBR = cast<BranchInst>(loopPreheader->getTerminator()); + assert(OldBR->isUnconditional() && OldBR->getSuccessor(0) == LoopBlocks[0] && + "Preheader splitting did not work correctly!"); + + // Emit the new branch that selects between the two versions of this loop. + EmitPreheaderBranchOnCondition(LIC, Val, NewBlocks[0], LoopBlocks[0], OldBR); + LPM->deleteSimpleAnalysisValue(OldBR, L); + OldBR->eraseFromParent(); + + LoopProcessWorklist.push_back(NewLoop); + redoLoop = true; + + // Keep a WeakVH holding onto LIC. If the first call to RewriteLoopBody + // deletes the instruction (for example by simplifying a PHI that feeds into + // the condition that we're unswitching on), we don't rewrite the second + // iteration. + WeakVH LICHandle(LIC); + + // Now we rewrite the original code to know that the condition is true and the + // new code to know that the condition is false. + RewriteLoopBodyWithConditionConstant(L, LIC, Val, false); + + // It's possible that simplifying one loop could cause the other to be + // changed to another value or a constant. If its a constant, don't simplify + // it. + if (!LoopProcessWorklist.empty() && LoopProcessWorklist.back() == NewLoop && + LICHandle && !isa<Constant>(LICHandle)) + RewriteLoopBodyWithConditionConstant(NewLoop, LICHandle, Val, true); +} + +/// RemoveFromWorklist - Remove all instances of I from the worklist vector +/// specified. +static void RemoveFromWorklist(Instruction *I, + std::vector<Instruction*> &Worklist) { + std::vector<Instruction*>::iterator WI = std::find(Worklist.begin(), + Worklist.end(), I); + while (WI != Worklist.end()) { + unsigned Offset = WI-Worklist.begin(); + Worklist.erase(WI); + WI = std::find(Worklist.begin()+Offset, Worklist.end(), I); + } +} + +/// ReplaceUsesOfWith - When we find that I really equals V, remove I from the +/// program, replacing all uses with V and update the worklist. +static void ReplaceUsesOfWith(Instruction *I, Value *V, + std::vector<Instruction*> &Worklist, + Loop *L, LPPassManager *LPM) { + DEBUG(dbgs() << "Replace with '" << *V << "': " << *I); + + // Add uses to the worklist, which may be dead now. + for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) + if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i))) + Worklist.push_back(Use); + + // Add users to the worklist which may be simplified now. + for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); + UI != E; ++UI) + Worklist.push_back(cast<Instruction>(*UI)); + LPM->deleteSimpleAnalysisValue(I, L); + RemoveFromWorklist(I, Worklist); + I->replaceAllUsesWith(V); + I->eraseFromParent(); + ++NumSimplify; +} + +/// RemoveBlockIfDead - If the specified block is dead, remove it, update loop +/// information, and remove any dead successors it has. +/// +void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB, + std::vector<Instruction*> &Worklist, + Loop *L) { + if (pred_begin(BB) != pred_end(BB)) { + // This block isn't dead, since an edge to BB was just removed, see if there + // are any easy simplifications we can do now. + if (BasicBlock *Pred = BB->getSinglePredecessor()) { + // If it has one pred, fold phi nodes in BB. + while (isa<PHINode>(BB->begin())) + ReplaceUsesOfWith(BB->begin(), + cast<PHINode>(BB->begin())->getIncomingValue(0), + Worklist, L, LPM); + + // If this is the header of a loop and the only pred is the latch, we now + // have an unreachable loop. + if (Loop *L = LI->getLoopFor(BB)) + if (loopHeader == BB && L->contains(Pred)) { + // Remove the branch from the latch to the header block, this makes + // the header dead, which will make the latch dead (because the header + // dominates the latch). + LPM->deleteSimpleAnalysisValue(Pred->getTerminator(), L); + Pred->getTerminator()->eraseFromParent(); + new UnreachableInst(BB->getContext(), Pred); + + // The loop is now broken, remove it from LI. + RemoveLoopFromHierarchy(L); + + // Reprocess the header, which now IS dead. + RemoveBlockIfDead(BB, Worklist, L); + return; + } + + // If pred ends in a uncond branch, add uncond branch to worklist so that + // the two blocks will get merged. + if (BranchInst *BI = dyn_cast<BranchInst>(Pred->getTerminator())) + if (BI->isUnconditional()) + Worklist.push_back(BI); + } + return; + } + + DEBUG(dbgs() << "Nuking dead block: " << *BB); + + // Remove the instructions in the basic block from the worklist. + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + RemoveFromWorklist(I, Worklist); + + // Anything that uses the instructions in this basic block should have their + // uses replaced with undefs. + // If I is not void type then replaceAllUsesWith undef. + // This allows ValueHandlers and custom metadata to adjust itself. + if (!I->getType()->isVoidTy()) + I->replaceAllUsesWith(UndefValue::get(I->getType())); + } + + // If this is the edge to the header block for a loop, remove the loop and + // promote all subloops. + if (Loop *BBLoop = LI->getLoopFor(BB)) { + if (BBLoop->getLoopLatch() == BB) + RemoveLoopFromHierarchy(BBLoop); + } + + // Remove the block from the loop info, which removes it from any loops it + // was in. + LI->removeBlock(BB); + + + // Remove phi node entries in successors for this block. + TerminatorInst *TI = BB->getTerminator(); + SmallVector<BasicBlock*, 4> Succs; + for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) { + Succs.push_back(TI->getSuccessor(i)); + TI->getSuccessor(i)->removePredecessor(BB); + } + + // Unique the successors, remove anything with multiple uses. + array_pod_sort(Succs.begin(), Succs.end()); + Succs.erase(std::unique(Succs.begin(), Succs.end()), Succs.end()); + + // Remove the basic block, including all of the instructions contained in it. + LPM->deleteSimpleAnalysisValue(BB, L); + BB->eraseFromParent(); + // Remove successor blocks here that are not dead, so that we know we only + // have dead blocks in this list. Nondead blocks have a way of becoming dead, + // then getting removed before we revisit them, which is badness. + // + for (unsigned i = 0; i != Succs.size(); ++i) + if (pred_begin(Succs[i]) != pred_end(Succs[i])) { + // One exception is loop headers. If this block was the preheader for a + // loop, then we DO want to visit the loop so the loop gets deleted. + // We know that if the successor is a loop header, that this loop had to + // be the preheader: the case where this was the latch block was handled + // above and headers can only have two predecessors. + if (!LI->isLoopHeader(Succs[i])) { + Succs.erase(Succs.begin()+i); + --i; + } + } + + for (unsigned i = 0, e = Succs.size(); i != e; ++i) + RemoveBlockIfDead(Succs[i], Worklist, L); +} + +/// RemoveLoopFromHierarchy - We have discovered that the specified loop has +/// become unwrapped, either because the backedge was deleted, or because the +/// edge into the header was removed. If the edge into the header from the +/// latch block was removed, the loop is unwrapped but subloops are still alive, +/// so they just reparent loops. If the loops are actually dead, they will be +/// removed later. +void LoopUnswitch::RemoveLoopFromHierarchy(Loop *L) { + LPM->deleteLoopFromQueue(L); + RemoveLoopFromWorklist(L); +} + +// RewriteLoopBodyWithConditionConstant - We know either that the value LIC has +// the value specified by Val in the specified loop, or we know it does NOT have +// that value. Rewrite any uses of LIC or of properties correlated to it. +void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, + Constant *Val, + bool IsEqual) { + assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?"); + + // FIXME: Support correlated properties, like: + // for (...) + // if (li1 < li2) + // ... + // if (li1 > li2) + // ... + + // FOLD boolean conditions (X|LIC), (X&LIC). Fold conditional branches, + // selects, switches. + std::vector<User*> Users(LIC->use_begin(), LIC->use_end()); + std::vector<Instruction*> Worklist; + LLVMContext &Context = Val->getContext(); + + + // If we know that LIC == Val, or that LIC == NotVal, just replace uses of LIC + // in the loop with the appropriate one directly. + if (IsEqual || (isa<ConstantInt>(Val) && + Val->getType()->isIntegerTy(1))) { + Value *Replacement; + if (IsEqual) + Replacement = Val; + else + Replacement = ConstantInt::get(Type::getInt1Ty(Val->getContext()), + !cast<ConstantInt>(Val)->getZExtValue()); + + for (unsigned i = 0, e = Users.size(); i != e; ++i) + if (Instruction *U = cast<Instruction>(Users[i])) { + if (!L->contains(U)) + continue; + U->replaceUsesOfWith(LIC, Replacement); + Worklist.push_back(U); + } + SimplifyCode(Worklist, L); + return; + } + + // Otherwise, we don't know the precise value of LIC, but we do know that it + // is certainly NOT "Val". As such, simplify any uses in the loop that we + // can. This case occurs when we unswitch switch statements. + for (unsigned i = 0, e = Users.size(); i != e; ++i) { + Instruction *U = cast<Instruction>(Users[i]); + if (!L->contains(U)) + continue; + + Worklist.push_back(U); + + // TODO: We could do other simplifications, for example, turning + // 'icmp eq LIC, Val' -> false. + + // If we know that LIC is not Val, use this info to simplify code. + SwitchInst *SI = dyn_cast<SwitchInst>(U); + if (SI == 0 || !isa<ConstantInt>(Val)) continue; + + unsigned DeadCase = SI->findCaseValue(cast<ConstantInt>(Val)); + if (DeadCase == 0) continue; // Default case is live for multiple values. + + // Found a dead case value. Don't remove PHI nodes in the + // successor if they become single-entry, those PHI nodes may + // be in the Users list. + + // FIXME: This is a hack. We need to keep the successor around + // and hooked up so as to preserve the loop structure, because + // trying to update it is complicated. So instead we preserve the + // loop structure and put the block on a dead code path. + BasicBlock *Switch = SI->getParent(); + SplitEdge(Switch, SI->getSuccessor(DeadCase), this); + // Compute the successors instead of relying on the return value + // of SplitEdge, since it may have split the switch successor + // after PHI nodes. + BasicBlock *NewSISucc = SI->getSuccessor(DeadCase); + BasicBlock *OldSISucc = *succ_begin(NewSISucc); + // Create an "unreachable" destination. + BasicBlock *Abort = BasicBlock::Create(Context, "us-unreachable", + Switch->getParent(), + OldSISucc); + new UnreachableInst(Context, Abort); + // Force the new case destination to branch to the "unreachable" + // block while maintaining a (dead) CFG edge to the old block. + NewSISucc->getTerminator()->eraseFromParent(); + BranchInst::Create(Abort, OldSISucc, + ConstantInt::getTrue(Context), NewSISucc); + // Release the PHI operands for this edge. + for (BasicBlock::iterator II = NewSISucc->begin(); + PHINode *PN = dyn_cast<PHINode>(II); ++II) + PN->setIncomingValue(PN->getBasicBlockIndex(Switch), + UndefValue::get(PN->getType())); + // Tell the domtree about the new block. We don't fully update the + // domtree here -- instead we force it to do a full recomputation + // after the pass is complete -- but we do need to inform it of + // new blocks. + if (DT) + DT->addNewBlock(Abort, NewSISucc); + } + + SimplifyCode(Worklist, L); +} + +/// SimplifyCode - Okay, now that we have simplified some instructions in the +/// loop, walk over it and constant prop, dce, and fold control flow where +/// possible. Note that this is effectively a very simple loop-structure-aware +/// optimizer. During processing of this loop, L could very well be deleted, so +/// it must not be used. +/// +/// FIXME: When the loop optimizer is more mature, separate this out to a new +/// pass. +/// +void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) { + while (!Worklist.empty()) { + Instruction *I = Worklist.back(); + Worklist.pop_back(); + + // Simple DCE. + if (isInstructionTriviallyDead(I)) { + DEBUG(dbgs() << "Remove dead instruction '" << *I); + + // Add uses to the worklist, which may be dead now. + for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) + if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i))) + Worklist.push_back(Use); + LPM->deleteSimpleAnalysisValue(I, L); + RemoveFromWorklist(I, Worklist); + I->eraseFromParent(); + ++NumSimplify; + continue; + } + + // See if instruction simplification can hack this up. This is common for + // things like "select false, X, Y" after unswitching made the condition be + // 'false'. + if (Value *V = SimplifyInstruction(I, 0, DT)) + if (LI->replacementPreservesLCSSAForm(I, V)) { + ReplaceUsesOfWith(I, V, Worklist, L, LPM); + continue; + } + + // Special case hacks that appear commonly in unswitched code. + if (BranchInst *BI = dyn_cast<BranchInst>(I)) { + if (BI->isUnconditional()) { + // If BI's parent is the only pred of the successor, fold the two blocks + // together. + BasicBlock *Pred = BI->getParent(); + BasicBlock *Succ = BI->getSuccessor(0); + BasicBlock *SinglePred = Succ->getSinglePredecessor(); + if (!SinglePred) continue; // Nothing to do. + assert(SinglePred == Pred && "CFG broken"); + + DEBUG(dbgs() << "Merging blocks: " << Pred->getName() << " <- " + << Succ->getName() << "\n"); + + // Resolve any single entry PHI nodes in Succ. + while (PHINode *PN = dyn_cast<PHINode>(Succ->begin())) + ReplaceUsesOfWith(PN, PN->getIncomingValue(0), Worklist, L, LPM); + + // Move all of the successor contents from Succ to Pred. + Pred->getInstList().splice(BI, Succ->getInstList(), Succ->begin(), + Succ->end()); + LPM->deleteSimpleAnalysisValue(BI, L); + BI->eraseFromParent(); + RemoveFromWorklist(BI, Worklist); + + // If Succ has any successors with PHI nodes, update them to have + // entries coming from Pred instead of Succ. + Succ->replaceAllUsesWith(Pred); + + // Remove Succ from the loop tree. + LI->removeBlock(Succ); + LPM->deleteSimpleAnalysisValue(Succ, L); + Succ->eraseFromParent(); + ++NumSimplify; + continue; + } + + if (ConstantInt *CB = dyn_cast<ConstantInt>(BI->getCondition())){ + // Conditional branch. Turn it into an unconditional branch, then + // remove dead blocks. + continue; // FIXME: Enable. + + DEBUG(dbgs() << "Folded branch: " << *BI); + BasicBlock *DeadSucc = BI->getSuccessor(CB->getZExtValue()); + BasicBlock *LiveSucc = BI->getSuccessor(!CB->getZExtValue()); + DeadSucc->removePredecessor(BI->getParent(), true); + Worklist.push_back(BranchInst::Create(LiveSucc, BI)); + LPM->deleteSimpleAnalysisValue(BI, L); + BI->eraseFromParent(); + RemoveFromWorklist(BI, Worklist); + ++NumSimplify; + + RemoveBlockIfDead(DeadSucc, Worklist, L); + } + continue; + } + } +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp new file mode 100644 index 0000000..9087b46 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp @@ -0,0 +1,139 @@ +//===- LowerAtomic.cpp - Lower atomic intrinsics --------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass lowers atomic intrinsics to non-atomic form for use in a known +// non-preemptible environment. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "loweratomic" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Function.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Pass.h" +#include "llvm/Support/IRBuilder.h" +using namespace llvm; + +static bool LowerAtomicIntrinsic(IntrinsicInst *II) { + IRBuilder<> Builder(II->getParent(), II); + unsigned IID = II->getIntrinsicID(); + switch (IID) { + case Intrinsic::memory_barrier: + break; + + case Intrinsic::atomic_load_add: + case Intrinsic::atomic_load_sub: + case Intrinsic::atomic_load_and: + case Intrinsic::atomic_load_nand: + case Intrinsic::atomic_load_or: + case Intrinsic::atomic_load_xor: + case Intrinsic::atomic_load_max: + case Intrinsic::atomic_load_min: + case Intrinsic::atomic_load_umax: + case Intrinsic::atomic_load_umin: { + Value *Ptr = II->getArgOperand(0), *Delta = II->getArgOperand(1); + + LoadInst *Orig = Builder.CreateLoad(Ptr); + Value *Res = NULL; + switch (IID) { + default: assert(0 && "Unrecognized atomic modify operation"); + case Intrinsic::atomic_load_add: + Res = Builder.CreateAdd(Orig, Delta); + break; + case Intrinsic::atomic_load_sub: + Res = Builder.CreateSub(Orig, Delta); + break; + case Intrinsic::atomic_load_and: + Res = Builder.CreateAnd(Orig, Delta); + break; + case Intrinsic::atomic_load_nand: + Res = Builder.CreateNot(Builder.CreateAnd(Orig, Delta)); + break; + case Intrinsic::atomic_load_or: + Res = Builder.CreateOr(Orig, Delta); + break; + case Intrinsic::atomic_load_xor: + Res = Builder.CreateXor(Orig, Delta); + break; + case Intrinsic::atomic_load_max: + Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Delta), + Delta, Orig); + break; + case Intrinsic::atomic_load_min: + Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Delta), + Orig, Delta); + break; + case Intrinsic::atomic_load_umax: + Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Delta), + Delta, Orig); + break; + case Intrinsic::atomic_load_umin: + Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Delta), + Orig, Delta); + break; + } + Builder.CreateStore(Res, Ptr); + + II->replaceAllUsesWith(Orig); + break; + } + + case Intrinsic::atomic_swap: { + Value *Ptr = II->getArgOperand(0), *Val = II->getArgOperand(1); + LoadInst *Orig = Builder.CreateLoad(Ptr); + Builder.CreateStore(Val, Ptr); + II->replaceAllUsesWith(Orig); + break; + } + + case Intrinsic::atomic_cmp_swap: { + Value *Ptr = II->getArgOperand(0), *Cmp = II->getArgOperand(1); + Value *Val = II->getArgOperand(2); + + LoadInst *Orig = Builder.CreateLoad(Ptr); + Value *Equal = Builder.CreateICmpEQ(Orig, Cmp); + Value *Res = Builder.CreateSelect(Equal, Val, Orig); + Builder.CreateStore(Res, Ptr); + II->replaceAllUsesWith(Orig); + break; + } + + default: + return false; + } + + assert(II->use_empty() && + "Lowering should have eliminated any uses of the intrinsic call!"); + II->eraseFromParent(); + + return true; +} + +namespace { + struct LowerAtomic : public BasicBlockPass { + static char ID; + LowerAtomic() : BasicBlockPass(ID) { + initializeLowerAtomicPass(*PassRegistry::getPassRegistry()); + } + bool runOnBasicBlock(BasicBlock &BB) { + bool Changed = false; + for (BasicBlock::iterator DI = BB.begin(), DE = BB.end(); DI != DE; ) + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(DI++)) + Changed |= LowerAtomicIntrinsic(II); + return Changed; + } + }; +} + +char LowerAtomic::ID = 0; +INITIALIZE_PASS(LowerAtomic, "loweratomic", + "Lower atomic intrinsics to non-atomic form", + false, false) + +Pass *llvm::createLowerAtomicPass() { return new LowerAtomic(); } diff --git a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp new file mode 100644 index 0000000..bde0e53 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -0,0 +1,946 @@ +//===- MemCpyOptimizer.cpp - Optimize use of memcpy and friends -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass performs various transformations related to eliminating memcpy +// calls, or transforming sets of stores into memset's. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "memcpyopt" +#include "llvm/Transforms/Scalar.h" +#include "llvm/GlobalVariable.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Instructions.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/MemoryDependenceAnalysis.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/GetElementPtrTypeIterator.h" +#include "llvm/Support/IRBuilder.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetData.h" +#include <list> +using namespace llvm; + +STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted"); +STATISTIC(NumMemSetInfer, "Number of memsets inferred"); +STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy"); +STATISTIC(NumCpyToSet, "Number of memcpys converted to memset"); + +static int64_t GetOffsetFromIndex(const GetElementPtrInst *GEP, unsigned Idx, + bool &VariableIdxFound, const TargetData &TD){ + // Skip over the first indices. + gep_type_iterator GTI = gep_type_begin(GEP); + for (unsigned i = 1; i != Idx; ++i, ++GTI) + /*skip along*/; + + // Compute the offset implied by the rest of the indices. + int64_t Offset = 0; + for (unsigned i = Idx, e = GEP->getNumOperands(); i != e; ++i, ++GTI) { + ConstantInt *OpC = dyn_cast<ConstantInt>(GEP->getOperand(i)); + if (OpC == 0) + return VariableIdxFound = true; + if (OpC->isZero()) continue; // No offset. + + // Handle struct indices, which add their field offset to the pointer. + if (const StructType *STy = dyn_cast<StructType>(*GTI)) { + Offset += TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue()); + continue; + } + + // Otherwise, we have a sequential type like an array or vector. Multiply + // the index by the ElementSize. + uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType()); + Offset += Size*OpC->getSExtValue(); + } + + return Offset; +} + +/// IsPointerOffset - Return true if Ptr1 is provably equal to Ptr2 plus a +/// constant offset, and return that constant offset. For example, Ptr1 might +/// be &A[42], and Ptr2 might be &A[40]. In this case offset would be -8. +static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, + const TargetData &TD) { + Ptr1 = Ptr1->stripPointerCasts(); + Ptr2 = Ptr2->stripPointerCasts(); + GetElementPtrInst *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1); + GetElementPtrInst *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2); + + bool VariableIdxFound = false; + + // If one pointer is a GEP and the other isn't, then see if the GEP is a + // constant offset from the base, as in "P" and "gep P, 1". + if (GEP1 && GEP2 == 0 && GEP1->getOperand(0)->stripPointerCasts() == Ptr2) { + Offset = -GetOffsetFromIndex(GEP1, 1, VariableIdxFound, TD); + return !VariableIdxFound; + } + + if (GEP2 && GEP1 == 0 && GEP2->getOperand(0)->stripPointerCasts() == Ptr1) { + Offset = GetOffsetFromIndex(GEP2, 1, VariableIdxFound, TD); + return !VariableIdxFound; + } + + // Right now we handle the case when Ptr1/Ptr2 are both GEPs with an identical + // base. After that base, they may have some number of common (and + // potentially variable) indices. After that they handle some constant + // offset, which determines their offset from each other. At this point, we + // handle no other case. + if (!GEP1 || !GEP2 || GEP1->getOperand(0) != GEP2->getOperand(0)) + return false; + + // Skip any common indices and track the GEP types. + unsigned Idx = 1; + for (; Idx != GEP1->getNumOperands() && Idx != GEP2->getNumOperands(); ++Idx) + if (GEP1->getOperand(Idx) != GEP2->getOperand(Idx)) + break; + + int64_t Offset1 = GetOffsetFromIndex(GEP1, Idx, VariableIdxFound, TD); + int64_t Offset2 = GetOffsetFromIndex(GEP2, Idx, VariableIdxFound, TD); + if (VariableIdxFound) return false; + + Offset = Offset2-Offset1; + return true; +} + + +/// MemsetRange - Represents a range of memset'd bytes with the ByteVal value. +/// This allows us to analyze stores like: +/// store 0 -> P+1 +/// store 0 -> P+0 +/// store 0 -> P+3 +/// store 0 -> P+2 +/// which sometimes happens with stores to arrays of structs etc. When we see +/// the first store, we make a range [1, 2). The second store extends the range +/// to [0, 2). The third makes a new range [2, 3). The fourth store joins the +/// two ranges into [0, 3) which is memset'able. +namespace { +struct MemsetRange { + // Start/End - A semi range that describes the span that this range covers. + // The range is closed at the start and open at the end: [Start, End). + int64_t Start, End; + + /// StartPtr - The getelementptr instruction that points to the start of the + /// range. + Value *StartPtr; + + /// Alignment - The known alignment of the first store. + unsigned Alignment; + + /// TheStores - The actual stores that make up this range. + SmallVector<Instruction*, 16> TheStores; + + bool isProfitableToUseMemset(const TargetData &TD) const; + +}; +} // end anon namespace + +bool MemsetRange::isProfitableToUseMemset(const TargetData &TD) const { + // If we found more than 8 stores to merge or 64 bytes, use memset. + if (TheStores.size() >= 8 || End-Start >= 64) return true; + + // If there is nothing to merge, don't do anything. + if (TheStores.size() < 2) return false; + + // If any of the stores are a memset, then it is always good to extend the + // memset. + for (unsigned i = 0, e = TheStores.size(); i != e; ++i) + if (!isa<StoreInst>(TheStores[i])) + return true; + + // Assume that the code generator is capable of merging pairs of stores + // together if it wants to. + if (TheStores.size() == 2) return false; + + // If we have fewer than 8 stores, it can still be worthwhile to do this. + // For example, merging 4 i8 stores into an i32 store is useful almost always. + // However, merging 2 32-bit stores isn't useful on a 32-bit architecture (the + // memset will be split into 2 32-bit stores anyway) and doing so can + // pessimize the llvm optimizer. + // + // Since we don't have perfect knowledge here, make some assumptions: assume + // the maximum GPR width is the same size as the pointer size and assume that + // this width can be stored. If so, check to see whether we will end up + // actually reducing the number of stores used. + unsigned Bytes = unsigned(End-Start); + unsigned NumPointerStores = Bytes/TD.getPointerSize(); + + // Assume the remaining bytes if any are done a byte at a time. + unsigned NumByteStores = Bytes - NumPointerStores*TD.getPointerSize(); + + // If we will reduce the # stores (according to this heuristic), do the + // transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32 + // etc. + return TheStores.size() > NumPointerStores+NumByteStores; +} + + +namespace { +class MemsetRanges { + /// Ranges - A sorted list of the memset ranges. We use std::list here + /// because each element is relatively large and expensive to copy. + std::list<MemsetRange> Ranges; + typedef std::list<MemsetRange>::iterator range_iterator; + const TargetData &TD; +public: + MemsetRanges(const TargetData &td) : TD(td) {} + + typedef std::list<MemsetRange>::const_iterator const_iterator; + const_iterator begin() const { return Ranges.begin(); } + const_iterator end() const { return Ranges.end(); } + bool empty() const { return Ranges.empty(); } + + void addInst(int64_t OffsetFromFirst, Instruction *Inst) { + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) + addStore(OffsetFromFirst, SI); + else + addMemSet(OffsetFromFirst, cast<MemSetInst>(Inst)); + } + + void addStore(int64_t OffsetFromFirst, StoreInst *SI) { + int64_t StoreSize = TD.getTypeStoreSize(SI->getOperand(0)->getType()); + + addRange(OffsetFromFirst, StoreSize, + SI->getPointerOperand(), SI->getAlignment(), SI); + } + + void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) { + int64_t Size = cast<ConstantInt>(MSI->getLength())->getZExtValue(); + addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getAlignment(), MSI); + } + + void addRange(int64_t Start, int64_t Size, Value *Ptr, + unsigned Alignment, Instruction *Inst); + +}; + +} // end anon namespace + + +/// addRange - Add a new store to the MemsetRanges data structure. This adds a +/// new range for the specified store at the specified offset, merging into +/// existing ranges as appropriate. +/// +/// Do a linear search of the ranges to see if this can be joined and/or to +/// find the insertion point in the list. We keep the ranges sorted for +/// simplicity here. This is a linear search of a linked list, which is ugly, +/// however the number of ranges is limited, so this won't get crazy slow. +void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr, + unsigned Alignment, Instruction *Inst) { + int64_t End = Start+Size; + range_iterator I = Ranges.begin(), E = Ranges.end(); + + while (I != E && Start > I->End) + ++I; + + // We now know that I == E, in which case we didn't find anything to merge + // with, or that Start <= I->End. If End < I->Start or I == E, then we need + // to insert a new range. Handle this now. + if (I == E || End < I->Start) { + MemsetRange &R = *Ranges.insert(I, MemsetRange()); + R.Start = Start; + R.End = End; + R.StartPtr = Ptr; + R.Alignment = Alignment; + R.TheStores.push_back(Inst); + return; + } + + // This store overlaps with I, add it. + I->TheStores.push_back(Inst); + + // At this point, we may have an interval that completely contains our store. + // If so, just add it to the interval and return. + if (I->Start <= Start && I->End >= End) + return; + + // Now we know that Start <= I->End and End >= I->Start so the range overlaps + // but is not entirely contained within the range. + + // See if the range extends the start of the range. In this case, it couldn't + // possibly cause it to join the prior range, because otherwise we would have + // stopped on *it*. + if (Start < I->Start) { + I->Start = Start; + I->StartPtr = Ptr; + I->Alignment = Alignment; + } + + // Now we know that Start <= I->End and Start >= I->Start (so the startpoint + // is in or right at the end of I), and that End >= I->Start. Extend I out to + // End. + if (End > I->End) { + I->End = End; + range_iterator NextI = I; + while (++NextI != E && End >= NextI->Start) { + // Merge the range in. + I->TheStores.append(NextI->TheStores.begin(), NextI->TheStores.end()); + if (NextI->End > I->End) + I->End = NextI->End; + Ranges.erase(NextI); + NextI = I; + } + } +} + +//===----------------------------------------------------------------------===// +// MemCpyOpt Pass +//===----------------------------------------------------------------------===// + +namespace { + class MemCpyOpt : public FunctionPass { + MemoryDependenceAnalysis *MD; + const TargetData *TD; + public: + static char ID; // Pass identification, replacement for typeid + MemCpyOpt() : FunctionPass(ID) { + initializeMemCpyOptPass(*PassRegistry::getPassRegistry()); + MD = 0; + } + + bool runOnFunction(Function &F); + + private: + // This transformation requires dominator postdominator info + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired<DominatorTree>(); + AU.addRequired<MemoryDependenceAnalysis>(); + AU.addRequired<AliasAnalysis>(); + AU.addPreserved<AliasAnalysis>(); + AU.addPreserved<MemoryDependenceAnalysis>(); + } + + // Helper fuctions + bool processStore(StoreInst *SI, BasicBlock::iterator &BBI); + bool processMemSet(MemSetInst *SI, BasicBlock::iterator &BBI); + bool processMemCpy(MemCpyInst *M); + bool processMemMove(MemMoveInst *M); + bool performCallSlotOptzn(Instruction *cpy, Value *cpyDst, Value *cpySrc, + uint64_t cpyLen, CallInst *C); + bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, + uint64_t MSize); + bool processByValArgument(CallSite CS, unsigned ArgNo); + Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr, + Value *ByteVal); + + bool iterateOnFunction(Function &F); + }; + + char MemCpyOpt::ID = 0; +} + +// createMemCpyOptPass - The public interface to this file... +FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOpt(); } + +INITIALIZE_PASS_BEGIN(MemCpyOpt, "memcpyopt", "MemCpy Optimization", + false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization", + false, false) + +/// tryMergingIntoMemset - When scanning forward over instructions, we look for +/// some other patterns to fold away. In particular, this looks for stores to +/// neighboring locations of memory. If it sees enough consecutive ones, it +/// attempts to merge them together into a memcpy/memset. +Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, + Value *StartPtr, Value *ByteVal) { + if (TD == 0) return 0; + + // Okay, so we now have a single store that can be splatable. Scan to find + // all subsequent stores of the same value to offset from the same pointer. + // Join these together into ranges, so we can decide whether contiguous blocks + // are stored. + MemsetRanges Ranges(*TD); + + BasicBlock::iterator BI = StartInst; + for (++BI; !isa<TerminatorInst>(BI); ++BI) { + if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) { + // If the instruction is readnone, ignore it, otherwise bail out. We + // don't even allow readonly here because we don't want something like: + // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A). + if (BI->mayWriteToMemory() || BI->mayReadFromMemory()) + break; + continue; + } + + if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) { + // If this is a store, see if we can merge it in. + if (NextStore->isVolatile()) break; + + // Check to see if this stored value is of the same byte-splattable value. + if (ByteVal != isBytewiseValue(NextStore->getOperand(0))) + break; + + // Check to see if this store is to a constant offset from the start ptr. + int64_t Offset; + if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), + Offset, *TD)) + break; + + Ranges.addStore(Offset, NextStore); + } else { + MemSetInst *MSI = cast<MemSetInst>(BI); + + if (MSI->isVolatile() || ByteVal != MSI->getValue() || + !isa<ConstantInt>(MSI->getLength())) + break; + + // Check to see if this store is to a constant offset from the start ptr. + int64_t Offset; + if (!IsPointerOffset(StartPtr, MSI->getDest(), Offset, *TD)) + break; + + Ranges.addMemSet(Offset, MSI); + } + } + + // If we have no ranges, then we just had a single store with nothing that + // could be merged in. This is a very common case of course. + if (Ranges.empty()) + return 0; + + // If we had at least one store that could be merged in, add the starting + // store as well. We try to avoid this unless there is at least something + // interesting as a small compile-time optimization. + Ranges.addInst(0, StartInst); + + // If we create any memsets, we put it right before the first instruction that + // isn't part of the memset block. This ensure that the memset is dominated + // by any addressing instruction needed by the start of the block. + IRBuilder<> Builder(BI); + + // Now that we have full information about ranges, loop over the ranges and + // emit memset's for anything big enough to be worthwhile. + Instruction *AMemSet = 0; + for (MemsetRanges::const_iterator I = Ranges.begin(), E = Ranges.end(); + I != E; ++I) { + const MemsetRange &Range = *I; + + if (Range.TheStores.size() == 1) continue; + + // If it is profitable to lower this range to memset, do so now. + if (!Range.isProfitableToUseMemset(*TD)) + continue; + + // Otherwise, we do want to transform this! Create a new memset. + // Get the starting pointer of the block. + StartPtr = Range.StartPtr; + + // Determine alignment + unsigned Alignment = Range.Alignment; + if (Alignment == 0) { + const Type *EltType = + cast<PointerType>(StartPtr->getType())->getElementType(); + Alignment = TD->getABITypeAlignment(EltType); + } + + AMemSet = + Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment); + + DEBUG(dbgs() << "Replace stores:\n"; + for (unsigned i = 0, e = Range.TheStores.size(); i != e; ++i) + dbgs() << *Range.TheStores[i] << '\n'; + dbgs() << "With: " << *AMemSet << '\n'); + + // Zap all the stores. + for (SmallVector<Instruction*, 16>::const_iterator + SI = Range.TheStores.begin(), + SE = Range.TheStores.end(); SI != SE; ++SI) { + MD->removeInstruction(*SI); + (*SI)->eraseFromParent(); + } + ++NumMemSetInfer; + } + + return AMemSet; +} + + +bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { + if (SI->isVolatile()) return false; + + if (TD == 0) return false; + + // Detect cases where we're performing call slot forwarding, but + // happen to be using a load-store pair to implement it, rather than + // a memcpy. + if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) { + if (!LI->isVolatile() && LI->hasOneUse()) { + MemDepResult dep = MD->getDependency(LI); + CallInst *C = 0; + if (dep.isClobber() && !isa<MemCpyInst>(dep.getInst())) + C = dyn_cast<CallInst>(dep.getInst()); + + if (C) { + bool changed = performCallSlotOptzn(LI, + SI->getPointerOperand()->stripPointerCasts(), + LI->getPointerOperand()->stripPointerCasts(), + TD->getTypeStoreSize(SI->getOperand(0)->getType()), C); + if (changed) { + MD->removeInstruction(SI); + SI->eraseFromParent(); + MD->removeInstruction(LI); + LI->eraseFromParent(); + ++NumMemCpyInstr; + return true; + } + } + } + } + + // There are two cases that are interesting for this code to handle: memcpy + // and memset. Right now we only handle memset. + + // Ensure that the value being stored is something that can be memset'able a + // byte at a time like "0" or "-1" or any width, as well as things like + // 0xA0A0A0A0 and 0.0. + if (Value *ByteVal = isBytewiseValue(SI->getOperand(0))) + if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(), + ByteVal)) { + BBI = I; // Don't invalidate iterator. + return true; + } + + return false; +} + +bool MemCpyOpt::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) { + // See if there is another memset or store neighboring this memset which + // allows us to widen out the memset to do a single larger store. + if (isa<ConstantInt>(MSI->getLength()) && !MSI->isVolatile()) + if (Instruction *I = tryMergingIntoMemset(MSI, MSI->getDest(), + MSI->getValue())) { + BBI = I; // Don't invalidate iterator. + return true; + } + return false; +} + + +/// performCallSlotOptzn - takes a memcpy and a call that it depends on, +/// and checks for the possibility of a call slot optimization by having +/// the call write its result directly into the destination of the memcpy. +bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, + Value *cpyDest, Value *cpySrc, + uint64_t cpyLen, CallInst *C) { + // The general transformation to keep in mind is + // + // call @func(..., src, ...) + // memcpy(dest, src, ...) + // + // -> + // + // memcpy(dest, src, ...) + // call @func(..., dest, ...) + // + // Since moving the memcpy is technically awkward, we additionally check that + // src only holds uninitialized values at the moment of the call, meaning that + // the memcpy can be discarded rather than moved. + + // Deliberately get the source and destination with bitcasts stripped away, + // because we'll need to do type comparisons based on the underlying type. + CallSite CS(C); + + // Require that src be an alloca. This simplifies the reasoning considerably. + AllocaInst *srcAlloca = dyn_cast<AllocaInst>(cpySrc); + if (!srcAlloca) + return false; + + // Check that all of src is copied to dest. + if (TD == 0) return false; + + ConstantInt *srcArraySize = dyn_cast<ConstantInt>(srcAlloca->getArraySize()); + if (!srcArraySize) + return false; + + uint64_t srcSize = TD->getTypeAllocSize(srcAlloca->getAllocatedType()) * + srcArraySize->getZExtValue(); + + if (cpyLen < srcSize) + return false; + + // Check that accessing the first srcSize bytes of dest will not cause a + // trap. Otherwise the transform is invalid since it might cause a trap + // to occur earlier than it otherwise would. + if (AllocaInst *A = dyn_cast<AllocaInst>(cpyDest)) { + // The destination is an alloca. Check it is larger than srcSize. + ConstantInt *destArraySize = dyn_cast<ConstantInt>(A->getArraySize()); + if (!destArraySize) + return false; + + uint64_t destSize = TD->getTypeAllocSize(A->getAllocatedType()) * + destArraySize->getZExtValue(); + + if (destSize < srcSize) + return false; + } else if (Argument *A = dyn_cast<Argument>(cpyDest)) { + // If the destination is an sret parameter then only accesses that are + // outside of the returned struct type can trap. + if (!A->hasStructRetAttr()) + return false; + + const Type *StructTy = cast<PointerType>(A->getType())->getElementType(); + uint64_t destSize = TD->getTypeAllocSize(StructTy); + + if (destSize < srcSize) + return false; + } else { + return false; + } + + // Check that src is not accessed except via the call and the memcpy. This + // guarantees that it holds only undefined values when passed in (so the final + // memcpy can be dropped), that it is not read or written between the call and + // the memcpy, and that writing beyond the end of it is undefined. + SmallVector<User*, 8> srcUseList(srcAlloca->use_begin(), + srcAlloca->use_end()); + while (!srcUseList.empty()) { + User *UI = srcUseList.pop_back_val(); + + if (isa<BitCastInst>(UI)) { + for (User::use_iterator I = UI->use_begin(), E = UI->use_end(); + I != E; ++I) + srcUseList.push_back(*I); + } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(UI)) { + if (G->hasAllZeroIndices()) + for (User::use_iterator I = UI->use_begin(), E = UI->use_end(); + I != E; ++I) + srcUseList.push_back(*I); + else + return false; + } else if (UI != C && UI != cpy) { + return false; + } + } + + // Since we're changing the parameter to the callsite, we need to make sure + // that what would be the new parameter dominates the callsite. + DominatorTree &DT = getAnalysis<DominatorTree>(); + if (Instruction *cpyDestInst = dyn_cast<Instruction>(cpyDest)) + if (!DT.dominates(cpyDestInst, C)) + return false; + + // In addition to knowing that the call does not access src in some + // unexpected manner, for example via a global, which we deduce from + // the use analysis, we also need to know that it does not sneakily + // access dest. We rely on AA to figure this out for us. + AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); + if (AA.getModRefInfo(C, cpyDest, srcSize) != AliasAnalysis::NoModRef) + return false; + + // All the checks have passed, so do the transformation. + bool changedArgument = false; + for (unsigned i = 0; i < CS.arg_size(); ++i) + if (CS.getArgument(i)->stripPointerCasts() == cpySrc) { + if (cpySrc->getType() != cpyDest->getType()) + cpyDest = CastInst::CreatePointerCast(cpyDest, cpySrc->getType(), + cpyDest->getName(), C); + changedArgument = true; + if (CS.getArgument(i)->getType() == cpyDest->getType()) + CS.setArgument(i, cpyDest); + else + CS.setArgument(i, CastInst::CreatePointerCast(cpyDest, + CS.getArgument(i)->getType(), cpyDest->getName(), C)); + } + + if (!changedArgument) + return false; + + // Drop any cached information about the call, because we may have changed + // its dependence information by changing its parameter. + MD->removeInstruction(C); + + // Remove the memcpy. + MD->removeInstruction(cpy); + ++NumMemCpyInstr; + + return true; +} + +/// processMemCpyMemCpyDependence - We've found that the (upward scanning) +/// memory dependence of memcpy 'M' is the memcpy 'MDep'. Try to simplify M to +/// copy from MDep's input if we can. MSize is the size of M's copy. +/// +bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, + uint64_t MSize) { + // We can only transforms memcpy's where the dest of one is the source of the + // other. + if (M->getSource() != MDep->getDest() || MDep->isVolatile()) + return false; + + // If dep instruction is reading from our current input, then it is a noop + // transfer and substituting the input won't change this instruction. Just + // ignore the input and let someone else zap MDep. This handles cases like: + // memcpy(a <- a) + // memcpy(b <- a) + if (M->getSource() == MDep->getSource()) + return false; + + // Second, the length of the memcpy's must be the same, or the preceeding one + // must be larger than the following one. + ConstantInt *MDepLen = dyn_cast<ConstantInt>(MDep->getLength()); + ConstantInt *MLen = dyn_cast<ConstantInt>(M->getLength()); + if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue()) + return false; + + AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); + + // Verify that the copied-from memory doesn't change in between the two + // transfers. For example, in: + // memcpy(a <- b) + // *b = 42; + // memcpy(c <- a) + // It would be invalid to transform the second memcpy into memcpy(c <- b). + // + // TODO: If the code between M and MDep is transparent to the destination "c", + // then we could still perform the xform by moving M up to the first memcpy. + // + // NOTE: This is conservative, it will stop on any read from the source loc, + // not just the defining memcpy. + MemDepResult SourceDep = + MD->getPointerDependencyFrom(AA.getLocationForSource(MDep), + false, M, M->getParent()); + if (!SourceDep.isClobber() || SourceDep.getInst() != MDep) + return false; + + // If the dest of the second might alias the source of the first, then the + // source and dest might overlap. We still want to eliminate the intermediate + // value, but we have to generate a memmove instead of memcpy. + bool UseMemMove = false; + if (!AA.isNoAlias(AA.getLocationForDest(M), AA.getLocationForSource(MDep))) + UseMemMove = true; + + // If all checks passed, then we can transform M. + + // Make sure to use the lesser of the alignment of the source and the dest + // since we're changing where we're reading from, but don't want to increase + // the alignment past what can be read from or written to. + // TODO: Is this worth it if we're creating a less aligned memcpy? For + // example we could be moving from movaps -> movq on x86. + unsigned Align = std::min(MDep->getAlignment(), M->getAlignment()); + + IRBuilder<> Builder(M); + if (UseMemMove) + Builder.CreateMemMove(M->getRawDest(), MDep->getRawSource(), M->getLength(), + Align, M->isVolatile()); + else + Builder.CreateMemCpy(M->getRawDest(), MDep->getRawSource(), M->getLength(), + Align, M->isVolatile()); + + // Remove the instruction we're replacing. + MD->removeInstruction(M); + M->eraseFromParent(); + ++NumMemCpyInstr; + return true; +} + + +/// processMemCpy - perform simplification of memcpy's. If we have memcpy A +/// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite +/// B to be a memcpy from X to Z (or potentially a memmove, depending on +/// circumstances). This allows later passes to remove the first memcpy +/// altogether. +bool MemCpyOpt::processMemCpy(MemCpyInst *M) { + // We can only optimize statically-sized memcpy's that are non-volatile. + ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength()); + if (CopySize == 0 || M->isVolatile()) return false; + + // If the source and destination of the memcpy are the same, then zap it. + if (M->getSource() == M->getDest()) { + MD->removeInstruction(M); + M->eraseFromParent(); + return false; + } + + // If copying from a constant, try to turn the memcpy into a memset. + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(M->getSource())) + if (GV->isConstant() && GV->hasDefinitiveInitializer()) + if (Value *ByteVal = isBytewiseValue(GV->getInitializer())) { + IRBuilder<> Builder(M); + Builder.CreateMemSet(M->getRawDest(), ByteVal, CopySize, + M->getAlignment(), false); + MD->removeInstruction(M); + M->eraseFromParent(); + ++NumCpyToSet; + return true; + } + + // The are two possible optimizations we can do for memcpy: + // a) memcpy-memcpy xform which exposes redundance for DSE. + // b) call-memcpy xform for return slot optimization. + MemDepResult DepInfo = MD->getDependency(M); + if (!DepInfo.isClobber()) + return false; + + if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst())) + return processMemCpyMemCpyDependence(M, MDep, CopySize->getZExtValue()); + + if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) { + if (performCallSlotOptzn(M, M->getDest(), M->getSource(), + CopySize->getZExtValue(), C)) { + MD->removeInstruction(M); + M->eraseFromParent(); + return true; + } + } + + return false; +} + +/// processMemMove - Transforms memmove calls to memcpy calls when the src/dst +/// are guaranteed not to alias. +bool MemCpyOpt::processMemMove(MemMoveInst *M) { + AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); + + // See if the pointers alias. + if (!AA.isNoAlias(AA.getLocationForDest(M), AA.getLocationForSource(M))) + return false; + + DEBUG(dbgs() << "MemCpyOpt: Optimizing memmove -> memcpy: " << *M << "\n"); + + // If not, then we know we can transform this. + Module *Mod = M->getParent()->getParent()->getParent(); + const Type *ArgTys[3] = { M->getRawDest()->getType(), + M->getRawSource()->getType(), + M->getLength()->getType() }; + M->setCalledFunction(Intrinsic::getDeclaration(Mod, Intrinsic::memcpy, + ArgTys, 3)); + + // MemDep may have over conservative information about this instruction, just + // conservatively flush it from the cache. + MD->removeInstruction(M); + + ++NumMoveToCpy; + return true; +} + +/// processByValArgument - This is called on every byval argument in call sites. +bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { + if (TD == 0) return false; + + // Find out what feeds this byval argument. + Value *ByValArg = CS.getArgument(ArgNo); + const Type *ByValTy =cast<PointerType>(ByValArg->getType())->getElementType(); + uint64_t ByValSize = TD->getTypeAllocSize(ByValTy); + MemDepResult DepInfo = + MD->getPointerDependencyFrom(AliasAnalysis::Location(ByValArg, ByValSize), + true, CS.getInstruction(), + CS.getInstruction()->getParent()); + if (!DepInfo.isClobber()) + return false; + + // If the byval argument isn't fed by a memcpy, ignore it. If it is fed by + // a memcpy, see if we can byval from the source of the memcpy instead of the + // result. + MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst()); + if (MDep == 0 || MDep->isVolatile() || + ByValArg->stripPointerCasts() != MDep->getDest()) + return false; + + // The length of the memcpy must be larger or equal to the size of the byval. + ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength()); + if (C1 == 0 || C1->getValue().getZExtValue() < ByValSize) + return false; + + // Get the alignment of the byval. If it is greater than the memcpy, then we + // can't do the substitution. If the call doesn't specify the alignment, then + // it is some target specific value that we can't know. + unsigned ByValAlign = CS.getParamAlignment(ArgNo+1); + if (ByValAlign == 0 || MDep->getAlignment() < ByValAlign) + return false; + + // Verify that the copied-from memory doesn't change in between the memcpy and + // the byval call. + // memcpy(a <- b) + // *b = 42; + // foo(*a) + // It would be invalid to transform the second memcpy into foo(*b). + // + // NOTE: This is conservative, it will stop on any read from the source loc, + // not just the defining memcpy. + MemDepResult SourceDep = + MD->getPointerDependencyFrom(AliasAnalysis::getLocationForSource(MDep), + false, CS.getInstruction(), MDep->getParent()); + if (!SourceDep.isClobber() || SourceDep.getInst() != MDep) + return false; + + Value *TmpCast = MDep->getSource(); + if (MDep->getSource()->getType() != ByValArg->getType()) + TmpCast = new BitCastInst(MDep->getSource(), ByValArg->getType(), + "tmpcast", CS.getInstruction()); + + DEBUG(dbgs() << "MemCpyOpt: Forwarding memcpy to byval:\n" + << " " << *MDep << "\n" + << " " << *CS.getInstruction() << "\n"); + + // Otherwise we're good! Update the byval argument. + CS.setArgument(ArgNo, TmpCast); + ++NumMemCpyInstr; + return true; +} + +/// iterateOnFunction - Executes one iteration of MemCpyOpt. +bool MemCpyOpt::iterateOnFunction(Function &F) { + bool MadeChange = false; + + // Walk all instruction in the function. + for (Function::iterator BB = F.begin(), BBE = F.end(); BB != BBE; ++BB) { + for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { + // Avoid invalidating the iterator. + Instruction *I = BI++; + + bool RepeatInstruction = false; + + if (StoreInst *SI = dyn_cast<StoreInst>(I)) + MadeChange |= processStore(SI, BI); + else if (MemSetInst *M = dyn_cast<MemSetInst>(I)) + RepeatInstruction = processMemSet(M, BI); + else if (MemCpyInst *M = dyn_cast<MemCpyInst>(I)) + RepeatInstruction = processMemCpy(M); + else if (MemMoveInst *M = dyn_cast<MemMoveInst>(I)) + RepeatInstruction = processMemMove(M); + else if (CallSite CS = (Value*)I) { + for (unsigned i = 0, e = CS.arg_size(); i != e; ++i) + if (CS.paramHasAttr(i+1, Attribute::ByVal)) + MadeChange |= processByValArgument(CS, i); + } + + // Reprocess the instruction if desired. + if (RepeatInstruction) { + if (BI != BB->begin()) --BI; + MadeChange = true; + } + } + } + + return MadeChange; +} + +// MemCpyOpt::runOnFunction - This is the main transformation entry point for a +// function. +// +bool MemCpyOpt::runOnFunction(Function &F) { + bool MadeChange = false; + MD = &getAnalysis<MemoryDependenceAnalysis>(); + TD = getAnalysisIfAvailable<TargetData>(); + while (1) { + if (!iterateOnFunction(F)) + break; + MadeChange = true; + } + + MD = 0; + return MadeChange; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp new file mode 100644 index 0000000..e093b52 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -0,0 +1,1101 @@ +//===- Reassociate.cpp - Reassociate binary expressions -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass reassociates commutative expressions in an order that is designed +// to promote better constant propagation, GCSE, LICM, PRE, etc. +// +// For example: 4 + (x + 5) -> x + (4 + 5) +// +// In the implementation of this algorithm, constants are assigned rank = 0, +// function arguments are rank = 1, and other values are assigned ranks +// corresponding to the reverse post order traversal of current function +// (starting at 2), which effectively gives values in deep loops higher rank +// than values not in loops. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "reassociate" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Pass.h" +#include "llvm/Assembly/Writer.h" +#include "llvm/Support/CFG.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ValueHandle.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/DenseMap.h" +#include <algorithm> +using namespace llvm; + +STATISTIC(NumLinear , "Number of insts linearized"); +STATISTIC(NumChanged, "Number of insts reassociated"); +STATISTIC(NumAnnihil, "Number of expr tree annihilated"); +STATISTIC(NumFactor , "Number of multiplies factored"); + +namespace { + struct ValueEntry { + unsigned Rank; + Value *Op; + ValueEntry(unsigned R, Value *O) : Rank(R), Op(O) {} + }; + inline bool operator<(const ValueEntry &LHS, const ValueEntry &RHS) { + return LHS.Rank > RHS.Rank; // Sort so that highest rank goes to start. + } +} + +#ifndef NDEBUG +/// PrintOps - Print out the expression identified in the Ops list. +/// +static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) { + Module *M = I->getParent()->getParent()->getParent(); + dbgs() << Instruction::getOpcodeName(I->getOpcode()) << " " + << *Ops[0].Op->getType() << '\t'; + for (unsigned i = 0, e = Ops.size(); i != e; ++i) { + dbgs() << "[ "; + WriteAsOperand(dbgs(), Ops[i].Op, false, M); + dbgs() << ", #" << Ops[i].Rank << "] "; + } +} +#endif + +namespace { + class Reassociate : public FunctionPass { + DenseMap<BasicBlock*, unsigned> RankMap; + DenseMap<AssertingVH<>, unsigned> ValueRankMap; + bool MadeChange; + public: + static char ID; // Pass identification, replacement for typeid + Reassociate() : FunctionPass(ID) { + initializeReassociatePass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + } + private: + void BuildRankMap(Function &F); + unsigned getRank(Value *V); + Value *ReassociateExpression(BinaryOperator *I); + void RewriteExprTree(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops, + unsigned Idx = 0); + Value *OptimizeExpression(BinaryOperator *I, + SmallVectorImpl<ValueEntry> &Ops); + Value *OptimizeAdd(Instruction *I, SmallVectorImpl<ValueEntry> &Ops); + void LinearizeExprTree(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops); + void LinearizeExpr(BinaryOperator *I); + Value *RemoveFactorFromExpression(Value *V, Value *Factor); + void ReassociateBB(BasicBlock *BB); + + void RemoveDeadBinaryOp(Value *V); + }; +} + +char Reassociate::ID = 0; +INITIALIZE_PASS(Reassociate, "reassociate", + "Reassociate expressions", false, false) + +// Public interface to the Reassociate pass +FunctionPass *llvm::createReassociatePass() { return new Reassociate(); } + +void Reassociate::RemoveDeadBinaryOp(Value *V) { + Instruction *Op = dyn_cast<Instruction>(V); + if (!Op || !isa<BinaryOperator>(Op) || !Op->use_empty()) + return; + + Value *LHS = Op->getOperand(0), *RHS = Op->getOperand(1); + + ValueRankMap.erase(Op); + Op->eraseFromParent(); + RemoveDeadBinaryOp(LHS); + RemoveDeadBinaryOp(RHS); +} + + +static bool isUnmovableInstruction(Instruction *I) { + if (I->getOpcode() == Instruction::PHI || + I->getOpcode() == Instruction::Alloca || + I->getOpcode() == Instruction::Load || + I->getOpcode() == Instruction::Invoke || + (I->getOpcode() == Instruction::Call && + !isa<DbgInfoIntrinsic>(I)) || + I->getOpcode() == Instruction::UDiv || + I->getOpcode() == Instruction::SDiv || + I->getOpcode() == Instruction::FDiv || + I->getOpcode() == Instruction::URem || + I->getOpcode() == Instruction::SRem || + I->getOpcode() == Instruction::FRem) + return true; + return false; +} + +void Reassociate::BuildRankMap(Function &F) { + unsigned i = 2; + + // Assign distinct ranks to function arguments + for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) + ValueRankMap[&*I] = ++i; + + ReversePostOrderTraversal<Function*> RPOT(&F); + for (ReversePostOrderTraversal<Function*>::rpo_iterator I = RPOT.begin(), + E = RPOT.end(); I != E; ++I) { + BasicBlock *BB = *I; + unsigned BBRank = RankMap[BB] = ++i << 16; + + // Walk the basic block, adding precomputed ranks for any instructions that + // we cannot move. This ensures that the ranks for these instructions are + // all different in the block. + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) + if (isUnmovableInstruction(I)) + ValueRankMap[&*I] = ++BBRank; + } +} + +unsigned Reassociate::getRank(Value *V) { + Instruction *I = dyn_cast<Instruction>(V); + if (I == 0) { + if (isa<Argument>(V)) return ValueRankMap[V]; // Function argument. + return 0; // Otherwise it's a global or constant, rank 0. + } + + if (unsigned Rank = ValueRankMap[I]) + return Rank; // Rank already known? + + // If this is an expression, return the 1+MAX(rank(LHS), rank(RHS)) so that + // we can reassociate expressions for code motion! Since we do not recurse + // for PHI nodes, we cannot have infinite recursion here, because there + // cannot be loops in the value graph that do not go through PHI nodes. + unsigned Rank = 0, MaxRank = RankMap[I->getParent()]; + for (unsigned i = 0, e = I->getNumOperands(); + i != e && Rank != MaxRank; ++i) + Rank = std::max(Rank, getRank(I->getOperand(i))); + + // If this is a not or neg instruction, do not count it for rank. This + // assures us that X and ~X will have the same rank. + if (!I->getType()->isIntegerTy() || + (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I))) + ++Rank; + + //DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = " + // << Rank << "\n"); + + return ValueRankMap[I] = Rank; +} + +/// isReassociableOp - Return true if V is an instruction of the specified +/// opcode and if it only has one use. +static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) { + if ((V->hasOneUse() || V->use_empty()) && isa<Instruction>(V) && + cast<Instruction>(V)->getOpcode() == Opcode) + return cast<BinaryOperator>(V); + return 0; +} + +/// LowerNegateToMultiply - Replace 0-X with X*-1. +/// +static Instruction *LowerNegateToMultiply(Instruction *Neg, + DenseMap<AssertingVH<>, unsigned> &ValueRankMap) { + Constant *Cst = Constant::getAllOnesValue(Neg->getType()); + + Instruction *Res = BinaryOperator::CreateMul(Neg->getOperand(1), Cst, "",Neg); + ValueRankMap.erase(Neg); + Res->takeName(Neg); + Neg->replaceAllUsesWith(Res); + Neg->eraseFromParent(); + return Res; +} + +// Given an expression of the form '(A+B)+(D+C)', turn it into '(((A+B)+C)+D)'. +// Note that if D is also part of the expression tree that we recurse to +// linearize it as well. Besides that case, this does not recurse into A,B, or +// C. +void Reassociate::LinearizeExpr(BinaryOperator *I) { + BinaryOperator *LHS = cast<BinaryOperator>(I->getOperand(0)); + BinaryOperator *RHS = cast<BinaryOperator>(I->getOperand(1)); + assert(isReassociableOp(LHS, I->getOpcode()) && + isReassociableOp(RHS, I->getOpcode()) && + "Not an expression that needs linearization?"); + + DEBUG(dbgs() << "Linear" << *LHS << '\n' << *RHS << '\n' << *I << '\n'); + + // Move the RHS instruction to live immediately before I, avoiding breaking + // dominator properties. + RHS->moveBefore(I); + + // Move operands around to do the linearization. + I->setOperand(1, RHS->getOperand(0)); + RHS->setOperand(0, LHS); + I->setOperand(0, RHS); + + // Conservatively clear all the optional flags, which may not hold + // after the reassociation. + I->clearSubclassOptionalData(); + LHS->clearSubclassOptionalData(); + RHS->clearSubclassOptionalData(); + + ++NumLinear; + MadeChange = true; + DEBUG(dbgs() << "Linearized: " << *I << '\n'); + + // If D is part of this expression tree, tail recurse. + if (isReassociableOp(I->getOperand(1), I->getOpcode())) + LinearizeExpr(I); +} + + +/// LinearizeExprTree - Given an associative binary expression tree, traverse +/// all of the uses putting it into canonical form. This forces a left-linear +/// form of the expression (((a+b)+c)+d), and collects information about the +/// rank of the non-tree operands. +/// +/// NOTE: These intentionally destroys the expression tree operands (turning +/// them into undef values) to reduce #uses of the values. This means that the +/// caller MUST use something like RewriteExprTree to put the values back in. +/// +void Reassociate::LinearizeExprTree(BinaryOperator *I, + SmallVectorImpl<ValueEntry> &Ops) { + Value *LHS = I->getOperand(0), *RHS = I->getOperand(1); + unsigned Opcode = I->getOpcode(); + + // First step, linearize the expression if it is in ((A+B)+(C+D)) form. + BinaryOperator *LHSBO = isReassociableOp(LHS, Opcode); + BinaryOperator *RHSBO = isReassociableOp(RHS, Opcode); + + // If this is a multiply expression tree and it contains internal negations, + // transform them into multiplies by -1 so they can be reassociated. + if (I->getOpcode() == Instruction::Mul) { + if (!LHSBO && LHS->hasOneUse() && BinaryOperator::isNeg(LHS)) { + LHS = LowerNegateToMultiply(cast<Instruction>(LHS), ValueRankMap); + LHSBO = isReassociableOp(LHS, Opcode); + } + if (!RHSBO && RHS->hasOneUse() && BinaryOperator::isNeg(RHS)) { + RHS = LowerNegateToMultiply(cast<Instruction>(RHS), ValueRankMap); + RHSBO = isReassociableOp(RHS, Opcode); + } + } + + if (!LHSBO) { + if (!RHSBO) { + // Neither the LHS or RHS as part of the tree, thus this is a leaf. As + // such, just remember these operands and their rank. + Ops.push_back(ValueEntry(getRank(LHS), LHS)); + Ops.push_back(ValueEntry(getRank(RHS), RHS)); + + // Clear the leaves out. + I->setOperand(0, UndefValue::get(I->getType())); + I->setOperand(1, UndefValue::get(I->getType())); + return; + } + + // Turn X+(Y+Z) -> (Y+Z)+X + std::swap(LHSBO, RHSBO); + std::swap(LHS, RHS); + bool Success = !I->swapOperands(); + assert(Success && "swapOperands failed"); + Success = false; + MadeChange = true; + } else if (RHSBO) { + // Turn (A+B)+(C+D) -> (((A+B)+C)+D). This guarantees the RHS is not + // part of the expression tree. + LinearizeExpr(I); + LHS = LHSBO = cast<BinaryOperator>(I->getOperand(0)); + RHS = I->getOperand(1); + RHSBO = 0; + } + + // Okay, now we know that the LHS is a nested expression and that the RHS is + // not. Perform reassociation. + assert(!isReassociableOp(RHS, Opcode) && "LinearizeExpr failed!"); + + // Move LHS right before I to make sure that the tree expression dominates all + // values. + LHSBO->moveBefore(I); + + // Linearize the expression tree on the LHS. + LinearizeExprTree(LHSBO, Ops); + + // Remember the RHS operand and its rank. + Ops.push_back(ValueEntry(getRank(RHS), RHS)); + + // Clear the RHS leaf out. + I->setOperand(1, UndefValue::get(I->getType())); +} + +// RewriteExprTree - Now that the operands for this expression tree are +// linearized and optimized, emit them in-order. This function is written to be +// tail recursive. +void Reassociate::RewriteExprTree(BinaryOperator *I, + SmallVectorImpl<ValueEntry> &Ops, + unsigned i) { + if (i+2 == Ops.size()) { + if (I->getOperand(0) != Ops[i].Op || + I->getOperand(1) != Ops[i+1].Op) { + Value *OldLHS = I->getOperand(0); + DEBUG(dbgs() << "RA: " << *I << '\n'); + I->setOperand(0, Ops[i].Op); + I->setOperand(1, Ops[i+1].Op); + + // Clear all the optional flags, which may not hold after the + // reassociation if the expression involved more than just this operation. + if (Ops.size() != 2) + I->clearSubclassOptionalData(); + + DEBUG(dbgs() << "TO: " << *I << '\n'); + MadeChange = true; + ++NumChanged; + + // If we reassociated a tree to fewer operands (e.g. (1+a+2) -> (a+3) + // delete the extra, now dead, nodes. + RemoveDeadBinaryOp(OldLHS); + } + return; + } + assert(i+2 < Ops.size() && "Ops index out of range!"); + + if (I->getOperand(1) != Ops[i].Op) { + DEBUG(dbgs() << "RA: " << *I << '\n'); + I->setOperand(1, Ops[i].Op); + + // Conservatively clear all the optional flags, which may not hold + // after the reassociation. + I->clearSubclassOptionalData(); + + DEBUG(dbgs() << "TO: " << *I << '\n'); + MadeChange = true; + ++NumChanged; + } + + BinaryOperator *LHS = cast<BinaryOperator>(I->getOperand(0)); + assert(LHS->getOpcode() == I->getOpcode() && + "Improper expression tree!"); + + // Compactify the tree instructions together with each other to guarantee + // that the expression tree is dominated by all of Ops. + LHS->moveBefore(I); + RewriteExprTree(LHS, Ops, i+1); +} + + + +// NegateValue - Insert instructions before the instruction pointed to by BI, +// that computes the negative version of the value specified. The negative +// version of the value is returned, and BI is left pointing at the instruction +// that should be processed next by the reassociation pass. +// +static Value *NegateValue(Value *V, Instruction *BI) { + if (Constant *C = dyn_cast<Constant>(V)) + return ConstantExpr::getNeg(C); + + // We are trying to expose opportunity for reassociation. One of the things + // that we want to do to achieve this is to push a negation as deep into an + // expression chain as possible, to expose the add instructions. In practice, + // this means that we turn this: + // X = -(A+12+C+D) into X = -A + -12 + -C + -D = -12 + -A + -C + -D + // so that later, a: Y = 12+X could get reassociated with the -12 to eliminate + // the constants. We assume that instcombine will clean up the mess later if + // we introduce tons of unnecessary negation instructions. + // + if (Instruction *I = dyn_cast<Instruction>(V)) + if (I->getOpcode() == Instruction::Add && I->hasOneUse()) { + // Push the negates through the add. + I->setOperand(0, NegateValue(I->getOperand(0), BI)); + I->setOperand(1, NegateValue(I->getOperand(1), BI)); + + // We must move the add instruction here, because the neg instructions do + // not dominate the old add instruction in general. By moving it, we are + // assured that the neg instructions we just inserted dominate the + // instruction we are about to insert after them. + // + I->moveBefore(BI); + I->setName(I->getName()+".neg"); + return I; + } + + // Okay, we need to materialize a negated version of V with an instruction. + // Scan the use lists of V to see if we have one already. + for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;++UI){ + User *U = *UI; + if (!BinaryOperator::isNeg(U)) continue; + + // We found one! Now we have to make sure that the definition dominates + // this use. We do this by moving it to the entry block (if it is a + // non-instruction value) or right after the definition. These negates will + // be zapped by reassociate later, so we don't need much finesse here. + BinaryOperator *TheNeg = cast<BinaryOperator>(U); + + // Verify that the negate is in this function, V might be a constant expr. + if (TheNeg->getParent()->getParent() != BI->getParent()->getParent()) + continue; + + BasicBlock::iterator InsertPt; + if (Instruction *InstInput = dyn_cast<Instruction>(V)) { + if (InvokeInst *II = dyn_cast<InvokeInst>(InstInput)) { + InsertPt = II->getNormalDest()->begin(); + } else { + InsertPt = InstInput; + ++InsertPt; + } + while (isa<PHINode>(InsertPt)) ++InsertPt; + } else { + InsertPt = TheNeg->getParent()->getParent()->getEntryBlock().begin(); + } + TheNeg->moveBefore(InsertPt); + return TheNeg; + } + + // Insert a 'neg' instruction that subtracts the value from zero to get the + // negation. + return BinaryOperator::CreateNeg(V, V->getName() + ".neg", BI); +} + +/// ShouldBreakUpSubtract - Return true if we should break up this subtract of +/// X-Y into (X + -Y). +static bool ShouldBreakUpSubtract(Instruction *Sub) { + // If this is a negation, we can't split it up! + if (BinaryOperator::isNeg(Sub)) + return false; + + // Don't bother to break this up unless either the LHS is an associable add or + // subtract or if this is only used by one. + if (isReassociableOp(Sub->getOperand(0), Instruction::Add) || + isReassociableOp(Sub->getOperand(0), Instruction::Sub)) + return true; + if (isReassociableOp(Sub->getOperand(1), Instruction::Add) || + isReassociableOp(Sub->getOperand(1), Instruction::Sub)) + return true; + if (Sub->hasOneUse() && + (isReassociableOp(Sub->use_back(), Instruction::Add) || + isReassociableOp(Sub->use_back(), Instruction::Sub))) + return true; + + return false; +} + +/// BreakUpSubtract - If we have (X-Y), and if either X is an add, or if this is +/// only used by an add, transform this into (X+(0-Y)) to promote better +/// reassociation. +static Instruction *BreakUpSubtract(Instruction *Sub, + DenseMap<AssertingVH<>, unsigned> &ValueRankMap) { + // Convert a subtract into an add and a neg instruction. This allows sub + // instructions to be commuted with other add instructions. + // + // Calculate the negative value of Operand 1 of the sub instruction, + // and set it as the RHS of the add instruction we just made. + // + Value *NegVal = NegateValue(Sub->getOperand(1), Sub); + Instruction *New = + BinaryOperator::CreateAdd(Sub->getOperand(0), NegVal, "", Sub); + New->takeName(Sub); + + // Everyone now refers to the add instruction. + ValueRankMap.erase(Sub); + Sub->replaceAllUsesWith(New); + Sub->eraseFromParent(); + + DEBUG(dbgs() << "Negated: " << *New << '\n'); + return New; +} + +/// ConvertShiftToMul - If this is a shift of a reassociable multiply or is used +/// by one, change this into a multiply by a constant to assist with further +/// reassociation. +static Instruction *ConvertShiftToMul(Instruction *Shl, + DenseMap<AssertingVH<>, unsigned> &ValueRankMap) { + // If an operand of this shift is a reassociable multiply, or if the shift + // is used by a reassociable multiply or add, turn into a multiply. + if (isReassociableOp(Shl->getOperand(0), Instruction::Mul) || + (Shl->hasOneUse() && + (isReassociableOp(Shl->use_back(), Instruction::Mul) || + isReassociableOp(Shl->use_back(), Instruction::Add)))) { + Constant *MulCst = ConstantInt::get(Shl->getType(), 1); + MulCst = ConstantExpr::getShl(MulCst, cast<Constant>(Shl->getOperand(1))); + + Instruction *Mul = + BinaryOperator::CreateMul(Shl->getOperand(0), MulCst, "", Shl); + ValueRankMap.erase(Shl); + Mul->takeName(Shl); + Shl->replaceAllUsesWith(Mul); + Shl->eraseFromParent(); + return Mul; + } + return 0; +} + +// Scan backwards and forwards among values with the same rank as element i to +// see if X exists. If X does not exist, return i. This is useful when +// scanning for 'x' when we see '-x' because they both get the same rank. +static unsigned FindInOperandList(SmallVectorImpl<ValueEntry> &Ops, unsigned i, + Value *X) { + unsigned XRank = Ops[i].Rank; + unsigned e = Ops.size(); + for (unsigned j = i+1; j != e && Ops[j].Rank == XRank; ++j) + if (Ops[j].Op == X) + return j; + // Scan backwards. + for (unsigned j = i-1; j != ~0U && Ops[j].Rank == XRank; --j) + if (Ops[j].Op == X) + return j; + return i; +} + +/// EmitAddTreeOfValues - Emit a tree of add instructions, summing Ops together +/// and returning the result. Insert the tree before I. +static Value *EmitAddTreeOfValues(Instruction *I, SmallVectorImpl<Value*> &Ops){ + if (Ops.size() == 1) return Ops.back(); + + Value *V1 = Ops.back(); + Ops.pop_back(); + Value *V2 = EmitAddTreeOfValues(I, Ops); + return BinaryOperator::CreateAdd(V2, V1, "tmp", I); +} + +/// RemoveFactorFromExpression - If V is an expression tree that is a +/// multiplication sequence, and if this sequence contains a multiply by Factor, +/// remove Factor from the tree and return the new tree. +Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) { + BinaryOperator *BO = isReassociableOp(V, Instruction::Mul); + if (!BO) return 0; + + SmallVector<ValueEntry, 8> Factors; + LinearizeExprTree(BO, Factors); + + bool FoundFactor = false; + bool NeedsNegate = false; + for (unsigned i = 0, e = Factors.size(); i != e; ++i) { + if (Factors[i].Op == Factor) { + FoundFactor = true; + Factors.erase(Factors.begin()+i); + break; + } + + // If this is a negative version of this factor, remove it. + if (ConstantInt *FC1 = dyn_cast<ConstantInt>(Factor)) + if (ConstantInt *FC2 = dyn_cast<ConstantInt>(Factors[i].Op)) + if (FC1->getValue() == -FC2->getValue()) { + FoundFactor = NeedsNegate = true; + Factors.erase(Factors.begin()+i); + break; + } + } + + if (!FoundFactor) { + // Make sure to restore the operands to the expression tree. + RewriteExprTree(BO, Factors); + return 0; + } + + BasicBlock::iterator InsertPt = BO; ++InsertPt; + + // If this was just a single multiply, remove the multiply and return the only + // remaining operand. + if (Factors.size() == 1) { + ValueRankMap.erase(BO); + BO->eraseFromParent(); + V = Factors[0].Op; + } else { + RewriteExprTree(BO, Factors); + V = BO; + } + + if (NeedsNegate) + V = BinaryOperator::CreateNeg(V, "neg", InsertPt); + + return V; +} + +/// FindSingleUseMultiplyFactors - If V is a single-use multiply, recursively +/// add its operands as factors, otherwise add V to the list of factors. +/// +/// Ops is the top-level list of add operands we're trying to factor. +static void FindSingleUseMultiplyFactors(Value *V, + SmallVectorImpl<Value*> &Factors, + const SmallVectorImpl<ValueEntry> &Ops, + bool IsRoot) { + BinaryOperator *BO; + if (!(V->hasOneUse() || V->use_empty()) || // More than one use. + !(BO = dyn_cast<BinaryOperator>(V)) || + BO->getOpcode() != Instruction::Mul) { + Factors.push_back(V); + return; + } + + // If this value has a single use because it is another input to the add + // tree we're reassociating and we dropped its use, it actually has two + // uses and we can't factor it. + if (!IsRoot) { + for (unsigned i = 0, e = Ops.size(); i != e; ++i) + if (Ops[i].Op == V) { + Factors.push_back(V); + return; + } + } + + + // Otherwise, add the LHS and RHS to the list of factors. + FindSingleUseMultiplyFactors(BO->getOperand(1), Factors, Ops, false); + FindSingleUseMultiplyFactors(BO->getOperand(0), Factors, Ops, false); +} + +/// OptimizeAndOrXor - Optimize a series of operands to an 'and', 'or', or 'xor' +/// instruction. This optimizes based on identities. If it can be reduced to +/// a single Value, it is returned, otherwise the Ops list is mutated as +/// necessary. +static Value *OptimizeAndOrXor(unsigned Opcode, + SmallVectorImpl<ValueEntry> &Ops) { + // Scan the operand lists looking for X and ~X pairs, along with X,X pairs. + // If we find any, we can simplify the expression. X&~X == 0, X|~X == -1. + for (unsigned i = 0, e = Ops.size(); i != e; ++i) { + // First, check for X and ~X in the operand list. + assert(i < Ops.size()); + if (BinaryOperator::isNot(Ops[i].Op)) { // Cannot occur for ^. + Value *X = BinaryOperator::getNotArgument(Ops[i].Op); + unsigned FoundX = FindInOperandList(Ops, i, X); + if (FoundX != i) { + if (Opcode == Instruction::And) // ...&X&~X = 0 + return Constant::getNullValue(X->getType()); + + if (Opcode == Instruction::Or) // ...|X|~X = -1 + return Constant::getAllOnesValue(X->getType()); + } + } + + // Next, check for duplicate pairs of values, which we assume are next to + // each other, due to our sorting criteria. + assert(i < Ops.size()); + if (i+1 != Ops.size() && Ops[i+1].Op == Ops[i].Op) { + if (Opcode == Instruction::And || Opcode == Instruction::Or) { + // Drop duplicate values for And and Or. + Ops.erase(Ops.begin()+i); + --i; --e; + ++NumAnnihil; + continue; + } + + // Drop pairs of values for Xor. + assert(Opcode == Instruction::Xor); + if (e == 2) + return Constant::getNullValue(Ops[0].Op->getType()); + + // Y ^ X^X -> Y + Ops.erase(Ops.begin()+i, Ops.begin()+i+2); + i -= 1; e -= 2; + ++NumAnnihil; + } + } + return 0; +} + +/// OptimizeAdd - Optimize a series of operands to an 'add' instruction. This +/// optimizes based on identities. If it can be reduced to a single Value, it +/// is returned, otherwise the Ops list is mutated as necessary. +Value *Reassociate::OptimizeAdd(Instruction *I, + SmallVectorImpl<ValueEntry> &Ops) { + // Scan the operand lists looking for X and -X pairs. If we find any, we + // can simplify the expression. X+-X == 0. While we're at it, scan for any + // duplicates. We want to canonicalize Y+Y+Y+Z -> 3*Y+Z. + // + // TODO: We could handle "X + ~X" -> "-1" if we wanted, since "-X = ~X+1". + // + for (unsigned i = 0, e = Ops.size(); i != e; ++i) { + Value *TheOp = Ops[i].Op; + // Check to see if we've seen this operand before. If so, we factor all + // instances of the operand together. Due to our sorting criteria, we know + // that these need to be next to each other in the vector. + if (i+1 != Ops.size() && Ops[i+1].Op == TheOp) { + // Rescan the list, remove all instances of this operand from the expr. + unsigned NumFound = 0; + do { + Ops.erase(Ops.begin()+i); + ++NumFound; + } while (i != Ops.size() && Ops[i].Op == TheOp); + + DEBUG(errs() << "\nFACTORING [" << NumFound << "]: " << *TheOp << '\n'); + ++NumFactor; + + // Insert a new multiply. + Value *Mul = ConstantInt::get(cast<IntegerType>(I->getType()), NumFound); + Mul = BinaryOperator::CreateMul(TheOp, Mul, "factor", I); + + // Now that we have inserted a multiply, optimize it. This allows us to + // handle cases that require multiple factoring steps, such as this: + // (X*2) + (X*2) + (X*2) -> (X*2)*3 -> X*6 + Mul = ReassociateExpression(cast<BinaryOperator>(Mul)); + + // If every add operand was a duplicate, return the multiply. + if (Ops.empty()) + return Mul; + + // Otherwise, we had some input that didn't have the dupe, such as + // "A + A + B" -> "A*2 + B". Add the new multiply to the list of + // things being added by this operation. + Ops.insert(Ops.begin(), ValueEntry(getRank(Mul), Mul)); + + --i; + e = Ops.size(); + continue; + } + + // Check for X and -X in the operand list. + if (!BinaryOperator::isNeg(TheOp)) + continue; + + Value *X = BinaryOperator::getNegArgument(TheOp); + unsigned FoundX = FindInOperandList(Ops, i, X); + if (FoundX == i) + continue; + + // Remove X and -X from the operand list. + if (Ops.size() == 2) + return Constant::getNullValue(X->getType()); + + Ops.erase(Ops.begin()+i); + if (i < FoundX) + --FoundX; + else + --i; // Need to back up an extra one. + Ops.erase(Ops.begin()+FoundX); + ++NumAnnihil; + --i; // Revisit element. + e -= 2; // Removed two elements. + } + + // Scan the operand list, checking to see if there are any common factors + // between operands. Consider something like A*A+A*B*C+D. We would like to + // reassociate this to A*(A+B*C)+D, which reduces the number of multiplies. + // To efficiently find this, we count the number of times a factor occurs + // for any ADD operands that are MULs. + DenseMap<Value*, unsigned> FactorOccurrences; + + // Keep track of each multiply we see, to avoid triggering on (X*4)+(X*4) + // where they are actually the same multiply. + unsigned MaxOcc = 0; + Value *MaxOccVal = 0; + for (unsigned i = 0, e = Ops.size(); i != e; ++i) { + BinaryOperator *BOp = dyn_cast<BinaryOperator>(Ops[i].Op); + if (BOp == 0 || BOp->getOpcode() != Instruction::Mul || !BOp->use_empty()) + continue; + + // Compute all of the factors of this added value. + SmallVector<Value*, 8> Factors; + FindSingleUseMultiplyFactors(BOp, Factors, Ops, true); + assert(Factors.size() > 1 && "Bad linearize!"); + + // Add one to FactorOccurrences for each unique factor in this op. + SmallPtrSet<Value*, 8> Duplicates; + for (unsigned i = 0, e = Factors.size(); i != e; ++i) { + Value *Factor = Factors[i]; + if (!Duplicates.insert(Factor)) continue; + + unsigned Occ = ++FactorOccurrences[Factor]; + if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factor; } + + // If Factor is a negative constant, add the negated value as a factor + // because we can percolate the negate out. Watch for minint, which + // cannot be positivified. + if (ConstantInt *CI = dyn_cast<ConstantInt>(Factor)) + if (CI->getValue().isNegative() && !CI->getValue().isMinSignedValue()) { + Factor = ConstantInt::get(CI->getContext(), -CI->getValue()); + assert(!Duplicates.count(Factor) && + "Shouldn't have two constant factors, missed a canonicalize"); + + unsigned Occ = ++FactorOccurrences[Factor]; + if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factor; } + } + } + } + + // If any factor occurred more than one time, we can pull it out. + if (MaxOcc > 1) { + DEBUG(errs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << '\n'); + ++NumFactor; + + // Create a new instruction that uses the MaxOccVal twice. If we don't do + // this, we could otherwise run into situations where removing a factor + // from an expression will drop a use of maxocc, and this can cause + // RemoveFactorFromExpression on successive values to behave differently. + Instruction *DummyInst = BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal); + SmallVector<Value*, 4> NewMulOps; + for (unsigned i = 0; i != Ops.size(); ++i) { + // Only try to remove factors from expressions we're allowed to. + BinaryOperator *BOp = dyn_cast<BinaryOperator>(Ops[i].Op); + if (BOp == 0 || BOp->getOpcode() != Instruction::Mul || !BOp->use_empty()) + continue; + + if (Value *V = RemoveFactorFromExpression(Ops[i].Op, MaxOccVal)) { + // The factorized operand may occur several times. Convert them all in + // one fell swoop. + for (unsigned j = Ops.size(); j != i;) { + --j; + if (Ops[j].Op == Ops[i].Op) { + NewMulOps.push_back(V); + Ops.erase(Ops.begin()+j); + } + } + --i; + } + } + + // No need for extra uses anymore. + delete DummyInst; + + unsigned NumAddedValues = NewMulOps.size(); + Value *V = EmitAddTreeOfValues(I, NewMulOps); + + // Now that we have inserted the add tree, optimize it. This allows us to + // handle cases that require multiple factoring steps, such as this: + // A*A*B + A*A*C --> A*(A*B+A*C) --> A*(A*(B+C)) + assert(NumAddedValues > 1 && "Each occurrence should contribute a value"); + (void)NumAddedValues; + V = ReassociateExpression(cast<BinaryOperator>(V)); + + // Create the multiply. + Value *V2 = BinaryOperator::CreateMul(V, MaxOccVal, "tmp", I); + + // Rerun associate on the multiply in case the inner expression turned into + // a multiply. We want to make sure that we keep things in canonical form. + V2 = ReassociateExpression(cast<BinaryOperator>(V2)); + + // If every add operand included the factor (e.g. "A*B + A*C"), then the + // entire result expression is just the multiply "A*(B+C)". + if (Ops.empty()) + return V2; + + // Otherwise, we had some input that didn't have the factor, such as + // "A*B + A*C + D" -> "A*(B+C) + D". Add the new multiply to the list of + // things being added by this operation. + Ops.insert(Ops.begin(), ValueEntry(getRank(V2), V2)); + } + + return 0; +} + +Value *Reassociate::OptimizeExpression(BinaryOperator *I, + SmallVectorImpl<ValueEntry> &Ops) { + // Now that we have the linearized expression tree, try to optimize it. + // Start by folding any constants that we found. + bool IterateOptimization = false; + if (Ops.size() == 1) return Ops[0].Op; + + unsigned Opcode = I->getOpcode(); + + if (Constant *V1 = dyn_cast<Constant>(Ops[Ops.size()-2].Op)) + if (Constant *V2 = dyn_cast<Constant>(Ops.back().Op)) { + Ops.pop_back(); + Ops.back().Op = ConstantExpr::get(Opcode, V1, V2); + return OptimizeExpression(I, Ops); + } + + // Check for destructive annihilation due to a constant being used. + if (ConstantInt *CstVal = dyn_cast<ConstantInt>(Ops.back().Op)) + switch (Opcode) { + default: break; + case Instruction::And: + if (CstVal->isZero()) // X & 0 -> 0 + return CstVal; + if (CstVal->isAllOnesValue()) // X & -1 -> X + Ops.pop_back(); + break; + case Instruction::Mul: + if (CstVal->isZero()) { // X * 0 -> 0 + ++NumAnnihil; + return CstVal; + } + + if (cast<ConstantInt>(CstVal)->isOne()) + Ops.pop_back(); // X * 1 -> X + break; + case Instruction::Or: + if (CstVal->isAllOnesValue()) // X | -1 -> -1 + return CstVal; + // FALLTHROUGH! + case Instruction::Add: + case Instruction::Xor: + if (CstVal->isZero()) // X [|^+] 0 -> X + Ops.pop_back(); + break; + } + if (Ops.size() == 1) return Ops[0].Op; + + // Handle destructive annihilation due to identities between elements in the + // argument list here. + switch (Opcode) { + default: break; + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + unsigned NumOps = Ops.size(); + if (Value *Result = OptimizeAndOrXor(Opcode, Ops)) + return Result; + IterateOptimization |= Ops.size() != NumOps; + break; + } + + case Instruction::Add: { + unsigned NumOps = Ops.size(); + if (Value *Result = OptimizeAdd(I, Ops)) + return Result; + IterateOptimization |= Ops.size() != NumOps; + } + + break; + //case Instruction::Mul: + } + + if (IterateOptimization) + return OptimizeExpression(I, Ops); + return 0; +} + + +/// ReassociateBB - Inspect all of the instructions in this basic block, +/// reassociating them as we go. +void Reassociate::ReassociateBB(BasicBlock *BB) { + for (BasicBlock::iterator BBI = BB->begin(); BBI != BB->end(); ) { + Instruction *BI = BBI++; + if (BI->getOpcode() == Instruction::Shl && + isa<ConstantInt>(BI->getOperand(1))) + if (Instruction *NI = ConvertShiftToMul(BI, ValueRankMap)) { + MadeChange = true; + BI = NI; + } + + // Reject cases where it is pointless to do this. + if (!isa<BinaryOperator>(BI) || BI->getType()->isFloatingPointTy() || + BI->getType()->isVectorTy()) + continue; // Floating point ops are not associative. + + // Do not reassociate boolean (i1) expressions. We want to preserve the + // original order of evaluation for short-circuited comparisons that + // SimplifyCFG has folded to AND/OR expressions. If the expression + // is not further optimized, it is likely to be transformed back to a + // short-circuited form for code gen, and the source order may have been + // optimized for the most likely conditions. + if (BI->getType()->isIntegerTy(1)) + continue; + + // If this is a subtract instruction which is not already in negate form, + // see if we can convert it to X+-Y. + if (BI->getOpcode() == Instruction::Sub) { + if (ShouldBreakUpSubtract(BI)) { + BI = BreakUpSubtract(BI, ValueRankMap); + // Reset the BBI iterator in case BreakUpSubtract changed the + // instruction it points to. + BBI = BI; + ++BBI; + MadeChange = true; + } else if (BinaryOperator::isNeg(BI)) { + // Otherwise, this is a negation. See if the operand is a multiply tree + // and if this is not an inner node of a multiply tree. + if (isReassociableOp(BI->getOperand(1), Instruction::Mul) && + (!BI->hasOneUse() || + !isReassociableOp(BI->use_back(), Instruction::Mul))) { + BI = LowerNegateToMultiply(BI, ValueRankMap); + MadeChange = true; + } + } + } + + // If this instruction is a commutative binary operator, process it. + if (!BI->isAssociative()) continue; + BinaryOperator *I = cast<BinaryOperator>(BI); + + // If this is an interior node of a reassociable tree, ignore it until we + // get to the root of the tree, to avoid N^2 analysis. + if (I->hasOneUse() && isReassociableOp(I->use_back(), I->getOpcode())) + continue; + + // If this is an add tree that is used by a sub instruction, ignore it + // until we process the subtract. + if (I->hasOneUse() && I->getOpcode() == Instruction::Add && + cast<Instruction>(I->use_back())->getOpcode() == Instruction::Sub) + continue; + + ReassociateExpression(I); + } +} + +Value *Reassociate::ReassociateExpression(BinaryOperator *I) { + + // First, walk the expression tree, linearizing the tree, collecting the + // operand information. + SmallVector<ValueEntry, 8> Ops; + LinearizeExprTree(I, Ops); + + DEBUG(dbgs() << "RAIn:\t"; PrintOps(I, Ops); dbgs() << '\n'); + + // Now that we have linearized the tree to a list and have gathered all of + // the operands and their ranks, sort the operands by their rank. Use a + // stable_sort so that values with equal ranks will have their relative + // positions maintained (and so the compiler is deterministic). Note that + // this sorts so that the highest ranking values end up at the beginning of + // the vector. + std::stable_sort(Ops.begin(), Ops.end()); + + // OptimizeExpression - Now that we have the expression tree in a convenient + // sorted form, optimize it globally if possible. + if (Value *V = OptimizeExpression(I, Ops)) { + // This expression tree simplified to something that isn't a tree, + // eliminate it. + DEBUG(dbgs() << "Reassoc to scalar: " << *V << '\n'); + I->replaceAllUsesWith(V); + RemoveDeadBinaryOp(I); + ++NumAnnihil; + return V; + } + + // We want to sink immediates as deeply as possible except in the case where + // this is a multiply tree used only by an add, and the immediate is a -1. + // In this case we reassociate to put the negation on the outside so that we + // can fold the negation into the add: (-X)*Y + Z -> Z-X*Y + if (I->getOpcode() == Instruction::Mul && I->hasOneUse() && + cast<Instruction>(I->use_back())->getOpcode() == Instruction::Add && + isa<ConstantInt>(Ops.back().Op) && + cast<ConstantInt>(Ops.back().Op)->isAllOnesValue()) { + ValueEntry Tmp = Ops.pop_back_val(); + Ops.insert(Ops.begin(), Tmp); + } + + DEBUG(dbgs() << "RAOut:\t"; PrintOps(I, Ops); dbgs() << '\n'); + + if (Ops.size() == 1) { + // This expression tree simplified to something that isn't a tree, + // eliminate it. + I->replaceAllUsesWith(Ops[0].Op); + RemoveDeadBinaryOp(I); + return Ops[0].Op; + } + + // Now that we ordered and optimized the expressions, splat them back into + // the expression tree, removing any unneeded nodes. + RewriteExprTree(I, Ops); + return I; +} + + +bool Reassociate::runOnFunction(Function &F) { + // Recalculate the rank map for F + BuildRankMap(F); + + MadeChange = false; + for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) + ReassociateBB(FI); + + // We are done with the rank map. + RankMap.clear(); + ValueRankMap.clear(); + return MadeChange; +} + diff --git a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp new file mode 100644 index 0000000..459bb06 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp @@ -0,0 +1,134 @@ +//===- Reg2Mem.cpp - Convert registers to allocas -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file demotes all registers to memory references. It is intented to be +// the inverse of PromoteMemoryToRegister. By converting to loads, the only +// values live accross basic blocks are allocas and loads before phi nodes. +// It is intended that this should make CFG hacking much easier. +// To make later hacking easier, the entry block is split into two, such that +// all introduced allocas and nothing else are in the entry block. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "reg2mem" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Pass.h" +#include "llvm/Function.h" +#include "llvm/LLVMContext.h" +#include "llvm/Module.h" +#include "llvm/BasicBlock.h" +#include "llvm/Instructions.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/CFG.h" +#include <list> +using namespace llvm; + +STATISTIC(NumRegsDemoted, "Number of registers demoted"); +STATISTIC(NumPhisDemoted, "Number of phi-nodes demoted"); + +namespace { + struct RegToMem : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + RegToMem() : FunctionPass(ID) { + initializeRegToMemPass(*PassRegistry::getPassRegistry()); + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequiredID(BreakCriticalEdgesID); + AU.addPreservedID(BreakCriticalEdgesID); + } + + bool valueEscapes(const Instruction *Inst) const { + const BasicBlock *BB = Inst->getParent(); + for (Value::const_use_iterator UI = Inst->use_begin(),E = Inst->use_end(); + UI != E; ++UI) { + const Instruction *I = cast<Instruction>(*UI); + if (I->getParent() != BB || isa<PHINode>(I)) + return true; + } + return false; + } + + virtual bool runOnFunction(Function &F); + }; +} + +char RegToMem::ID = 0; +INITIALIZE_PASS_BEGIN(RegToMem, "reg2mem", "Demote all values to stack slots", + false, false) +INITIALIZE_PASS_DEPENDENCY(BreakCriticalEdges) +INITIALIZE_PASS_END(RegToMem, "reg2mem", "Demote all values to stack slots", + false, false) + +bool RegToMem::runOnFunction(Function &F) { + if (F.isDeclaration()) + return false; + + // Insert all new allocas into entry block. + BasicBlock *BBEntry = &F.getEntryBlock(); + assert(pred_begin(BBEntry) == pred_end(BBEntry) && + "Entry block to function must not have predecessors!"); + + // Find first non-alloca instruction and create insertion point. This is + // safe if block is well-formed: it always have terminator, otherwise + // we'll get and assertion. + BasicBlock::iterator I = BBEntry->begin(); + while (isa<AllocaInst>(I)) ++I; + + CastInst *AllocaInsertionPoint = + new BitCastInst(Constant::getNullValue(Type::getInt32Ty(F.getContext())), + Type::getInt32Ty(F.getContext()), + "reg2mem alloca point", I); + + // Find the escaped instructions. But don't create stack slots for + // allocas in entry block. + std::list<Instruction*> WorkList; + for (Function::iterator ibb = F.begin(), ibe = F.end(); + ibb != ibe; ++ibb) + for (BasicBlock::iterator iib = ibb->begin(), iie = ibb->end(); + iib != iie; ++iib) { + if (!(isa<AllocaInst>(iib) && iib->getParent() == BBEntry) && + valueEscapes(iib)) { + WorkList.push_front(&*iib); + } + } + + // Demote escaped instructions + NumRegsDemoted += WorkList.size(); + for (std::list<Instruction*>::iterator ilb = WorkList.begin(), + ile = WorkList.end(); ilb != ile; ++ilb) + DemoteRegToStack(**ilb, false, AllocaInsertionPoint); + + WorkList.clear(); + + // Find all phi's + for (Function::iterator ibb = F.begin(), ibe = F.end(); + ibb != ibe; ++ibb) + for (BasicBlock::iterator iib = ibb->begin(), iie = ibb->end(); + iib != iie; ++iib) + if (isa<PHINode>(iib)) + WorkList.push_front(&*iib); + + // Demote phi nodes + NumPhisDemoted += WorkList.size(); + for (std::list<Instruction*>::iterator ilb = WorkList.begin(), + ile = WorkList.end(); ilb != ile; ++ilb) + DemotePHIToStack(cast<PHINode>(*ilb), AllocaInsertionPoint); + + return true; +} + + +// createDemoteRegisterToMemory - Provide an entry point to create this pass. +// +char &llvm::DemoteRegisterToMemoryID = RegToMem::ID; +FunctionPass *llvm::createDemoteRegisterToMemoryPass() { + return new RegToMem(); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp new file mode 100644 index 0000000..c82e929 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -0,0 +1,2010 @@ +//===- SCCP.cpp - Sparse Conditional Constant Propagation -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements sparse conditional constant propagation and merging: +// +// Specifically, this: +// * Assumes values are constant unless proven otherwise +// * Assumes BasicBlocks are dead unless proven otherwise +// * Proves values to be constant, and replaces them with constants +// * Proves conditional branches to be unconditional +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "sccp" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Instructions.h" +#include "llvm/Pass.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/InstVisitor.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/PointerIntPair.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include <algorithm> +#include <map> +using namespace llvm; + +STATISTIC(NumInstRemoved, "Number of instructions removed"); +STATISTIC(NumDeadBlocks , "Number of basic blocks unreachable"); + +STATISTIC(IPNumInstRemoved, "Number of instructions removed by IPSCCP"); +STATISTIC(IPNumArgsElimed ,"Number of arguments constant propagated by IPSCCP"); +STATISTIC(IPNumGlobalConst, "Number of globals found to be constant by IPSCCP"); + +namespace { +/// LatticeVal class - This class represents the different lattice values that +/// an LLVM value may occupy. It is a simple class with value semantics. +/// +class LatticeVal { + enum LatticeValueTy { + /// undefined - This LLVM Value has no known value yet. + undefined, + + /// constant - This LLVM Value has a specific constant value. + constant, + + /// forcedconstant - This LLVM Value was thought to be undef until + /// ResolvedUndefsIn. This is treated just like 'constant', but if merged + /// with another (different) constant, it goes to overdefined, instead of + /// asserting. + forcedconstant, + + /// overdefined - This instruction is not known to be constant, and we know + /// it has a value. + overdefined + }; + + /// Val: This stores the current lattice value along with the Constant* for + /// the constant if this is a 'constant' or 'forcedconstant' value. + PointerIntPair<Constant *, 2, LatticeValueTy> Val; + + LatticeValueTy getLatticeValue() const { + return Val.getInt(); + } + +public: + LatticeVal() : Val(0, undefined) {} + + bool isUndefined() const { return getLatticeValue() == undefined; } + bool isConstant() const { + return getLatticeValue() == constant || getLatticeValue() == forcedconstant; + } + bool isOverdefined() const { return getLatticeValue() == overdefined; } + + Constant *getConstant() const { + assert(isConstant() && "Cannot get the constant of a non-constant!"); + return Val.getPointer(); + } + + /// markOverdefined - Return true if this is a change in status. + bool markOverdefined() { + if (isOverdefined()) + return false; + + Val.setInt(overdefined); + return true; + } + + /// markConstant - Return true if this is a change in status. + bool markConstant(Constant *V) { + if (getLatticeValue() == constant) { // Constant but not forcedconstant. + assert(getConstant() == V && "Marking constant with different value"); + return false; + } + + if (isUndefined()) { + Val.setInt(constant); + assert(V && "Marking constant with NULL"); + Val.setPointer(V); + } else { + assert(getLatticeValue() == forcedconstant && + "Cannot move from overdefined to constant!"); + // Stay at forcedconstant if the constant is the same. + if (V == getConstant()) return false; + + // Otherwise, we go to overdefined. Assumptions made based on the + // forced value are possibly wrong. Assuming this is another constant + // could expose a contradiction. + Val.setInt(overdefined); + } + return true; + } + + /// getConstantInt - If this is a constant with a ConstantInt value, return it + /// otherwise return null. + ConstantInt *getConstantInt() const { + if (isConstant()) + return dyn_cast<ConstantInt>(getConstant()); + return 0; + } + + void markForcedConstant(Constant *V) { + assert(isUndefined() && "Can't force a defined value!"); + Val.setInt(forcedconstant); + Val.setPointer(V); + } +}; +} // end anonymous namespace. + + +namespace { + +//===----------------------------------------------------------------------===// +// +/// SCCPSolver - This class is a general purpose solver for Sparse Conditional +/// Constant Propagation. +/// +class SCCPSolver : public InstVisitor<SCCPSolver> { + const TargetData *TD; + SmallPtrSet<BasicBlock*, 8> BBExecutable;// The BBs that are executable. + DenseMap<Value*, LatticeVal> ValueState; // The state each value is in. + + /// StructValueState - This maintains ValueState for values that have + /// StructType, for example for formal arguments, calls, insertelement, etc. + /// + DenseMap<std::pair<Value*, unsigned>, LatticeVal> StructValueState; + + /// GlobalValue - If we are tracking any values for the contents of a global + /// variable, we keep a mapping from the constant accessor to the element of + /// the global, to the currently known value. If the value becomes + /// overdefined, it's entry is simply removed from this map. + DenseMap<GlobalVariable*, LatticeVal> TrackedGlobals; + + /// TrackedRetVals - If we are tracking arguments into and the return + /// value out of a function, it will have an entry in this map, indicating + /// what the known return value for the function is. + DenseMap<Function*, LatticeVal> TrackedRetVals; + + /// TrackedMultipleRetVals - Same as TrackedRetVals, but used for functions + /// that return multiple values. + DenseMap<std::pair<Function*, unsigned>, LatticeVal> TrackedMultipleRetVals; + + /// MRVFunctionsTracked - Each function in TrackedMultipleRetVals is + /// represented here for efficient lookup. + SmallPtrSet<Function*, 16> MRVFunctionsTracked; + + /// TrackingIncomingArguments - This is the set of functions for whose + /// arguments we make optimistic assumptions about and try to prove as + /// constants. + SmallPtrSet<Function*, 16> TrackingIncomingArguments; + + /// The reason for two worklists is that overdefined is the lowest state + /// on the lattice, and moving things to overdefined as fast as possible + /// makes SCCP converge much faster. + /// + /// By having a separate worklist, we accomplish this because everything + /// possibly overdefined will become overdefined at the soonest possible + /// point. + SmallVector<Value*, 64> OverdefinedInstWorkList; + SmallVector<Value*, 64> InstWorkList; + + + SmallVector<BasicBlock*, 64> BBWorkList; // The BasicBlock work list + + /// UsersOfOverdefinedPHIs - Keep track of any users of PHI nodes that are not + /// overdefined, despite the fact that the PHI node is overdefined. + std::multimap<PHINode*, Instruction*> UsersOfOverdefinedPHIs; + + /// KnownFeasibleEdges - Entries in this set are edges which have already had + /// PHI nodes retriggered. + typedef std::pair<BasicBlock*, BasicBlock*> Edge; + DenseSet<Edge> KnownFeasibleEdges; +public: + SCCPSolver(const TargetData *td) : TD(td) {} + + /// MarkBlockExecutable - This method can be used by clients to mark all of + /// the blocks that are known to be intrinsically live in the processed unit. + /// + /// This returns true if the block was not considered live before. + bool MarkBlockExecutable(BasicBlock *BB) { + if (!BBExecutable.insert(BB)) return false; + DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << "\n"); + BBWorkList.push_back(BB); // Add the block to the work list! + return true; + } + + /// TrackValueOfGlobalVariable - Clients can use this method to + /// inform the SCCPSolver that it should track loads and stores to the + /// specified global variable if it can. This is only legal to call if + /// performing Interprocedural SCCP. + void TrackValueOfGlobalVariable(GlobalVariable *GV) { + // We only track the contents of scalar globals. + if (GV->getType()->getElementType()->isSingleValueType()) { + LatticeVal &IV = TrackedGlobals[GV]; + if (!isa<UndefValue>(GV->getInitializer())) + IV.markConstant(GV->getInitializer()); + } + } + + /// AddTrackedFunction - If the SCCP solver is supposed to track calls into + /// and out of the specified function (which cannot have its address taken), + /// this method must be called. + void AddTrackedFunction(Function *F) { + // Add an entry, F -> undef. + if (const StructType *STy = dyn_cast<StructType>(F->getReturnType())) { + MRVFunctionsTracked.insert(F); + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) + TrackedMultipleRetVals.insert(std::make_pair(std::make_pair(F, i), + LatticeVal())); + } else + TrackedRetVals.insert(std::make_pair(F, LatticeVal())); + } + + void AddArgumentTrackedFunction(Function *F) { + TrackingIncomingArguments.insert(F); + } + + /// Solve - Solve for constants and executable blocks. + /// + void Solve(); + + /// ResolvedUndefsIn - While solving the dataflow for a function, we assume + /// that branches on undef values cannot reach any of their successors. + /// However, this is not a safe assumption. After we solve dataflow, this + /// method should be use to handle this. If this returns true, the solver + /// should be rerun. + bool ResolvedUndefsIn(Function &F); + + bool isBlockExecutable(BasicBlock *BB) const { + return BBExecutable.count(BB); + } + + LatticeVal getLatticeValueFor(Value *V) const { + DenseMap<Value*, LatticeVal>::const_iterator I = ValueState.find(V); + assert(I != ValueState.end() && "V is not in valuemap!"); + return I->second; + } + + /*LatticeVal getStructLatticeValueFor(Value *V, unsigned i) const { + DenseMap<std::pair<Value*, unsigned>, LatticeVal>::const_iterator I = + StructValueState.find(std::make_pair(V, i)); + assert(I != StructValueState.end() && "V is not in valuemap!"); + return I->second; + }*/ + + /// getTrackedRetVals - Get the inferred return value map. + /// + const DenseMap<Function*, LatticeVal> &getTrackedRetVals() { + return TrackedRetVals; + } + + /// getTrackedGlobals - Get and return the set of inferred initializers for + /// global variables. + const DenseMap<GlobalVariable*, LatticeVal> &getTrackedGlobals() { + return TrackedGlobals; + } + + void markOverdefined(Value *V) { + assert(!V->getType()->isStructTy() && "Should use other method"); + markOverdefined(ValueState[V], V); + } + + /// markAnythingOverdefined - Mark the specified value overdefined. This + /// works with both scalars and structs. + void markAnythingOverdefined(Value *V) { + if (const StructType *STy = dyn_cast<StructType>(V->getType())) + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) + markOverdefined(getStructValueState(V, i), V); + else + markOverdefined(V); + } + +private: + // markConstant - Make a value be marked as "constant". If the value + // is not already a constant, add it to the instruction work list so that + // the users of the instruction are updated later. + // + void markConstant(LatticeVal &IV, Value *V, Constant *C) { + if (!IV.markConstant(C)) return; + DEBUG(dbgs() << "markConstant: " << *C << ": " << *V << '\n'); + if (IV.isOverdefined()) + OverdefinedInstWorkList.push_back(V); + else + InstWorkList.push_back(V); + } + + void markConstant(Value *V, Constant *C) { + assert(!V->getType()->isStructTy() && "Should use other method"); + markConstant(ValueState[V], V, C); + } + + void markForcedConstant(Value *V, Constant *C) { + assert(!V->getType()->isStructTy() && "Should use other method"); + LatticeVal &IV = ValueState[V]; + IV.markForcedConstant(C); + DEBUG(dbgs() << "markForcedConstant: " << *C << ": " << *V << '\n'); + if (IV.isOverdefined()) + OverdefinedInstWorkList.push_back(V); + else + InstWorkList.push_back(V); + } + + + // markOverdefined - Make a value be marked as "overdefined". If the + // value is not already overdefined, add it to the overdefined instruction + // work list so that the users of the instruction are updated later. + void markOverdefined(LatticeVal &IV, Value *V) { + if (!IV.markOverdefined()) return; + + DEBUG(dbgs() << "markOverdefined: "; + if (Function *F = dyn_cast<Function>(V)) + dbgs() << "Function '" << F->getName() << "'\n"; + else + dbgs() << *V << '\n'); + // Only instructions go on the work list + OverdefinedInstWorkList.push_back(V); + } + + void mergeInValue(LatticeVal &IV, Value *V, LatticeVal MergeWithV) { + if (IV.isOverdefined() || MergeWithV.isUndefined()) + return; // Noop. + if (MergeWithV.isOverdefined()) + markOverdefined(IV, V); + else if (IV.isUndefined()) + markConstant(IV, V, MergeWithV.getConstant()); + else if (IV.getConstant() != MergeWithV.getConstant()) + markOverdefined(IV, V); + } + + void mergeInValue(Value *V, LatticeVal MergeWithV) { + assert(!V->getType()->isStructTy() && "Should use other method"); + mergeInValue(ValueState[V], V, MergeWithV); + } + + + /// getValueState - Return the LatticeVal object that corresponds to the + /// value. This function handles the case when the value hasn't been seen yet + /// by properly seeding constants etc. + LatticeVal &getValueState(Value *V) { + assert(!V->getType()->isStructTy() && "Should use getStructValueState"); + + std::pair<DenseMap<Value*, LatticeVal>::iterator, bool> I = + ValueState.insert(std::make_pair(V, LatticeVal())); + LatticeVal &LV = I.first->second; + + if (!I.second) + return LV; // Common case, already in the map. + + if (Constant *C = dyn_cast<Constant>(V)) { + // Undef values remain undefined. + if (!isa<UndefValue>(V)) + LV.markConstant(C); // Constants are constant + } + + // All others are underdefined by default. + return LV; + } + + /// getStructValueState - Return the LatticeVal object that corresponds to the + /// value/field pair. This function handles the case when the value hasn't + /// been seen yet by properly seeding constants etc. + LatticeVal &getStructValueState(Value *V, unsigned i) { + assert(V->getType()->isStructTy() && "Should use getValueState"); + assert(i < cast<StructType>(V->getType())->getNumElements() && + "Invalid element #"); + + std::pair<DenseMap<std::pair<Value*, unsigned>, LatticeVal>::iterator, + bool> I = StructValueState.insert( + std::make_pair(std::make_pair(V, i), LatticeVal())); + LatticeVal &LV = I.first->second; + + if (!I.second) + return LV; // Common case, already in the map. + + if (Constant *C = dyn_cast<Constant>(V)) { + if (isa<UndefValue>(C)) + ; // Undef values remain undefined. + else if (ConstantStruct *CS = dyn_cast<ConstantStruct>(C)) + LV.markConstant(CS->getOperand(i)); // Constants are constant. + else if (isa<ConstantAggregateZero>(C)) { + const Type *FieldTy = cast<StructType>(V->getType())->getElementType(i); + LV.markConstant(Constant::getNullValue(FieldTy)); + } else + LV.markOverdefined(); // Unknown sort of constant. + } + + // All others are underdefined by default. + return LV; + } + + + /// markEdgeExecutable - Mark a basic block as executable, adding it to the BB + /// work list if it is not already executable. + void markEdgeExecutable(BasicBlock *Source, BasicBlock *Dest) { + if (!KnownFeasibleEdges.insert(Edge(Source, Dest)).second) + return; // This edge is already known to be executable! + + if (!MarkBlockExecutable(Dest)) { + // If the destination is already executable, we just made an *edge* + // feasible that wasn't before. Revisit the PHI nodes in the block + // because they have potentially new operands. + DEBUG(dbgs() << "Marking Edge Executable: " << Source->getName() + << " -> " << Dest->getName() << "\n"); + + PHINode *PN; + for (BasicBlock::iterator I = Dest->begin(); + (PN = dyn_cast<PHINode>(I)); ++I) + visitPHINode(*PN); + } + } + + // getFeasibleSuccessors - Return a vector of booleans to indicate which + // successors are reachable from a given terminator instruction. + // + void getFeasibleSuccessors(TerminatorInst &TI, SmallVector<bool, 16> &Succs); + + // isEdgeFeasible - Return true if the control flow edge from the 'From' basic + // block to the 'To' basic block is currently feasible. + // + bool isEdgeFeasible(BasicBlock *From, BasicBlock *To); + + // OperandChangedState - This method is invoked on all of the users of an + // instruction that was just changed state somehow. Based on this + // information, we need to update the specified user of this instruction. + // + void OperandChangedState(Instruction *I) { + if (BBExecutable.count(I->getParent())) // Inst is executable? + visit(*I); + } + + /// RemoveFromOverdefinedPHIs - If I has any entries in the + /// UsersOfOverdefinedPHIs map for PN, remove them now. + void RemoveFromOverdefinedPHIs(Instruction *I, PHINode *PN) { + if (UsersOfOverdefinedPHIs.empty()) return; + std::multimap<PHINode*, Instruction*>::iterator It, E; + tie(It, E) = UsersOfOverdefinedPHIs.equal_range(PN); + while (It != E) { + if (It->second == I) + UsersOfOverdefinedPHIs.erase(It++); + else + ++It; + } + } + + /// InsertInOverdefinedPHIs - Insert an entry in the UsersOfOverdefinedPHIS + /// map for I and PN, but if one is there already, do not create another. + /// (Duplicate entries do not break anything directly, but can lead to + /// exponential growth of the table in rare cases.) + void InsertInOverdefinedPHIs(Instruction *I, PHINode *PN) { + std::multimap<PHINode*, Instruction*>::iterator J, E; + tie(J, E) = UsersOfOverdefinedPHIs.equal_range(PN); + for (; J != E; ++J) + if (J->second == I) + return; + UsersOfOverdefinedPHIs.insert(std::make_pair(PN, I)); + } + +private: + friend class InstVisitor<SCCPSolver>; + + // visit implementations - Something changed in this instruction. Either an + // operand made a transition, or the instruction is newly executable. Change + // the value type of I to reflect these changes if appropriate. + void visitPHINode(PHINode &I); + + // Terminators + void visitReturnInst(ReturnInst &I); + void visitTerminatorInst(TerminatorInst &TI); + + void visitCastInst(CastInst &I); + void visitSelectInst(SelectInst &I); + void visitBinaryOperator(Instruction &I); + void visitCmpInst(CmpInst &I); + void visitExtractElementInst(ExtractElementInst &I); + void visitInsertElementInst(InsertElementInst &I); + void visitShuffleVectorInst(ShuffleVectorInst &I); + void visitExtractValueInst(ExtractValueInst &EVI); + void visitInsertValueInst(InsertValueInst &IVI); + + // Instructions that cannot be folded away. + void visitStoreInst (StoreInst &I); + void visitLoadInst (LoadInst &I); + void visitGetElementPtrInst(GetElementPtrInst &I); + void visitCallInst (CallInst &I) { + visitCallSite(&I); + } + void visitInvokeInst (InvokeInst &II) { + visitCallSite(&II); + visitTerminatorInst(II); + } + void visitCallSite (CallSite CS); + void visitUnwindInst (TerminatorInst &I) { /*returns void*/ } + void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ } + void visitAllocaInst (Instruction &I) { markOverdefined(&I); } + void visitVAArgInst (Instruction &I) { markAnythingOverdefined(&I); } + + void visitInstruction(Instruction &I) { + // If a new instruction is added to LLVM that we don't handle. + dbgs() << "SCCP: Don't know how to handle: " << I; + markAnythingOverdefined(&I); // Just in case + } +}; + +} // end anonymous namespace + + +// getFeasibleSuccessors - Return a vector of booleans to indicate which +// successors are reachable from a given terminator instruction. +// +void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI, + SmallVector<bool, 16> &Succs) { + Succs.resize(TI.getNumSuccessors()); + if (BranchInst *BI = dyn_cast<BranchInst>(&TI)) { + if (BI->isUnconditional()) { + Succs[0] = true; + return; + } + + LatticeVal BCValue = getValueState(BI->getCondition()); + ConstantInt *CI = BCValue.getConstantInt(); + if (CI == 0) { + // Overdefined condition variables, and branches on unfoldable constant + // conditions, mean the branch could go either way. + if (!BCValue.isUndefined()) + Succs[0] = Succs[1] = true; + return; + } + + // Constant condition variables mean the branch can only go a single way. + Succs[CI->isZero()] = true; + return; + } + + if (isa<InvokeInst>(TI)) { + // Invoke instructions successors are always executable. + Succs[0] = Succs[1] = true; + return; + } + + if (SwitchInst *SI = dyn_cast<SwitchInst>(&TI)) { + LatticeVal SCValue = getValueState(SI->getCondition()); + ConstantInt *CI = SCValue.getConstantInt(); + + if (CI == 0) { // Overdefined or undefined condition? + // All destinations are executable! + if (!SCValue.isUndefined()) + Succs.assign(TI.getNumSuccessors(), true); + return; + } + + Succs[SI->findCaseValue(CI)] = true; + return; + } + + // TODO: This could be improved if the operand is a [cast of a] BlockAddress. + if (isa<IndirectBrInst>(&TI)) { + // Just mark all destinations executable! + Succs.assign(TI.getNumSuccessors(), true); + return; + } + +#ifndef NDEBUG + dbgs() << "Unknown terminator instruction: " << TI << '\n'; +#endif + llvm_unreachable("SCCP: Don't know how to handle this terminator!"); +} + + +// isEdgeFeasible - Return true if the control flow edge from the 'From' basic +// block to the 'To' basic block is currently feasible. +// +bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) { + assert(BBExecutable.count(To) && "Dest should always be alive!"); + + // Make sure the source basic block is executable!! + if (!BBExecutable.count(From)) return false; + + // Check to make sure this edge itself is actually feasible now. + TerminatorInst *TI = From->getTerminator(); + if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { + if (BI->isUnconditional()) + return true; + + LatticeVal BCValue = getValueState(BI->getCondition()); + + // Overdefined condition variables mean the branch could go either way, + // undef conditions mean that neither edge is feasible yet. + ConstantInt *CI = BCValue.getConstantInt(); + if (CI == 0) + return !BCValue.isUndefined(); + + // Constant condition variables mean the branch can only go a single way. + return BI->getSuccessor(CI->isZero()) == To; + } + + // Invoke instructions successors are always executable. + if (isa<InvokeInst>(TI)) + return true; + + if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { + LatticeVal SCValue = getValueState(SI->getCondition()); + ConstantInt *CI = SCValue.getConstantInt(); + + if (CI == 0) + return !SCValue.isUndefined(); + + // Make sure to skip the "default value" which isn't a value + for (unsigned i = 1, E = SI->getNumSuccessors(); i != E; ++i) + if (SI->getSuccessorValue(i) == CI) // Found the taken branch. + return SI->getSuccessor(i) == To; + + // If the constant value is not equal to any of the branches, we must + // execute default branch. + return SI->getDefaultDest() == To; + } + + // Just mark all destinations executable! + // TODO: This could be improved if the operand is a [cast of a] BlockAddress. + if (isa<IndirectBrInst>(&TI)) + return true; + +#ifndef NDEBUG + dbgs() << "Unknown terminator instruction: " << *TI << '\n'; +#endif + llvm_unreachable(0); +} + +// visit Implementations - Something changed in this instruction, either an +// operand made a transition, or the instruction is newly executable. Change +// the value type of I to reflect these changes if appropriate. This method +// makes sure to do the following actions: +// +// 1. If a phi node merges two constants in, and has conflicting value coming +// from different branches, or if the PHI node merges in an overdefined +// value, then the PHI node becomes overdefined. +// 2. If a phi node merges only constants in, and they all agree on value, the +// PHI node becomes a constant value equal to that. +// 3. If V <- x (op) y && isConstant(x) && isConstant(y) V = Constant +// 4. If V <- x (op) y && (isOverdefined(x) || isOverdefined(y)) V = Overdefined +// 5. If V <- MEM or V <- CALL or V <- (unknown) then V = Overdefined +// 6. If a conditional branch has a value that is constant, make the selected +// destination executable +// 7. If a conditional branch has a value that is overdefined, make all +// successors executable. +// +void SCCPSolver::visitPHINode(PHINode &PN) { + // If this PN returns a struct, just mark the result overdefined. + // TODO: We could do a lot better than this if code actually uses this. + if (PN.getType()->isStructTy()) + return markAnythingOverdefined(&PN); + + if (getValueState(&PN).isOverdefined()) { + // There may be instructions using this PHI node that are not overdefined + // themselves. If so, make sure that they know that the PHI node operand + // changed. + std::multimap<PHINode*, Instruction*>::iterator I, E; + tie(I, E) = UsersOfOverdefinedPHIs.equal_range(&PN); + if (I == E) + return; + + SmallVector<Instruction*, 16> Users; + for (; I != E; ++I) + Users.push_back(I->second); + while (!Users.empty()) + visit(Users.pop_back_val()); + return; // Quick exit + } + + // Super-extra-high-degree PHI nodes are unlikely to ever be marked constant, + // and slow us down a lot. Just mark them overdefined. + if (PN.getNumIncomingValues() > 64) + return markOverdefined(&PN); + + // Look at all of the executable operands of the PHI node. If any of them + // are overdefined, the PHI becomes overdefined as well. If they are all + // constant, and they agree with each other, the PHI becomes the identical + // constant. If they are constant and don't agree, the PHI is overdefined. + // If there are no executable operands, the PHI remains undefined. + // + Constant *OperandVal = 0; + for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) { + LatticeVal IV = getValueState(PN.getIncomingValue(i)); + if (IV.isUndefined()) continue; // Doesn't influence PHI node. + + if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent())) + continue; + + if (IV.isOverdefined()) // PHI node becomes overdefined! + return markOverdefined(&PN); + + if (OperandVal == 0) { // Grab the first value. + OperandVal = IV.getConstant(); + continue; + } + + // There is already a reachable operand. If we conflict with it, + // then the PHI node becomes overdefined. If we agree with it, we + // can continue on. + + // Check to see if there are two different constants merging, if so, the PHI + // node is overdefined. + if (IV.getConstant() != OperandVal) + return markOverdefined(&PN); + } + + // If we exited the loop, this means that the PHI node only has constant + // arguments that agree with each other(and OperandVal is the constant) or + // OperandVal is null because there are no defined incoming arguments. If + // this is the case, the PHI remains undefined. + // + if (OperandVal) + markConstant(&PN, OperandVal); // Acquire operand value +} + + + + +void SCCPSolver::visitReturnInst(ReturnInst &I) { + if (I.getNumOperands() == 0) return; // ret void + + Function *F = I.getParent()->getParent(); + Value *ResultOp = I.getOperand(0); + + // If we are tracking the return value of this function, merge it in. + if (!TrackedRetVals.empty() && !ResultOp->getType()->isStructTy()) { + DenseMap<Function*, LatticeVal>::iterator TFRVI = + TrackedRetVals.find(F); + if (TFRVI != TrackedRetVals.end()) { + mergeInValue(TFRVI->second, F, getValueState(ResultOp)); + return; + } + } + + // Handle functions that return multiple values. + if (!TrackedMultipleRetVals.empty()) { + if (const StructType *STy = dyn_cast<StructType>(ResultOp->getType())) + if (MRVFunctionsTracked.count(F)) + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) + mergeInValue(TrackedMultipleRetVals[std::make_pair(F, i)], F, + getStructValueState(ResultOp, i)); + + } +} + +void SCCPSolver::visitTerminatorInst(TerminatorInst &TI) { + SmallVector<bool, 16> SuccFeasible; + getFeasibleSuccessors(TI, SuccFeasible); + + BasicBlock *BB = TI.getParent(); + + // Mark all feasible successors executable. + for (unsigned i = 0, e = SuccFeasible.size(); i != e; ++i) + if (SuccFeasible[i]) + markEdgeExecutable(BB, TI.getSuccessor(i)); +} + +void SCCPSolver::visitCastInst(CastInst &I) { + LatticeVal OpSt = getValueState(I.getOperand(0)); + if (OpSt.isOverdefined()) // Inherit overdefinedness of operand + markOverdefined(&I); + else if (OpSt.isConstant()) // Propagate constant value + markConstant(&I, ConstantExpr::getCast(I.getOpcode(), + OpSt.getConstant(), I.getType())); +} + + +void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) { + // If this returns a struct, mark all elements over defined, we don't track + // structs in structs. + if (EVI.getType()->isStructTy()) + return markAnythingOverdefined(&EVI); + + // If this is extracting from more than one level of struct, we don't know. + if (EVI.getNumIndices() != 1) + return markOverdefined(&EVI); + + Value *AggVal = EVI.getAggregateOperand(); + if (AggVal->getType()->isStructTy()) { + unsigned i = *EVI.idx_begin(); + LatticeVal EltVal = getStructValueState(AggVal, i); + mergeInValue(getValueState(&EVI), &EVI, EltVal); + } else { + // Otherwise, must be extracting from an array. + return markOverdefined(&EVI); + } +} + +void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) { + const StructType *STy = dyn_cast<StructType>(IVI.getType()); + if (STy == 0) + return markOverdefined(&IVI); + + // If this has more than one index, we can't handle it, drive all results to + // undef. + if (IVI.getNumIndices() != 1) + return markAnythingOverdefined(&IVI); + + Value *Aggr = IVI.getAggregateOperand(); + unsigned Idx = *IVI.idx_begin(); + + // Compute the result based on what we're inserting. + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + // This passes through all values that aren't the inserted element. + if (i != Idx) { + LatticeVal EltVal = getStructValueState(Aggr, i); + mergeInValue(getStructValueState(&IVI, i), &IVI, EltVal); + continue; + } + + Value *Val = IVI.getInsertedValueOperand(); + if (Val->getType()->isStructTy()) + // We don't track structs in structs. + markOverdefined(getStructValueState(&IVI, i), &IVI); + else { + LatticeVal InVal = getValueState(Val); + mergeInValue(getStructValueState(&IVI, i), &IVI, InVal); + } + } +} + +void SCCPSolver::visitSelectInst(SelectInst &I) { + // If this select returns a struct, just mark the result overdefined. + // TODO: We could do a lot better than this if code actually uses this. + if (I.getType()->isStructTy()) + return markAnythingOverdefined(&I); + + LatticeVal CondValue = getValueState(I.getCondition()); + if (CondValue.isUndefined()) + return; + + if (ConstantInt *CondCB = CondValue.getConstantInt()) { + Value *OpVal = CondCB->isZero() ? I.getFalseValue() : I.getTrueValue(); + mergeInValue(&I, getValueState(OpVal)); + return; + } + + // Otherwise, the condition is overdefined or a constant we can't evaluate. + // See if we can produce something better than overdefined based on the T/F + // value. + LatticeVal TVal = getValueState(I.getTrueValue()); + LatticeVal FVal = getValueState(I.getFalseValue()); + + // select ?, C, C -> C. + if (TVal.isConstant() && FVal.isConstant() && + TVal.getConstant() == FVal.getConstant()) + return markConstant(&I, FVal.getConstant()); + + if (TVal.isUndefined()) // select ?, undef, X -> X. + return mergeInValue(&I, FVal); + if (FVal.isUndefined()) // select ?, X, undef -> X. + return mergeInValue(&I, TVal); + markOverdefined(&I); +} + +// Handle Binary Operators. +void SCCPSolver::visitBinaryOperator(Instruction &I) { + LatticeVal V1State = getValueState(I.getOperand(0)); + LatticeVal V2State = getValueState(I.getOperand(1)); + + LatticeVal &IV = ValueState[&I]; + if (IV.isOverdefined()) return; + + if (V1State.isConstant() && V2State.isConstant()) + return markConstant(IV, &I, + ConstantExpr::get(I.getOpcode(), V1State.getConstant(), + V2State.getConstant())); + + // If something is undef, wait for it to resolve. + if (!V1State.isOverdefined() && !V2State.isOverdefined()) + return; + + // Otherwise, one of our operands is overdefined. Try to produce something + // better than overdefined with some tricks. + + // If this is an AND or OR with 0 or -1, it doesn't matter that the other + // operand is overdefined. + if (I.getOpcode() == Instruction::And || I.getOpcode() == Instruction::Or) { + LatticeVal *NonOverdefVal = 0; + if (!V1State.isOverdefined()) + NonOverdefVal = &V1State; + else if (!V2State.isOverdefined()) + NonOverdefVal = &V2State; + + if (NonOverdefVal) { + if (NonOverdefVal->isUndefined()) { + // Could annihilate value. + if (I.getOpcode() == Instruction::And) + markConstant(IV, &I, Constant::getNullValue(I.getType())); + else if (const VectorType *PT = dyn_cast<VectorType>(I.getType())) + markConstant(IV, &I, Constant::getAllOnesValue(PT)); + else + markConstant(IV, &I, + Constant::getAllOnesValue(I.getType())); + return; + } + + if (I.getOpcode() == Instruction::And) { + // X and 0 = 0 + if (NonOverdefVal->getConstant()->isNullValue()) + return markConstant(IV, &I, NonOverdefVal->getConstant()); + } else { + if (ConstantInt *CI = NonOverdefVal->getConstantInt()) + if (CI->isAllOnesValue()) // X or -1 = -1 + return markConstant(IV, &I, NonOverdefVal->getConstant()); + } + } + } + + + // If both operands are PHI nodes, it is possible that this instruction has + // a constant value, despite the fact that the PHI node doesn't. Check for + // this condition now. + if (PHINode *PN1 = dyn_cast<PHINode>(I.getOperand(0))) + if (PHINode *PN2 = dyn_cast<PHINode>(I.getOperand(1))) + if (PN1->getParent() == PN2->getParent()) { + // Since the two PHI nodes are in the same basic block, they must have + // entries for the same predecessors. Walk the predecessor list, and + // if all of the incoming values are constants, and the result of + // evaluating this expression with all incoming value pairs is the + // same, then this expression is a constant even though the PHI node + // is not a constant! + LatticeVal Result; + for (unsigned i = 0, e = PN1->getNumIncomingValues(); i != e; ++i) { + LatticeVal In1 = getValueState(PN1->getIncomingValue(i)); + BasicBlock *InBlock = PN1->getIncomingBlock(i); + LatticeVal In2 =getValueState(PN2->getIncomingValueForBlock(InBlock)); + + if (In1.isOverdefined() || In2.isOverdefined()) { + Result.markOverdefined(); + break; // Cannot fold this operation over the PHI nodes! + } + + if (In1.isConstant() && In2.isConstant()) { + Constant *V = ConstantExpr::get(I.getOpcode(), In1.getConstant(), + In2.getConstant()); + if (Result.isUndefined()) + Result.markConstant(V); + else if (Result.isConstant() && Result.getConstant() != V) { + Result.markOverdefined(); + break; + } + } + } + + // If we found a constant value here, then we know the instruction is + // constant despite the fact that the PHI nodes are overdefined. + if (Result.isConstant()) { + markConstant(IV, &I, Result.getConstant()); + // Remember that this instruction is virtually using the PHI node + // operands. + InsertInOverdefinedPHIs(&I, PN1); + InsertInOverdefinedPHIs(&I, PN2); + return; + } + + if (Result.isUndefined()) + return; + + // Okay, this really is overdefined now. Since we might have + // speculatively thought that this was not overdefined before, and + // added ourselves to the UsersOfOverdefinedPHIs list for the PHIs, + // make sure to clean out any entries that we put there, for + // efficiency. + RemoveFromOverdefinedPHIs(&I, PN1); + RemoveFromOverdefinedPHIs(&I, PN2); + } + + markOverdefined(&I); +} + +// Handle ICmpInst instruction. +void SCCPSolver::visitCmpInst(CmpInst &I) { + LatticeVal V1State = getValueState(I.getOperand(0)); + LatticeVal V2State = getValueState(I.getOperand(1)); + + LatticeVal &IV = ValueState[&I]; + if (IV.isOverdefined()) return; + + if (V1State.isConstant() && V2State.isConstant()) + return markConstant(IV, &I, ConstantExpr::getCompare(I.getPredicate(), + V1State.getConstant(), + V2State.getConstant())); + + // If operands are still undefined, wait for it to resolve. + if (!V1State.isOverdefined() && !V2State.isOverdefined()) + return; + + // If something is overdefined, use some tricks to avoid ending up and over + // defined if we can. + + // If both operands are PHI nodes, it is possible that this instruction has + // a constant value, despite the fact that the PHI node doesn't. Check for + // this condition now. + if (PHINode *PN1 = dyn_cast<PHINode>(I.getOperand(0))) + if (PHINode *PN2 = dyn_cast<PHINode>(I.getOperand(1))) + if (PN1->getParent() == PN2->getParent()) { + // Since the two PHI nodes are in the same basic block, they must have + // entries for the same predecessors. Walk the predecessor list, and + // if all of the incoming values are constants, and the result of + // evaluating this expression with all incoming value pairs is the + // same, then this expression is a constant even though the PHI node + // is not a constant! + LatticeVal Result; + for (unsigned i = 0, e = PN1->getNumIncomingValues(); i != e; ++i) { + LatticeVal In1 = getValueState(PN1->getIncomingValue(i)); + BasicBlock *InBlock = PN1->getIncomingBlock(i); + LatticeVal In2 =getValueState(PN2->getIncomingValueForBlock(InBlock)); + + if (In1.isOverdefined() || In2.isOverdefined()) { + Result.markOverdefined(); + break; // Cannot fold this operation over the PHI nodes! + } + + if (In1.isConstant() && In2.isConstant()) { + Constant *V = ConstantExpr::getCompare(I.getPredicate(), + In1.getConstant(), + In2.getConstant()); + if (Result.isUndefined()) + Result.markConstant(V); + else if (Result.isConstant() && Result.getConstant() != V) { + Result.markOverdefined(); + break; + } + } + } + + // If we found a constant value here, then we know the instruction is + // constant despite the fact that the PHI nodes are overdefined. + if (Result.isConstant()) { + markConstant(&I, Result.getConstant()); + // Remember that this instruction is virtually using the PHI node + // operands. + InsertInOverdefinedPHIs(&I, PN1); + InsertInOverdefinedPHIs(&I, PN2); + return; + } + + if (Result.isUndefined()) + return; + + // Okay, this really is overdefined now. Since we might have + // speculatively thought that this was not overdefined before, and + // added ourselves to the UsersOfOverdefinedPHIs list for the PHIs, + // make sure to clean out any entries that we put there, for + // efficiency. + RemoveFromOverdefinedPHIs(&I, PN1); + RemoveFromOverdefinedPHIs(&I, PN2); + } + + markOverdefined(&I); +} + +void SCCPSolver::visitExtractElementInst(ExtractElementInst &I) { + // TODO : SCCP does not handle vectors properly. + return markOverdefined(&I); + +#if 0 + LatticeVal &ValState = getValueState(I.getOperand(0)); + LatticeVal &IdxState = getValueState(I.getOperand(1)); + + if (ValState.isOverdefined() || IdxState.isOverdefined()) + markOverdefined(&I); + else if(ValState.isConstant() && IdxState.isConstant()) + markConstant(&I, ConstantExpr::getExtractElement(ValState.getConstant(), + IdxState.getConstant())); +#endif +} + +void SCCPSolver::visitInsertElementInst(InsertElementInst &I) { + // TODO : SCCP does not handle vectors properly. + return markOverdefined(&I); +#if 0 + LatticeVal &ValState = getValueState(I.getOperand(0)); + LatticeVal &EltState = getValueState(I.getOperand(1)); + LatticeVal &IdxState = getValueState(I.getOperand(2)); + + if (ValState.isOverdefined() || EltState.isOverdefined() || + IdxState.isOverdefined()) + markOverdefined(&I); + else if(ValState.isConstant() && EltState.isConstant() && + IdxState.isConstant()) + markConstant(&I, ConstantExpr::getInsertElement(ValState.getConstant(), + EltState.getConstant(), + IdxState.getConstant())); + else if (ValState.isUndefined() && EltState.isConstant() && + IdxState.isConstant()) + markConstant(&I,ConstantExpr::getInsertElement(UndefValue::get(I.getType()), + EltState.getConstant(), + IdxState.getConstant())); +#endif +} + +void SCCPSolver::visitShuffleVectorInst(ShuffleVectorInst &I) { + // TODO : SCCP does not handle vectors properly. + return markOverdefined(&I); +#if 0 + LatticeVal &V1State = getValueState(I.getOperand(0)); + LatticeVal &V2State = getValueState(I.getOperand(1)); + LatticeVal &MaskState = getValueState(I.getOperand(2)); + + if (MaskState.isUndefined() || + (V1State.isUndefined() && V2State.isUndefined())) + return; // Undefined output if mask or both inputs undefined. + + if (V1State.isOverdefined() || V2State.isOverdefined() || + MaskState.isOverdefined()) { + markOverdefined(&I); + } else { + // A mix of constant/undef inputs. + Constant *V1 = V1State.isConstant() ? + V1State.getConstant() : UndefValue::get(I.getType()); + Constant *V2 = V2State.isConstant() ? + V2State.getConstant() : UndefValue::get(I.getType()); + Constant *Mask = MaskState.isConstant() ? + MaskState.getConstant() : UndefValue::get(I.getOperand(2)->getType()); + markConstant(&I, ConstantExpr::getShuffleVector(V1, V2, Mask)); + } +#endif +} + +// Handle getelementptr instructions. If all operands are constants then we +// can turn this into a getelementptr ConstantExpr. +// +void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) { + if (ValueState[&I].isOverdefined()) return; + + SmallVector<Constant*, 8> Operands; + Operands.reserve(I.getNumOperands()); + + for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) { + LatticeVal State = getValueState(I.getOperand(i)); + if (State.isUndefined()) + return; // Operands are not resolved yet. + + if (State.isOverdefined()) + return markOverdefined(&I); + + assert(State.isConstant() && "Unknown state!"); + Operands.push_back(State.getConstant()); + } + + Constant *Ptr = Operands[0]; + markConstant(&I, ConstantExpr::getGetElementPtr(Ptr, &Operands[0]+1, + Operands.size()-1)); +} + +void SCCPSolver::visitStoreInst(StoreInst &SI) { + // If this store is of a struct, ignore it. + if (SI.getOperand(0)->getType()->isStructTy()) + return; + + if (TrackedGlobals.empty() || !isa<GlobalVariable>(SI.getOperand(1))) + return; + + GlobalVariable *GV = cast<GlobalVariable>(SI.getOperand(1)); + DenseMap<GlobalVariable*, LatticeVal>::iterator I = TrackedGlobals.find(GV); + if (I == TrackedGlobals.end() || I->second.isOverdefined()) return; + + // Get the value we are storing into the global, then merge it. + mergeInValue(I->second, GV, getValueState(SI.getOperand(0))); + if (I->second.isOverdefined()) + TrackedGlobals.erase(I); // No need to keep tracking this! +} + + +// Handle load instructions. If the operand is a constant pointer to a constant +// global, we can replace the load with the loaded constant value! +void SCCPSolver::visitLoadInst(LoadInst &I) { + // If this load is of a struct, just mark the result overdefined. + if (I.getType()->isStructTy()) + return markAnythingOverdefined(&I); + + LatticeVal PtrVal = getValueState(I.getOperand(0)); + if (PtrVal.isUndefined()) return; // The pointer is not resolved yet! + + LatticeVal &IV = ValueState[&I]; + if (IV.isOverdefined()) return; + + if (!PtrVal.isConstant() || I.isVolatile()) + return markOverdefined(IV, &I); + + Constant *Ptr = PtrVal.getConstant(); + + // load null -> null + if (isa<ConstantPointerNull>(Ptr) && I.getPointerAddressSpace() == 0) + return markConstant(IV, &I, Constant::getNullValue(I.getType())); + + // Transform load (constant global) into the value loaded. + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) { + if (!TrackedGlobals.empty()) { + // If we are tracking this global, merge in the known value for it. + DenseMap<GlobalVariable*, LatticeVal>::iterator It = + TrackedGlobals.find(GV); + if (It != TrackedGlobals.end()) { + mergeInValue(IV, &I, It->second); + return; + } + } + } + + // Transform load from a constant into a constant if possible. + if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, TD)) + return markConstant(IV, &I, C); + + // Otherwise we cannot say for certain what value this load will produce. + // Bail out. + markOverdefined(IV, &I); +} + +void SCCPSolver::visitCallSite(CallSite CS) { + Function *F = CS.getCalledFunction(); + Instruction *I = CS.getInstruction(); + + // The common case is that we aren't tracking the callee, either because we + // are not doing interprocedural analysis or the callee is indirect, or is + // external. Handle these cases first. + if (F == 0 || F->isDeclaration()) { +CallOverdefined: + // Void return and not tracking callee, just bail. + if (I->getType()->isVoidTy()) return; + + // Otherwise, if we have a single return value case, and if the function is + // a declaration, maybe we can constant fold it. + if (F && F->isDeclaration() && !I->getType()->isStructTy() && + canConstantFoldCallTo(F)) { + + SmallVector<Constant*, 8> Operands; + for (CallSite::arg_iterator AI = CS.arg_begin(), E = CS.arg_end(); + AI != E; ++AI) { + LatticeVal State = getValueState(*AI); + + if (State.isUndefined()) + return; // Operands are not resolved yet. + if (State.isOverdefined()) + return markOverdefined(I); + assert(State.isConstant() && "Unknown state!"); + Operands.push_back(State.getConstant()); + } + + // If we can constant fold this, mark the result of the call as a + // constant. + if (Constant *C = ConstantFoldCall(F, Operands.data(), Operands.size())) + return markConstant(I, C); + } + + // Otherwise, we don't know anything about this call, mark it overdefined. + return markAnythingOverdefined(I); + } + + // If this is a local function that doesn't have its address taken, mark its + // entry block executable and merge in the actual arguments to the call into + // the formal arguments of the function. + if (!TrackingIncomingArguments.empty() && TrackingIncomingArguments.count(F)){ + MarkBlockExecutable(F->begin()); + + // Propagate information from this call site into the callee. + CallSite::arg_iterator CAI = CS.arg_begin(); + for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); + AI != E; ++AI, ++CAI) { + // If this argument is byval, and if the function is not readonly, there + // will be an implicit copy formed of the input aggregate. + if (AI->hasByValAttr() && !F->onlyReadsMemory()) { + markOverdefined(AI); + continue; + } + + if (const StructType *STy = dyn_cast<StructType>(AI->getType())) { + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + LatticeVal CallArg = getStructValueState(*CAI, i); + mergeInValue(getStructValueState(AI, i), AI, CallArg); + } + } else { + mergeInValue(AI, getValueState(*CAI)); + } + } + } + + // If this is a single/zero retval case, see if we're tracking the function. + if (const StructType *STy = dyn_cast<StructType>(F->getReturnType())) { + if (!MRVFunctionsTracked.count(F)) + goto CallOverdefined; // Not tracking this callee. + + // If we are tracking this callee, propagate the result of the function + // into this call site. + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) + mergeInValue(getStructValueState(I, i), I, + TrackedMultipleRetVals[std::make_pair(F, i)]); + } else { + DenseMap<Function*, LatticeVal>::iterator TFRVI = TrackedRetVals.find(F); + if (TFRVI == TrackedRetVals.end()) + goto CallOverdefined; // Not tracking this callee. + + // If so, propagate the return value of the callee into this call result. + mergeInValue(I, TFRVI->second); + } +} + +void SCCPSolver::Solve() { + // Process the work lists until they are empty! + while (!BBWorkList.empty() || !InstWorkList.empty() || + !OverdefinedInstWorkList.empty()) { + // Process the overdefined instruction's work list first, which drives other + // things to overdefined more quickly. + while (!OverdefinedInstWorkList.empty()) { + Value *I = OverdefinedInstWorkList.pop_back_val(); + + DEBUG(dbgs() << "\nPopped off OI-WL: " << *I << '\n'); + + // "I" got into the work list because it either made the transition from + // bottom to constant + // + // Anything on this worklist that is overdefined need not be visited + // since all of its users will have already been marked as overdefined + // Update all of the users of this instruction's value. + // + for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); + UI != E; ++UI) + if (Instruction *I = dyn_cast<Instruction>(*UI)) + OperandChangedState(I); + } + + // Process the instruction work list. + while (!InstWorkList.empty()) { + Value *I = InstWorkList.pop_back_val(); + + DEBUG(dbgs() << "\nPopped off I-WL: " << *I << '\n'); + + // "I" got into the work list because it made the transition from undef to + // constant. + // + // Anything on this worklist that is overdefined need not be visited + // since all of its users will have already been marked as overdefined. + // Update all of the users of this instruction's value. + // + if (I->getType()->isStructTy() || !getValueState(I).isOverdefined()) + for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); + UI != E; ++UI) + if (Instruction *I = dyn_cast<Instruction>(*UI)) + OperandChangedState(I); + } + + // Process the basic block work list. + while (!BBWorkList.empty()) { + BasicBlock *BB = BBWorkList.back(); + BBWorkList.pop_back(); + + DEBUG(dbgs() << "\nPopped off BBWL: " << *BB << '\n'); + + // Notify all instructions in this basic block that they are newly + // executable. + visit(BB); + } + } +} + +/// ResolvedUndefsIn - While solving the dataflow for a function, we assume +/// that branches on undef values cannot reach any of their successors. +/// However, this is not a safe assumption. After we solve dataflow, this +/// method should be use to handle this. If this returns true, the solver +/// should be rerun. +/// +/// This method handles this by finding an unresolved branch and marking it one +/// of the edges from the block as being feasible, even though the condition +/// doesn't say it would otherwise be. This allows SCCP to find the rest of the +/// CFG and only slightly pessimizes the analysis results (by marking one, +/// potentially infeasible, edge feasible). This cannot usefully modify the +/// constraints on the condition of the branch, as that would impact other users +/// of the value. +/// +/// This scan also checks for values that use undefs, whose results are actually +/// defined. For example, 'zext i8 undef to i32' should produce all zeros +/// conservatively, as "(zext i8 X -> i32) & 0xFF00" must always return zero, +/// even if X isn't defined. +bool SCCPSolver::ResolvedUndefsIn(Function &F) { + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { + if (!BBExecutable.count(BB)) + continue; + + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + // Look for instructions which produce undef values. + if (I->getType()->isVoidTy()) continue; + + if (const StructType *STy = dyn_cast<StructType>(I->getType())) { + // Only a few things that can be structs matter for undef. Just send + // all their results to overdefined. We could be more precise than this + // but it isn't worth bothering. + if (isa<CallInst>(I) || isa<SelectInst>(I)) { + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + LatticeVal &LV = getStructValueState(I, i); + if (LV.isUndefined()) + markOverdefined(LV, I); + } + } + continue; + } + + LatticeVal &LV = getValueState(I); + if (!LV.isUndefined()) continue; + + // No instructions using structs need disambiguation. + if (I->getOperand(0)->getType()->isStructTy()) + continue; + + // Get the lattice values of the first two operands for use below. + LatticeVal Op0LV = getValueState(I->getOperand(0)); + LatticeVal Op1LV; + if (I->getNumOperands() == 2) { + // No instructions using structs need disambiguation. + if (I->getOperand(1)->getType()->isStructTy()) + continue; + + // If this is a two-operand instruction, and if both operands are + // undefs, the result stays undef. + Op1LV = getValueState(I->getOperand(1)); + if (Op0LV.isUndefined() && Op1LV.isUndefined()) + continue; + } + + // If this is an instructions whose result is defined even if the input is + // not fully defined, propagate the information. + const Type *ITy = I->getType(); + switch (I->getOpcode()) { + default: break; // Leave the instruction as an undef. + case Instruction::ZExt: + // After a zero extend, we know the top part is zero. SExt doesn't have + // to be handled here, because we don't know whether the top part is 1's + // or 0's. + case Instruction::SIToFP: // some FP values are not possible, just use 0. + case Instruction::UIToFP: // some FP values are not possible, just use 0. + markForcedConstant(I, Constant::getNullValue(ITy)); + return true; + case Instruction::Mul: + case Instruction::And: + // undef * X -> 0. X could be zero. + // undef & X -> 0. X could be zero. + markForcedConstant(I, Constant::getNullValue(ITy)); + return true; + + case Instruction::Or: + // undef | X -> -1. X could be -1. + markForcedConstant(I, Constant::getAllOnesValue(ITy)); + return true; + + case Instruction::SDiv: + case Instruction::UDiv: + case Instruction::SRem: + case Instruction::URem: + // X / undef -> undef. No change. + // X % undef -> undef. No change. + if (Op1LV.isUndefined()) break; + + // undef / X -> 0. X could be maxint. + // undef % X -> 0. X could be 1. + markForcedConstant(I, Constant::getNullValue(ITy)); + return true; + + case Instruction::AShr: + // undef >>s X -> undef. No change. + if (Op0LV.isUndefined()) break; + + // X >>s undef -> X. X could be 0, X could have the high-bit known set. + if (Op0LV.isConstant()) + markForcedConstant(I, Op0LV.getConstant()); + else + markOverdefined(I); + return true; + case Instruction::LShr: + case Instruction::Shl: + // undef >> X -> undef. No change. + // undef << X -> undef. No change. + if (Op0LV.isUndefined()) break; + + // X >> undef -> 0. X could be 0. + // X << undef -> 0. X could be 0. + markForcedConstant(I, Constant::getNullValue(ITy)); + return true; + case Instruction::Select: + // undef ? X : Y -> X or Y. There could be commonality between X/Y. + if (Op0LV.isUndefined()) { + if (!Op1LV.isConstant()) // Pick the constant one if there is any. + Op1LV = getValueState(I->getOperand(2)); + } else if (Op1LV.isUndefined()) { + // c ? undef : undef -> undef. No change. + Op1LV = getValueState(I->getOperand(2)); + if (Op1LV.isUndefined()) + break; + // Otherwise, c ? undef : x -> x. + } else { + // Leave Op1LV as Operand(1)'s LatticeValue. + } + + if (Op1LV.isConstant()) + markForcedConstant(I, Op1LV.getConstant()); + else + markOverdefined(I); + return true; + case Instruction::Call: + // If a call has an undef result, it is because it is constant foldable + // but one of the inputs was undef. Just force the result to + // overdefined. + markOverdefined(I); + return true; + } + } + + // Check to see if we have a branch or switch on an undefined value. If so + // we force the branch to go one way or the other to make the successor + // values live. It doesn't really matter which way we force it. + TerminatorInst *TI = BB->getTerminator(); + if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { + if (!BI->isConditional()) continue; + if (!getValueState(BI->getCondition()).isUndefined()) + continue; + + // If the input to SCCP is actually branch on undef, fix the undef to + // false. + if (isa<UndefValue>(BI->getCondition())) { + BI->setCondition(ConstantInt::getFalse(BI->getContext())); + markEdgeExecutable(BB, TI->getSuccessor(1)); + return true; + } + + // Otherwise, it is a branch on a symbolic value which is currently + // considered to be undef. Handle this by forcing the input value to the + // branch to false. + markForcedConstant(BI->getCondition(), + ConstantInt::getFalse(TI->getContext())); + return true; + } + + if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { + if (SI->getNumSuccessors() < 2) // no cases + continue; + if (!getValueState(SI->getCondition()).isUndefined()) + continue; + + // If the input to SCCP is actually switch on undef, fix the undef to + // the first constant. + if (isa<UndefValue>(SI->getCondition())) { + SI->setCondition(SI->getCaseValue(1)); + markEdgeExecutable(BB, TI->getSuccessor(1)); + return true; + } + + markForcedConstant(SI->getCondition(), SI->getCaseValue(1)); + return true; + } + } + + return false; +} + + +namespace { + //===--------------------------------------------------------------------===// + // + /// SCCP Class - This class uses the SCCPSolver to implement a per-function + /// Sparse Conditional Constant Propagator. + /// + struct SCCP : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + SCCP() : FunctionPass(ID) { + initializeSCCPPass(*PassRegistry::getPassRegistry()); + } + + // runOnFunction - Run the Sparse Conditional Constant Propagation + // algorithm, and return true if the function was modified. + // + bool runOnFunction(Function &F); + }; +} // end anonymous namespace + +char SCCP::ID = 0; +INITIALIZE_PASS(SCCP, "sccp", + "Sparse Conditional Constant Propagation", false, false) + +// createSCCPPass - This is the public interface to this file. +FunctionPass *llvm::createSCCPPass() { + return new SCCP(); +} + +static void DeleteInstructionInBlock(BasicBlock *BB) { + DEBUG(dbgs() << " BasicBlock Dead:" << *BB); + ++NumDeadBlocks; + + // Delete the instructions backwards, as it has a reduced likelihood of + // having to update as many def-use and use-def chains. + while (!isa<TerminatorInst>(BB->begin())) { + Instruction *I = --BasicBlock::iterator(BB->getTerminator()); + + if (!I->use_empty()) + I->replaceAllUsesWith(UndefValue::get(I->getType())); + BB->getInstList().erase(I); + ++NumInstRemoved; + } +} + +// runOnFunction() - Run the Sparse Conditional Constant Propagation algorithm, +// and return true if the function was modified. +// +bool SCCP::runOnFunction(Function &F) { + DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n"); + SCCPSolver Solver(getAnalysisIfAvailable<TargetData>()); + + // Mark the first block of the function as being executable. + Solver.MarkBlockExecutable(F.begin()); + + // Mark all arguments to the function as being overdefined. + for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E;++AI) + Solver.markAnythingOverdefined(AI); + + // Solve for constants. + bool ResolvedUndefs = true; + while (ResolvedUndefs) { + Solver.Solve(); + DEBUG(dbgs() << "RESOLVING UNDEFs\n"); + ResolvedUndefs = Solver.ResolvedUndefsIn(F); + } + + bool MadeChanges = false; + + // If we decided that there are basic blocks that are dead in this function, + // delete their contents now. Note that we cannot actually delete the blocks, + // as we cannot modify the CFG of the function. + + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { + if (!Solver.isBlockExecutable(BB)) { + DeleteInstructionInBlock(BB); + MadeChanges = true; + continue; + } + + // Iterate over all of the instructions in a function, replacing them with + // constants if we have found them to be of constant values. + // + for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) { + Instruction *Inst = BI++; + if (Inst->getType()->isVoidTy() || isa<TerminatorInst>(Inst)) + continue; + + // TODO: Reconstruct structs from their elements. + if (Inst->getType()->isStructTy()) + continue; + + LatticeVal IV = Solver.getLatticeValueFor(Inst); + if (IV.isOverdefined()) + continue; + + Constant *Const = IV.isConstant() + ? IV.getConstant() : UndefValue::get(Inst->getType()); + DEBUG(dbgs() << " Constant: " << *Const << " = " << *Inst); + + // Replaces all of the uses of a variable with uses of the constant. + Inst->replaceAllUsesWith(Const); + + // Delete the instruction. + Inst->eraseFromParent(); + + // Hey, we just changed something! + MadeChanges = true; + ++NumInstRemoved; + } + } + + return MadeChanges; +} + +namespace { + //===--------------------------------------------------------------------===// + // + /// IPSCCP Class - This class implements interprocedural Sparse Conditional + /// Constant Propagation. + /// + struct IPSCCP : public ModulePass { + static char ID; + IPSCCP() : ModulePass(ID) { + initializeIPSCCPPass(*PassRegistry::getPassRegistry()); + } + bool runOnModule(Module &M); + }; +} // end anonymous namespace + +char IPSCCP::ID = 0; +INITIALIZE_PASS(IPSCCP, "ipsccp", + "Interprocedural Sparse Conditional Constant Propagation", + false, false) + +// createIPSCCPPass - This is the public interface to this file. +ModulePass *llvm::createIPSCCPPass() { + return new IPSCCP(); +} + + +static bool AddressIsTaken(const GlobalValue *GV) { + // Delete any dead constantexpr klingons. + GV->removeDeadConstantUsers(); + + for (Value::const_use_iterator UI = GV->use_begin(), E = GV->use_end(); + UI != E; ++UI) { + const User *U = *UI; + if (const StoreInst *SI = dyn_cast<StoreInst>(U)) { + if (SI->getOperand(0) == GV || SI->isVolatile()) + return true; // Storing addr of GV. + } else if (isa<InvokeInst>(U) || isa<CallInst>(U)) { + // Make sure we are calling the function, not passing the address. + ImmutableCallSite CS(cast<Instruction>(U)); + if (!CS.isCallee(UI)) + return true; + } else if (const LoadInst *LI = dyn_cast<LoadInst>(U)) { + if (LI->isVolatile()) + return true; + } else if (isa<BlockAddress>(U)) { + // blockaddress doesn't take the address of the function, it takes addr + // of label. + } else { + return true; + } + } + return false; +} + +bool IPSCCP::runOnModule(Module &M) { + SCCPSolver Solver(getAnalysisIfAvailable<TargetData>()); + + // AddressTakenFunctions - This set keeps track of the address-taken functions + // that are in the input. As IPSCCP runs through and simplifies code, + // functions that were address taken can end up losing their + // address-taken-ness. Because of this, we keep track of their addresses from + // the first pass so we can use them for the later simplification pass. + SmallPtrSet<Function*, 32> AddressTakenFunctions; + + // Loop over all functions, marking arguments to those with their addresses + // taken or that are external as overdefined. + // + for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) { + if (F->isDeclaration()) + continue; + + // If this is a strong or ODR definition of this function, then we can + // propagate information about its result into callsites of it. + if (!F->mayBeOverridden()) + Solver.AddTrackedFunction(F); + + // If this function only has direct calls that we can see, we can track its + // arguments and return value aggressively, and can assume it is not called + // unless we see evidence to the contrary. + if (F->hasLocalLinkage()) { + if (AddressIsTaken(F)) + AddressTakenFunctions.insert(F); + else { + Solver.AddArgumentTrackedFunction(F); + continue; + } + } + + // Assume the function is called. + Solver.MarkBlockExecutable(F->begin()); + + // Assume nothing about the incoming arguments. + for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); + AI != E; ++AI) + Solver.markAnythingOverdefined(AI); + } + + // Loop over global variables. We inform the solver about any internal global + // variables that do not have their 'addresses taken'. If they don't have + // their addresses taken, we can propagate constants through them. + for (Module::global_iterator G = M.global_begin(), E = M.global_end(); + G != E; ++G) + if (!G->isConstant() && G->hasLocalLinkage() && !AddressIsTaken(G)) + Solver.TrackValueOfGlobalVariable(G); + + // Solve for constants. + bool ResolvedUndefs = true; + while (ResolvedUndefs) { + Solver.Solve(); + + DEBUG(dbgs() << "RESOLVING UNDEFS\n"); + ResolvedUndefs = false; + for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) + ResolvedUndefs |= Solver.ResolvedUndefsIn(*F); + } + + bool MadeChanges = false; + + // Iterate over all of the instructions in the module, replacing them with + // constants if we have found them to be of constant values. + // + SmallVector<BasicBlock*, 512> BlocksToErase; + + for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) { + if (Solver.isBlockExecutable(F->begin())) { + for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); + AI != E; ++AI) { + if (AI->use_empty() || AI->getType()->isStructTy()) continue; + + // TODO: Could use getStructLatticeValueFor to find out if the entire + // result is a constant and replace it entirely if so. + + LatticeVal IV = Solver.getLatticeValueFor(AI); + if (IV.isOverdefined()) continue; + + Constant *CST = IV.isConstant() ? + IV.getConstant() : UndefValue::get(AI->getType()); + DEBUG(dbgs() << "*** Arg " << *AI << " = " << *CST <<"\n"); + + // Replaces all of the uses of a variable with uses of the + // constant. + AI->replaceAllUsesWith(CST); + ++IPNumArgsElimed; + } + } + + for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { + if (!Solver.isBlockExecutable(BB)) { + DeleteInstructionInBlock(BB); + MadeChanges = true; + + TerminatorInst *TI = BB->getTerminator(); + for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) { + BasicBlock *Succ = TI->getSuccessor(i); + if (!Succ->empty() && isa<PHINode>(Succ->begin())) + TI->getSuccessor(i)->removePredecessor(BB); + } + if (!TI->use_empty()) + TI->replaceAllUsesWith(UndefValue::get(TI->getType())); + TI->eraseFromParent(); + + if (&*BB != &F->front()) + BlocksToErase.push_back(BB); + else + new UnreachableInst(M.getContext(), BB); + continue; + } + + for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) { + Instruction *Inst = BI++; + if (Inst->getType()->isVoidTy() || Inst->getType()->isStructTy()) + continue; + + // TODO: Could use getStructLatticeValueFor to find out if the entire + // result is a constant and replace it entirely if so. + + LatticeVal IV = Solver.getLatticeValueFor(Inst); + if (IV.isOverdefined()) + continue; + + Constant *Const = IV.isConstant() + ? IV.getConstant() : UndefValue::get(Inst->getType()); + DEBUG(dbgs() << " Constant: " << *Const << " = " << *Inst); + + // Replaces all of the uses of a variable with uses of the + // constant. + Inst->replaceAllUsesWith(Const); + + // Delete the instruction. + if (!isa<CallInst>(Inst) && !isa<TerminatorInst>(Inst)) + Inst->eraseFromParent(); + + // Hey, we just changed something! + MadeChanges = true; + ++IPNumInstRemoved; + } + } + + // Now that all instructions in the function are constant folded, erase dead + // blocks, because we can now use ConstantFoldTerminator to get rid of + // in-edges. + for (unsigned i = 0, e = BlocksToErase.size(); i != e; ++i) { + // If there are any PHI nodes in this successor, drop entries for BB now. + BasicBlock *DeadBB = BlocksToErase[i]; + for (Value::use_iterator UI = DeadBB->use_begin(), UE = DeadBB->use_end(); + UI != UE; ) { + // Grab the user and then increment the iterator early, as the user + // will be deleted. Step past all adjacent uses from the same user. + Instruction *I = dyn_cast<Instruction>(*UI); + do { ++UI; } while (UI != UE && *UI == I); + + // Ignore blockaddress users; BasicBlock's dtor will handle them. + if (!I) continue; + + bool Folded = ConstantFoldTerminator(I->getParent()); + if (!Folded) { + // The constant folder may not have been able to fold the terminator + // if this is a branch or switch on undef. Fold it manually as a + // branch to the first successor. +#ifndef NDEBUG + if (BranchInst *BI = dyn_cast<BranchInst>(I)) { + assert(BI->isConditional() && isa<UndefValue>(BI->getCondition()) && + "Branch should be foldable!"); + } else if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) { + assert(isa<UndefValue>(SI->getCondition()) && "Switch should fold"); + } else { + llvm_unreachable("Didn't fold away reference to block!"); + } +#endif + + // Make this an uncond branch to the first successor. + TerminatorInst *TI = I->getParent()->getTerminator(); + BranchInst::Create(TI->getSuccessor(0), TI); + + // Remove entries in successor phi nodes to remove edges. + for (unsigned i = 1, e = TI->getNumSuccessors(); i != e; ++i) + TI->getSuccessor(i)->removePredecessor(TI->getParent()); + + // Remove the old terminator. + TI->eraseFromParent(); + } + } + + // Finally, delete the basic block. + F->getBasicBlockList().erase(DeadBB); + } + BlocksToErase.clear(); + } + + // If we inferred constant or undef return values for a function, we replaced + // all call uses with the inferred value. This means we don't need to bother + // actually returning anything from the function. Replace all return + // instructions with return undef. + // + // Do this in two stages: first identify the functions we should process, then + // actually zap their returns. This is important because we can only do this + // if the address of the function isn't taken. In cases where a return is the + // last use of a function, the order of processing functions would affect + // whether other functions are optimizable. + SmallVector<ReturnInst*, 8> ReturnsToZap; + + // TODO: Process multiple value ret instructions also. + const DenseMap<Function*, LatticeVal> &RV = Solver.getTrackedRetVals(); + for (DenseMap<Function*, LatticeVal>::const_iterator I = RV.begin(), + E = RV.end(); I != E; ++I) { + Function *F = I->first; + if (I->second.isOverdefined() || F->getReturnType()->isVoidTy()) + continue; + + // We can only do this if we know that nothing else can call the function. + if (!F->hasLocalLinkage() || AddressTakenFunctions.count(F)) + continue; + + for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) + if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) + if (!isa<UndefValue>(RI->getOperand(0))) + ReturnsToZap.push_back(RI); + } + + // Zap all returns which we've identified as zap to change. + for (unsigned i = 0, e = ReturnsToZap.size(); i != e; ++i) { + Function *F = ReturnsToZap[i]->getParent()->getParent(); + ReturnsToZap[i]->setOperand(0, UndefValue::get(F->getReturnType())); + } + + // If we infered constant or undef values for globals variables, we can delete + // the global and any stores that remain to it. + const DenseMap<GlobalVariable*, LatticeVal> &TG = Solver.getTrackedGlobals(); + for (DenseMap<GlobalVariable*, LatticeVal>::const_iterator I = TG.begin(), + E = TG.end(); I != E; ++I) { + GlobalVariable *GV = I->first; + assert(!I->second.isOverdefined() && + "Overdefined values should have been taken out of the map!"); + DEBUG(dbgs() << "Found that GV '" << GV->getName() << "' is constant!\n"); + while (!GV->use_empty()) { + StoreInst *SI = cast<StoreInst>(GV->use_back()); + SI->eraseFromParent(); + } + M.getGlobalList().erase(GV); + ++IPNumGlobalConst; + } + + return MadeChanges; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp new file mode 100644 index 0000000..bf9ca6d --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -0,0 +1,161 @@ +//===-- Scalar.cpp --------------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements common infrastructure for libLLVMScalarOpts.a, which +// implements several scalar transformations over the LLVM intermediate +// representation, including the C bindings for that library. +// +//===----------------------------------------------------------------------===// + +#include "llvm-c/Transforms/Scalar.h" +#include "llvm-c/Initialization.h" +#include "llvm/InitializePasses.h" +#include "llvm/PassManager.h" +#include "llvm/Analysis/Verifier.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Transforms/Scalar.h" + +using namespace llvm; + +/// initializeScalarOptsPasses - Initialize all passes linked into the +/// ScalarOpts library. +void llvm::initializeScalarOpts(PassRegistry &Registry) { + initializeADCEPass(Registry); + initializeBlockPlacementPass(Registry); + initializeCodeGenPreparePass(Registry); + initializeConstantPropagationPass(Registry); + initializeCorrelatedValuePropagationPass(Registry); + initializeDCEPass(Registry); + initializeDeadInstEliminationPass(Registry); + initializeDSEPass(Registry); + initializeGEPSplitterPass(Registry); + initializeGVNPass(Registry); + initializeEarlyCSEPass(Registry); + initializeIndVarSimplifyPass(Registry); + initializeJumpThreadingPass(Registry); + initializeLICMPass(Registry); + initializeLoopDeletionPass(Registry); + initializeLoopInstSimplifyPass(Registry); + initializeLoopRotatePass(Registry); + initializeLoopStrengthReducePass(Registry); + initializeLoopUnrollPass(Registry); + initializeLoopUnswitchPass(Registry); + initializeLoopIdiomRecognizePass(Registry); + initializeLowerAtomicPass(Registry); + initializeMemCpyOptPass(Registry); + initializeReassociatePass(Registry); + initializeRegToMemPass(Registry); + initializeSCCPPass(Registry); + initializeIPSCCPPass(Registry); + initializeSROA_DTPass(Registry); + initializeSROA_SSAUpPass(Registry); + initializeCFGSimplifyPassPass(Registry); + initializeSimplifyHalfPowrLibCallsPass(Registry); + initializeSimplifyLibCallsPass(Registry); + initializeSinkingPass(Registry); + initializeTailDupPass(Registry); + initializeTailCallElimPass(Registry); +} + +void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) { + initializeScalarOpts(*unwrap(R)); +} + +void LLVMAddAggressiveDCEPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createAggressiveDCEPass()); +} + +void LLVMAddCFGSimplificationPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createCFGSimplificationPass()); +} + +void LLVMAddDeadStoreEliminationPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createDeadStoreEliminationPass()); +} + +void LLVMAddGVNPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createGVNPass()); +} + +void LLVMAddIndVarSimplifyPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createIndVarSimplifyPass()); +} + +void LLVMAddInstructionCombiningPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createInstructionCombiningPass()); +} + +void LLVMAddJumpThreadingPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createJumpThreadingPass()); +} + +void LLVMAddLICMPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLICMPass()); +} + +void LLVMAddLoopDeletionPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLoopDeletionPass()); +} + +void LLVMAddLoopRotatePass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLoopRotatePass()); +} + +void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLoopUnrollPass()); +} + +void LLVMAddLoopUnswitchPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLoopUnswitchPass()); +} + +void LLVMAddMemCpyOptPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createMemCpyOptPass()); +} + +void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createPromoteMemoryToRegisterPass()); +} + +void LLVMAddReassociatePass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createReassociatePass()); +} + +void LLVMAddSCCPPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createSCCPPass()); +} + +void LLVMAddScalarReplAggregatesPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createScalarReplAggregatesPass()); +} + +void LLVMAddScalarReplAggregatesPassWithThreshold(LLVMPassManagerRef PM, + int Threshold) { + unwrap(PM)->add(createScalarReplAggregatesPass(Threshold)); +} + +void LLVMAddSimplifyLibCallsPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createSimplifyLibCallsPass()); +} + +void LLVMAddTailCallEliminationPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createTailCallEliminationPass()); +} + +void LLVMAddConstantPropagationPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createConstantPropagationPass()); +} + +void LLVMAddDemoteMemoryToRegisterPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createDemoteRegisterToMemoryPass()); +} + +void LLVMAddVerifierPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createVerifierPass()); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp new file mode 100644 index 0000000..c3ca852 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -0,0 +1,2336 @@ +//===- ScalarReplAggregates.cpp - Scalar Replacement of Aggregates --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This transformation implements the well known scalar replacement of +// aggregates transformation. This xform breaks up alloca instructions of +// aggregate type (structure or array) into individual alloca instructions for +// each member (if possible). Then, if possible, it transforms the individual +// alloca instructions into nice clean scalar SSA form. +// +// This combines a simple SRoA algorithm with the Mem2Reg algorithm because +// often interact, especially for C++ programs. As such, iterating between +// SRoA, then Mem2Reg until we run out of things to promote works well. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "scalarrepl" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/GlobalVariable.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/LLVMContext.h" +#include "llvm/Module.h" +#include "llvm/Pass.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Transforms/Utils/PromoteMemToReg.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/GetElementPtrTypeIterator.h" +#include "llvm/Support/IRBuilder.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumReplaced, "Number of allocas broken up"); +STATISTIC(NumPromoted, "Number of allocas promoted"); +STATISTIC(NumAdjusted, "Number of scalar allocas adjusted to allow promotion"); +STATISTIC(NumConverted, "Number of aggregates converted to scalar"); +STATISTIC(NumGlobals, "Number of allocas copied from constant global"); + +namespace { + struct SROA : public FunctionPass { + SROA(int T, bool hasDT, char &ID) + : FunctionPass(ID), HasDomTree(hasDT) { + if (T == -1) + SRThreshold = 128; + else + SRThreshold = T; + } + + bool runOnFunction(Function &F); + + bool performScalarRepl(Function &F); + bool performPromotion(Function &F); + + private: + bool HasDomTree; + TargetData *TD; + + /// DeadInsts - Keep track of instructions we have made dead, so that + /// we can remove them after we are done working. + SmallVector<Value*, 32> DeadInsts; + + /// AllocaInfo - When analyzing uses of an alloca instruction, this captures + /// information about the uses. All these fields are initialized to false + /// and set to true when something is learned. + struct AllocaInfo { + /// The alloca to promote. + AllocaInst *AI; + + /// CheckedPHIs - This is a set of verified PHI nodes, to prevent infinite + /// looping and avoid redundant work. + SmallPtrSet<PHINode*, 8> CheckedPHIs; + + /// isUnsafe - This is set to true if the alloca cannot be SROA'd. + bool isUnsafe : 1; + + /// isMemCpySrc - This is true if this aggregate is memcpy'd from. + bool isMemCpySrc : 1; + + /// isMemCpyDst - This is true if this aggregate is memcpy'd into. + bool isMemCpyDst : 1; + + /// hasSubelementAccess - This is true if a subelement of the alloca is + /// ever accessed, or false if the alloca is only accessed with mem + /// intrinsics or load/store that only access the entire alloca at once. + bool hasSubelementAccess : 1; + + /// hasALoadOrStore - This is true if there are any loads or stores to it. + /// The alloca may just be accessed with memcpy, for example, which would + /// not set this. + bool hasALoadOrStore : 1; + + explicit AllocaInfo(AllocaInst *ai) + : AI(ai), isUnsafe(false), isMemCpySrc(false), isMemCpyDst(false), + hasSubelementAccess(false), hasALoadOrStore(false) {} + }; + + unsigned SRThreshold; + + void MarkUnsafe(AllocaInfo &I, Instruction *User) { + I.isUnsafe = true; + DEBUG(dbgs() << " Transformation preventing inst: " << *User << '\n'); + } + + bool isSafeAllocaToScalarRepl(AllocaInst *AI); + + void isSafeForScalarRepl(Instruction *I, uint64_t Offset, AllocaInfo &Info); + void isSafePHISelectUseForScalarRepl(Instruction *User, uint64_t Offset, + AllocaInfo &Info); + void isSafeGEP(GetElementPtrInst *GEPI, uint64_t &Offset, AllocaInfo &Info); + void isSafeMemAccess(uint64_t Offset, uint64_t MemSize, + const Type *MemOpType, bool isStore, AllocaInfo &Info, + Instruction *TheAccess, bool AllowWholeAccess); + bool TypeHasComponent(const Type *T, uint64_t Offset, uint64_t Size); + uint64_t FindElementAndOffset(const Type *&T, uint64_t &Offset, + const Type *&IdxTy); + + void DoScalarReplacement(AllocaInst *AI, + std::vector<AllocaInst*> &WorkList); + void DeleteDeadInstructions(); + + void RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, + SmallVector<AllocaInst*, 32> &NewElts); + void RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset, + SmallVector<AllocaInst*, 32> &NewElts); + void RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, + SmallVector<AllocaInst*, 32> &NewElts); + void RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, + AllocaInst *AI, + SmallVector<AllocaInst*, 32> &NewElts); + void RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, + SmallVector<AllocaInst*, 32> &NewElts); + void RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, + SmallVector<AllocaInst*, 32> &NewElts); + + static MemTransferInst *isOnlyCopiedFromConstantGlobal(AllocaInst *AI); + }; + + // SROA_DT - SROA that uses DominatorTree. + struct SROA_DT : public SROA { + static char ID; + public: + SROA_DT(int T = -1) : SROA(T, true, ID) { + initializeSROA_DTPass(*PassRegistry::getPassRegistry()); + } + + // getAnalysisUsage - This pass does not require any passes, but we know it + // will not alter the CFG, so say so. + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<DominatorTree>(); + AU.setPreservesCFG(); + } + }; + + // SROA_SSAUp - SROA that uses SSAUpdater. + struct SROA_SSAUp : public SROA { + static char ID; + public: + SROA_SSAUp(int T = -1) : SROA(T, false, ID) { + initializeSROA_SSAUpPass(*PassRegistry::getPassRegistry()); + } + + // getAnalysisUsage - This pass does not require any passes, but we know it + // will not alter the CFG, so say so. + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + } + }; + +} + +char SROA_DT::ID = 0; +char SROA_SSAUp::ID = 0; + +INITIALIZE_PASS_BEGIN(SROA_DT, "scalarrepl", + "Scalar Replacement of Aggregates (DT)", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_END(SROA_DT, "scalarrepl", + "Scalar Replacement of Aggregates (DT)", false, false) + +INITIALIZE_PASS_BEGIN(SROA_SSAUp, "scalarrepl-ssa", + "Scalar Replacement of Aggregates (SSAUp)", false, false) +INITIALIZE_PASS_END(SROA_SSAUp, "scalarrepl-ssa", + "Scalar Replacement of Aggregates (SSAUp)", false, false) + +// Public interface to the ScalarReplAggregates pass +FunctionPass *llvm::createScalarReplAggregatesPass(int Threshold, + bool UseDomTree) { + if (UseDomTree) + return new SROA_DT(Threshold); + return new SROA_SSAUp(Threshold); +} + + +//===----------------------------------------------------------------------===// +// Convert To Scalar Optimization. +//===----------------------------------------------------------------------===// + +namespace { +/// ConvertToScalarInfo - This class implements the "Convert To Scalar" +/// optimization, which scans the uses of an alloca and determines if it can +/// rewrite it in terms of a single new alloca that can be mem2reg'd. +class ConvertToScalarInfo { + /// AllocaSize - The size of the alloca being considered. + unsigned AllocaSize; + const TargetData &TD; + + /// IsNotTrivial - This is set to true if there is some access to the object + /// which means that mem2reg can't promote it. + bool IsNotTrivial; + + /// VectorTy - This tracks the type that we should promote the vector to if + /// it is possible to turn it into a vector. This starts out null, and if it + /// isn't possible to turn into a vector type, it gets set to VoidTy. + const Type *VectorTy; + + /// HadAVector - True if there is at least one vector access to the alloca. + /// We don't want to turn random arrays into vectors and use vector element + /// insert/extract, but if there are element accesses to something that is + /// also declared as a vector, we do want to promote to a vector. + bool HadAVector; + +public: + explicit ConvertToScalarInfo(unsigned Size, const TargetData &td) + : AllocaSize(Size), TD(td) { + IsNotTrivial = false; + VectorTy = 0; + HadAVector = false; + } + + AllocaInst *TryConvert(AllocaInst *AI); + +private: + bool CanConvertToScalar(Value *V, uint64_t Offset); + void MergeInType(const Type *In, uint64_t Offset); + void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset); + + Value *ConvertScalar_ExtractValue(Value *NV, const Type *ToType, + uint64_t Offset, IRBuilder<> &Builder); + Value *ConvertScalar_InsertValue(Value *StoredVal, Value *ExistingVal, + uint64_t Offset, IRBuilder<> &Builder); +}; +} // end anonymous namespace. + + +/// TryConvert - Analyze the specified alloca, and if it is safe to do so, +/// rewrite it to be a new alloca which is mem2reg'able. This returns the new +/// alloca if possible or null if not. +AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) { + // If we can't convert this scalar, or if mem2reg can trivially do it, bail + // out. + if (!CanConvertToScalar(AI, 0) || !IsNotTrivial) + return 0; + + // If we were able to find a vector type that can handle this with + // insert/extract elements, and if there was at least one use that had + // a vector type, promote this to a vector. We don't want to promote + // random stuff that doesn't use vectors (e.g. <9 x double>) because then + // we just get a lot of insert/extracts. If at least one vector is + // involved, then we probably really do have a union of vector/array. + const Type *NewTy; + if (VectorTy && VectorTy->isVectorTy() && HadAVector) { + DEBUG(dbgs() << "CONVERT TO VECTOR: " << *AI << "\n TYPE = " + << *VectorTy << '\n'); + NewTy = VectorTy; // Use the vector type. + } else { + DEBUG(dbgs() << "CONVERT TO SCALAR INTEGER: " << *AI << "\n"); + // Create and insert the integer alloca. + NewTy = IntegerType::get(AI->getContext(), AllocaSize*8); + } + AllocaInst *NewAI = new AllocaInst(NewTy, 0, "", AI->getParent()->begin()); + ConvertUsesToScalar(AI, NewAI, 0); + return NewAI; +} + +/// MergeInType - Add the 'In' type to the accumulated vector type (VectorTy) +/// so far at the offset specified by Offset (which is specified in bytes). +/// +/// There are two cases we handle here: +/// 1) A union of vector types of the same size and potentially its elements. +/// Here we turn element accesses into insert/extract element operations. +/// This promotes a <4 x float> with a store of float to the third element +/// into a <4 x float> that uses insert element. +/// 2) A fully general blob of memory, which we turn into some (potentially +/// large) integer type with extract and insert operations where the loads +/// and stores would mutate the memory. We mark this by setting VectorTy +/// to VoidTy. +void ConvertToScalarInfo::MergeInType(const Type *In, uint64_t Offset) { + // If we already decided to turn this into a blob of integer memory, there is + // nothing to be done. + if (VectorTy && VectorTy->isVoidTy()) + return; + + // If this could be contributing to a vector, analyze it. + + // If the In type is a vector that is the same size as the alloca, see if it + // matches the existing VecTy. + if (const VectorType *VInTy = dyn_cast<VectorType>(In)) { + // Remember if we saw a vector type. + HadAVector = true; + + if (VInTy->getBitWidth()/8 == AllocaSize && Offset == 0) { + // If we're storing/loading a vector of the right size, allow it as a + // vector. If this the first vector we see, remember the type so that + // we know the element size. If this is a subsequent access, ignore it + // even if it is a differing type but the same size. Worst case we can + // bitcast the resultant vectors. + if (VectorTy == 0) + VectorTy = VInTy; + return; + } + } else if (In->isFloatTy() || In->isDoubleTy() || + (In->isIntegerTy() && In->getPrimitiveSizeInBits() >= 8 && + isPowerOf2_32(In->getPrimitiveSizeInBits()))) { + // If we're accessing something that could be an element of a vector, see + // if the implied vector agrees with what we already have and if Offset is + // compatible with it. + unsigned EltSize = In->getPrimitiveSizeInBits()/8; + if (Offset % EltSize == 0 && AllocaSize % EltSize == 0 && + (VectorTy == 0 || + cast<VectorType>(VectorTy)->getElementType() + ->getPrimitiveSizeInBits()/8 == EltSize)) { + if (VectorTy == 0) + VectorTy = VectorType::get(In, AllocaSize/EltSize); + return; + } + } + + // Otherwise, we have a case that we can't handle with an optimized vector + // form. We can still turn this into a large integer. + VectorTy = Type::getVoidTy(In->getContext()); +} + +/// CanConvertToScalar - V is a pointer. If we can convert the pointee and all +/// its accesses to a single vector type, return true and set VecTy to +/// the new type. If we could convert the alloca into a single promotable +/// integer, return true but set VecTy to VoidTy. Further, if the use is not a +/// completely trivial use that mem2reg could promote, set IsNotTrivial. Offset +/// is the current offset from the base of the alloca being analyzed. +/// +/// If we see at least one access to the value that is as a vector type, set the +/// SawVec flag. +bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) { + for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) { + Instruction *User = cast<Instruction>(*UI); + + if (LoadInst *LI = dyn_cast<LoadInst>(User)) { + // Don't break volatile loads. + if (LI->isVolatile()) + return false; + // Don't touch MMX operations. + if (LI->getType()->isX86_MMXTy()) + return false; + MergeInType(LI->getType(), Offset); + continue; + } + + if (StoreInst *SI = dyn_cast<StoreInst>(User)) { + // Storing the pointer, not into the value? + if (SI->getOperand(0) == V || SI->isVolatile()) return false; + // Don't touch MMX operations. + if (SI->getOperand(0)->getType()->isX86_MMXTy()) + return false; + MergeInType(SI->getOperand(0)->getType(), Offset); + continue; + } + + if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) { + IsNotTrivial = true; // Can't be mem2reg'd. + if (!CanConvertToScalar(BCI, Offset)) + return false; + continue; + } + + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) { + // If this is a GEP with a variable indices, we can't handle it. + if (!GEP->hasAllConstantIndices()) + return false; + + // Compute the offset that this GEP adds to the pointer. + SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end()); + uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(), + &Indices[0], Indices.size()); + // See if all uses can be converted. + if (!CanConvertToScalar(GEP, Offset+GEPOffset)) + return false; + IsNotTrivial = true; // Can't be mem2reg'd. + continue; + } + + // If this is a constant sized memset of a constant value (e.g. 0) we can + // handle it. + if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) { + // Store of constant value and constant size. + if (!isa<ConstantInt>(MSI->getValue()) || + !isa<ConstantInt>(MSI->getLength())) + return false; + IsNotTrivial = true; // Can't be mem2reg'd. + continue; + } + + // If this is a memcpy or memmove into or out of the whole allocation, we + // can handle it like a load or store of the scalar type. + if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) { + ConstantInt *Len = dyn_cast<ConstantInt>(MTI->getLength()); + if (Len == 0 || Len->getZExtValue() != AllocaSize || Offset != 0) + return false; + + IsNotTrivial = true; // Can't be mem2reg'd. + continue; + } + + // Otherwise, we cannot handle this! + return false; + } + + return true; +} + +/// ConvertUsesToScalar - Convert all of the users of Ptr to use the new alloca +/// directly. This happens when we are converting an "integer union" to a +/// single integer scalar, or when we are converting a "vector union" to a +/// vector with insert/extractelement instructions. +/// +/// Offset is an offset from the original alloca, in bits that need to be +/// shifted to the right. By the end of this, there should be no uses of Ptr. +void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, + uint64_t Offset) { + while (!Ptr->use_empty()) { + Instruction *User = cast<Instruction>(Ptr->use_back()); + + if (BitCastInst *CI = dyn_cast<BitCastInst>(User)) { + ConvertUsesToScalar(CI, NewAI, Offset); + CI->eraseFromParent(); + continue; + } + + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) { + // Compute the offset that this GEP adds to the pointer. + SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end()); + uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(), + &Indices[0], Indices.size()); + ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8); + GEP->eraseFromParent(); + continue; + } + + IRBuilder<> Builder(User); + + if (LoadInst *LI = dyn_cast<LoadInst>(User)) { + // The load is a bit extract from NewAI shifted right by Offset bits. + Value *LoadedVal = Builder.CreateLoad(NewAI, "tmp"); + Value *NewLoadVal + = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset, Builder); + LI->replaceAllUsesWith(NewLoadVal); + LI->eraseFromParent(); + continue; + } + + if (StoreInst *SI = dyn_cast<StoreInst>(User)) { + assert(SI->getOperand(0) != Ptr && "Consistency error!"); + Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in"); + Value *New = ConvertScalar_InsertValue(SI->getOperand(0), Old, Offset, + Builder); + Builder.CreateStore(New, NewAI); + SI->eraseFromParent(); + + // If the load we just inserted is now dead, then the inserted store + // overwrote the entire thing. + if (Old->use_empty()) + Old->eraseFromParent(); + continue; + } + + // If this is a constant sized memset of a constant value (e.g. 0) we can + // transform it into a store of the expanded constant value. + if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) { + assert(MSI->getRawDest() == Ptr && "Consistency error!"); + unsigned NumBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue(); + if (NumBytes != 0) { + unsigned Val = cast<ConstantInt>(MSI->getValue())->getZExtValue(); + + // Compute the value replicated the right number of times. + APInt APVal(NumBytes*8, Val); + + // Splat the value if non-zero. + if (Val) + for (unsigned i = 1; i != NumBytes; ++i) + APVal |= APVal << 8; + + Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in"); + Value *New = ConvertScalar_InsertValue( + ConstantInt::get(User->getContext(), APVal), + Old, Offset, Builder); + Builder.CreateStore(New, NewAI); + + // If the load we just inserted is now dead, then the memset overwrote + // the entire thing. + if (Old->use_empty()) + Old->eraseFromParent(); + } + MSI->eraseFromParent(); + continue; + } + + // If this is a memcpy or memmove into or out of the whole allocation, we + // can handle it like a load or store of the scalar type. + if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) { + assert(Offset == 0 && "must be store to start of alloca"); + + // If the source and destination are both to the same alloca, then this is + // a noop copy-to-self, just delete it. Otherwise, emit a load and store + // as appropriate. + AllocaInst *OrigAI = cast<AllocaInst>(GetUnderlyingObject(Ptr, &TD, 0)); + + if (GetUnderlyingObject(MTI->getSource(), &TD, 0) != OrigAI) { + // Dest must be OrigAI, change this to be a load from the original + // pointer (bitcasted), then a store to our new alloca. + assert(MTI->getRawDest() == Ptr && "Neither use is of pointer?"); + Value *SrcPtr = MTI->getSource(); + const PointerType* SPTy = cast<PointerType>(SrcPtr->getType()); + const PointerType* AIPTy = cast<PointerType>(NewAI->getType()); + if (SPTy->getAddressSpace() != AIPTy->getAddressSpace()) { + AIPTy = PointerType::get(AIPTy->getElementType(), + SPTy->getAddressSpace()); + } + SrcPtr = Builder.CreateBitCast(SrcPtr, AIPTy); + + LoadInst *SrcVal = Builder.CreateLoad(SrcPtr, "srcval"); + SrcVal->setAlignment(MTI->getAlignment()); + Builder.CreateStore(SrcVal, NewAI); + } else if (GetUnderlyingObject(MTI->getDest(), &TD, 0) != OrigAI) { + // Src must be OrigAI, change this to be a load from NewAI then a store + // through the original dest pointer (bitcasted). + assert(MTI->getRawSource() == Ptr && "Neither use is of pointer?"); + LoadInst *SrcVal = Builder.CreateLoad(NewAI, "srcval"); + + const PointerType* DPTy = cast<PointerType>(MTI->getDest()->getType()); + const PointerType* AIPTy = cast<PointerType>(NewAI->getType()); + if (DPTy->getAddressSpace() != AIPTy->getAddressSpace()) { + AIPTy = PointerType::get(AIPTy->getElementType(), + DPTy->getAddressSpace()); + } + Value *DstPtr = Builder.CreateBitCast(MTI->getDest(), AIPTy); + + StoreInst *NewStore = Builder.CreateStore(SrcVal, DstPtr); + NewStore->setAlignment(MTI->getAlignment()); + } else { + // Noop transfer. Src == Dst + } + + MTI->eraseFromParent(); + continue; + } + + llvm_unreachable("Unsupported operation!"); + } +} + +/// ConvertScalar_ExtractValue - Extract a value of type ToType from an integer +/// or vector value FromVal, extracting the bits from the offset specified by +/// Offset. This returns the value, which is of type ToType. +/// +/// This happens when we are converting an "integer union" to a single +/// integer scalar, or when we are converting a "vector union" to a vector with +/// insert/extractelement instructions. +/// +/// Offset is an offset from the original alloca, in bits that need to be +/// shifted to the right. +Value *ConvertToScalarInfo:: +ConvertScalar_ExtractValue(Value *FromVal, const Type *ToType, + uint64_t Offset, IRBuilder<> &Builder) { + // If the load is of the whole new alloca, no conversion is needed. + if (FromVal->getType() == ToType && Offset == 0) + return FromVal; + + // If the result alloca is a vector type, this is either an element + // access or a bitcast to another vector type of the same size. + if (const VectorType *VTy = dyn_cast<VectorType>(FromVal->getType())) { + if (ToType->isVectorTy()) + return Builder.CreateBitCast(FromVal, ToType, "tmp"); + + // Otherwise it must be an element access. + unsigned Elt = 0; + if (Offset) { + unsigned EltSize = TD.getTypeAllocSizeInBits(VTy->getElementType()); + Elt = Offset/EltSize; + assert(EltSize*Elt == Offset && "Invalid modulus in validity checking"); + } + // Return the element extracted out of it. + Value *V = Builder.CreateExtractElement(FromVal, ConstantInt::get( + Type::getInt32Ty(FromVal->getContext()), Elt), "tmp"); + if (V->getType() != ToType) + V = Builder.CreateBitCast(V, ToType, "tmp"); + return V; + } + + // If ToType is a first class aggregate, extract out each of the pieces and + // use insertvalue's to form the FCA. + if (const StructType *ST = dyn_cast<StructType>(ToType)) { + const StructLayout &Layout = *TD.getStructLayout(ST); + Value *Res = UndefValue::get(ST); + for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { + Value *Elt = ConvertScalar_ExtractValue(FromVal, ST->getElementType(i), + Offset+Layout.getElementOffsetInBits(i), + Builder); + Res = Builder.CreateInsertValue(Res, Elt, i, "tmp"); + } + return Res; + } + + if (const ArrayType *AT = dyn_cast<ArrayType>(ToType)) { + uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType()); + Value *Res = UndefValue::get(AT); + for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { + Value *Elt = ConvertScalar_ExtractValue(FromVal, AT->getElementType(), + Offset+i*EltSize, Builder); + Res = Builder.CreateInsertValue(Res, Elt, i, "tmp"); + } + return Res; + } + + // Otherwise, this must be a union that was converted to an integer value. + const IntegerType *NTy = cast<IntegerType>(FromVal->getType()); + + // If this is a big-endian system and the load is narrower than the + // full alloca type, we need to do a shift to get the right bits. + int ShAmt = 0; + if (TD.isBigEndian()) { + // On big-endian machines, the lowest bit is stored at the bit offset + // from the pointer given by getTypeStoreSizeInBits. This matters for + // integers with a bitwidth that is not a multiple of 8. + ShAmt = TD.getTypeStoreSizeInBits(NTy) - + TD.getTypeStoreSizeInBits(ToType) - Offset; + } else { + ShAmt = Offset; + } + + // Note: we support negative bitwidths (with shl) which are not defined. + // We do this to support (f.e.) loads off the end of a structure where + // only some bits are used. + if (ShAmt > 0 && (unsigned)ShAmt < NTy->getBitWidth()) + FromVal = Builder.CreateLShr(FromVal, + ConstantInt::get(FromVal->getType(), + ShAmt), "tmp"); + else if (ShAmt < 0 && (unsigned)-ShAmt < NTy->getBitWidth()) + FromVal = Builder.CreateShl(FromVal, + ConstantInt::get(FromVal->getType(), + -ShAmt), "tmp"); + + // Finally, unconditionally truncate the integer to the right width. + unsigned LIBitWidth = TD.getTypeSizeInBits(ToType); + if (LIBitWidth < NTy->getBitWidth()) + FromVal = + Builder.CreateTrunc(FromVal, IntegerType::get(FromVal->getContext(), + LIBitWidth), "tmp"); + else if (LIBitWidth > NTy->getBitWidth()) + FromVal = + Builder.CreateZExt(FromVal, IntegerType::get(FromVal->getContext(), + LIBitWidth), "tmp"); + + // If the result is an integer, this is a trunc or bitcast. + if (ToType->isIntegerTy()) { + // Should be done. + } else if (ToType->isFloatingPointTy() || ToType->isVectorTy()) { + // Just do a bitcast, we know the sizes match up. + FromVal = Builder.CreateBitCast(FromVal, ToType, "tmp"); + } else { + // Otherwise must be a pointer. + FromVal = Builder.CreateIntToPtr(FromVal, ToType, "tmp"); + } + assert(FromVal->getType() == ToType && "Didn't convert right?"); + return FromVal; +} + +/// ConvertScalar_InsertValue - Insert the value "SV" into the existing integer +/// or vector value "Old" at the offset specified by Offset. +/// +/// This happens when we are converting an "integer union" to a +/// single integer scalar, or when we are converting a "vector union" to a +/// vector with insert/extractelement instructions. +/// +/// Offset is an offset from the original alloca, in bits that need to be +/// shifted to the right. +Value *ConvertToScalarInfo:: +ConvertScalar_InsertValue(Value *SV, Value *Old, + uint64_t Offset, IRBuilder<> &Builder) { + // Convert the stored type to the actual type, shift it left to insert + // then 'or' into place. + const Type *AllocaType = Old->getType(); + LLVMContext &Context = Old->getContext(); + + if (const VectorType *VTy = dyn_cast<VectorType>(AllocaType)) { + uint64_t VecSize = TD.getTypeAllocSizeInBits(VTy); + uint64_t ValSize = TD.getTypeAllocSizeInBits(SV->getType()); + + // Changing the whole vector with memset or with an access of a different + // vector type? + if (ValSize == VecSize) + return Builder.CreateBitCast(SV, AllocaType, "tmp"); + + uint64_t EltSize = TD.getTypeAllocSizeInBits(VTy->getElementType()); + + // Must be an element insertion. + unsigned Elt = Offset/EltSize; + + if (SV->getType() != VTy->getElementType()) + SV = Builder.CreateBitCast(SV, VTy->getElementType(), "tmp"); + + SV = Builder.CreateInsertElement(Old, SV, + ConstantInt::get(Type::getInt32Ty(SV->getContext()), Elt), + "tmp"); + return SV; + } + + // If SV is a first-class aggregate value, insert each value recursively. + if (const StructType *ST = dyn_cast<StructType>(SV->getType())) { + const StructLayout &Layout = *TD.getStructLayout(ST); + for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { + Value *Elt = Builder.CreateExtractValue(SV, i, "tmp"); + Old = ConvertScalar_InsertValue(Elt, Old, + Offset+Layout.getElementOffsetInBits(i), + Builder); + } + return Old; + } + + if (const ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) { + uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType()); + for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { + Value *Elt = Builder.CreateExtractValue(SV, i, "tmp"); + Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, Builder); + } + return Old; + } + + // If SV is a float, convert it to the appropriate integer type. + // If it is a pointer, do the same. + unsigned SrcWidth = TD.getTypeSizeInBits(SV->getType()); + unsigned DestWidth = TD.getTypeSizeInBits(AllocaType); + unsigned SrcStoreWidth = TD.getTypeStoreSizeInBits(SV->getType()); + unsigned DestStoreWidth = TD.getTypeStoreSizeInBits(AllocaType); + if (SV->getType()->isFloatingPointTy() || SV->getType()->isVectorTy()) + SV = Builder.CreateBitCast(SV, + IntegerType::get(SV->getContext(),SrcWidth), "tmp"); + else if (SV->getType()->isPointerTy()) + SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getContext()), "tmp"); + + // Zero extend or truncate the value if needed. + if (SV->getType() != AllocaType) { + if (SV->getType()->getPrimitiveSizeInBits() < + AllocaType->getPrimitiveSizeInBits()) + SV = Builder.CreateZExt(SV, AllocaType, "tmp"); + else { + // Truncation may be needed if storing more than the alloca can hold + // (undefined behavior). + SV = Builder.CreateTrunc(SV, AllocaType, "tmp"); + SrcWidth = DestWidth; + SrcStoreWidth = DestStoreWidth; + } + } + + // If this is a big-endian system and the store is narrower than the + // full alloca type, we need to do a shift to get the right bits. + int ShAmt = 0; + if (TD.isBigEndian()) { + // On big-endian machines, the lowest bit is stored at the bit offset + // from the pointer given by getTypeStoreSizeInBits. This matters for + // integers with a bitwidth that is not a multiple of 8. + ShAmt = DestStoreWidth - SrcStoreWidth - Offset; + } else { + ShAmt = Offset; + } + + // Note: we support negative bitwidths (with shr) which are not defined. + // We do this to support (f.e.) stores off the end of a structure where + // only some bits in the structure are set. + APInt Mask(APInt::getLowBitsSet(DestWidth, SrcWidth)); + if (ShAmt > 0 && (unsigned)ShAmt < DestWidth) { + SV = Builder.CreateShl(SV, ConstantInt::get(SV->getType(), + ShAmt), "tmp"); + Mask <<= ShAmt; + } else if (ShAmt < 0 && (unsigned)-ShAmt < DestWidth) { + SV = Builder.CreateLShr(SV, ConstantInt::get(SV->getType(), + -ShAmt), "tmp"); + Mask = Mask.lshr(-ShAmt); + } + + // Mask out the bits we are about to insert from the old value, and or + // in the new bits. + if (SrcWidth != DestWidth) { + assert(DestWidth > SrcWidth); + Old = Builder.CreateAnd(Old, ConstantInt::get(Context, ~Mask), "mask"); + SV = Builder.CreateOr(Old, SV, "ins"); + } + return SV; +} + + +//===----------------------------------------------------------------------===// +// SRoA Driver +//===----------------------------------------------------------------------===// + + +bool SROA::runOnFunction(Function &F) { + TD = getAnalysisIfAvailable<TargetData>(); + + bool Changed = performPromotion(F); + + // FIXME: ScalarRepl currently depends on TargetData more than it + // theoretically needs to. It should be refactored in order to support + // target-independent IR. Until this is done, just skip the actual + // scalar-replacement portion of this pass. + if (!TD) return Changed; + + while (1) { + bool LocalChange = performScalarRepl(F); + if (!LocalChange) break; // No need to repromote if no scalarrepl + Changed = true; + LocalChange = performPromotion(F); + if (!LocalChange) break; // No need to re-scalarrepl if no promotion + } + + return Changed; +} + +namespace { +class AllocaPromoter : public LoadAndStorePromoter { + AllocaInst *AI; +public: + AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S) + : LoadAndStorePromoter(Insts, S), AI(0) {} + + void run(AllocaInst *AI, const SmallVectorImpl<Instruction*> &Insts) { + // Remember which alloca we're promoting (for isInstInList). + this->AI = AI; + LoadAndStorePromoter::run(Insts); + AI->eraseFromParent(); + } + + virtual bool isInstInList(Instruction *I, + const SmallVectorImpl<Instruction*> &Insts) const { + if (LoadInst *LI = dyn_cast<LoadInst>(I)) + return LI->getOperand(0) == AI; + return cast<StoreInst>(I)->getPointerOperand() == AI; + } +}; +} // end anon namespace + +/// isSafeSelectToSpeculate - Select instructions that use an alloca and are +/// subsequently loaded can be rewritten to load both input pointers and then +/// select between the result, allowing the load of the alloca to be promoted. +/// From this: +/// %P2 = select i1 %cond, i32* %Alloca, i32* %Other +/// %V = load i32* %P2 +/// to: +/// %V1 = load i32* %Alloca -> will be mem2reg'd +/// %V2 = load i32* %Other +/// %V = select i1 %cond, i32 %V1, i32 %V2 +/// +/// We can do this to a select if its only uses are loads and if the operand to +/// the select can be loaded unconditionally. +static bool isSafeSelectToSpeculate(SelectInst *SI, const TargetData *TD) { + bool TDerefable = SI->getTrueValue()->isDereferenceablePointer(); + bool FDerefable = SI->getFalseValue()->isDereferenceablePointer(); + + for (Value::use_iterator UI = SI->use_begin(), UE = SI->use_end(); + UI != UE; ++UI) { + LoadInst *LI = dyn_cast<LoadInst>(*UI); + if (LI == 0 || LI->isVolatile()) return false; + + // Both operands to the select need to be dereferencable, either absolutely + // (e.g. allocas) or at this point because we can see other accesses to it. + if (!TDerefable && !isSafeToLoadUnconditionally(SI->getTrueValue(), LI, + LI->getAlignment(), TD)) + return false; + if (!FDerefable && !isSafeToLoadUnconditionally(SI->getFalseValue(), LI, + LI->getAlignment(), TD)) + return false; + } + + return true; +} + +/// isSafePHIToSpeculate - PHI instructions that use an alloca and are +/// subsequently loaded can be rewritten to load both input pointers in the pred +/// blocks and then PHI the results, allowing the load of the alloca to be +/// promoted. +/// From this: +/// %P2 = phi [i32* %Alloca, i32* %Other] +/// %V = load i32* %P2 +/// to: +/// %V1 = load i32* %Alloca -> will be mem2reg'd +/// ... +/// %V2 = load i32* %Other +/// ... +/// %V = phi [i32 %V1, i32 %V2] +/// +/// We can do this to a select if its only uses are loads and if the operand to +/// the select can be loaded unconditionally. +static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) { + // For now, we can only do this promotion if the load is in the same block as + // the PHI, and if there are no stores between the phi and load. + // TODO: Allow recursive phi users. + // TODO: Allow stores. + BasicBlock *BB = PN->getParent(); + unsigned MaxAlign = 0; + for (Value::use_iterator UI = PN->use_begin(), UE = PN->use_end(); + UI != UE; ++UI) { + LoadInst *LI = dyn_cast<LoadInst>(*UI); + if (LI == 0 || LI->isVolatile()) return false; + + // For now we only allow loads in the same block as the PHI. This is a + // common case that happens when instcombine merges two loads through a PHI. + if (LI->getParent() != BB) return false; + + // Ensure that there are no instructions between the PHI and the load that + // could store. + for (BasicBlock::iterator BBI = PN; &*BBI != LI; ++BBI) + if (BBI->mayWriteToMemory()) + return false; + + MaxAlign = std::max(MaxAlign, LI->getAlignment()); + } + + // Okay, we know that we have one or more loads in the same block as the PHI. + // We can transform this if it is safe to push the loads into the predecessor + // blocks. The only thing to watch out for is that we can't put a possibly + // trapping load in the predecessor if it is a critical edge. + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + BasicBlock *Pred = PN->getIncomingBlock(i); + + // If the predecessor has a single successor, then the edge isn't critical. + if (Pred->getTerminator()->getNumSuccessors() == 1) + continue; + + Value *InVal = PN->getIncomingValue(i); + + // If the InVal is an invoke in the pred, we can't put a load on the edge. + if (InvokeInst *II = dyn_cast<InvokeInst>(InVal)) + if (II->getParent() == Pred) + return false; + + // If this pointer is always safe to load, or if we can prove that there is + // already a load in the block, then we can move the load to the pred block. + if (InVal->isDereferenceablePointer() || + isSafeToLoadUnconditionally(InVal, Pred->getTerminator(), MaxAlign, TD)) + continue; + + return false; + } + + return true; +} + + +/// tryToMakeAllocaBePromotable - This returns true if the alloca only has +/// direct (non-volatile) loads and stores to it. If the alloca is close but +/// not quite there, this will transform the code to allow promotion. As such, +/// it is a non-pure predicate. +static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { + SetVector<Instruction*, SmallVector<Instruction*, 4>, + SmallPtrSet<Instruction*, 4> > InstsToRewrite; + + for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end(); + UI != UE; ++UI) { + User *U = *UI; + if (LoadInst *LI = dyn_cast<LoadInst>(U)) { + if (LI->isVolatile()) + return false; + continue; + } + + if (StoreInst *SI = dyn_cast<StoreInst>(U)) { + if (SI->getOperand(0) == AI || SI->isVolatile()) + return false; // Don't allow a store OF the AI, only INTO the AI. + continue; + } + + if (SelectInst *SI = dyn_cast<SelectInst>(U)) { + // If the condition being selected on is a constant, fold the select, yes + // this does (rarely) happen early on. + if (ConstantInt *CI = dyn_cast<ConstantInt>(SI->getCondition())) { + Value *Result = SI->getOperand(1+CI->isZero()); + SI->replaceAllUsesWith(Result); + SI->eraseFromParent(); + + // This is very rare and we just scrambled the use list of AI, start + // over completely. + return tryToMakeAllocaBePromotable(AI, TD); + } + + // If it is safe to turn "load (select c, AI, ptr)" into a select of two + // loads, then we can transform this by rewriting the select. + if (!isSafeSelectToSpeculate(SI, TD)) + return false; + + InstsToRewrite.insert(SI); + continue; + } + + if (PHINode *PN = dyn_cast<PHINode>(U)) { + if (PN->use_empty()) { // Dead PHIs can be stripped. + InstsToRewrite.insert(PN); + continue; + } + + // If it is safe to turn "load (phi [AI, ptr, ...])" into a PHI of loads + // in the pred blocks, then we can transform this by rewriting the PHI. + if (!isSafePHIToSpeculate(PN, TD)) + return false; + + InstsToRewrite.insert(PN); + continue; + } + + return false; + } + + // If there are no instructions to rewrite, then all uses are load/stores and + // we're done! + if (InstsToRewrite.empty()) + return true; + + // If we have instructions that need to be rewritten for this to be promotable + // take care of it now. + for (unsigned i = 0, e = InstsToRewrite.size(); i != e; ++i) { + if (SelectInst *SI = dyn_cast<SelectInst>(InstsToRewrite[i])) { + // Selects in InstsToRewrite only have load uses. Rewrite each as two + // loads with a new select. + while (!SI->use_empty()) { + LoadInst *LI = cast<LoadInst>(SI->use_back()); + + IRBuilder<> Builder(LI); + LoadInst *TrueLoad = + Builder.CreateLoad(SI->getTrueValue(), LI->getName()+".t"); + LoadInst *FalseLoad = + Builder.CreateLoad(SI->getFalseValue(), LI->getName()+".t"); + + // Transfer alignment and TBAA info if present. + TrueLoad->setAlignment(LI->getAlignment()); + FalseLoad->setAlignment(LI->getAlignment()); + if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) { + TrueLoad->setMetadata(LLVMContext::MD_tbaa, Tag); + FalseLoad->setMetadata(LLVMContext::MD_tbaa, Tag); + } + + Value *V = Builder.CreateSelect(SI->getCondition(), TrueLoad, FalseLoad); + V->takeName(LI); + LI->replaceAllUsesWith(V); + LI->eraseFromParent(); + } + + // Now that all the loads are gone, the select is gone too. + SI->eraseFromParent(); + continue; + } + + // Otherwise, we have a PHI node which allows us to push the loads into the + // predecessors. + PHINode *PN = cast<PHINode>(InstsToRewrite[i]); + if (PN->use_empty()) { + PN->eraseFromParent(); + continue; + } + + const Type *LoadTy = cast<PointerType>(PN->getType())->getElementType(); + PHINode *NewPN = PHINode::Create(LoadTy, PN->getName()+".ld", PN); + + // Get the TBAA tag and alignment to use from one of the loads. It doesn't + // matter which one we get and if any differ, it doesn't matter. + LoadInst *SomeLoad = cast<LoadInst>(PN->use_back()); + MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa); + unsigned Align = SomeLoad->getAlignment(); + + // Rewrite all loads of the PN to use the new PHI. + while (!PN->use_empty()) { + LoadInst *LI = cast<LoadInst>(PN->use_back()); + LI->replaceAllUsesWith(NewPN); + LI->eraseFromParent(); + } + + // Inject loads into all of the pred blocks. Keep track of which blocks we + // insert them into in case we have multiple edges from the same block. + DenseMap<BasicBlock*, LoadInst*> InsertedLoads; + + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + BasicBlock *Pred = PN->getIncomingBlock(i); + LoadInst *&Load = InsertedLoads[Pred]; + if (Load == 0) { + Load = new LoadInst(PN->getIncomingValue(i), + PN->getName() + "." + Pred->getName(), + Pred->getTerminator()); + Load->setAlignment(Align); + if (TBAATag) Load->setMetadata(LLVMContext::MD_tbaa, TBAATag); + } + + NewPN->addIncoming(Load, Pred); + } + + PN->eraseFromParent(); + } + + ++NumAdjusted; + return true; +} + + +bool SROA::performPromotion(Function &F) { + std::vector<AllocaInst*> Allocas; + DominatorTree *DT = 0; + if (HasDomTree) + DT = &getAnalysis<DominatorTree>(); + + BasicBlock &BB = F.getEntryBlock(); // Get the entry node for the function + + bool Changed = false; + SmallVector<Instruction*, 64> Insts; + while (1) { + Allocas.clear(); + + // Find allocas that are safe to promote, by looking at all instructions in + // the entry node + for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I) + if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) // Is it an alloca? + if (tryToMakeAllocaBePromotable(AI, TD)) + Allocas.push_back(AI); + + if (Allocas.empty()) break; + + if (HasDomTree) + PromoteMemToReg(Allocas, *DT); + else { + SSAUpdater SSA; + for (unsigned i = 0, e = Allocas.size(); i != e; ++i) { + AllocaInst *AI = Allocas[i]; + + // Build list of instructions to promote. + for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); + UI != E; ++UI) + Insts.push_back(cast<Instruction>(*UI)); + + AllocaPromoter(Insts, SSA).run(AI, Insts); + Insts.clear(); + } + } + NumPromoted += Allocas.size(); + Changed = true; + } + + return Changed; +} + + +/// ShouldAttemptScalarRepl - Decide if an alloca is a good candidate for +/// SROA. It must be a struct or array type with a small number of elements. +static bool ShouldAttemptScalarRepl(AllocaInst *AI) { + const Type *T = AI->getAllocatedType(); + // Do not promote any struct into more than 32 separate vars. + if (const StructType *ST = dyn_cast<StructType>(T)) + return ST->getNumElements() <= 32; + // Arrays are much less likely to be safe for SROA; only consider + // them if they are very small. + if (const ArrayType *AT = dyn_cast<ArrayType>(T)) + return AT->getNumElements() <= 8; + return false; +} + + +// performScalarRepl - This algorithm is a simple worklist driven algorithm, +// which runs on all of the malloc/alloca instructions in the function, removing +// them if they are only used by getelementptr instructions. +// +bool SROA::performScalarRepl(Function &F) { + std::vector<AllocaInst*> WorkList; + + // Scan the entry basic block, adding allocas to the worklist. + BasicBlock &BB = F.getEntryBlock(); + for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I) + if (AllocaInst *A = dyn_cast<AllocaInst>(I)) + WorkList.push_back(A); + + // Process the worklist + bool Changed = false; + while (!WorkList.empty()) { + AllocaInst *AI = WorkList.back(); + WorkList.pop_back(); + + // Handle dead allocas trivially. These can be formed by SROA'ing arrays + // with unused elements. + if (AI->use_empty()) { + AI->eraseFromParent(); + Changed = true; + continue; + } + + // If this alloca is impossible for us to promote, reject it early. + if (AI->isArrayAllocation() || !AI->getAllocatedType()->isSized()) + continue; + + // Check to see if this allocation is only modified by a memcpy/memmove from + // a constant global. If this is the case, we can change all users to use + // the constant global instead. This is commonly produced by the CFE by + // constructs like "void foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A' + // is only subsequently read. + if (MemTransferInst *TheCopy = isOnlyCopiedFromConstantGlobal(AI)) { + DEBUG(dbgs() << "Found alloca equal to global: " << *AI << '\n'); + DEBUG(dbgs() << " memcpy = " << *TheCopy << '\n'); + Constant *TheSrc = cast<Constant>(TheCopy->getSource()); + AI->replaceAllUsesWith(ConstantExpr::getBitCast(TheSrc, AI->getType())); + TheCopy->eraseFromParent(); // Don't mutate the global. + AI->eraseFromParent(); + ++NumGlobals; + Changed = true; + continue; + } + + // Check to see if we can perform the core SROA transformation. We cannot + // transform the allocation instruction if it is an array allocation + // (allocations OF arrays are ok though), and an allocation of a scalar + // value cannot be decomposed at all. + uint64_t AllocaSize = TD->getTypeAllocSize(AI->getAllocatedType()); + + // Do not promote [0 x %struct]. + if (AllocaSize == 0) continue; + + // Do not promote any struct whose size is too big. + if (AllocaSize > SRThreshold) continue; + + // If the alloca looks like a good candidate for scalar replacement, and if + // all its users can be transformed, then split up the aggregate into its + // separate elements. + if (ShouldAttemptScalarRepl(AI) && isSafeAllocaToScalarRepl(AI)) { + DoScalarReplacement(AI, WorkList); + Changed = true; + continue; + } + + // If we can turn this aggregate value (potentially with casts) into a + // simple scalar value that can be mem2reg'd into a register value. + // IsNotTrivial tracks whether this is something that mem2reg could have + // promoted itself. If so, we don't want to transform it needlessly. Note + // that we can't just check based on the type: the alloca may be of an i32 + // but that has pointer arithmetic to set byte 3 of it or something. + if (AllocaInst *NewAI = + ConvertToScalarInfo((unsigned)AllocaSize, *TD).TryConvert(AI)) { + NewAI->takeName(AI); + AI->eraseFromParent(); + ++NumConverted; + Changed = true; + continue; + } + + // Otherwise, couldn't process this alloca. + } + + return Changed; +} + +/// DoScalarReplacement - This alloca satisfied the isSafeAllocaToScalarRepl +/// predicate, do SROA now. +void SROA::DoScalarReplacement(AllocaInst *AI, + std::vector<AllocaInst*> &WorkList) { + DEBUG(dbgs() << "Found inst to SROA: " << *AI << '\n'); + SmallVector<AllocaInst*, 32> ElementAllocas; + if (const StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) { + ElementAllocas.reserve(ST->getNumContainedTypes()); + for (unsigned i = 0, e = ST->getNumContainedTypes(); i != e; ++i) { + AllocaInst *NA = new AllocaInst(ST->getContainedType(i), 0, + AI->getAlignment(), + AI->getName() + "." + Twine(i), AI); + ElementAllocas.push_back(NA); + WorkList.push_back(NA); // Add to worklist for recursive processing + } + } else { + const ArrayType *AT = cast<ArrayType>(AI->getAllocatedType()); + ElementAllocas.reserve(AT->getNumElements()); + const Type *ElTy = AT->getElementType(); + for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { + AllocaInst *NA = new AllocaInst(ElTy, 0, AI->getAlignment(), + AI->getName() + "." + Twine(i), AI); + ElementAllocas.push_back(NA); + WorkList.push_back(NA); // Add to worklist for recursive processing + } + } + + // Now that we have created the new alloca instructions, rewrite all the + // uses of the old alloca. + RewriteForScalarRepl(AI, AI, 0, ElementAllocas); + + // Now erase any instructions that were made dead while rewriting the alloca. + DeleteDeadInstructions(); + AI->eraseFromParent(); + + ++NumReplaced; +} + +/// DeleteDeadInstructions - Erase instructions on the DeadInstrs list, +/// recursively including all their operands that become trivially dead. +void SROA::DeleteDeadInstructions() { + while (!DeadInsts.empty()) { + Instruction *I = cast<Instruction>(DeadInsts.pop_back_val()); + + for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) + if (Instruction *U = dyn_cast<Instruction>(*OI)) { + // Zero out the operand and see if it becomes trivially dead. + // (But, don't add allocas to the dead instruction list -- they are + // already on the worklist and will be deleted separately.) + *OI = 0; + if (isInstructionTriviallyDead(U) && !isa<AllocaInst>(U)) + DeadInsts.push_back(U); + } + + I->eraseFromParent(); + } +} + +/// isSafeForScalarRepl - Check if instruction I is a safe use with regard to +/// performing scalar replacement of alloca AI. The results are flagged in +/// the Info parameter. Offset indicates the position within AI that is +/// referenced by this instruction. +void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset, + AllocaInfo &Info) { + for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E; ++UI) { + Instruction *User = cast<Instruction>(*UI); + + if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) { + isSafeForScalarRepl(BC, Offset, Info); + } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) { + uint64_t GEPOffset = Offset; + isSafeGEP(GEPI, GEPOffset, Info); + if (!Info.isUnsafe) + isSafeForScalarRepl(GEPI, GEPOffset, Info); + } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) { + ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength()); + if (Length == 0) + return MarkUnsafe(Info, User); + isSafeMemAccess(Offset, Length->getZExtValue(), 0, + UI.getOperandNo() == 0, Info, MI, + true /*AllowWholeAccess*/); + } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) { + if (LI->isVolatile()) + return MarkUnsafe(Info, User); + const Type *LIType = LI->getType(); + isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType), + LIType, false, Info, LI, true /*AllowWholeAccess*/); + Info.hasALoadOrStore = true; + + } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) { + // Store is ok if storing INTO the pointer, not storing the pointer + if (SI->isVolatile() || SI->getOperand(0) == I) + return MarkUnsafe(Info, User); + + const Type *SIType = SI->getOperand(0)->getType(); + isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType), + SIType, true, Info, SI, true /*AllowWholeAccess*/); + Info.hasALoadOrStore = true; + } else if (isa<PHINode>(User) || isa<SelectInst>(User)) { + isSafePHISelectUseForScalarRepl(User, Offset, Info); + } else { + return MarkUnsafe(Info, User); + } + if (Info.isUnsafe) return; + } +} + + +/// isSafePHIUseForScalarRepl - If we see a PHI node or select using a pointer +/// derived from the alloca, we can often still split the alloca into elements. +/// This is useful if we have a large alloca where one element is phi'd +/// together somewhere: we can SRoA and promote all the other elements even if +/// we end up not being able to promote this one. +/// +/// All we require is that the uses of the PHI do not index into other parts of +/// the alloca. The most important use case for this is single load and stores +/// that are PHI'd together, which can happen due to code sinking. +void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset, + AllocaInfo &Info) { + // If we've already checked this PHI, don't do it again. + if (PHINode *PN = dyn_cast<PHINode>(I)) + if (!Info.CheckedPHIs.insert(PN)) + return; + + for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E; ++UI) { + Instruction *User = cast<Instruction>(*UI); + + if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) { + isSafePHISelectUseForScalarRepl(BC, Offset, Info); + } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) { + // Only allow "bitcast" GEPs for simplicity. We could generalize this, + // but would have to prove that we're staying inside of an element being + // promoted. + if (!GEPI->hasAllZeroIndices()) + return MarkUnsafe(Info, User); + isSafePHISelectUseForScalarRepl(GEPI, Offset, Info); + } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) { + if (LI->isVolatile()) + return MarkUnsafe(Info, User); + const Type *LIType = LI->getType(); + isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType), + LIType, false, Info, LI, false /*AllowWholeAccess*/); + Info.hasALoadOrStore = true; + + } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) { + // Store is ok if storing INTO the pointer, not storing the pointer + if (SI->isVolatile() || SI->getOperand(0) == I) + return MarkUnsafe(Info, User); + + const Type *SIType = SI->getOperand(0)->getType(); + isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType), + SIType, true, Info, SI, false /*AllowWholeAccess*/); + Info.hasALoadOrStore = true; + } else if (isa<PHINode>(User) || isa<SelectInst>(User)) { + isSafePHISelectUseForScalarRepl(User, Offset, Info); + } else { + return MarkUnsafe(Info, User); + } + if (Info.isUnsafe) return; + } +} + +/// isSafeGEP - Check if a GEP instruction can be handled for scalar +/// replacement. It is safe when all the indices are constant, in-bounds +/// references, and when the resulting offset corresponds to an element within +/// the alloca type. The results are flagged in the Info parameter. Upon +/// return, Offset is adjusted as specified by the GEP indices. +void SROA::isSafeGEP(GetElementPtrInst *GEPI, + uint64_t &Offset, AllocaInfo &Info) { + gep_type_iterator GEPIt = gep_type_begin(GEPI), E = gep_type_end(GEPI); + if (GEPIt == E) + return; + + // Walk through the GEP type indices, checking the types that this indexes + // into. + for (; GEPIt != E; ++GEPIt) { + // Ignore struct elements, no extra checking needed for these. + if ((*GEPIt)->isStructTy()) + continue; + + ConstantInt *IdxVal = dyn_cast<ConstantInt>(GEPIt.getOperand()); + if (!IdxVal) + return MarkUnsafe(Info, GEPI); + } + + // Compute the offset due to this GEP and check if the alloca has a + // component element at that offset. + SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end()); + Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(), + &Indices[0], Indices.size()); + if (!TypeHasComponent(Info.AI->getAllocatedType(), Offset, 0)) + MarkUnsafe(Info, GEPI); +} + +/// isHomogeneousAggregate - Check if type T is a struct or array containing +/// elements of the same type (which is always true for arrays). If so, +/// return true with NumElts and EltTy set to the number of elements and the +/// element type, respectively. +static bool isHomogeneousAggregate(const Type *T, unsigned &NumElts, + const Type *&EltTy) { + if (const ArrayType *AT = dyn_cast<ArrayType>(T)) { + NumElts = AT->getNumElements(); + EltTy = (NumElts == 0 ? 0 : AT->getElementType()); + return true; + } + if (const StructType *ST = dyn_cast<StructType>(T)) { + NumElts = ST->getNumContainedTypes(); + EltTy = (NumElts == 0 ? 0 : ST->getContainedType(0)); + for (unsigned n = 1; n < NumElts; ++n) { + if (ST->getContainedType(n) != EltTy) + return false; + } + return true; + } + return false; +} + +/// isCompatibleAggregate - Check if T1 and T2 are either the same type or are +/// "homogeneous" aggregates with the same element type and number of elements. +static bool isCompatibleAggregate(const Type *T1, const Type *T2) { + if (T1 == T2) + return true; + + unsigned NumElts1, NumElts2; + const Type *EltTy1, *EltTy2; + if (isHomogeneousAggregate(T1, NumElts1, EltTy1) && + isHomogeneousAggregate(T2, NumElts2, EltTy2) && + NumElts1 == NumElts2 && + EltTy1 == EltTy2) + return true; + + return false; +} + +/// isSafeMemAccess - Check if a load/store/memcpy operates on the entire AI +/// alloca or has an offset and size that corresponds to a component element +/// within it. The offset checked here may have been formed from a GEP with a +/// pointer bitcasted to a different type. +/// +/// If AllowWholeAccess is true, then this allows uses of the entire alloca as a +/// unit. If false, it only allows accesses known to be in a single element. +void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize, + const Type *MemOpType, bool isStore, + AllocaInfo &Info, Instruction *TheAccess, + bool AllowWholeAccess) { + // Check if this is a load/store of the entire alloca. + if (Offset == 0 && AllowWholeAccess && + MemSize == TD->getTypeAllocSize(Info.AI->getAllocatedType())) { + // This can be safe for MemIntrinsics (where MemOpType is 0) and integer + // loads/stores (which are essentially the same as the MemIntrinsics with + // regard to copying padding between elements). But, if an alloca is + // flagged as both a source and destination of such operations, we'll need + // to check later for padding between elements. + if (!MemOpType || MemOpType->isIntegerTy()) { + if (isStore) + Info.isMemCpyDst = true; + else + Info.isMemCpySrc = true; + return; + } + // This is also safe for references using a type that is compatible with + // the type of the alloca, so that loads/stores can be rewritten using + // insertvalue/extractvalue. + if (isCompatibleAggregate(MemOpType, Info.AI->getAllocatedType())) { + Info.hasSubelementAccess = true; + return; + } + } + // Check if the offset/size correspond to a component within the alloca type. + const Type *T = Info.AI->getAllocatedType(); + if (TypeHasComponent(T, Offset, MemSize)) { + Info.hasSubelementAccess = true; + return; + } + + return MarkUnsafe(Info, TheAccess); +} + +/// TypeHasComponent - Return true if T has a component type with the +/// specified offset and size. If Size is zero, do not check the size. +bool SROA::TypeHasComponent(const Type *T, uint64_t Offset, uint64_t Size) { + const Type *EltTy; + uint64_t EltSize; + if (const StructType *ST = dyn_cast<StructType>(T)) { + const StructLayout *Layout = TD->getStructLayout(ST); + unsigned EltIdx = Layout->getElementContainingOffset(Offset); + EltTy = ST->getContainedType(EltIdx); + EltSize = TD->getTypeAllocSize(EltTy); + Offset -= Layout->getElementOffset(EltIdx); + } else if (const ArrayType *AT = dyn_cast<ArrayType>(T)) { + EltTy = AT->getElementType(); + EltSize = TD->getTypeAllocSize(EltTy); + if (Offset >= AT->getNumElements() * EltSize) + return false; + Offset %= EltSize; + } else { + return false; + } + if (Offset == 0 && (Size == 0 || EltSize == Size)) + return true; + // Check if the component spans multiple elements. + if (Offset + Size > EltSize) + return false; + return TypeHasComponent(EltTy, Offset, Size); +} + +/// RewriteForScalarRepl - Alloca AI is being split into NewElts, so rewrite +/// the instruction I, which references it, to use the separate elements. +/// Offset indicates the position within AI that is referenced by this +/// instruction. +void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, + SmallVector<AllocaInst*, 32> &NewElts) { + for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E;) { + Use &TheUse = UI.getUse(); + Instruction *User = cast<Instruction>(*UI++); + + if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) { + RewriteBitCast(BC, AI, Offset, NewElts); + continue; + } + + if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) { + RewriteGEP(GEPI, AI, Offset, NewElts); + continue; + } + + if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) { + ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength()); + uint64_t MemSize = Length->getZExtValue(); + if (Offset == 0 && + MemSize == TD->getTypeAllocSize(AI->getAllocatedType())) + RewriteMemIntrinUserOfAlloca(MI, I, AI, NewElts); + // Otherwise the intrinsic can only touch a single element and the + // address operand will be updated, so nothing else needs to be done. + continue; + } + + if (LoadInst *LI = dyn_cast<LoadInst>(User)) { + const Type *LIType = LI->getType(); + + if (isCompatibleAggregate(LIType, AI->getAllocatedType())) { + // Replace: + // %res = load { i32, i32 }* %alloc + // with: + // %load.0 = load i32* %alloc.0 + // %insert.0 insertvalue { i32, i32 } zeroinitializer, i32 %load.0, 0 + // %load.1 = load i32* %alloc.1 + // %insert = insertvalue { i32, i32 } %insert.0, i32 %load.1, 1 + // (Also works for arrays instead of structs) + Value *Insert = UndefValue::get(LIType); + for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { + Value *Load = new LoadInst(NewElts[i], "load", LI); + Insert = InsertValueInst::Create(Insert, Load, i, "insert", LI); + } + LI->replaceAllUsesWith(Insert); + DeadInsts.push_back(LI); + } else if (LIType->isIntegerTy() && + TD->getTypeAllocSize(LIType) == + TD->getTypeAllocSize(AI->getAllocatedType())) { + // If this is a load of the entire alloca to an integer, rewrite it. + RewriteLoadUserOfWholeAlloca(LI, AI, NewElts); + } + continue; + } + + if (StoreInst *SI = dyn_cast<StoreInst>(User)) { + Value *Val = SI->getOperand(0); + const Type *SIType = Val->getType(); + if (isCompatibleAggregate(SIType, AI->getAllocatedType())) { + // Replace: + // store { i32, i32 } %val, { i32, i32 }* %alloc + // with: + // %val.0 = extractvalue { i32, i32 } %val, 0 + // store i32 %val.0, i32* %alloc.0 + // %val.1 = extractvalue { i32, i32 } %val, 1 + // store i32 %val.1, i32* %alloc.1 + // (Also works for arrays instead of structs) + for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { + Value *Extract = ExtractValueInst::Create(Val, i, Val->getName(), SI); + new StoreInst(Extract, NewElts[i], SI); + } + DeadInsts.push_back(SI); + } else if (SIType->isIntegerTy() && + TD->getTypeAllocSize(SIType) == + TD->getTypeAllocSize(AI->getAllocatedType())) { + // If this is a store of the entire alloca from an integer, rewrite it. + RewriteStoreUserOfWholeAlloca(SI, AI, NewElts); + } + continue; + } + + if (isa<SelectInst>(User) || isa<PHINode>(User)) { + // If we have a PHI user of the alloca itself (as opposed to a GEP or + // bitcast) we have to rewrite it. GEP and bitcast uses will be RAUW'd to + // the new pointer. + if (!isa<AllocaInst>(I)) continue; + + assert(Offset == 0 && NewElts[0] && + "Direct alloca use should have a zero offset"); + + // If we have a use of the alloca, we know the derived uses will be + // utilizing just the first element of the scalarized result. Insert a + // bitcast of the first alloca before the user as required. + AllocaInst *NewAI = NewElts[0]; + BitCastInst *BCI = new BitCastInst(NewAI, AI->getType(), "", NewAI); + NewAI->moveBefore(BCI); + TheUse = BCI; + continue; + } + } +} + +/// RewriteBitCast - Update a bitcast reference to the alloca being replaced +/// and recursively continue updating all of its uses. +void SROA::RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset, + SmallVector<AllocaInst*, 32> &NewElts) { + RewriteForScalarRepl(BC, AI, Offset, NewElts); + if (BC->getOperand(0) != AI) + return; + + // The bitcast references the original alloca. Replace its uses with + // references to the first new element alloca. + Instruction *Val = NewElts[0]; + if (Val->getType() != BC->getDestTy()) { + Val = new BitCastInst(Val, BC->getDestTy(), "", BC); + Val->takeName(BC); + } + BC->replaceAllUsesWith(Val); + DeadInsts.push_back(BC); +} + +/// FindElementAndOffset - Return the index of the element containing Offset +/// within the specified type, which must be either a struct or an array. +/// Sets T to the type of the element and Offset to the offset within that +/// element. IdxTy is set to the type of the index result to be used in a +/// GEP instruction. +uint64_t SROA::FindElementAndOffset(const Type *&T, uint64_t &Offset, + const Type *&IdxTy) { + uint64_t Idx = 0; + if (const StructType *ST = dyn_cast<StructType>(T)) { + const StructLayout *Layout = TD->getStructLayout(ST); + Idx = Layout->getElementContainingOffset(Offset); + T = ST->getContainedType(Idx); + Offset -= Layout->getElementOffset(Idx); + IdxTy = Type::getInt32Ty(T->getContext()); + return Idx; + } + const ArrayType *AT = cast<ArrayType>(T); + T = AT->getElementType(); + uint64_t EltSize = TD->getTypeAllocSize(T); + Idx = Offset / EltSize; + Offset -= Idx * EltSize; + IdxTy = Type::getInt64Ty(T->getContext()); + return Idx; +} + +/// RewriteGEP - Check if this GEP instruction moves the pointer across +/// elements of the alloca that are being split apart, and if so, rewrite +/// the GEP to be relative to the new element. +void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, + SmallVector<AllocaInst*, 32> &NewElts) { + uint64_t OldOffset = Offset; + SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end()); + Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(), + &Indices[0], Indices.size()); + + RewriteForScalarRepl(GEPI, AI, Offset, NewElts); + + const Type *T = AI->getAllocatedType(); + const Type *IdxTy; + uint64_t OldIdx = FindElementAndOffset(T, OldOffset, IdxTy); + if (GEPI->getOperand(0) == AI) + OldIdx = ~0ULL; // Force the GEP to be rewritten. + + T = AI->getAllocatedType(); + uint64_t EltOffset = Offset; + uint64_t Idx = FindElementAndOffset(T, EltOffset, IdxTy); + + // If this GEP does not move the pointer across elements of the alloca + // being split, then it does not needs to be rewritten. + if (Idx == OldIdx) + return; + + const Type *i32Ty = Type::getInt32Ty(AI->getContext()); + SmallVector<Value*, 8> NewArgs; + NewArgs.push_back(Constant::getNullValue(i32Ty)); + while (EltOffset != 0) { + uint64_t EltIdx = FindElementAndOffset(T, EltOffset, IdxTy); + NewArgs.push_back(ConstantInt::get(IdxTy, EltIdx)); + } + Instruction *Val = NewElts[Idx]; + if (NewArgs.size() > 1) { + Val = GetElementPtrInst::CreateInBounds(Val, NewArgs.begin(), + NewArgs.end(), "", GEPI); + Val->takeName(GEPI); + } + if (Val->getType() != GEPI->getType()) + Val = new BitCastInst(Val, GEPI->getType(), Val->getName(), GEPI); + GEPI->replaceAllUsesWith(Val); + DeadInsts.push_back(GEPI); +} + +/// RewriteMemIntrinUserOfAlloca - MI is a memcpy/memset/memmove from or to AI. +/// Rewrite it to copy or set the elements of the scalarized memory. +void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, + AllocaInst *AI, + SmallVector<AllocaInst*, 32> &NewElts) { + // If this is a memcpy/memmove, construct the other pointer as the + // appropriate type. The "Other" pointer is the pointer that goes to memory + // that doesn't have anything to do with the alloca that we are promoting. For + // memset, this Value* stays null. + Value *OtherPtr = 0; + unsigned MemAlignment = MI->getAlignment(); + if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) { // memmove/memcopy + if (Inst == MTI->getRawDest()) + OtherPtr = MTI->getRawSource(); + else { + assert(Inst == MTI->getRawSource()); + OtherPtr = MTI->getRawDest(); + } + } + + // If there is an other pointer, we want to convert it to the same pointer + // type as AI has, so we can GEP through it safely. + if (OtherPtr) { + unsigned AddrSpace = + cast<PointerType>(OtherPtr->getType())->getAddressSpace(); + + // Remove bitcasts and all-zero GEPs from OtherPtr. This is an + // optimization, but it's also required to detect the corner case where + // both pointer operands are referencing the same memory, and where + // OtherPtr may be a bitcast or GEP that currently being rewritten. (This + // function is only called for mem intrinsics that access the whole + // aggregate, so non-zero GEPs are not an issue here.) + OtherPtr = OtherPtr->stripPointerCasts(); + + // Copying the alloca to itself is a no-op: just delete it. + if (OtherPtr == AI || OtherPtr == NewElts[0]) { + // This code will run twice for a no-op memcpy -- once for each operand. + // Put only one reference to MI on the DeadInsts list. + for (SmallVector<Value*, 32>::const_iterator I = DeadInsts.begin(), + E = DeadInsts.end(); I != E; ++I) + if (*I == MI) return; + DeadInsts.push_back(MI); + return; + } + + // If the pointer is not the right type, insert a bitcast to the right + // type. + const Type *NewTy = + PointerType::get(AI->getType()->getElementType(), AddrSpace); + + if (OtherPtr->getType() != NewTy) + OtherPtr = new BitCastInst(OtherPtr, NewTy, OtherPtr->getName(), MI); + } + + // Process each element of the aggregate. + bool SROADest = MI->getRawDest() == Inst; + + Constant *Zero = Constant::getNullValue(Type::getInt32Ty(MI->getContext())); + + for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { + // If this is a memcpy/memmove, emit a GEP of the other element address. + Value *OtherElt = 0; + unsigned OtherEltAlign = MemAlignment; + + if (OtherPtr) { + Value *Idx[2] = { Zero, + ConstantInt::get(Type::getInt32Ty(MI->getContext()), i) }; + OtherElt = GetElementPtrInst::CreateInBounds(OtherPtr, Idx, Idx + 2, + OtherPtr->getName()+"."+Twine(i), + MI); + uint64_t EltOffset; + const PointerType *OtherPtrTy = cast<PointerType>(OtherPtr->getType()); + const Type *OtherTy = OtherPtrTy->getElementType(); + if (const StructType *ST = dyn_cast<StructType>(OtherTy)) { + EltOffset = TD->getStructLayout(ST)->getElementOffset(i); + } else { + const Type *EltTy = cast<SequentialType>(OtherTy)->getElementType(); + EltOffset = TD->getTypeAllocSize(EltTy)*i; + } + + // The alignment of the other pointer is the guaranteed alignment of the + // element, which is affected by both the known alignment of the whole + // mem intrinsic and the alignment of the element. If the alignment of + // the memcpy (f.e.) is 32 but the element is at a 4-byte offset, then the + // known alignment is just 4 bytes. + OtherEltAlign = (unsigned)MinAlign(OtherEltAlign, EltOffset); + } + + Value *EltPtr = NewElts[i]; + const Type *EltTy = cast<PointerType>(EltPtr->getType())->getElementType(); + + // If we got down to a scalar, insert a load or store as appropriate. + if (EltTy->isSingleValueType()) { + if (isa<MemTransferInst>(MI)) { + if (SROADest) { + // From Other to Alloca. + Value *Elt = new LoadInst(OtherElt, "tmp", false, OtherEltAlign, MI); + new StoreInst(Elt, EltPtr, MI); + } else { + // From Alloca to Other. + Value *Elt = new LoadInst(EltPtr, "tmp", MI); + new StoreInst(Elt, OtherElt, false, OtherEltAlign, MI); + } + continue; + } + assert(isa<MemSetInst>(MI)); + + // If the stored element is zero (common case), just store a null + // constant. + Constant *StoreVal; + if (ConstantInt *CI = dyn_cast<ConstantInt>(MI->getArgOperand(1))) { + if (CI->isZero()) { + StoreVal = Constant::getNullValue(EltTy); // 0.0, null, 0, <0,0> + } else { + // If EltTy is a vector type, get the element type. + const Type *ValTy = EltTy->getScalarType(); + + // Construct an integer with the right value. + unsigned EltSize = TD->getTypeSizeInBits(ValTy); + APInt OneVal(EltSize, CI->getZExtValue()); + APInt TotalVal(OneVal); + // Set each byte. + for (unsigned i = 0; 8*i < EltSize; ++i) { + TotalVal = TotalVal.shl(8); + TotalVal |= OneVal; + } + + // Convert the integer value to the appropriate type. + StoreVal = ConstantInt::get(CI->getContext(), TotalVal); + if (ValTy->isPointerTy()) + StoreVal = ConstantExpr::getIntToPtr(StoreVal, ValTy); + else if (ValTy->isFloatingPointTy()) + StoreVal = ConstantExpr::getBitCast(StoreVal, ValTy); + assert(StoreVal->getType() == ValTy && "Type mismatch!"); + + // If the requested value was a vector constant, create it. + if (EltTy != ValTy) { + unsigned NumElts = cast<VectorType>(ValTy)->getNumElements(); + SmallVector<Constant*, 16> Elts(NumElts, StoreVal); + StoreVal = ConstantVector::get(Elts); + } + } + new StoreInst(StoreVal, EltPtr, MI); + continue; + } + // Otherwise, if we're storing a byte variable, use a memset call for + // this element. + } + + unsigned EltSize = TD->getTypeAllocSize(EltTy); + + IRBuilder<> Builder(MI); + + // Finally, insert the meminst for this element. + if (isa<MemSetInst>(MI)) { + Builder.CreateMemSet(EltPtr, MI->getArgOperand(1), EltSize, + MI->isVolatile()); + } else { + assert(isa<MemTransferInst>(MI)); + Value *Dst = SROADest ? EltPtr : OtherElt; // Dest ptr + Value *Src = SROADest ? OtherElt : EltPtr; // Src ptr + + if (isa<MemCpyInst>(MI)) + Builder.CreateMemCpy(Dst, Src, EltSize, OtherEltAlign,MI->isVolatile()); + else + Builder.CreateMemMove(Dst, Src, EltSize,OtherEltAlign,MI->isVolatile()); + } + } + DeadInsts.push_back(MI); +} + +/// RewriteStoreUserOfWholeAlloca - We found a store of an integer that +/// overwrites the entire allocation. Extract out the pieces of the stored +/// integer and store them individually. +void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, + SmallVector<AllocaInst*, 32> &NewElts){ + // Extract each element out of the integer according to its structure offset + // and store the element value to the individual alloca. + Value *SrcVal = SI->getOperand(0); + const Type *AllocaEltTy = AI->getAllocatedType(); + uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy); + + IRBuilder<> Builder(SI); + + // Handle tail padding by extending the operand + if (TD->getTypeSizeInBits(SrcVal->getType()) != AllocaSizeBits) + SrcVal = Builder.CreateZExt(SrcVal, + IntegerType::get(SI->getContext(), AllocaSizeBits)); + + DEBUG(dbgs() << "PROMOTING STORE TO WHOLE ALLOCA: " << *AI << '\n' << *SI + << '\n'); + + // There are two forms here: AI could be an array or struct. Both cases + // have different ways to compute the element offset. + if (const StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) { + const StructLayout *Layout = TD->getStructLayout(EltSTy); + + for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { + // Get the number of bits to shift SrcVal to get the value. + const Type *FieldTy = EltSTy->getElementType(i); + uint64_t Shift = Layout->getElementOffsetInBits(i); + + if (TD->isBigEndian()) + Shift = AllocaSizeBits-Shift-TD->getTypeAllocSizeInBits(FieldTy); + + Value *EltVal = SrcVal; + if (Shift) { + Value *ShiftVal = ConstantInt::get(EltVal->getType(), Shift); + EltVal = Builder.CreateLShr(EltVal, ShiftVal, "sroa.store.elt"); + } + + // Truncate down to an integer of the right size. + uint64_t FieldSizeBits = TD->getTypeSizeInBits(FieldTy); + + // Ignore zero sized fields like {}, they obviously contain no data. + if (FieldSizeBits == 0) continue; + + if (FieldSizeBits != AllocaSizeBits) + EltVal = Builder.CreateTrunc(EltVal, + IntegerType::get(SI->getContext(), FieldSizeBits)); + Value *DestField = NewElts[i]; + if (EltVal->getType() == FieldTy) { + // Storing to an integer field of this size, just do it. + } else if (FieldTy->isFloatingPointTy() || FieldTy->isVectorTy()) { + // Bitcast to the right element type (for fp/vector values). + EltVal = Builder.CreateBitCast(EltVal, FieldTy); + } else { + // Otherwise, bitcast the dest pointer (for aggregates). + DestField = Builder.CreateBitCast(DestField, + PointerType::getUnqual(EltVal->getType())); + } + new StoreInst(EltVal, DestField, SI); + } + + } else { + const ArrayType *ATy = cast<ArrayType>(AllocaEltTy); + const Type *ArrayEltTy = ATy->getElementType(); + uint64_t ElementOffset = TD->getTypeAllocSizeInBits(ArrayEltTy); + uint64_t ElementSizeBits = TD->getTypeSizeInBits(ArrayEltTy); + + uint64_t Shift; + + if (TD->isBigEndian()) + Shift = AllocaSizeBits-ElementOffset; + else + Shift = 0; + + for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { + // Ignore zero sized fields like {}, they obviously contain no data. + if (ElementSizeBits == 0) continue; + + Value *EltVal = SrcVal; + if (Shift) { + Value *ShiftVal = ConstantInt::get(EltVal->getType(), Shift); + EltVal = Builder.CreateLShr(EltVal, ShiftVal, "sroa.store.elt"); + } + + // Truncate down to an integer of the right size. + if (ElementSizeBits != AllocaSizeBits) + EltVal = Builder.CreateTrunc(EltVal, + IntegerType::get(SI->getContext(), + ElementSizeBits)); + Value *DestField = NewElts[i]; + if (EltVal->getType() == ArrayEltTy) { + // Storing to an integer field of this size, just do it. + } else if (ArrayEltTy->isFloatingPointTy() || + ArrayEltTy->isVectorTy()) { + // Bitcast to the right element type (for fp/vector values). + EltVal = Builder.CreateBitCast(EltVal, ArrayEltTy); + } else { + // Otherwise, bitcast the dest pointer (for aggregates). + DestField = Builder.CreateBitCast(DestField, + PointerType::getUnqual(EltVal->getType())); + } + new StoreInst(EltVal, DestField, SI); + + if (TD->isBigEndian()) + Shift -= ElementOffset; + else + Shift += ElementOffset; + } + } + + DeadInsts.push_back(SI); +} + +/// RewriteLoadUserOfWholeAlloca - We found a load of the entire allocation to +/// an integer. Load the individual pieces to form the aggregate value. +void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, + SmallVector<AllocaInst*, 32> &NewElts) { + // Extract each element out of the NewElts according to its structure offset + // and form the result value. + const Type *AllocaEltTy = AI->getAllocatedType(); + uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy); + + DEBUG(dbgs() << "PROMOTING LOAD OF WHOLE ALLOCA: " << *AI << '\n' << *LI + << '\n'); + + // There are two forms here: AI could be an array or struct. Both cases + // have different ways to compute the element offset. + const StructLayout *Layout = 0; + uint64_t ArrayEltBitOffset = 0; + if (const StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) { + Layout = TD->getStructLayout(EltSTy); + } else { + const Type *ArrayEltTy = cast<ArrayType>(AllocaEltTy)->getElementType(); + ArrayEltBitOffset = TD->getTypeAllocSizeInBits(ArrayEltTy); + } + + Value *ResultVal = + Constant::getNullValue(IntegerType::get(LI->getContext(), AllocaSizeBits)); + + for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { + // Load the value from the alloca. If the NewElt is an aggregate, cast + // the pointer to an integer of the same size before doing the load. + Value *SrcField = NewElts[i]; + const Type *FieldTy = + cast<PointerType>(SrcField->getType())->getElementType(); + uint64_t FieldSizeBits = TD->getTypeSizeInBits(FieldTy); + + // Ignore zero sized fields like {}, they obviously contain no data. + if (FieldSizeBits == 0) continue; + + const IntegerType *FieldIntTy = IntegerType::get(LI->getContext(), + FieldSizeBits); + if (!FieldTy->isIntegerTy() && !FieldTy->isFloatingPointTy() && + !FieldTy->isVectorTy()) + SrcField = new BitCastInst(SrcField, + PointerType::getUnqual(FieldIntTy), + "", LI); + SrcField = new LoadInst(SrcField, "sroa.load.elt", LI); + + // If SrcField is a fp or vector of the right size but that isn't an + // integer type, bitcast to an integer so we can shift it. + if (SrcField->getType() != FieldIntTy) + SrcField = new BitCastInst(SrcField, FieldIntTy, "", LI); + + // Zero extend the field to be the same size as the final alloca so that + // we can shift and insert it. + if (SrcField->getType() != ResultVal->getType()) + SrcField = new ZExtInst(SrcField, ResultVal->getType(), "", LI); + + // Determine the number of bits to shift SrcField. + uint64_t Shift; + if (Layout) // Struct case. + Shift = Layout->getElementOffsetInBits(i); + else // Array case. + Shift = i*ArrayEltBitOffset; + + if (TD->isBigEndian()) + Shift = AllocaSizeBits-Shift-FieldIntTy->getBitWidth(); + + if (Shift) { + Value *ShiftVal = ConstantInt::get(SrcField->getType(), Shift); + SrcField = BinaryOperator::CreateShl(SrcField, ShiftVal, "", LI); + } + + // Don't create an 'or x, 0' on the first iteration. + if (!isa<Constant>(ResultVal) || + !cast<Constant>(ResultVal)->isNullValue()) + ResultVal = BinaryOperator::CreateOr(SrcField, ResultVal, "", LI); + else + ResultVal = SrcField; + } + + // Handle tail padding by truncating the result + if (TD->getTypeSizeInBits(LI->getType()) != AllocaSizeBits) + ResultVal = new TruncInst(ResultVal, LI->getType(), "", LI); + + LI->replaceAllUsesWith(ResultVal); + DeadInsts.push_back(LI); +} + +/// HasPadding - Return true if the specified type has any structure or +/// alignment padding in between the elements that would be split apart +/// by SROA; return false otherwise. +static bool HasPadding(const Type *Ty, const TargetData &TD) { + if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { + Ty = ATy->getElementType(); + return TD.getTypeSizeInBits(Ty) != TD.getTypeAllocSizeInBits(Ty); + } + + // SROA currently handles only Arrays and Structs. + const StructType *STy = cast<StructType>(Ty); + const StructLayout *SL = TD.getStructLayout(STy); + unsigned PrevFieldBitOffset = 0; + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + unsigned FieldBitOffset = SL->getElementOffsetInBits(i); + + // Check to see if there is any padding between this element and the + // previous one. + if (i) { + unsigned PrevFieldEnd = + PrevFieldBitOffset+TD.getTypeSizeInBits(STy->getElementType(i-1)); + if (PrevFieldEnd < FieldBitOffset) + return true; + } + PrevFieldBitOffset = FieldBitOffset; + } + // Check for tail padding. + if (unsigned EltCount = STy->getNumElements()) { + unsigned PrevFieldEnd = PrevFieldBitOffset + + TD.getTypeSizeInBits(STy->getElementType(EltCount-1)); + if (PrevFieldEnd < SL->getSizeInBits()) + return true; + } + return false; +} + +/// isSafeStructAllocaToScalarRepl - Check to see if the specified allocation of +/// an aggregate can be broken down into elements. Return 0 if not, 3 if safe, +/// or 1 if safe after canonicalization has been performed. +bool SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) { + // Loop over the use list of the alloca. We can only transform it if all of + // the users are safe to transform. + AllocaInfo Info(AI); + + isSafeForScalarRepl(AI, 0, Info); + if (Info.isUnsafe) { + DEBUG(dbgs() << "Cannot transform: " << *AI << '\n'); + return false; + } + + // Okay, we know all the users are promotable. If the aggregate is a memcpy + // source and destination, we have to be careful. In particular, the memcpy + // could be moving around elements that live in structure padding of the LLVM + // types, but may actually be used. In these cases, we refuse to promote the + // struct. + if (Info.isMemCpySrc && Info.isMemCpyDst && + HasPadding(AI->getAllocatedType(), *TD)) + return false; + + // If the alloca never has an access to just *part* of it, but is accessed + // via loads and stores, then we should use ConvertToScalarInfo to promote + // the alloca instead of promoting each piece at a time and inserting fission + // and fusion code. + if (!Info.hasSubelementAccess && Info.hasALoadOrStore) { + // If the struct/array just has one element, use basic SRoA. + if (const StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) { + if (ST->getNumElements() > 1) return false; + } else { + if (cast<ArrayType>(AI->getAllocatedType())->getNumElements() > 1) + return false; + } + } + + return true; +} + + + +/// PointsToConstantGlobal - Return true if V (possibly indirectly) points to +/// some part of a constant global variable. This intentionally only accepts +/// constant expressions because we don't can't rewrite arbitrary instructions. +static bool PointsToConstantGlobal(Value *V) { + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) + return GV->isConstant(); + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) + if (CE->getOpcode() == Instruction::BitCast || + CE->getOpcode() == Instruction::GetElementPtr) + return PointsToConstantGlobal(CE->getOperand(0)); + return false; +} + +/// isOnlyCopiedFromConstantGlobal - Recursively walk the uses of a (derived) +/// pointer to an alloca. Ignore any reads of the pointer, return false if we +/// see any stores or other unknown uses. If we see pointer arithmetic, keep +/// track of whether it moves the pointer (with isOffset) but otherwise traverse +/// the uses. If we see a memcpy/memmove that targets an unoffseted pointer to +/// the alloca, and if the source pointer is a pointer to a constant global, we +/// can optimize this. +static bool isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy, + bool isOffset) { + for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) { + User *U = cast<Instruction>(*UI); + + if (LoadInst *LI = dyn_cast<LoadInst>(U)) { + // Ignore non-volatile loads, they are always ok. + if (LI->isVolatile()) return false; + continue; + } + + if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) { + // If uses of the bitcast are ok, we are ok. + if (!isOnlyCopiedFromConstantGlobal(BCI, TheCopy, isOffset)) + return false; + continue; + } + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) { + // If the GEP has all zero indices, it doesn't offset the pointer. If it + // doesn't, it does. + if (!isOnlyCopiedFromConstantGlobal(GEP, TheCopy, + isOffset || !GEP->hasAllZeroIndices())) + return false; + continue; + } + + if (CallSite CS = U) { + // If this is a readonly/readnone call site, then we know it is just a + // load and we can ignore it. + if (CS.onlyReadsMemory()) + continue; + + // If this is the function being called then we treat it like a load and + // ignore it. + if (CS.isCallee(UI)) + continue; + + // If this is being passed as a byval argument, the caller is making a + // copy, so it is only a read of the alloca. + unsigned ArgNo = CS.getArgumentNo(UI); + if (CS.paramHasAttr(ArgNo+1, Attribute::ByVal)) + continue; + } + + // If this is isn't our memcpy/memmove, reject it as something we can't + // handle. + MemTransferInst *MI = dyn_cast<MemTransferInst>(U); + if (MI == 0) + return false; + + // If the transfer is using the alloca as a source of the transfer, then + // ignore it since it is a load (unless the transfer is volatile). + if (UI.getOperandNo() == 1) { + if (MI->isVolatile()) return false; + continue; + } + + // If we already have seen a copy, reject the second one. + if (TheCopy) return false; + + // If the pointer has been offset from the start of the alloca, we can't + // safely handle this. + if (isOffset) return false; + + // If the memintrinsic isn't using the alloca as the dest, reject it. + if (UI.getOperandNo() != 0) return false; + + // If the source of the memcpy/move is not a constant global, reject it. + if (!PointsToConstantGlobal(MI->getSource())) + return false; + + // Otherwise, the transform is safe. Remember the copy instruction. + TheCopy = MI; + } + return true; +} + +/// isOnlyCopiedFromConstantGlobal - Return true if the specified alloca is only +/// modified by a copy from a constant global. If we can prove this, we can +/// replace any uses of the alloca with uses of the global directly. +MemTransferInst *SROA::isOnlyCopiedFromConstantGlobal(AllocaInst *AI) { + MemTransferInst *TheCopy = 0; + if (::isOnlyCopiedFromConstantGlobal(AI, TheCopy, false)) + return TheCopy; + return 0; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp new file mode 100644 index 0000000..ce5dd73 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -0,0 +1,329 @@ +//===- SimplifyCFGPass.cpp - CFG Simplification Pass ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements dead code elimination and basic block merging, along +// with a collection of other peephole control flow optimizations. For example: +// +// * Removes basic blocks with no predecessors. +// * Merges a basic block into its predecessor if there is only one and the +// predecessor only has one successor. +// * Eliminates PHI nodes for basic blocks with a single predecessor. +// * Eliminates a basic block that only contains an unconditional branch. +// * Changes invoke instructions to nounwind functions to be calls. +// * Change things like "if (x) if (y)" into "if (x&y)". +// * etc.. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "simplifycfg" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Constants.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Module.h" +#include "llvm/Attributes.h" +#include "llvm/Support/CFG.h" +#include "llvm/Pass.h" +#include "llvm/Target/TargetData.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumSimpl, "Number of blocks simplified"); + +namespace { + struct CFGSimplifyPass : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + CFGSimplifyPass() : FunctionPass(ID) { + initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry()); + } + + virtual bool runOnFunction(Function &F); + }; +} + +char CFGSimplifyPass::ID = 0; +INITIALIZE_PASS(CFGSimplifyPass, "simplifycfg", + "Simplify the CFG", false, false) + +// Public interface to the CFGSimplification pass +FunctionPass *llvm::createCFGSimplificationPass() { + return new CFGSimplifyPass(); +} + +/// ChangeToUnreachable - Insert an unreachable instruction before the specified +/// instruction, making it and the rest of the code in the block dead. +static void ChangeToUnreachable(Instruction *I, bool UseLLVMTrap) { + BasicBlock *BB = I->getParent(); + // Loop over all of the successors, removing BB's entry from any PHI + // nodes. + for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) + (*SI)->removePredecessor(BB); + + // Insert a call to llvm.trap right before this. This turns the undefined + // behavior into a hard fail instead of falling through into random code. + if (UseLLVMTrap) { + Function *TrapFn = + Intrinsic::getDeclaration(BB->getParent()->getParent(), Intrinsic::trap); + CallInst::Create(TrapFn, "", I); + } + new UnreachableInst(I->getContext(), I); + + // All instructions after this are dead. + BasicBlock::iterator BBI = I, BBE = BB->end(); + while (BBI != BBE) { + if (!BBI->use_empty()) + BBI->replaceAllUsesWith(UndefValue::get(BBI->getType())); + BB->getInstList().erase(BBI++); + } +} + +/// ChangeToCall - Convert the specified invoke into a normal call. +static void ChangeToCall(InvokeInst *II) { + BasicBlock *BB = II->getParent(); + SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3); + CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args.begin(), + Args.end(), "", II); + NewCall->takeName(II); + NewCall->setCallingConv(II->getCallingConv()); + NewCall->setAttributes(II->getAttributes()); + II->replaceAllUsesWith(NewCall); + + // Follow the call by a branch to the normal destination. + BranchInst::Create(II->getNormalDest(), II); + + // Update PHI nodes in the unwind destination + II->getUnwindDest()->removePredecessor(BB); + BB->getInstList().erase(II); +} + +static bool MarkAliveBlocks(BasicBlock *BB, + SmallPtrSet<BasicBlock*, 128> &Reachable) { + + SmallVector<BasicBlock*, 128> Worklist; + Worklist.push_back(BB); + bool Changed = false; + do { + BB = Worklist.pop_back_val(); + + if (!Reachable.insert(BB)) + continue; + + // Do a quick scan of the basic block, turning any obviously unreachable + // instructions into LLVM unreachable insts. The instruction combining pass + // canonicalizes unreachable insts into stores to null or undef. + for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;++BBI){ + if (CallInst *CI = dyn_cast<CallInst>(BBI)) { + if (CI->doesNotReturn()) { + // If we found a call to a no-return function, insert an unreachable + // instruction after it. Make sure there isn't *already* one there + // though. + ++BBI; + if (!isa<UnreachableInst>(BBI)) { + // Don't insert a call to llvm.trap right before the unreachable. + ChangeToUnreachable(BBI, false); + Changed = true; + } + break; + } + } + + // Store to undef and store to null are undefined and used to signal that + // they should be changed to unreachable by passes that can't modify the + // CFG. + if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) { + // Don't touch volatile stores. + if (SI->isVolatile()) continue; + + Value *Ptr = SI->getOperand(1); + + if (isa<UndefValue>(Ptr) || + (isa<ConstantPointerNull>(Ptr) && + SI->getPointerAddressSpace() == 0)) { + ChangeToUnreachable(SI, true); + Changed = true; + break; + } + } + } + + // Turn invokes that call 'nounwind' functions into ordinary calls. + if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) + if (II->doesNotThrow()) { + ChangeToCall(II); + Changed = true; + } + + Changed |= ConstantFoldTerminator(BB); + for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) + Worklist.push_back(*SI); + } while (!Worklist.empty()); + return Changed; +} + +/// RemoveUnreachableBlocksFromFn - Remove blocks that are not reachable, even +/// if they are in a dead cycle. Return true if a change was made, false +/// otherwise. +static bool RemoveUnreachableBlocksFromFn(Function &F) { + SmallPtrSet<BasicBlock*, 128> Reachable; + bool Changed = MarkAliveBlocks(F.begin(), Reachable); + + // If there are unreachable blocks in the CFG... + if (Reachable.size() == F.size()) + return Changed; + + assert(Reachable.size() < F.size()); + NumSimpl += F.size()-Reachable.size(); + + // Loop over all of the basic blocks that are not reachable, dropping all of + // their internal references... + for (Function::iterator BB = ++F.begin(), E = F.end(); BB != E; ++BB) { + if (Reachable.count(BB)) + continue; + + for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) + if (Reachable.count(*SI)) + (*SI)->removePredecessor(BB); + BB->dropAllReferences(); + } + + for (Function::iterator I = ++F.begin(); I != F.end();) + if (!Reachable.count(I)) + I = F.getBasicBlockList().erase(I); + else + ++I; + + return true; +} + +/// MergeEmptyReturnBlocks - If we have more than one empty (other than phi +/// node) return blocks, merge them together to promote recursive block merging. +static bool MergeEmptyReturnBlocks(Function &F) { + bool Changed = false; + + BasicBlock *RetBlock = 0; + + // Scan all the blocks in the function, looking for empty return blocks. + for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; ) { + BasicBlock &BB = *BBI++; + + // Only look at return blocks. + ReturnInst *Ret = dyn_cast<ReturnInst>(BB.getTerminator()); + if (Ret == 0) continue; + + // Only look at the block if it is empty or the only other thing in it is a + // single PHI node that is the operand to the return. + if (Ret != &BB.front()) { + // Check for something else in the block. + BasicBlock::iterator I = Ret; + --I; + // Skip over debug info. + while (isa<DbgInfoIntrinsic>(I) && I != BB.begin()) + --I; + if (!isa<DbgInfoIntrinsic>(I) && + (!isa<PHINode>(I) || I != BB.begin() || + Ret->getNumOperands() == 0 || + Ret->getOperand(0) != I)) + continue; + } + + // If this is the first returning block, remember it and keep going. + if (RetBlock == 0) { + RetBlock = &BB; + continue; + } + + // Otherwise, we found a duplicate return block. Merge the two. + Changed = true; + + // Case when there is no input to the return or when the returned values + // agree is trivial. Note that they can't agree if there are phis in the + // blocks. + if (Ret->getNumOperands() == 0 || + Ret->getOperand(0) == + cast<ReturnInst>(RetBlock->getTerminator())->getOperand(0)) { + BB.replaceAllUsesWith(RetBlock); + BB.eraseFromParent(); + continue; + } + + // If the canonical return block has no PHI node, create one now. + PHINode *RetBlockPHI = dyn_cast<PHINode>(RetBlock->begin()); + if (RetBlockPHI == 0) { + Value *InVal = cast<ReturnInst>(RetBlock->getTerminator())->getOperand(0); + RetBlockPHI = PHINode::Create(Ret->getOperand(0)->getType(), "merge", + &RetBlock->front()); + + for (pred_iterator PI = pred_begin(RetBlock), E = pred_end(RetBlock); + PI != E; ++PI) + RetBlockPHI->addIncoming(InVal, *PI); + RetBlock->getTerminator()->setOperand(0, RetBlockPHI); + } + + // Turn BB into a block that just unconditionally branches to the return + // block. This handles the case when the two return blocks have a common + // predecessor but that return different things. + RetBlockPHI->addIncoming(Ret->getOperand(0), &BB); + BB.getTerminator()->eraseFromParent(); + BranchInst::Create(RetBlock, &BB); + } + + return Changed; +} + +/// IterativeSimplifyCFG - Call SimplifyCFG on all the blocks in the function, +/// iterating until no more changes are made. +static bool IterativeSimplifyCFG(Function &F, const TargetData *TD) { + bool Changed = false; + bool LocalChange = true; + while (LocalChange) { + LocalChange = false; + + // Loop over all of the basic blocks and remove them if they are unneeded... + // + for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) { + if (SimplifyCFG(BBIt++, TD)) { + LocalChange = true; + ++NumSimpl; + } + } + Changed |= LocalChange; + } + return Changed; +} + +// It is possible that we may require multiple passes over the code to fully +// simplify the CFG. +// +bool CFGSimplifyPass::runOnFunction(Function &F) { + const TargetData *TD = getAnalysisIfAvailable<TargetData>(); + bool EverChanged = RemoveUnreachableBlocksFromFn(F); + EverChanged |= MergeEmptyReturnBlocks(F); + EverChanged |= IterativeSimplifyCFG(F, TD); + + // If neither pass changed anything, we're done. + if (!EverChanged) return false; + + // IterativeSimplifyCFG can (rarely) make some loops dead. If this happens, + // RemoveUnreachableBlocksFromFn is needed to nuke them, which means we should + // iterate between the two optimizations. We structure the code like this to + // avoid reruning IterativeSimplifyCFG if the second pass of + // RemoveUnreachableBlocksFromFn doesn't do anything. + if (!RemoveUnreachableBlocksFromFn(F)) + return true; + + do { + EverChanged = IterativeSimplifyCFG(F, TD); + EverChanged |= RemoveUnreachableBlocksFromFn(F); + } while (EverChanged); + + return true; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp new file mode 100644 index 0000000..70ff32e --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp @@ -0,0 +1,160 @@ +//===- SimplifyHalfPowrLibCalls.cpp - Optimize specific half_powr calls ---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a simple pass that applies an experimental +// transformation on calls to specific functions. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "simplify-libcalls-halfpowr" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Instructions.h" +#include "llvm/Intrinsics.h" +#include "llvm/Module.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Target/TargetData.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/Debug.h" +using namespace llvm; + +namespace { + /// This pass optimizes well half_powr function calls. + /// + class SimplifyHalfPowrLibCalls : public FunctionPass { + const TargetData *TD; + public: + static char ID; // Pass identification + SimplifyHalfPowrLibCalls() : FunctionPass(ID) { + initializeSimplifyHalfPowrLibCallsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + } + + Instruction * + InlineHalfPowrs(const std::vector<Instruction *> &HalfPowrs, + Instruction *InsertPt); + }; + char SimplifyHalfPowrLibCalls::ID = 0; +} // end anonymous namespace. + +INITIALIZE_PASS(SimplifyHalfPowrLibCalls, "simplify-libcalls-halfpowr", + "Simplify half_powr library calls", false, false) + +// Public interface to the Simplify HalfPowr LibCalls pass. +FunctionPass *llvm::createSimplifyHalfPowrLibCallsPass() { + return new SimplifyHalfPowrLibCalls(); +} + +/// InlineHalfPowrs - Inline a sequence of adjacent half_powr calls, rearranging +/// their control flow to better facilitate subsequent optimization. +Instruction * +SimplifyHalfPowrLibCalls:: +InlineHalfPowrs(const std::vector<Instruction *> &HalfPowrs, + Instruction *InsertPt) { + std::vector<BasicBlock *> Bodies; + BasicBlock *NewBlock = 0; + + for (unsigned i = 0, e = HalfPowrs.size(); i != e; ++i) { + CallInst *Call = cast<CallInst>(HalfPowrs[i]); + Function *Callee = Call->getCalledFunction(); + + // Minimally sanity-check the CFG of half_powr to ensure that it contains + // the kind of code we expect. If we're running this pass, we have + // reason to believe it will be what we expect. + Function::iterator I = Callee->begin(); + BasicBlock *Prologue = I++; + if (I == Callee->end()) break; + BasicBlock *SubnormalHandling = I++; + if (I == Callee->end()) break; + BasicBlock *Body = I++; + if (I != Callee->end()) break; + if (SubnormalHandling->getSinglePredecessor() != Prologue) + break; + BranchInst *PBI = dyn_cast<BranchInst>(Prologue->getTerminator()); + if (!PBI || !PBI->isConditional()) + break; + BranchInst *SNBI = dyn_cast<BranchInst>(SubnormalHandling->getTerminator()); + if (!SNBI || SNBI->isConditional()) + break; + if (!isa<ReturnInst>(Body->getTerminator())) + break; + + Instruction *NextInst = llvm::next(BasicBlock::iterator(Call)); + + // Inline the call, taking care of what code ends up where. + NewBlock = SplitBlock(NextInst->getParent(), NextInst, this); + + InlineFunctionInfo IFI(0, TD); + bool B = InlineFunction(Call, IFI); + assert(B && "half_powr didn't inline?"); + (void)B; + + BasicBlock *NewBody = NewBlock->getSinglePredecessor(); + assert(NewBody); + Bodies.push_back(NewBody); + } + + if (!NewBlock) + return InsertPt; + + // Put the code for all the bodies into one block, to facilitate + // subsequent optimization. + (void)SplitEdge(NewBlock->getSinglePredecessor(), NewBlock, this); + for (unsigned i = 0, e = Bodies.size(); i != e; ++i) { + BasicBlock *Body = Bodies[i]; + Instruction *FNP = Body->getFirstNonPHI(); + // Splice the insts from body into NewBlock. + NewBlock->getInstList().splice(NewBlock->begin(), Body->getInstList(), + FNP, Body->getTerminator()); + } + + return NewBlock->begin(); +} + +/// runOnFunction - Top level algorithm. +/// +bool SimplifyHalfPowrLibCalls::runOnFunction(Function &F) { + TD = getAnalysisIfAvailable<TargetData>(); + + bool Changed = false; + std::vector<Instruction *> HalfPowrs; + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + // Look for calls. + bool IsHalfPowr = false; + if (CallInst *CI = dyn_cast<CallInst>(I)) { + // Look for direct calls and calls to non-external functions. + Function *Callee = CI->getCalledFunction(); + if (Callee && Callee->hasExternalLinkage()) { + // Look for calls with well-known names. + if (Callee->getName() == "__half_powrf4") + IsHalfPowr = true; + } + } + if (IsHalfPowr) + HalfPowrs.push_back(I); + // We're looking for sequences of up to three such calls, which we'll + // simplify as a group. + if ((!IsHalfPowr && !HalfPowrs.empty()) || HalfPowrs.size() == 3) { + I = InlineHalfPowrs(HalfPowrs, I); + E = I->getParent()->end(); + HalfPowrs.clear(); + Changed = true; + } + } + assert(HalfPowrs.empty() && "Block had no terminator!"); + } + + return Changed; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyLibCalls.cpp new file mode 100644 index 0000000..ec45b71 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyLibCalls.cpp @@ -0,0 +1,2339 @@ +//===- SimplifyLibCalls.cpp - Optimize specific well-known library calls --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a simple pass that applies a variety of small +// optimizations for calls to specific well-known function calls (e.g. runtime +// library functions). Any optimization that takes the very simple form +// "replace call to library function with simpler code that provides the same +// result" belongs in this file. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "simplify-libcalls" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BuildLibCalls.h" +#include "llvm/Intrinsics.h" +#include "llvm/LLVMContext.h" +#include "llvm/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/IRBuilder.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Target/TargetData.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Config/config.h" +using namespace llvm; + +STATISTIC(NumSimplified, "Number of library calls simplified"); +STATISTIC(NumAnnotated, "Number of attributes added to library functions"); + +//===----------------------------------------------------------------------===// +// Optimizer Base Class +//===----------------------------------------------------------------------===// + +/// This class is the abstract base class for the set of optimizations that +/// corresponds to one library call. +namespace { +class LibCallOptimization { +protected: + Function *Caller; + const TargetData *TD; + LLVMContext* Context; +public: + LibCallOptimization() { } + virtual ~LibCallOptimization() {} + + /// CallOptimizer - This pure virtual method is implemented by base classes to + /// do various optimizations. If this returns null then no transformation was + /// performed. If it returns CI, then it transformed the call and CI is to be + /// deleted. If it returns something else, replace CI with the new value and + /// delete CI. + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) + =0; + + Value *OptimizeCall(CallInst *CI, const TargetData *TD, IRBuilder<> &B) { + Caller = CI->getParent()->getParent(); + this->TD = TD; + if (CI->getCalledFunction()) + Context = &CI->getCalledFunction()->getContext(); + + // We never change the calling convention. + if (CI->getCallingConv() != llvm::CallingConv::C) + return NULL; + + return CallOptimizer(CI->getCalledFunction(), CI, B); + } +}; +} // End anonymous namespace. + + +//===----------------------------------------------------------------------===// +// Helper Functions +//===----------------------------------------------------------------------===// + +/// IsOnlyUsedInZeroEqualityComparison - Return true if it only matters that the +/// value is equal or not-equal to zero. +static bool IsOnlyUsedInZeroEqualityComparison(Value *V) { + for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); + UI != E; ++UI) { + if (ICmpInst *IC = dyn_cast<ICmpInst>(*UI)) + if (IC->isEquality()) + if (Constant *C = dyn_cast<Constant>(IC->getOperand(1))) + if (C->isNullValue()) + continue; + // Unknown instruction. + return false; + } + return true; +} + +/// IsOnlyUsedInEqualityComparison - Return true if it is only used in equality +/// comparisons with With. +static bool IsOnlyUsedInEqualityComparison(Value *V, Value *With) { + for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); + UI != E; ++UI) { + if (ICmpInst *IC = dyn_cast<ICmpInst>(*UI)) + if (IC->isEquality() && IC->getOperand(1) == With) + continue; + // Unknown instruction. + return false; + } + return true; +} + +//===----------------------------------------------------------------------===// +// String and Memory LibCall Optimizations +//===----------------------------------------------------------------------===// + +//===---------------------------------------===// +// 'strcat' Optimizations +namespace { +struct StrCatOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Verify the "strcat" function prototype. + const FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || + FT->getReturnType() != B.getInt8PtrTy() || + FT->getParamType(0) != FT->getReturnType() || + FT->getParamType(1) != FT->getReturnType()) + return 0; + + // Extract some information from the instruction + Value *Dst = CI->getArgOperand(0); + Value *Src = CI->getArgOperand(1); + + // See if we can get the length of the input string. + uint64_t Len = GetStringLength(Src); + if (Len == 0) return 0; + --Len; // Unbias length. + + // Handle the simple, do-nothing case: strcat(x, "") -> x + if (Len == 0) + return Dst; + + // These optimizations require TargetData. + if (!TD) return 0; + + EmitStrLenMemCpy(Src, Dst, Len, B); + return Dst; + } + + void EmitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len, IRBuilder<> &B) { + // We need to find the end of the destination string. That's where the + // memory is to be moved to. We just generate a call to strlen. + Value *DstLen = EmitStrLen(Dst, B, TD); + + // Now that we have the destination's length, we must index into the + // destination's pointer to get the actual memcpy destination (end of + // the string .. we're concatenating). + Value *CpyDst = B.CreateGEP(Dst, DstLen, "endptr"); + + // We have enough information to now generate the memcpy call to do the + // concatenation for us. Make a memcpy to copy the nul byte with align = 1. + B.CreateMemCpy(CpyDst, Src, + ConstantInt::get(TD->getIntPtrType(*Context), Len + 1), 1); + } +}; + +//===---------------------------------------===// +// 'strncat' Optimizations + +struct StrNCatOpt : public StrCatOpt { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Verify the "strncat" function prototype. + const FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 3 || + FT->getReturnType() != B.getInt8PtrTy() || + FT->getParamType(0) != FT->getReturnType() || + FT->getParamType(1) != FT->getReturnType() || + !FT->getParamType(2)->isIntegerTy()) + return 0; + + // Extract some information from the instruction + Value *Dst = CI->getArgOperand(0); + Value *Src = CI->getArgOperand(1); + uint64_t Len; + + // We don't do anything if length is not constant + if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2))) + Len = LengthArg->getZExtValue(); + else + return 0; + + // See if we can get the length of the input string. + uint64_t SrcLen = GetStringLength(Src); + if (SrcLen == 0) return 0; + --SrcLen; // Unbias length. + + // Handle the simple, do-nothing cases: + // strncat(x, "", c) -> x + // strncat(x, c, 0) -> x + if (SrcLen == 0 || Len == 0) return Dst; + + // These optimizations require TargetData. + if (!TD) return 0; + + // We don't optimize this case + if (Len < SrcLen) return 0; + + // strncat(x, s, c) -> strcat(x, s) + // s is constant so the strcat can be optimized further + EmitStrLenMemCpy(Src, Dst, SrcLen, B); + return Dst; + } +}; + +//===---------------------------------------===// +// 'strchr' Optimizations + +struct StrChrOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Verify the "strchr" function prototype. + const FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || + FT->getReturnType() != B.getInt8PtrTy() || + FT->getParamType(0) != FT->getReturnType() || + !FT->getParamType(1)->isIntegerTy(32)) + return 0; + + Value *SrcStr = CI->getArgOperand(0); + + // If the second operand is non-constant, see if we can compute the length + // of the input string and turn this into memchr. + ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1)); + if (CharC == 0) { + // These optimizations require TargetData. + if (!TD) return 0; + + uint64_t Len = GetStringLength(SrcStr); + if (Len == 0 || !FT->getParamType(1)->isIntegerTy(32))// memchr needs i32. + return 0; + + return EmitMemChr(SrcStr, CI->getArgOperand(1), // include nul. + ConstantInt::get(TD->getIntPtrType(*Context), Len), + B, TD); + } + + // Otherwise, the character is a constant, see if the first argument is + // a string literal. If so, we can constant fold. + std::string Str; + if (!GetConstantStringInfo(SrcStr, Str)) + return 0; + + // strchr can find the nul character. + Str += '\0'; + + // Compute the offset. + size_t I = Str.find(CharC->getSExtValue()); + if (I == std::string::npos) // Didn't find the char. strchr returns null. + return Constant::getNullValue(CI->getType()); + + // strchr(s+n,c) -> gep(s+n+i,c) + return B.CreateGEP(SrcStr, B.getInt64(I), "strchr"); + } +}; + +//===---------------------------------------===// +// 'strrchr' Optimizations + +struct StrRChrOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Verify the "strrchr" function prototype. + const FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || + FT->getReturnType() != B.getInt8PtrTy() || + FT->getParamType(0) != FT->getReturnType() || + !FT->getParamType(1)->isIntegerTy(32)) + return 0; + + Value *SrcStr = CI->getArgOperand(0); + ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1)); + + // Cannot fold anything if we're not looking for a constant. + if (!CharC) + return 0; + + std::string Str; + if (!GetConstantStringInfo(SrcStr, Str)) { + // strrchr(s, 0) -> strchr(s, 0) + if (TD && CharC->isZero()) + return EmitStrChr(SrcStr, '\0', B, TD); + return 0; + } + + // strrchr can find the nul character. + Str += '\0'; + + // Compute the offset. + size_t I = Str.rfind(CharC->getSExtValue()); + if (I == std::string::npos) // Didn't find the char. Return null. + return Constant::getNullValue(CI->getType()); + + // strrchr(s+n,c) -> gep(s+n+i,c) + return B.CreateGEP(SrcStr, B.getInt64(I), "strrchr"); + } +}; + +//===---------------------------------------===// +// 'strcmp' Optimizations + +struct StrCmpOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Verify the "strcmp" function prototype. + const FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || + !FT->getReturnType()->isIntegerTy(32) || + FT->getParamType(0) != FT->getParamType(1) || + FT->getParamType(0) != B.getInt8PtrTy()) + return 0; + + Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1); + if (Str1P == Str2P) // strcmp(x,x) -> 0 + return ConstantInt::get(CI->getType(), 0); + + std::string Str1, Str2; + bool HasStr1 = GetConstantStringInfo(Str1P, Str1); + bool HasStr2 = GetConstantStringInfo(Str2P, Str2); + + if (HasStr1 && Str1.empty()) // strcmp("", x) -> *x + return B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), CI->getType()); + + if (HasStr2 && Str2.empty()) // strcmp(x,"") -> *x + return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType()); + + // strcmp(x, y) -> cnst (if both x and y are constant strings) + if (HasStr1 && HasStr2) + return ConstantInt::get(CI->getType(), + strcmp(Str1.c_str(),Str2.c_str())); + + // strcmp(P, "x") -> memcmp(P, "x", 2) + uint64_t Len1 = GetStringLength(Str1P); + uint64_t Len2 = GetStringLength(Str2P); + if (Len1 && Len2) { + // These optimizations require TargetData. + if (!TD) return 0; + + return EmitMemCmp(Str1P, Str2P, + ConstantInt::get(TD->getIntPtrType(*Context), + std::min(Len1, Len2)), B, TD); + } + + return 0; + } +}; + +//===---------------------------------------===// +// 'strncmp' Optimizations + +struct StrNCmpOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Verify the "strncmp" function prototype. + const FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 3 || + !FT->getReturnType()->isIntegerTy(32) || + FT->getParamType(0) != FT->getParamType(1) || + FT->getParamType(0) != B.getInt8PtrTy() || + !FT->getParamType(2)->isIntegerTy()) + return 0; + + Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1); + if (Str1P == Str2P) // strncmp(x,x,n) -> 0 + return ConstantInt::get(CI->getType(), 0); + + // Get the length argument if it is constant. + uint64_t Length; + if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2))) + Length = LengthArg->getZExtValue(); + else + return 0; + + if (Length == 0) // strncmp(x,y,0) -> 0 + return ConstantInt::get(CI->getType(), 0); + + if (TD && Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1) + return EmitMemCmp(Str1P, Str2P, CI->getArgOperand(2), B, TD); + + std::string Str1, Str2; + bool HasStr1 = GetConstantStringInfo(Str1P, Str1); + bool HasStr2 = GetConstantStringInfo(Str2P, Str2); + + if (HasStr1 && Str1.empty()) // strncmp("", x, n) -> *x + return B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), CI->getType()); + + if (HasStr2 && Str2.empty()) // strncmp(x, "", n) -> *x + return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType()); + + // strncmp(x, y) -> cnst (if both x and y are constant strings) + if (HasStr1 && HasStr2) + return ConstantInt::get(CI->getType(), + strncmp(Str1.c_str(), Str2.c_str(), Length)); + return 0; + } +}; + + +//===---------------------------------------===// +// 'strcpy' Optimizations + +struct StrCpyOpt : public LibCallOptimization { + bool OptChkCall; // True if it's optimizing a __strcpy_chk libcall. + + StrCpyOpt(bool c) : OptChkCall(c) {} + + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Verify the "strcpy" function prototype. + unsigned NumParams = OptChkCall ? 3 : 2; + const FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != NumParams || + FT->getReturnType() != FT->getParamType(0) || + FT->getParamType(0) != FT->getParamType(1) || + FT->getParamType(0) != B.getInt8PtrTy()) + return 0; + + Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); + if (Dst == Src) // strcpy(x,x) -> x + return Src; + + // These optimizations require TargetData. + if (!TD) return 0; + + // See if we can get the length of the input string. + uint64_t Len = GetStringLength(Src); + if (Len == 0) return 0; + + // We have enough information to now generate the memcpy call to do the + // concatenation for us. Make a memcpy to copy the nul byte with align = 1. + if (OptChkCall) + EmitMemCpyChk(Dst, Src, + ConstantInt::get(TD->getIntPtrType(*Context), Len), + CI->getArgOperand(2), B, TD); + else + B.CreateMemCpy(Dst, Src, + ConstantInt::get(TD->getIntPtrType(*Context), Len), 1); + return Dst; + } +}; + +//===---------------------------------------===// +// 'strncpy' Optimizations + +struct StrNCpyOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + const FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || + FT->getParamType(0) != FT->getParamType(1) || + FT->getParamType(0) != B.getInt8PtrTy() || + !FT->getParamType(2)->isIntegerTy()) + return 0; + + Value *Dst = CI->getArgOperand(0); + Value *Src = CI->getArgOperand(1); + Value *LenOp = CI->getArgOperand(2); + + // See if we can get the length of the input string. + uint64_t SrcLen = GetStringLength(Src); + if (SrcLen == 0) return 0; + --SrcLen; + + if (SrcLen == 0) { + // strncpy(x, "", y) -> memset(x, '\0', y, 1) + B.CreateMemSet(Dst, B.getInt8('\0'), LenOp, 1); + return Dst; + } + + uint64_t Len; + if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(LenOp)) + Len = LengthArg->getZExtValue(); + else + return 0; + + if (Len == 0) return Dst; // strncpy(x, y, 0) -> x + + // These optimizations require TargetData. + if (!TD) return 0; + + // Let strncpy handle the zero padding + if (Len > SrcLen+1) return 0; + + // strncpy(x, s, c) -> memcpy(x, s, c, 1) [s and c are constant] + B.CreateMemCpy(Dst, Src, + ConstantInt::get(TD->getIntPtrType(*Context), Len), 1); + + return Dst; + } +}; + +//===---------------------------------------===// +// 'strlen' Optimizations + +struct StrLenOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + const FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 1 || + FT->getParamType(0) != B.getInt8PtrTy() || + !FT->getReturnType()->isIntegerTy()) + return 0; + + Value *Src = CI->getArgOperand(0); + + // Constant folding: strlen("xyz") -> 3 + if (uint64_t Len = GetStringLength(Src)) + return ConstantInt::get(CI->getType(), Len-1); + + // strlen(x) != 0 --> *x != 0 + // strlen(x) == 0 --> *x == 0 + if (IsOnlyUsedInZeroEqualityComparison(CI)) + return B.CreateZExt(B.CreateLoad(Src, "strlenfirst"), CI->getType()); + return 0; + } +}; + + +//===---------------------------------------===// +// 'strpbrk' Optimizations + +struct StrPBrkOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + const FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || + FT->getParamType(0) != B.getInt8PtrTy() || + FT->getParamType(1) != FT->getParamType(0) || + FT->getReturnType() != FT->getParamType(0)) + return 0; + + std::string S1, S2; + bool HasS1 = GetConstantStringInfo(CI->getArgOperand(0), S1); + bool HasS2 = GetConstantStringInfo(CI->getArgOperand(1), S2); + + // strpbrk(s, "") -> NULL + // strpbrk("", s) -> NULL + if ((HasS1 && S1.empty()) || (HasS2 && S2.empty())) + return Constant::getNullValue(CI->getType()); + + // Constant folding. + if (HasS1 && HasS2) { + size_t I = S1.find_first_of(S2); + if (I == std::string::npos) // No match. + return Constant::getNullValue(CI->getType()); + + return B.CreateGEP(CI->getArgOperand(0), B.getInt64(I), "strpbrk"); + } + + // strpbrk(s, "a") -> strchr(s, 'a') + if (TD && HasS2 && S2.size() == 1) + return EmitStrChr(CI->getArgOperand(0), S2[0], B, TD); + + return 0; + } +}; + +//===---------------------------------------===// +// 'strto*' Optimizations. This handles strtol, strtod, strtof, strtoul, etc. + +struct StrToOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + const FunctionType *FT = Callee->getFunctionType(); + if ((FT->getNumParams() != 2 && FT->getNumParams() != 3) || + !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isPointerTy()) + return 0; + + Value *EndPtr = CI->getArgOperand(1); + if (isa<ConstantPointerNull>(EndPtr)) { + // With a null EndPtr, this function won't capture the main argument. + // It would be readonly too, except that it still may write to errno. + CI->addAttribute(1, Attribute::NoCapture); + } + + return 0; + } +}; + +//===---------------------------------------===// +// 'strspn' Optimizations + +struct StrSpnOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + const FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || + FT->getParamType(0) != B.getInt8PtrTy() || + FT->getParamType(1) != FT->getParamType(0) || + !FT->getReturnType()->isIntegerTy()) + return 0; + + std::string S1, S2; + bool HasS1 = GetConstantStringInfo(CI->getArgOperand(0), S1); + bool HasS2 = GetConstantStringInfo(CI->getArgOperand(1), S2); + + // strspn(s, "") -> 0 + // strspn("", s) -> 0 + if ((HasS1 && S1.empty()) || (HasS2 && S2.empty())) + return Constant::getNullValue(CI->getType()); + + // Constant folding. + if (HasS1 && HasS2) + return ConstantInt::get(CI->getType(), strspn(S1.c_str(), S2.c_str())); + + return 0; + } +}; + +//===---------------------------------------===// +// 'strcspn' Optimizations + +struct StrCSpnOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + const FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || + FT->getParamType(0) != B.getInt8PtrTy() || + FT->getParamType(1) != FT->getParamType(0) || + !FT->getReturnType()->isIntegerTy()) + return 0; + + std::string S1, S2; + bool HasS1 = GetConstantStringInfo(CI->getArgOperand(0), S1); + bool HasS2 = GetConstantStringInfo(CI->getArgOperand(1), S2); + + // strcspn("", s) -> 0 + if (HasS1 && S1.empty()) + return Constant::getNullValue(CI->getType()); + + // Constant folding. + if (HasS1 && HasS2) + return ConstantInt::get(CI->getType(), strcspn(S1.c_str(), S2.c_str())); + + // strcspn(s, "") -> strlen(s) + if (TD && HasS2 && S2.empty()) + return EmitStrLen(CI->getArgOperand(0), B, TD); + + return 0; + } +}; + +//===---------------------------------------===// +// 'strstr' Optimizations + +struct StrStrOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + const FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || + !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isPointerTy() || + !FT->getReturnType()->isPointerTy()) + return 0; + + // fold strstr(x, x) -> x. + if (CI->getArgOperand(0) == CI->getArgOperand(1)) + return B.CreateBitCast(CI->getArgOperand(0), CI->getType()); + + // fold strstr(a, b) == a -> strncmp(a, b, strlen(b)) == 0 + if (TD && IsOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) { + Value *StrLen = EmitStrLen(CI->getArgOperand(1), B, TD); + Value *StrNCmp = EmitStrNCmp(CI->getArgOperand(0), CI->getArgOperand(1), + StrLen, B, TD); + for (Value::use_iterator UI = CI->use_begin(), UE = CI->use_end(); + UI != UE; ) { + ICmpInst *Old = cast<ICmpInst>(*UI++); + Value *Cmp = B.CreateICmp(Old->getPredicate(), StrNCmp, + ConstantInt::getNullValue(StrNCmp->getType()), + "cmp"); + Old->replaceAllUsesWith(Cmp); + Old->eraseFromParent(); + } + return CI; + } + + // See if either input string is a constant string. + std::string SearchStr, ToFindStr; + bool HasStr1 = GetConstantStringInfo(CI->getArgOperand(0), SearchStr); + bool HasStr2 = GetConstantStringInfo(CI->getArgOperand(1), ToFindStr); + + // fold strstr(x, "") -> x. + if (HasStr2 && ToFindStr.empty()) + return B.CreateBitCast(CI->getArgOperand(0), CI->getType()); + + // If both strings are known, constant fold it. + if (HasStr1 && HasStr2) { + std::string::size_type Offset = SearchStr.find(ToFindStr); + + if (Offset == std::string::npos) // strstr("foo", "bar") -> null + return Constant::getNullValue(CI->getType()); + + // strstr("abcd", "bc") -> gep((char*)"abcd", 1) + Value *Result = CastToCStr(CI->getArgOperand(0), B); + Result = B.CreateConstInBoundsGEP1_64(Result, Offset, "strstr"); + return B.CreateBitCast(Result, CI->getType()); + } + + // fold strstr(x, "y") -> strchr(x, 'y'). + if (HasStr2 && ToFindStr.size() == 1) + return B.CreateBitCast(EmitStrChr(CI->getArgOperand(0), + ToFindStr[0], B, TD), CI->getType()); + return 0; + } +}; + + +//===---------------------------------------===// +// 'memcmp' Optimizations + +struct MemCmpOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + const FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 3 || !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isPointerTy() || + !FT->getReturnType()->isIntegerTy(32)) + return 0; + + Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1); + + if (LHS == RHS) // memcmp(s,s,x) -> 0 + return Constant::getNullValue(CI->getType()); + + // Make sure we have a constant length. + ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getArgOperand(2)); + if (!LenC) return 0; + uint64_t Len = LenC->getZExtValue(); + + if (Len == 0) // memcmp(s1,s2,0) -> 0 + return Constant::getNullValue(CI->getType()); + + // memcmp(S1,S2,1) -> *(unsigned char*)LHS - *(unsigned char*)RHS + if (Len == 1) { + Value *LHSV = B.CreateZExt(B.CreateLoad(CastToCStr(LHS, B), "lhsc"), + CI->getType(), "lhsv"); + Value *RHSV = B.CreateZExt(B.CreateLoad(CastToCStr(RHS, B), "rhsc"), + CI->getType(), "rhsv"); + return B.CreateSub(LHSV, RHSV, "chardiff"); + } + + // Constant folding: memcmp(x, y, l) -> cnst (all arguments are constant) + std::string LHSStr, RHSStr; + if (GetConstantStringInfo(LHS, LHSStr) && + GetConstantStringInfo(RHS, RHSStr)) { + // Make sure we're not reading out-of-bounds memory. + if (Len > LHSStr.length() || Len > RHSStr.length()) + return 0; + uint64_t Ret = memcmp(LHSStr.data(), RHSStr.data(), Len); + return ConstantInt::get(CI->getType(), Ret); + } + + return 0; + } +}; + +//===---------------------------------------===// +// 'memcpy' Optimizations + +struct MemCpyOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // These optimizations require TargetData. + if (!TD) return 0; + + const FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || + !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isPointerTy() || + FT->getParamType(2) != TD->getIntPtrType(*Context)) + return 0; + + // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1) + B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), 1); + return CI->getArgOperand(0); + } +}; + +//===---------------------------------------===// +// 'memmove' Optimizations + +struct MemMoveOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // These optimizations require TargetData. + if (!TD) return 0; + + const FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || + !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isPointerTy() || + FT->getParamType(2) != TD->getIntPtrType(*Context)) + return 0; + + // memmove(x, y, n) -> llvm.memmove(x, y, n, 1) + B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), 1); + return CI->getArgOperand(0); + } +}; + +//===---------------------------------------===// +// 'memset' Optimizations + +struct MemSetOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // These optimizations require TargetData. + if (!TD) return 0; + + const FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || + !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isIntegerTy() || + FT->getParamType(2) != TD->getIntPtrType(*Context)) + return 0; + + // memset(p, v, n) -> llvm.memset(p, v, n, 1) + Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false); + B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1); + return CI->getArgOperand(0); + } +}; + +//===----------------------------------------------------------------------===// +// Math Library Optimizations +//===----------------------------------------------------------------------===// + +//===---------------------------------------===// +// 'pow*' Optimizations + +struct PowOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + const FunctionType *FT = Callee->getFunctionType(); + // Just make sure this has 2 arguments of the same FP type, which match the + // result type. + if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) || + FT->getParamType(0) != FT->getParamType(1) || + !FT->getParamType(0)->isFloatingPointTy()) + return 0; + + Value *Op1 = CI->getArgOperand(0), *Op2 = CI->getArgOperand(1); + if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) { + if (Op1C->isExactlyValue(1.0)) // pow(1.0, x) -> 1.0 + return Op1C; + if (Op1C->isExactlyValue(2.0)) // pow(2.0, x) -> exp2(x) + return EmitUnaryFloatFnCall(Op2, "exp2", B, Callee->getAttributes()); + } + + ConstantFP *Op2C = dyn_cast<ConstantFP>(Op2); + if (Op2C == 0) return 0; + + if (Op2C->getValueAPF().isZero()) // pow(x, 0.0) -> 1.0 + return ConstantFP::get(CI->getType(), 1.0); + + if (Op2C->isExactlyValue(0.5)) { + // Expand pow(x, 0.5) to (x == -infinity ? +infinity : fabs(sqrt(x))). + // This is faster than calling pow, and still handles negative zero + // and negative infinite correctly. + // TODO: In fast-math mode, this could be just sqrt(x). + // TODO: In finite-only mode, this could be just fabs(sqrt(x)). + Value *Inf = ConstantFP::getInfinity(CI->getType()); + Value *NegInf = ConstantFP::getInfinity(CI->getType(), true); + Value *Sqrt = EmitUnaryFloatFnCall(Op1, "sqrt", B, + Callee->getAttributes()); + Value *FAbs = EmitUnaryFloatFnCall(Sqrt, "fabs", B, + Callee->getAttributes()); + Value *FCmp = B.CreateFCmpOEQ(Op1, NegInf, "tmp"); + Value *Sel = B.CreateSelect(FCmp, Inf, FAbs, "tmp"); + return Sel; + } + + if (Op2C->isExactlyValue(1.0)) // pow(x, 1.0) -> x + return Op1; + if (Op2C->isExactlyValue(2.0)) // pow(x, 2.0) -> x*x + return B.CreateFMul(Op1, Op1, "pow2"); + if (Op2C->isExactlyValue(-1.0)) // pow(x, -1.0) -> 1.0/x + return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), + Op1, "powrecip"); + return 0; + } +}; + +//===---------------------------------------===// +// 'exp2' Optimizations + +struct Exp2Opt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + const FunctionType *FT = Callee->getFunctionType(); + // Just make sure this has 1 argument of FP type, which matches the + // result type. + if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || + !FT->getParamType(0)->isFloatingPointTy()) + return 0; + + Value *Op = CI->getArgOperand(0); + // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x)) if sizeof(x) <= 32 + // Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x)) if sizeof(x) < 32 + Value *LdExpArg = 0; + if (SIToFPInst *OpC = dyn_cast<SIToFPInst>(Op)) { + if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() <= 32) + LdExpArg = B.CreateSExt(OpC->getOperand(0), B.getInt32Ty(), "tmp"); + } else if (UIToFPInst *OpC = dyn_cast<UIToFPInst>(Op)) { + if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() < 32) + LdExpArg = B.CreateZExt(OpC->getOperand(0), B.getInt32Ty(), "tmp"); + } + + if (LdExpArg) { + const char *Name; + if (Op->getType()->isFloatTy()) + Name = "ldexpf"; + else if (Op->getType()->isDoubleTy()) + Name = "ldexp"; + else + Name = "ldexpl"; + + Constant *One = ConstantFP::get(*Context, APFloat(1.0f)); + if (!Op->getType()->isFloatTy()) + One = ConstantExpr::getFPExtend(One, Op->getType()); + + Module *M = Caller->getParent(); + Value *Callee = M->getOrInsertFunction(Name, Op->getType(), + Op->getType(), + B.getInt32Ty(), NULL); + CallInst *CI = B.CreateCall2(Callee, One, LdExpArg); + if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts())) + CI->setCallingConv(F->getCallingConv()); + + return CI; + } + return 0; + } +}; + +//===---------------------------------------===// +// Double -> Float Shrinking Optimizations for Unary Functions like 'floor' + +struct UnaryDoubleFPOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + const FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 1 || !FT->getReturnType()->isDoubleTy() || + !FT->getParamType(0)->isDoubleTy()) + return 0; + + // If this is something like 'floor((double)floatval)', convert to floorf. + FPExtInst *Cast = dyn_cast<FPExtInst>(CI->getArgOperand(0)); + if (Cast == 0 || !Cast->getOperand(0)->getType()->isFloatTy()) + return 0; + + // floor((double)floatval) -> (double)floorf(floatval) + Value *V = Cast->getOperand(0); + V = EmitUnaryFloatFnCall(V, Callee->getName().data(), B, + Callee->getAttributes()); + return B.CreateFPExt(V, B.getDoubleTy()); + } +}; + +//===----------------------------------------------------------------------===// +// Integer Optimizations +//===----------------------------------------------------------------------===// + +//===---------------------------------------===// +// 'ffs*' Optimizations + +struct FFSOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + const FunctionType *FT = Callee->getFunctionType(); + // Just make sure this has 2 arguments of the same FP type, which match the + // result type. + if (FT->getNumParams() != 1 || + !FT->getReturnType()->isIntegerTy(32) || + !FT->getParamType(0)->isIntegerTy()) + return 0; + + Value *Op = CI->getArgOperand(0); + + // Constant fold. + if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { + if (CI->getValue() == 0) // ffs(0) -> 0. + return Constant::getNullValue(CI->getType()); + // ffs(c) -> cttz(c)+1 + return B.getInt32(CI->getValue().countTrailingZeros() + 1); + } + + // ffs(x) -> x != 0 ? (i32)llvm.cttz(x)+1 : 0 + const Type *ArgType = Op->getType(); + Value *F = Intrinsic::getDeclaration(Callee->getParent(), + Intrinsic::cttz, &ArgType, 1); + Value *V = B.CreateCall(F, Op, "cttz"); + V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1), "tmp"); + V = B.CreateIntCast(V, B.getInt32Ty(), false, "tmp"); + + Value *Cond = B.CreateICmpNE(Op, Constant::getNullValue(ArgType), "tmp"); + return B.CreateSelect(Cond, V, B.getInt32(0)); + } +}; + +//===---------------------------------------===// +// 'isdigit' Optimizations + +struct IsDigitOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + const FunctionType *FT = Callee->getFunctionType(); + // We require integer(i32) + if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() || + !FT->getParamType(0)->isIntegerTy(32)) + return 0; + + // isdigit(c) -> (c-'0') <u 10 + Value *Op = CI->getArgOperand(0); + Op = B.CreateSub(Op, B.getInt32('0'), "isdigittmp"); + Op = B.CreateICmpULT(Op, B.getInt32(10), "isdigit"); + return B.CreateZExt(Op, CI->getType()); + } +}; + +//===---------------------------------------===// +// 'isascii' Optimizations + +struct IsAsciiOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + const FunctionType *FT = Callee->getFunctionType(); + // We require integer(i32) + if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() || + !FT->getParamType(0)->isIntegerTy(32)) + return 0; + + // isascii(c) -> c <u 128 + Value *Op = CI->getArgOperand(0); + Op = B.CreateICmpULT(Op, B.getInt32(128), "isascii"); + return B.CreateZExt(Op, CI->getType()); + } +}; + +//===---------------------------------------===// +// 'abs', 'labs', 'llabs' Optimizations + +struct AbsOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + const FunctionType *FT = Callee->getFunctionType(); + // We require integer(integer) where the types agree. + if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() || + FT->getParamType(0) != FT->getReturnType()) + return 0; + + // abs(x) -> x >s -1 ? x : -x + Value *Op = CI->getArgOperand(0); + Value *Pos = B.CreateICmpSGT(Op, Constant::getAllOnesValue(Op->getType()), + "ispos"); + Value *Neg = B.CreateNeg(Op, "neg"); + return B.CreateSelect(Pos, Op, Neg); + } +}; + + +//===---------------------------------------===// +// 'toascii' Optimizations + +struct ToAsciiOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + const FunctionType *FT = Callee->getFunctionType(); + // We require i32(i32) + if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || + !FT->getParamType(0)->isIntegerTy(32)) + return 0; + + // isascii(c) -> c & 0x7f + return B.CreateAnd(CI->getArgOperand(0), + ConstantInt::get(CI->getType(),0x7F)); + } +}; + +//===----------------------------------------------------------------------===// +// Formatting and IO Optimizations +//===----------------------------------------------------------------------===// + +//===---------------------------------------===// +// 'printf' Optimizations + +struct PrintFOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Require one fixed pointer argument and an integer/void result. + const FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() || + !(FT->getReturnType()->isIntegerTy() || + FT->getReturnType()->isVoidTy())) + return 0; + + // Check for a fixed format string. + std::string FormatStr; + if (!GetConstantStringInfo(CI->getArgOperand(0), FormatStr)) + return 0; + + // Empty format string -> noop. + if (FormatStr.empty()) // Tolerate printf's declared void. + return CI->use_empty() ? (Value*)CI : + ConstantInt::get(CI->getType(), 0); + + // Do not do any of the following transformations if the printf return value + // is used, in general the printf return value is not compatible with either + // putchar() or puts(). + if (!CI->use_empty()) + return 0; + + // printf("x") -> putchar('x'), even for '%'. + if (FormatStr.size() == 1) { + Value *Res = EmitPutChar(B.getInt32(FormatStr[0]), B, TD); + if (CI->use_empty()) return CI; + return B.CreateIntCast(Res, CI->getType(), true); + } + + // printf("foo\n") --> puts("foo") + if (FormatStr[FormatStr.size()-1] == '\n' && + FormatStr.find('%') == std::string::npos) { // no format characters. + // Create a string literal with no \n on it. We expect the constant merge + // pass to be run after this pass, to merge duplicate strings. + FormatStr.erase(FormatStr.end()-1); + Constant *C = ConstantArray::get(*Context, FormatStr, true); + C = new GlobalVariable(*Callee->getParent(), C->getType(), true, + GlobalVariable::InternalLinkage, C, "str"); + EmitPutS(C, B, TD); + return CI->use_empty() ? (Value*)CI : + ConstantInt::get(CI->getType(), FormatStr.size()+1); + } + + // Optimize specific format strings. + // printf("%c", chr) --> putchar(chr) + if (FormatStr == "%c" && CI->getNumArgOperands() > 1 && + CI->getArgOperand(1)->getType()->isIntegerTy()) { + Value *Res = EmitPutChar(CI->getArgOperand(1), B, TD); + + if (CI->use_empty()) return CI; + return B.CreateIntCast(Res, CI->getType(), true); + } + + // printf("%s\n", str) --> puts(str) + if (FormatStr == "%s\n" && CI->getNumArgOperands() > 1 && + CI->getArgOperand(1)->getType()->isPointerTy()) { + EmitPutS(CI->getArgOperand(1), B, TD); + return CI; + } + return 0; + } +}; + +//===---------------------------------------===// +// 'sprintf' Optimizations + +struct SPrintFOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Require two fixed pointer arguments and an integer result. + const FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isPointerTy() || + !FT->getReturnType()->isIntegerTy()) + return 0; + + // Check for a fixed format string. + std::string FormatStr; + if (!GetConstantStringInfo(CI->getArgOperand(1), FormatStr)) + return 0; + + // If we just have a format string (nothing else crazy) transform it. + if (CI->getNumArgOperands() == 2) { + // Make sure there's no % in the constant array. We could try to handle + // %% -> % in the future if we cared. + for (unsigned i = 0, e = FormatStr.size(); i != e; ++i) + if (FormatStr[i] == '%') + return 0; // we found a format specifier, bail out. + + // These optimizations require TargetData. + if (!TD) return 0; + + // sprintf(str, fmt) -> llvm.memcpy(str, fmt, strlen(fmt)+1, 1) + B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1), + ConstantInt::get(TD->getIntPtrType(*Context), // Copy the + FormatStr.size() + 1), 1); // nul byte. + return ConstantInt::get(CI->getType(), FormatStr.size()); + } + + // The remaining optimizations require the format string to be "%s" or "%c" + // and have an extra operand. + if (FormatStr.size() != 2 || FormatStr[0] != '%' || + CI->getNumArgOperands() < 3) + return 0; + + // Decode the second character of the format string. + if (FormatStr[1] == 'c') { + // sprintf(dst, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0 + if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0; + Value *V = B.CreateTrunc(CI->getArgOperand(2), B.getInt8Ty(), "char"); + Value *Ptr = CastToCStr(CI->getArgOperand(0), B); + B.CreateStore(V, Ptr); + Ptr = B.CreateGEP(Ptr, B.getInt32(1), "nul"); + B.CreateStore(B.getInt8(0), Ptr); + + return ConstantInt::get(CI->getType(), 1); + } + + if (FormatStr[1] == 's') { + // These optimizations require TargetData. + if (!TD) return 0; + + // sprintf(dest, "%s", str) -> llvm.memcpy(dest, str, strlen(str)+1, 1) + if (!CI->getArgOperand(2)->getType()->isPointerTy()) return 0; + + Value *Len = EmitStrLen(CI->getArgOperand(2), B, TD); + Value *IncLen = B.CreateAdd(Len, + ConstantInt::get(Len->getType(), 1), + "leninc"); + B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(2), IncLen, 1); + + // The sprintf result is the unincremented number of bytes in the string. + return B.CreateIntCast(Len, CI->getType(), false); + } + return 0; + } +}; + +//===---------------------------------------===// +// 'fwrite' Optimizations + +struct FWriteOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Require a pointer, an integer, an integer, a pointer, returning integer. + const FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 4 || !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isIntegerTy() || + !FT->getParamType(2)->isIntegerTy() || + !FT->getParamType(3)->isPointerTy() || + !FT->getReturnType()->isIntegerTy()) + return 0; + + // Get the element size and count. + ConstantInt *SizeC = dyn_cast<ConstantInt>(CI->getArgOperand(1)); + ConstantInt *CountC = dyn_cast<ConstantInt>(CI->getArgOperand(2)); + if (!SizeC || !CountC) return 0; + uint64_t Bytes = SizeC->getZExtValue()*CountC->getZExtValue(); + + // If this is writing zero records, remove the call (it's a noop). + if (Bytes == 0) + return ConstantInt::get(CI->getType(), 0); + + // If this is writing one byte, turn it into fputc. + if (Bytes == 1) { // fwrite(S,1,1,F) -> fputc(S[0],F) + Value *Char = B.CreateLoad(CastToCStr(CI->getArgOperand(0), B), "char"); + EmitFPutC(Char, CI->getArgOperand(3), B, TD); + return ConstantInt::get(CI->getType(), 1); + } + + return 0; + } +}; + +//===---------------------------------------===// +// 'fputs' Optimizations + +struct FPutsOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // These optimizations require TargetData. + if (!TD) return 0; + + // Require two pointers. Also, we can't optimize if return value is used. + const FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isPointerTy() || + !CI->use_empty()) + return 0; + + // fputs(s,F) --> fwrite(s,1,strlen(s),F) + uint64_t Len = GetStringLength(CI->getArgOperand(0)); + if (!Len) return 0; + EmitFWrite(CI->getArgOperand(0), + ConstantInt::get(TD->getIntPtrType(*Context), Len-1), + CI->getArgOperand(1), B, TD); + return CI; // Known to have no uses (see above). + } +}; + +//===---------------------------------------===// +// 'fprintf' Optimizations + +struct FPrintFOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Require two fixed paramters as pointers and integer result. + const FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isPointerTy() || + !FT->getReturnType()->isIntegerTy()) + return 0; + + // All the optimizations depend on the format string. + std::string FormatStr; + if (!GetConstantStringInfo(CI->getArgOperand(1), FormatStr)) + return 0; + + // fprintf(F, "foo") --> fwrite("foo", 3, 1, F) + if (CI->getNumArgOperands() == 2) { + for (unsigned i = 0, e = FormatStr.size(); i != e; ++i) + if (FormatStr[i] == '%') // Could handle %% -> % if we cared. + return 0; // We found a format specifier. + + // These optimizations require TargetData. + if (!TD) return 0; + + EmitFWrite(CI->getArgOperand(1), + ConstantInt::get(TD->getIntPtrType(*Context), + FormatStr.size()), + CI->getArgOperand(0), B, TD); + return ConstantInt::get(CI->getType(), FormatStr.size()); + } + + // The remaining optimizations require the format string to be "%s" or "%c" + // and have an extra operand. + if (FormatStr.size() != 2 || FormatStr[0] != '%' || + CI->getNumArgOperands() < 3) + return 0; + + // Decode the second character of the format string. + if (FormatStr[1] == 'c') { + // fprintf(F, "%c", chr) --> fputc(chr, F) + if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0; + EmitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, TD); + return ConstantInt::get(CI->getType(), 1); + } + + if (FormatStr[1] == 's') { + // fprintf(F, "%s", str) --> fputs(str, F) + if (!CI->getArgOperand(2)->getType()->isPointerTy() || !CI->use_empty()) + return 0; + EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TD); + return CI; + } + return 0; + } +}; + +//===---------------------------------------===// +// 'puts' Optimizations + +struct PutsOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Require one fixed pointer argument and an integer/void result. + const FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() || + !(FT->getReturnType()->isIntegerTy() || + FT->getReturnType()->isVoidTy())) + return 0; + + // Check for a constant string. + std::string Str; + if (!GetConstantStringInfo(CI->getArgOperand(0), Str)) + return 0; + + if (Str.empty() && CI->use_empty()) { + // puts("") -> putchar('\n') + Value *Res = EmitPutChar(B.getInt32('\n'), B, TD); + if (CI->use_empty()) return CI; + return B.CreateIntCast(Res, CI->getType(), true); + } + + return 0; + } +}; + +} // end anonymous namespace. + +//===----------------------------------------------------------------------===// +// SimplifyLibCalls Pass Implementation +//===----------------------------------------------------------------------===// + +namespace { + /// This pass optimizes well known library functions from libc and libm. + /// + class SimplifyLibCalls : public FunctionPass { + StringMap<LibCallOptimization*> Optimizations; + // String and Memory LibCall Optimizations + StrCatOpt StrCat; StrNCatOpt StrNCat; StrChrOpt StrChr; StrRChrOpt StrRChr; + StrCmpOpt StrCmp; StrNCmpOpt StrNCmp; StrCpyOpt StrCpy; StrCpyOpt StrCpyChk; + StrNCpyOpt StrNCpy; StrLenOpt StrLen; StrPBrkOpt StrPBrk; + StrToOpt StrTo; StrSpnOpt StrSpn; StrCSpnOpt StrCSpn; StrStrOpt StrStr; + MemCmpOpt MemCmp; MemCpyOpt MemCpy; MemMoveOpt MemMove; MemSetOpt MemSet; + // Math Library Optimizations + PowOpt Pow; Exp2Opt Exp2; UnaryDoubleFPOpt UnaryDoubleFP; + // Integer Optimizations + FFSOpt FFS; AbsOpt Abs; IsDigitOpt IsDigit; IsAsciiOpt IsAscii; + ToAsciiOpt ToAscii; + // Formatting and IO Optimizations + SPrintFOpt SPrintF; PrintFOpt PrintF; + FWriteOpt FWrite; FPutsOpt FPuts; FPrintFOpt FPrintF; + PutsOpt Puts; + + bool Modified; // This is only used by doInitialization. + public: + static char ID; // Pass identification + SimplifyLibCalls() : FunctionPass(ID), StrCpy(false), StrCpyChk(true) { + initializeSimplifyLibCallsPass(*PassRegistry::getPassRegistry()); + } + void InitOptimizations(); + bool runOnFunction(Function &F); + + void setDoesNotAccessMemory(Function &F); + void setOnlyReadsMemory(Function &F); + void setDoesNotThrow(Function &F); + void setDoesNotCapture(Function &F, unsigned n); + void setDoesNotAlias(Function &F, unsigned n); + bool doInitialization(Module &M); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + } + }; + char SimplifyLibCalls::ID = 0; +} // end anonymous namespace. + +INITIALIZE_PASS(SimplifyLibCalls, "simplify-libcalls", + "Simplify well-known library calls", false, false) + +// Public interface to the Simplify LibCalls pass. +FunctionPass *llvm::createSimplifyLibCallsPass() { + return new SimplifyLibCalls(); +} + +/// Optimizations - Populate the Optimizations map with all the optimizations +/// we know. +void SimplifyLibCalls::InitOptimizations() { + // String and Memory LibCall Optimizations + Optimizations["strcat"] = &StrCat; + Optimizations["strncat"] = &StrNCat; + Optimizations["strchr"] = &StrChr; + Optimizations["strrchr"] = &StrRChr; + Optimizations["strcmp"] = &StrCmp; + Optimizations["strncmp"] = &StrNCmp; + Optimizations["strcpy"] = &StrCpy; + Optimizations["strncpy"] = &StrNCpy; + Optimizations["strlen"] = &StrLen; + Optimizations["strpbrk"] = &StrPBrk; + Optimizations["strtol"] = &StrTo; + Optimizations["strtod"] = &StrTo; + Optimizations["strtof"] = &StrTo; + Optimizations["strtoul"] = &StrTo; + Optimizations["strtoll"] = &StrTo; + Optimizations["strtold"] = &StrTo; + Optimizations["strtoull"] = &StrTo; + Optimizations["strspn"] = &StrSpn; + Optimizations["strcspn"] = &StrCSpn; + Optimizations["strstr"] = &StrStr; + Optimizations["memcmp"] = &MemCmp; + Optimizations["memcpy"] = &MemCpy; + Optimizations["memmove"] = &MemMove; + Optimizations["memset"] = &MemSet; + + // _chk variants of String and Memory LibCall Optimizations. + Optimizations["__strcpy_chk"] = &StrCpyChk; + + // Math Library Optimizations + Optimizations["powf"] = &Pow; + Optimizations["pow"] = &Pow; + Optimizations["powl"] = &Pow; + Optimizations["llvm.pow.f32"] = &Pow; + Optimizations["llvm.pow.f64"] = &Pow; + Optimizations["llvm.pow.f80"] = &Pow; + Optimizations["llvm.pow.f128"] = &Pow; + Optimizations["llvm.pow.ppcf128"] = &Pow; + Optimizations["exp2l"] = &Exp2; + Optimizations["exp2"] = &Exp2; + Optimizations["exp2f"] = &Exp2; + Optimizations["llvm.exp2.ppcf128"] = &Exp2; + Optimizations["llvm.exp2.f128"] = &Exp2; + Optimizations["llvm.exp2.f80"] = &Exp2; + Optimizations["llvm.exp2.f64"] = &Exp2; + Optimizations["llvm.exp2.f32"] = &Exp2; + +#ifdef HAVE_FLOORF + Optimizations["floor"] = &UnaryDoubleFP; +#endif +#ifdef HAVE_CEILF + Optimizations["ceil"] = &UnaryDoubleFP; +#endif +#ifdef HAVE_ROUNDF + Optimizations["round"] = &UnaryDoubleFP; +#endif +#ifdef HAVE_RINTF + Optimizations["rint"] = &UnaryDoubleFP; +#endif +#ifdef HAVE_NEARBYINTF + Optimizations["nearbyint"] = &UnaryDoubleFP; +#endif + + // Integer Optimizations + Optimizations["ffs"] = &FFS; + Optimizations["ffsl"] = &FFS; + Optimizations["ffsll"] = &FFS; + Optimizations["abs"] = &Abs; + Optimizations["labs"] = &Abs; + Optimizations["llabs"] = &Abs; + Optimizations["isdigit"] = &IsDigit; + Optimizations["isascii"] = &IsAscii; + Optimizations["toascii"] = &ToAscii; + + // Formatting and IO Optimizations + Optimizations["sprintf"] = &SPrintF; + Optimizations["printf"] = &PrintF; + Optimizations["fwrite"] = &FWrite; + Optimizations["fputs"] = &FPuts; + Optimizations["fprintf"] = &FPrintF; + Optimizations["puts"] = &Puts; +} + + +/// runOnFunction - Top level algorithm. +/// +bool SimplifyLibCalls::runOnFunction(Function &F) { + if (Optimizations.empty()) + InitOptimizations(); + + const TargetData *TD = getAnalysisIfAvailable<TargetData>(); + + IRBuilder<> Builder(F.getContext()); + + bool Changed = false; + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) { + // Ignore non-calls. + CallInst *CI = dyn_cast<CallInst>(I++); + if (!CI) continue; + + // Ignore indirect calls and calls to non-external functions. + Function *Callee = CI->getCalledFunction(); + if (Callee == 0 || !Callee->isDeclaration() || + !(Callee->hasExternalLinkage() || Callee->hasDLLImportLinkage())) + continue; + + // Ignore unknown calls. + LibCallOptimization *LCO = Optimizations.lookup(Callee->getName()); + if (!LCO) continue; + + // Set the builder to the instruction after the call. + Builder.SetInsertPoint(BB, I); + + // Try to optimize this call. + Value *Result = LCO->OptimizeCall(CI, TD, Builder); + if (Result == 0) continue; + + DEBUG(dbgs() << "SimplifyLibCalls simplified: " << *CI; + dbgs() << " into: " << *Result << "\n"); + + // Something changed! + Changed = true; + ++NumSimplified; + + // Inspect the instruction after the call (which was potentially just + // added) next. + I = CI; ++I; + + if (CI != Result && !CI->use_empty()) { + CI->replaceAllUsesWith(Result); + if (!Result->hasName()) + Result->takeName(CI); + } + CI->eraseFromParent(); + } + } + return Changed; +} + +// Utility methods for doInitialization. + +void SimplifyLibCalls::setDoesNotAccessMemory(Function &F) { + if (!F.doesNotAccessMemory()) { + F.setDoesNotAccessMemory(); + ++NumAnnotated; + Modified = true; + } +} +void SimplifyLibCalls::setOnlyReadsMemory(Function &F) { + if (!F.onlyReadsMemory()) { + F.setOnlyReadsMemory(); + ++NumAnnotated; + Modified = true; + } +} +void SimplifyLibCalls::setDoesNotThrow(Function &F) { + if (!F.doesNotThrow()) { + F.setDoesNotThrow(); + ++NumAnnotated; + Modified = true; + } +} +void SimplifyLibCalls::setDoesNotCapture(Function &F, unsigned n) { + if (!F.doesNotCapture(n)) { + F.setDoesNotCapture(n); + ++NumAnnotated; + Modified = true; + } +} +void SimplifyLibCalls::setDoesNotAlias(Function &F, unsigned n) { + if (!F.doesNotAlias(n)) { + F.setDoesNotAlias(n); + ++NumAnnotated; + Modified = true; + } +} + +/// doInitialization - Add attributes to well-known functions. +/// +bool SimplifyLibCalls::doInitialization(Module &M) { + Modified = false; + for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { + Function &F = *I; + if (!F.isDeclaration()) + continue; + + if (!F.hasName()) + continue; + + const FunctionType *FTy = F.getFunctionType(); + + StringRef Name = F.getName(); + switch (Name[0]) { + case 's': + if (Name == "strlen") { + if (FTy->getNumParams() != 1 || + !FTy->getParamType(0)->isPointerTy()) + continue; + setOnlyReadsMemory(F); + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } else if (Name == "strchr" || + Name == "strrchr") { + if (FTy->getNumParams() != 2 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isIntegerTy()) + continue; + setOnlyReadsMemory(F); + setDoesNotThrow(F); + } else if (Name == "strcpy" || + Name == "stpcpy" || + Name == "strcat" || + Name == "strtol" || + Name == "strtod" || + Name == "strtof" || + Name == "strtoul" || + Name == "strtoll" || + Name == "strtold" || + Name == "strncat" || + Name == "strncpy" || + Name == "strtoull") { + if (FTy->getNumParams() < 2 || + !FTy->getParamType(1)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 2); + } else if (Name == "strxfrm") { + if (FTy->getNumParams() != 3 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + } else if (Name == "strcmp" || + Name == "strspn" || + Name == "strncmp" || + Name == "strcspn" || + Name == "strcoll" || + Name == "strcasecmp" || + Name == "strncasecmp") { + if (FTy->getNumParams() < 2 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + continue; + setOnlyReadsMemory(F); + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + } else if (Name == "strstr" || + Name == "strpbrk") { + if (FTy->getNumParams() != 2 || + !FTy->getParamType(1)->isPointerTy()) + continue; + setOnlyReadsMemory(F); + setDoesNotThrow(F); + setDoesNotCapture(F, 2); + } else if (Name == "strtok" || + Name == "strtok_r") { + if (FTy->getNumParams() < 2 || + !FTy->getParamType(1)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 2); + } else if (Name == "scanf" || + Name == "setbuf" || + Name == "setvbuf") { + if (FTy->getNumParams() < 1 || + !FTy->getParamType(0)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } else if (Name == "strdup" || + Name == "strndup") { + if (FTy->getNumParams() < 1 || + !FTy->getReturnType()->isPointerTy() || + !FTy->getParamType(0)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotAlias(F, 0); + setDoesNotCapture(F, 1); + } else if (Name == "stat" || + Name == "sscanf" || + Name == "sprintf" || + Name == "statvfs") { + if (FTy->getNumParams() < 2 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + } else if (Name == "snprintf") { + if (FTy->getNumParams() != 3 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(2)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 3); + } else if (Name == "setitimer") { + if (FTy->getNumParams() != 3 || + !FTy->getParamType(1)->isPointerTy() || + !FTy->getParamType(2)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 2); + setDoesNotCapture(F, 3); + } else if (Name == "system") { + if (FTy->getNumParams() != 1 || + !FTy->getParamType(0)->isPointerTy()) + continue; + // May throw; "system" is a valid pthread cancellation point. + setDoesNotCapture(F, 1); + } + break; + case 'm': + if (Name == "malloc") { + if (FTy->getNumParams() != 1 || + !FTy->getReturnType()->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotAlias(F, 0); + } else if (Name == "memcmp") { + if (FTy->getNumParams() != 3 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + continue; + setOnlyReadsMemory(F); + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + } else if (Name == "memchr" || + Name == "memrchr") { + if (FTy->getNumParams() != 3) + continue; + setOnlyReadsMemory(F); + setDoesNotThrow(F); + } else if (Name == "modf" || + Name == "modff" || + Name == "modfl" || + Name == "memcpy" || + Name == "memccpy" || + Name == "memmove") { + if (FTy->getNumParams() < 2 || + !FTy->getParamType(1)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 2); + } else if (Name == "memalign") { + if (!FTy->getReturnType()->isPointerTy()) + continue; + setDoesNotAlias(F, 0); + } else if (Name == "mkdir" || + Name == "mktime") { + if (FTy->getNumParams() == 0 || + !FTy->getParamType(0)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } + break; + case 'r': + if (Name == "realloc") { + if (FTy->getNumParams() != 2 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getReturnType()->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotAlias(F, 0); + setDoesNotCapture(F, 1); + } else if (Name == "read") { + if (FTy->getNumParams() != 3 || + !FTy->getParamType(1)->isPointerTy()) + continue; + // May throw; "read" is a valid pthread cancellation point. + setDoesNotCapture(F, 2); + } else if (Name == "rmdir" || + Name == "rewind" || + Name == "remove" || + Name == "realpath") { + if (FTy->getNumParams() < 1 || + !FTy->getParamType(0)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } else if (Name == "rename" || + Name == "readlink") { + if (FTy->getNumParams() < 2 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + } + break; + case 'w': + if (Name == "write") { + if (FTy->getNumParams() != 3 || + !FTy->getParamType(1)->isPointerTy()) + continue; + // May throw; "write" is a valid pthread cancellation point. + setDoesNotCapture(F, 2); + } + break; + case 'b': + if (Name == "bcopy") { + if (FTy->getNumParams() != 3 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + } else if (Name == "bcmp") { + if (FTy->getNumParams() != 3 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + continue; + setDoesNotThrow(F); + setOnlyReadsMemory(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + } else if (Name == "bzero") { + if (FTy->getNumParams() != 2 || + !FTy->getParamType(0)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } + break; + case 'c': + if (Name == "calloc") { + if (FTy->getNumParams() != 2 || + !FTy->getReturnType()->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotAlias(F, 0); + } else if (Name == "chmod" || + Name == "chown" || + Name == "ctermid" || + Name == "clearerr" || + Name == "closedir") { + if (FTy->getNumParams() == 0 || + !FTy->getParamType(0)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } + break; + case 'a': + if (Name == "atoi" || + Name == "atol" || + Name == "atof" || + Name == "atoll") { + if (FTy->getNumParams() != 1 || + !FTy->getParamType(0)->isPointerTy()) + continue; + setDoesNotThrow(F); + setOnlyReadsMemory(F); + setDoesNotCapture(F, 1); + } else if (Name == "access") { + if (FTy->getNumParams() != 2 || + !FTy->getParamType(0)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } + break; + case 'f': + if (Name == "fopen") { + if (FTy->getNumParams() != 2 || + !FTy->getReturnType()->isPointerTy() || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotAlias(F, 0); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + } else if (Name == "fdopen") { + if (FTy->getNumParams() != 2 || + !FTy->getReturnType()->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotAlias(F, 0); + setDoesNotCapture(F, 2); + } else if (Name == "feof" || + Name == "free" || + Name == "fseek" || + Name == "ftell" || + Name == "fgetc" || + Name == "fseeko" || + Name == "ftello" || + Name == "fileno" || + Name == "fflush" || + Name == "fclose" || + Name == "fsetpos" || + Name == "flockfile" || + Name == "funlockfile" || + Name == "ftrylockfile") { + if (FTy->getNumParams() == 0 || + !FTy->getParamType(0)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } else if (Name == "ferror") { + if (FTy->getNumParams() != 1 || + !FTy->getParamType(0)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setOnlyReadsMemory(F); + } else if (Name == "fputc" || + Name == "fstat" || + Name == "frexp" || + Name == "frexpf" || + Name == "frexpl" || + Name == "fstatvfs") { + if (FTy->getNumParams() != 2 || + !FTy->getParamType(1)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 2); + } else if (Name == "fgets") { + if (FTy->getNumParams() != 3 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(2)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 3); + } else if (Name == "fread" || + Name == "fwrite") { + if (FTy->getNumParams() != 4 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(3)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 4); + } else if (Name == "fputs" || + Name == "fscanf" || + Name == "fprintf" || + Name == "fgetpos") { + if (FTy->getNumParams() < 2 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + } + break; + case 'g': + if (Name == "getc" || + Name == "getlogin_r" || + Name == "getc_unlocked") { + if (FTy->getNumParams() == 0 || + !FTy->getParamType(0)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } else if (Name == "getenv") { + if (FTy->getNumParams() != 1 || + !FTy->getParamType(0)->isPointerTy()) + continue; + setDoesNotThrow(F); + setOnlyReadsMemory(F); + setDoesNotCapture(F, 1); + } else if (Name == "gets" || + Name == "getchar") { + setDoesNotThrow(F); + } else if (Name == "getitimer") { + if (FTy->getNumParams() != 2 || + !FTy->getParamType(1)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 2); + } else if (Name == "getpwnam") { + if (FTy->getNumParams() != 1 || + !FTy->getParamType(0)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } + break; + case 'u': + if (Name == "ungetc") { + if (FTy->getNumParams() != 2 || + !FTy->getParamType(1)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 2); + } else if (Name == "uname" || + Name == "unlink" || + Name == "unsetenv") { + if (FTy->getNumParams() != 1 || + !FTy->getParamType(0)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } else if (Name == "utime" || + Name == "utimes") { + if (FTy->getNumParams() != 2 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + } + break; + case 'p': + if (Name == "putc") { + if (FTy->getNumParams() != 2 || + !FTy->getParamType(1)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 2); + } else if (Name == "puts" || + Name == "printf" || + Name == "perror") { + if (FTy->getNumParams() != 1 || + !FTy->getParamType(0)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } else if (Name == "pread" || + Name == "pwrite") { + if (FTy->getNumParams() != 4 || + !FTy->getParamType(1)->isPointerTy()) + continue; + // May throw; these are valid pthread cancellation points. + setDoesNotCapture(F, 2); + } else if (Name == "putchar") { + setDoesNotThrow(F); + } else if (Name == "popen") { + if (FTy->getNumParams() != 2 || + !FTy->getReturnType()->isPointerTy() || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotAlias(F, 0); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + } else if (Name == "pclose") { + if (FTy->getNumParams() != 1 || + !FTy->getParamType(0)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } + break; + case 'v': + if (Name == "vscanf") { + if (FTy->getNumParams() != 2 || + !FTy->getParamType(1)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } else if (Name == "vsscanf" || + Name == "vfscanf") { + if (FTy->getNumParams() != 3 || + !FTy->getParamType(1)->isPointerTy() || + !FTy->getParamType(2)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + } else if (Name == "valloc") { + if (!FTy->getReturnType()->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotAlias(F, 0); + } else if (Name == "vprintf") { + if (FTy->getNumParams() != 2 || + !FTy->getParamType(0)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } else if (Name == "vfprintf" || + Name == "vsprintf") { + if (FTy->getNumParams() != 3 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + } else if (Name == "vsnprintf") { + if (FTy->getNumParams() != 4 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(2)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 3); + } + break; + case 'o': + if (Name == "open") { + if (FTy->getNumParams() < 2 || + !FTy->getParamType(0)->isPointerTy()) + continue; + // May throw; "open" is a valid pthread cancellation point. + setDoesNotCapture(F, 1); + } else if (Name == "opendir") { + if (FTy->getNumParams() != 1 || + !FTy->getReturnType()->isPointerTy() || + !FTy->getParamType(0)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotAlias(F, 0); + setDoesNotCapture(F, 1); + } + break; + case 't': + if (Name == "tmpfile") { + if (!FTy->getReturnType()->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotAlias(F, 0); + } else if (Name == "times") { + if (FTy->getNumParams() != 1 || + !FTy->getParamType(0)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } + break; + case 'h': + if (Name == "htonl" || + Name == "htons") { + setDoesNotThrow(F); + setDoesNotAccessMemory(F); + } + break; + case 'n': + if (Name == "ntohl" || + Name == "ntohs") { + setDoesNotThrow(F); + setDoesNotAccessMemory(F); + } + break; + case 'l': + if (Name == "lstat") { + if (FTy->getNumParams() != 2 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + } else if (Name == "lchown") { + if (FTy->getNumParams() != 3 || + !FTy->getParamType(0)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } + break; + case 'q': + if (Name == "qsort") { + if (FTy->getNumParams() != 4 || + !FTy->getParamType(3)->isPointerTy()) + continue; + // May throw; places call through function pointer. + setDoesNotCapture(F, 4); + } + break; + case '_': + if (Name == "__strdup" || + Name == "__strndup") { + if (FTy->getNumParams() < 1 || + !FTy->getReturnType()->isPointerTy() || + !FTy->getParamType(0)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotAlias(F, 0); + setDoesNotCapture(F, 1); + } else if (Name == "__strtok_r") { + if (FTy->getNumParams() != 3 || + !FTy->getParamType(1)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 2); + } else if (Name == "_IO_getc") { + if (FTy->getNumParams() != 1 || + !FTy->getParamType(0)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } else if (Name == "_IO_putc") { + if (FTy->getNumParams() != 2 || + !FTy->getParamType(1)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 2); + } + break; + case 1: + if (Name == "\1__isoc99_scanf") { + if (FTy->getNumParams() < 1 || + !FTy->getParamType(0)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } else if (Name == "\1stat64" || + Name == "\1lstat64" || + Name == "\1statvfs64" || + Name == "\1__isoc99_sscanf") { + if (FTy->getNumParams() < 1 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + } else if (Name == "\1fopen64") { + if (FTy->getNumParams() != 2 || + !FTy->getReturnType()->isPointerTy() || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotAlias(F, 0); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + } else if (Name == "\1fseeko64" || + Name == "\1ftello64") { + if (FTy->getNumParams() == 0 || + !FTy->getParamType(0)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } else if (Name == "\1tmpfile64") { + if (!FTy->getReturnType()->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotAlias(F, 0); + } else if (Name == "\1fstat64" || + Name == "\1fstatvfs64") { + if (FTy->getNumParams() != 2 || + !FTy->getParamType(1)->isPointerTy()) + continue; + setDoesNotThrow(F); + setDoesNotCapture(F, 2); + } else if (Name == "\1open64") { + if (FTy->getNumParams() < 2 || + !FTy->getParamType(0)->isPointerTy()) + continue; + // May throw; "open" is a valid pthread cancellation point. + setDoesNotCapture(F, 1); + } + break; + } + } + return Modified; +} + +// TODO: +// Additional cases that we need to add to this file: +// +// cbrt: +// * cbrt(expN(X)) -> expN(x/3) +// * cbrt(sqrt(x)) -> pow(x,1/6) +// * cbrt(sqrt(x)) -> pow(x,1/9) +// +// cos, cosf, cosl: +// * cos(-x) -> cos(x) +// +// exp, expf, expl: +// * exp(log(x)) -> x +// +// log, logf, logl: +// * log(exp(x)) -> x +// * log(x**y) -> y*log(x) +// * log(exp(y)) -> y*log(e) +// * log(exp2(y)) -> y*log(2) +// * log(exp10(y)) -> y*log(10) +// * log(sqrt(x)) -> 0.5*log(x) +// * log(pow(x,y)) -> y*log(x) +// +// lround, lroundf, lroundl: +// * lround(cnst) -> cnst' +// +// pow, powf, powl: +// * pow(exp(x),y) -> exp(x*y) +// * pow(sqrt(x),y) -> pow(x,y*0.5) +// * pow(pow(x,y),z)-> pow(x,y*z) +// +// round, roundf, roundl: +// * round(cnst) -> cnst' +// +// signbit: +// * signbit(cnst) -> cnst' +// * signbit(nncst) -> 0 (if pstv is a non-negative constant) +// +// sqrt, sqrtf, sqrtl: +// * sqrt(expN(x)) -> expN(x*0.5) +// * sqrt(Nroot(x)) -> pow(x,1/(2*N)) +// * sqrt(pow(x,y)) -> pow(|x|,y*0.5) +// +// stpcpy: +// * stpcpy(str, "literal") -> +// llvm.memcpy(str,"literal",strlen("literal")+1,1) +// +// tan, tanf, tanl: +// * tan(atan(x)) -> x +// +// trunc, truncf, truncl: +// * trunc(cnst) -> cnst' +// +// diff --git a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp new file mode 100644 index 0000000..705f442 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp @@ -0,0 +1,274 @@ +//===-- Sink.cpp - Code Sinking -------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass moves instructions into successor blocks, when possible, so that +// they aren't executed on paths where their results aren't needed. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "sink" +#include "llvm/Transforms/Scalar.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Assembly/Writer.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/CFG.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +STATISTIC(NumSunk, "Number of instructions sunk"); + +namespace { + class Sinking : public FunctionPass { + DominatorTree *DT; + LoopInfo *LI; + AliasAnalysis *AA; + + public: + static char ID; // Pass identification + Sinking() : FunctionPass(ID) { + initializeSinkingPass(*PassRegistry::getPassRegistry()); + } + + virtual bool runOnFunction(Function &F); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + FunctionPass::getAnalysisUsage(AU); + AU.addRequired<AliasAnalysis>(); + AU.addRequired<DominatorTree>(); + AU.addRequired<LoopInfo>(); + AU.addPreserved<DominatorTree>(); + AU.addPreserved<LoopInfo>(); + } + private: + bool ProcessBlock(BasicBlock &BB); + bool SinkInstruction(Instruction *I, SmallPtrSet<Instruction *, 8> &Stores); + bool AllUsesDominatedByBlock(Instruction *Inst, BasicBlock *BB) const; + }; +} // end anonymous namespace + +char Sinking::ID = 0; +INITIALIZE_PASS_BEGIN(Sinking, "sink", "Code sinking", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_END(Sinking, "sink", "Code sinking", false, false) + +FunctionPass *llvm::createSinkingPass() { return new Sinking(); } + +/// AllUsesDominatedByBlock - Return true if all uses of the specified value +/// occur in blocks dominated by the specified block. +bool Sinking::AllUsesDominatedByBlock(Instruction *Inst, + BasicBlock *BB) const { + // Ignoring debug uses is necessary so debug info doesn't affect the code. + // This may leave a referencing dbg_value in the original block, before + // the definition of the vreg. Dwarf generator handles this although the + // user might not get the right info at runtime. + for (Value::use_iterator I = Inst->use_begin(), + E = Inst->use_end(); I != E; ++I) { + // Determine the block of the use. + Instruction *UseInst = cast<Instruction>(*I); + BasicBlock *UseBlock = UseInst->getParent(); + if (PHINode *PN = dyn_cast<PHINode>(UseInst)) { + // PHI nodes use the operand in the predecessor block, not the block with + // the PHI. + unsigned Num = PHINode::getIncomingValueNumForOperand(I.getOperandNo()); + UseBlock = PN->getIncomingBlock(Num); + } + // Check that it dominates. + if (!DT->dominates(BB, UseBlock)) + return false; + } + return true; +} + +bool Sinking::runOnFunction(Function &F) { + DT = &getAnalysis<DominatorTree>(); + LI = &getAnalysis<LoopInfo>(); + AA = &getAnalysis<AliasAnalysis>(); + + bool EverMadeChange = false; + + while (1) { + bool MadeChange = false; + + // Process all basic blocks. + for (Function::iterator I = F.begin(), E = F.end(); + I != E; ++I) + MadeChange |= ProcessBlock(*I); + + // If this iteration over the code changed anything, keep iterating. + if (!MadeChange) break; + EverMadeChange = true; + } + return EverMadeChange; +} + +bool Sinking::ProcessBlock(BasicBlock &BB) { + // Can't sink anything out of a block that has less than two successors. + if (BB.getTerminator()->getNumSuccessors() <= 1 || BB.empty()) return false; + + // Don't bother sinking code out of unreachable blocks. In addition to being + // unprofitable, it can also lead to infinite looping, because in an unreachable + // loop there may be nowhere to stop. + if (!DT->isReachableFromEntry(&BB)) return false; + + bool MadeChange = false; + + // Walk the basic block bottom-up. Remember if we saw a store. + BasicBlock::iterator I = BB.end(); + --I; + bool ProcessedBegin = false; + SmallPtrSet<Instruction *, 8> Stores; + do { + Instruction *Inst = I; // The instruction to sink. + + // Predecrement I (if it's not begin) so that it isn't invalidated by + // sinking. + ProcessedBegin = I == BB.begin(); + if (!ProcessedBegin) + --I; + + if (isa<DbgInfoIntrinsic>(Inst)) + continue; + + if (SinkInstruction(Inst, Stores)) + ++NumSunk, MadeChange = true; + + // If we just processed the first instruction in the block, we're done. + } while (!ProcessedBegin); + + return MadeChange; +} + +static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA, + SmallPtrSet<Instruction *, 8> &Stores) { + if (LoadInst *L = dyn_cast<LoadInst>(Inst)) { + if (L->isVolatile()) return false; + + AliasAnalysis::Location Loc = AA->getLocation(L); + for (SmallPtrSet<Instruction *, 8>::iterator I = Stores.begin(), + E = Stores.end(); I != E; ++I) + if (AA->getModRefInfo(*I, Loc) & AliasAnalysis::Mod) + return false; + } + + if (Inst->mayWriteToMemory()) { + Stores.insert(Inst); + return false; + } + + if (isa<TerminatorInst>(Inst) || isa<PHINode>(Inst)) + return false; + + return true; +} + +/// SinkInstruction - Determine whether it is safe to sink the specified machine +/// instruction out of its current block into a successor. +bool Sinking::SinkInstruction(Instruction *Inst, + SmallPtrSet<Instruction *, 8> &Stores) { + // Check if it's safe to move the instruction. + if (!isSafeToMove(Inst, AA, Stores)) + return false; + + // FIXME: This should include support for sinking instructions within the + // block they are currently in to shorten the live ranges. We often get + // instructions sunk into the top of a large block, but it would be better to + // also sink them down before their first use in the block. This xform has to + // be careful not to *increase* register pressure though, e.g. sinking + // "x = y + z" down if it kills y and z would increase the live ranges of y + // and z and only shrink the live range of x. + + // Loop over all the operands of the specified instruction. If there is + // anything we can't handle, bail out. + BasicBlock *ParentBlock = Inst->getParent(); + + // SuccToSinkTo - This is the successor to sink this instruction to, once we + // decide. + BasicBlock *SuccToSinkTo = 0; + + // FIXME: This picks a successor to sink into based on having one + // successor that dominates all the uses. However, there are cases where + // sinking can happen but where the sink point isn't a successor. For + // example: + // x = computation + // if () {} else {} + // use x + // the instruction could be sunk over the whole diamond for the + // if/then/else (or loop, etc), allowing it to be sunk into other blocks + // after that. + + // Instructions can only be sunk if all their uses are in blocks + // dominated by one of the successors. + // Look at all the successors and decide which one + // we should sink to. + for (succ_iterator SI = succ_begin(ParentBlock), + E = succ_end(ParentBlock); SI != E; ++SI) { + if (AllUsesDominatedByBlock(Inst, *SI)) { + SuccToSinkTo = *SI; + break; + } + } + + // If we couldn't find a block to sink to, ignore this instruction. + if (SuccToSinkTo == 0) + return false; + + // It is not possible to sink an instruction into its own block. This can + // happen with loops. + if (Inst->getParent() == SuccToSinkTo) + return false; + + DEBUG(dbgs() << "Sink instr " << *Inst); + DEBUG(dbgs() << "to block "; + WriteAsOperand(dbgs(), SuccToSinkTo, false)); + + // If the block has multiple predecessors, this would introduce computation on + // a path that it doesn't already exist. We could split the critical edge, + // but for now we just punt. + // FIXME: Split critical edges if not backedges. + if (SuccToSinkTo->getUniquePredecessor() != ParentBlock) { + // We cannot sink a load across a critical edge - there may be stores in + // other code paths. + if (!Inst->isSafeToSpeculativelyExecute()) { + DEBUG(dbgs() << " *** PUNTING: Wont sink load along critical edge.\n"); + return false; + } + + // We don't want to sink across a critical edge if we don't dominate the + // successor. We could be introducing calculations to new code paths. + if (!DT->dominates(ParentBlock, SuccToSinkTo)) { + DEBUG(dbgs() << " *** PUNTING: Critical edge found\n"); + return false; + } + + // Don't sink instructions into a loop. + if (LI->isLoopHeader(SuccToSinkTo)) { + DEBUG(dbgs() << " *** PUNTING: Loop header found\n"); + return false; + } + + // Otherwise we are OK with sinking along a critical edge. + DEBUG(dbgs() << "Sinking along critical edge.\n"); + } + + // Determine where to insert into. Skip phi nodes. + BasicBlock::iterator InsertPos = SuccToSinkTo->begin(); + while (InsertPos != SuccToSinkTo->end() && isa<PHINode>(InsertPos)) + ++InsertPos; + + // Move the instruction. + Inst->moveBefore(InsertPos); + return true; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/TailDuplication.cpp b/contrib/llvm/lib/Transforms/Scalar/TailDuplication.cpp new file mode 100644 index 0000000..9dd83c0 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/TailDuplication.cpp @@ -0,0 +1,373 @@ +//===- TailDuplication.cpp - Simplify CFG through tail duplication --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass performs a limited form of tail duplication, intended to simplify +// CFGs by removing some unconditional branches. This pass is necessary to +// straighten out loops created by the C front-end, but also is capable of +// making other code nicer. After this pass is run, the CFG simplify pass +// should be run to clean up the mess. +// +// This pass could be enhanced in the future to use profile information to be +// more aggressive. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "tailduplicate" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Constant.h" +#include "llvm/Function.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Pass.h" +#include "llvm/Type.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Support/CFG.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/Local.h" +#include <map> +using namespace llvm; + +STATISTIC(NumEliminated, "Number of unconditional branches eliminated"); + +static cl::opt<unsigned> +TailDupThreshold("taildup-threshold", + cl::desc("Max block size to tail duplicate"), + cl::init(1), cl::Hidden); + +namespace { + class TailDup : public FunctionPass { + bool runOnFunction(Function &F); + public: + static char ID; // Pass identification, replacement for typeid + TailDup() : FunctionPass(ID) { + initializeTailDupPass(*PassRegistry::getPassRegistry()); + } + + private: + inline bool shouldEliminateUnconditionalBranch(TerminatorInst *, unsigned); + inline void eliminateUnconditionalBranch(BranchInst *BI); + SmallPtrSet<BasicBlock*, 4> CycleDetector; + }; +} + +char TailDup::ID = 0; +INITIALIZE_PASS(TailDup, "tailduplicate", "Tail Duplication", false, false) + +// Public interface to the Tail Duplication pass +FunctionPass *llvm::createTailDuplicationPass() { return new TailDup(); } + +/// runOnFunction - Top level algorithm - Loop over each unconditional branch in +/// the function, eliminating it if it looks attractive enough. CycleDetector +/// prevents infinite loops by checking that we aren't redirecting a branch to +/// a place it already pointed to earlier; see PR 2323. +bool TailDup::runOnFunction(Function &F) { + bool Changed = false; + CycleDetector.clear(); + for (Function::iterator I = F.begin(), E = F.end(); I != E; ) { + if (shouldEliminateUnconditionalBranch(I->getTerminator(), + TailDupThreshold)) { + eliminateUnconditionalBranch(cast<BranchInst>(I->getTerminator())); + Changed = true; + } else { + ++I; + CycleDetector.clear(); + } + } + return Changed; +} + +/// shouldEliminateUnconditionalBranch - Return true if this branch looks +/// attractive to eliminate. We eliminate the branch if the destination basic +/// block has <= 5 instructions in it, not counting PHI nodes. In practice, +/// since one of these is a terminator instruction, this means that we will add +/// up to 4 instructions to the new block. +/// +/// We don't count PHI nodes in the count since they will be removed when the +/// contents of the block are copied over. +/// +bool TailDup::shouldEliminateUnconditionalBranch(TerminatorInst *TI, + unsigned Threshold) { + BranchInst *BI = dyn_cast<BranchInst>(TI); + if (!BI || !BI->isUnconditional()) return false; // Not an uncond branch! + + BasicBlock *Dest = BI->getSuccessor(0); + if (Dest == BI->getParent()) return false; // Do not loop infinitely! + + // Do not inline a block if we will just get another branch to the same block! + TerminatorInst *DTI = Dest->getTerminator(); + if (BranchInst *DBI = dyn_cast<BranchInst>(DTI)) + if (DBI->isUnconditional() && DBI->getSuccessor(0) == Dest) + return false; // Do not loop infinitely! + + // FIXME: DemoteRegToStack cannot yet demote invoke instructions to the stack, + // because doing so would require breaking critical edges. This should be + // fixed eventually. + if (!DTI->use_empty()) + return false; + + // Do not bother with blocks with only a single predecessor: simplify + // CFG will fold these two blocks together! + pred_iterator PI = pred_begin(Dest), PE = pred_end(Dest); + ++PI; + if (PI == PE) return false; // Exactly one predecessor! + + BasicBlock::iterator I = Dest->getFirstNonPHI(); + + for (unsigned Size = 0; I != Dest->end(); ++I) { + if (Size == Threshold) return false; // The block is too large. + + // Don't tail duplicate call instructions. They are very large compared to + // other instructions. + if (isa<CallInst>(I) || isa<InvokeInst>(I)) return false; + + // Also alloca and malloc. + if (isa<AllocaInst>(I)) return false; + + // Some vector instructions can expand into a number of instructions. + if (isa<ShuffleVectorInst>(I) || isa<ExtractElementInst>(I) || + isa<InsertElementInst>(I)) return false; + + // Only count instructions that are not debugger intrinsics. + if (!isa<DbgInfoIntrinsic>(I)) ++Size; + } + + // Do not tail duplicate a block that has thousands of successors into a block + // with a single successor if the block has many other predecessors. This can + // cause an N^2 explosion in CFG edges (and PHI node entries), as seen in + // cases that have a large number of indirect gotos. + unsigned NumSuccs = DTI->getNumSuccessors(); + if (NumSuccs > 8) { + unsigned TooMany = 128; + if (NumSuccs >= TooMany) return false; + TooMany = TooMany/NumSuccs; + for (; PI != PE; ++PI) + if (TooMany-- == 0) return false; + } + + // If this unconditional branch is a fall-through, be careful about + // tail duplicating it. In particular, we don't want to taildup it if the + // original block will still be there after taildup is completed: doing so + // would eliminate the fall-through, requiring unconditional branches. + Function::iterator DestI = Dest; + if (&*--DestI == BI->getParent()) { + // The uncond branch is a fall-through. Tail duplication of the block is + // will eliminate the fall-through-ness and end up cloning the terminator + // at the end of the Dest block. Since the original Dest block will + // continue to exist, this means that one or the other will not be able to + // fall through. One typical example that this helps with is code like: + // if (a) + // foo(); + // if (b) + // foo(); + // Cloning the 'if b' block into the end of the first foo block is messy. + + // The messy case is when the fall-through block falls through to other + // blocks. This is what we would be preventing if we cloned the block. + DestI = Dest; + if (++DestI != Dest->getParent()->end()) { + BasicBlock *DestSucc = DestI; + // If any of Dest's successors are fall-throughs, don't do this xform. + for (succ_iterator SI = succ_begin(Dest), SE = succ_end(Dest); + SI != SE; ++SI) + if (*SI == DestSucc) + return false; + } + } + + // Finally, check that we haven't redirected to this target block earlier; + // there are cases where we loop forever if we don't check this (PR 2323). + if (!CycleDetector.insert(Dest)) + return false; + + return true; +} + +/// FindObviousSharedDomOf - We know there is a branch from SrcBlock to +/// DestBlock, and that SrcBlock is not the only predecessor of DstBlock. If we +/// can find a predecessor of SrcBlock that is a dominator of both SrcBlock and +/// DstBlock, return it. +static BasicBlock *FindObviousSharedDomOf(BasicBlock *SrcBlock, + BasicBlock *DstBlock) { + // SrcBlock must have a single predecessor. + pred_iterator PI = pred_begin(SrcBlock), PE = pred_end(SrcBlock); + if (PI == PE || ++PI != PE) return 0; + + BasicBlock *SrcPred = *pred_begin(SrcBlock); + + // Look at the predecessors of DstBlock. One of them will be SrcBlock. If + // there is only one other pred, get it, otherwise we can't handle it. + PI = pred_begin(DstBlock); PE = pred_end(DstBlock); + BasicBlock *DstOtherPred = 0; + BasicBlock *P = *PI; + if (P == SrcBlock) { + if (++PI == PE) return 0; + DstOtherPred = *PI; + if (++PI != PE) return 0; + } else { + DstOtherPred = P; + if (++PI == PE || *PI != SrcBlock || ++PI != PE) return 0; + } + + // We can handle two situations here: "if then" and "if then else" blocks. An + // 'if then' situation is just where DstOtherPred == SrcPred. + if (DstOtherPred == SrcPred) + return SrcPred; + + // Check to see if we have an "if then else" situation, which means that + // DstOtherPred will have a single predecessor and it will be SrcPred. + PI = pred_begin(DstOtherPred); PE = pred_end(DstOtherPred); + if (PI != PE && *PI == SrcPred) { + if (++PI != PE) return 0; // Not a single pred. + return SrcPred; // Otherwise, it's an "if then" situation. Return the if. + } + + // Otherwise, this is something we can't handle. + return 0; +} + + +/// eliminateUnconditionalBranch - Clone the instructions from the destination +/// block into the source block, eliminating the specified unconditional branch. +/// If the destination block defines values used by successors of the dest +/// block, we may need to insert PHI nodes. +/// +void TailDup::eliminateUnconditionalBranch(BranchInst *Branch) { + BasicBlock *SourceBlock = Branch->getParent(); + BasicBlock *DestBlock = Branch->getSuccessor(0); + assert(SourceBlock != DestBlock && "Our predicate is broken!"); + + DEBUG(dbgs() << "TailDuplication[" << SourceBlock->getParent()->getName() + << "]: Eliminating branch: " << *Branch); + + // See if we can avoid duplicating code by moving it up to a dominator of both + // blocks. + if (BasicBlock *DomBlock = FindObviousSharedDomOf(SourceBlock, DestBlock)) { + DEBUG(dbgs() << "Found shared dominator: " << DomBlock->getName() << "\n"); + + // If there are non-phi instructions in DestBlock that have no operands + // defined in DestBlock, and if the instruction has no side effects, we can + // move the instruction to DomBlock instead of duplicating it. + BasicBlock::iterator BBI = DestBlock->getFirstNonPHI(); + while (!isa<TerminatorInst>(BBI)) { + Instruction *I = BBI++; + + bool CanHoist = I->isSafeToSpeculativelyExecute() && + !I->mayReadFromMemory(); + if (CanHoist) { + for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op) + if (Instruction *OpI = dyn_cast<Instruction>(I->getOperand(op))) + if (OpI->getParent() == DestBlock || + (isa<InvokeInst>(OpI) && OpI->getParent() == DomBlock)) { + CanHoist = false; + break; + } + if (CanHoist) { + // Remove from DestBlock, move right before the term in DomBlock. + DestBlock->getInstList().remove(I); + DomBlock->getInstList().insert(DomBlock->getTerminator(), I); + DEBUG(dbgs() << "Hoisted: " << *I); + } + } + } + } + + // Tail duplication can not update SSA properties correctly if the values + // defined in the duplicated tail are used outside of the tail itself. For + // this reason, we spill all values that are used outside of the tail to the + // stack. + for (BasicBlock::iterator I = DestBlock->begin(); I != DestBlock->end(); ++I) + if (I->isUsedOutsideOfBlock(DestBlock)) { + // We found a use outside of the tail. Create a new stack slot to + // break this inter-block usage pattern. + DemoteRegToStack(*I); + } + + // We are going to have to map operands from the original block B to the new + // copy of the block B'. If there are PHI nodes in the DestBlock, these PHI + // nodes also define part of this mapping. Loop over these PHI nodes, adding + // them to our mapping. + // + std::map<Value*, Value*> ValueMapping; + + BasicBlock::iterator BI = DestBlock->begin(); + bool HadPHINodes = isa<PHINode>(BI); + for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI) + ValueMapping[PN] = PN->getIncomingValueForBlock(SourceBlock); + + // Clone the non-phi instructions of the dest block into the source block, + // keeping track of the mapping... + // + for (; BI != DestBlock->end(); ++BI) { + Instruction *New = BI->clone(); + New->setName(BI->getName()); + SourceBlock->getInstList().push_back(New); + ValueMapping[BI] = New; + } + + // Now that we have built the mapping information and cloned all of the + // instructions (giving us a new terminator, among other things), walk the new + // instructions, rewriting references of old instructions to use new + // instructions. + // + BI = Branch; ++BI; // Get an iterator to the first new instruction + for (; BI != SourceBlock->end(); ++BI) + for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) { + std::map<Value*, Value*>::const_iterator I = + ValueMapping.find(BI->getOperand(i)); + if (I != ValueMapping.end()) + BI->setOperand(i, I->second); + } + + // Next we check to see if any of the successors of DestBlock had PHI nodes. + // If so, we need to add entries to the PHI nodes for SourceBlock now. + for (succ_iterator SI = succ_begin(DestBlock), SE = succ_end(DestBlock); + SI != SE; ++SI) { + BasicBlock *Succ = *SI; + for (BasicBlock::iterator PNI = Succ->begin(); isa<PHINode>(PNI); ++PNI) { + PHINode *PN = cast<PHINode>(PNI); + // Ok, we have a PHI node. Figure out what the incoming value was for the + // DestBlock. + Value *IV = PN->getIncomingValueForBlock(DestBlock); + + // Remap the value if necessary... + std::map<Value*, Value*>::const_iterator I = ValueMapping.find(IV); + if (I != ValueMapping.end()) + IV = I->second; + PN->addIncoming(IV, SourceBlock); + } + } + + // Next, remove the old branch instruction, and any PHI node entries that we + // had. + BI = Branch; ++BI; // Get an iterator to the first new instruction + DestBlock->removePredecessor(SourceBlock); // Remove entries in PHI nodes... + SourceBlock->getInstList().erase(Branch); // Destroy the uncond branch... + + // Final step: now that we have finished everything up, walk the cloned + // instructions one last time, constant propagating and DCE'ing them, because + // they may not be needed anymore. + // + if (HadPHINodes) { + while (BI != SourceBlock->end()) { + Instruction *Inst = BI++; + if (isInstructionTriviallyDead(Inst)) + Inst->eraseFromParent(); + else if (Value *V = SimplifyInstruction(Inst)) { + Inst->replaceAllUsesWith(V); + Inst->eraseFromParent(); + } + } + } + + ++NumEliminated; // We just killed a branch! +} diff --git a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp new file mode 100644 index 0000000..5b6bc04 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -0,0 +1,630 @@ +//===- TailRecursionElimination.cpp - Eliminate Tail Calls ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file transforms calls of the current function (self recursion) followed +// by a return instruction with a branch to the entry of the function, creating +// a loop. This pass also implements the following extensions to the basic +// algorithm: +// +// 1. Trivial instructions between the call and return do not prevent the +// transformation from taking place, though currently the analysis cannot +// support moving any really useful instructions (only dead ones). +// 2. This pass transforms functions that are prevented from being tail +// recursive by an associative and commutative expression to use an +// accumulator variable, thus compiling the typical naive factorial or +// 'fib' implementation into efficient code. +// 3. TRE is performed if the function returns void, if the return +// returns the result returned by the call, or if the function returns a +// run-time constant on all exits from the function. It is possible, though +// unlikely, that the return returns something else (like constant 0), and +// can still be TRE'd. It can be TRE'd if ALL OTHER return instructions in +// the function return the exact same value. +// 4. If it can prove that callees do not access their caller stack frame, +// they are marked as eligible for tail call elimination (by the code +// generator). +// +// There are several improvements that could be made: +// +// 1. If the function has any alloca instructions, these instructions will be +// moved out of the entry block of the function, causing them to be +// evaluated each time through the tail recursion. Safely keeping allocas +// in the entry block requires analysis to proves that the tail-called +// function does not read or write the stack object. +// 2. Tail recursion is only performed if the call immediately preceeds the +// return instruction. It's possible that there could be a jump between +// the call and the return. +// 3. There can be intervening operations between the call and the return that +// prevent the TRE from occurring. For example, there could be GEP's and +// stores to memory that will not be read or written by the call. This +// requires some substantial analysis (such as with DSA) to prove safe to +// move ahead of the call, but doing so could allow many more TREs to be +// performed, for example in TreeAdd/TreeAlloc from the treeadd benchmark. +// 4. The algorithm we use to detect if callees access their caller stack +// frames is very primitive. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "tailcallelim" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Pass.h" +#include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/InlineCost.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Support/CFG.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +using namespace llvm; + +STATISTIC(NumEliminated, "Number of tail calls removed"); +STATISTIC(NumRetDuped, "Number of return duplicated"); +STATISTIC(NumAccumAdded, "Number of accumulators introduced"); + +namespace { + struct TailCallElim : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + TailCallElim() : FunctionPass(ID) { + initializeTailCallElimPass(*PassRegistry::getPassRegistry()); + } + + virtual bool runOnFunction(Function &F); + + private: + CallInst *FindTRECandidate(Instruction *I, + bool CannotTailCallElimCallsMarkedTail); + bool EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, + BasicBlock *&OldEntry, + bool &TailCallsAreMarkedTail, + SmallVector<PHINode*, 8> &ArgumentPHIs, + bool CannotTailCallElimCallsMarkedTail); + bool FoldReturnAndProcessPred(BasicBlock *BB, + ReturnInst *Ret, BasicBlock *&OldEntry, + bool &TailCallsAreMarkedTail, + SmallVector<PHINode*, 8> &ArgumentPHIs, + bool CannotTailCallElimCallsMarkedTail); + bool ProcessReturningBlock(ReturnInst *RI, BasicBlock *&OldEntry, + bool &TailCallsAreMarkedTail, + SmallVector<PHINode*, 8> &ArgumentPHIs, + bool CannotTailCallElimCallsMarkedTail); + bool CanMoveAboveCall(Instruction *I, CallInst *CI); + Value *CanTransformAccumulatorRecursion(Instruction *I, CallInst *CI); + }; +} + +char TailCallElim::ID = 0; +INITIALIZE_PASS(TailCallElim, "tailcallelim", + "Tail Call Elimination", false, false) + +// Public interface to the TailCallElimination pass +FunctionPass *llvm::createTailCallEliminationPass() { + return new TailCallElim(); +} + +/// AllocaMightEscapeToCalls - Return true if this alloca may be accessed by +/// callees of this function. We only do very simple analysis right now, this +/// could be expanded in the future to use mod/ref information for particular +/// call sites if desired. +static bool AllocaMightEscapeToCalls(AllocaInst *AI) { + // FIXME: do simple 'address taken' analysis. + return true; +} + +/// CheckForEscapingAllocas - Scan the specified basic block for alloca +/// instructions. If it contains any that might be accessed by calls, return +/// true. +static bool CheckForEscapingAllocas(BasicBlock *BB, + bool &CannotTCETailMarkedCall) { + bool RetVal = false; + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) + if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) { + RetVal |= AllocaMightEscapeToCalls(AI); + + // If this alloca is in the body of the function, or if it is a variable + // sized allocation, we cannot tail call eliminate calls marked 'tail' + // with this mechanism. + if (BB != &BB->getParent()->getEntryBlock() || + !isa<ConstantInt>(AI->getArraySize())) + CannotTCETailMarkedCall = true; + } + return RetVal; +} + +bool TailCallElim::runOnFunction(Function &F) { + // If this function is a varargs function, we won't be able to PHI the args + // right, so don't even try to convert it... + if (F.getFunctionType()->isVarArg()) return false; + + BasicBlock *OldEntry = 0; + bool TailCallsAreMarkedTail = false; + SmallVector<PHINode*, 8> ArgumentPHIs; + bool MadeChange = false; + bool FunctionContainsEscapingAllocas = false; + + // CannotTCETailMarkedCall - If true, we cannot perform TCE on tail calls + // marked with the 'tail' attribute, because doing so would cause the stack + // size to increase (real TCE would deallocate variable sized allocas, TCE + // doesn't). + bool CannotTCETailMarkedCall = false; + + // Loop over the function, looking for any returning blocks, and keeping track + // of whether this function has any non-trivially used allocas. + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { + if (FunctionContainsEscapingAllocas && CannotTCETailMarkedCall) + break; + + FunctionContainsEscapingAllocas |= + CheckForEscapingAllocas(BB, CannotTCETailMarkedCall); + } + + /// FIXME: The code generator produces really bad code when an 'escaping + /// alloca' is changed from being a static alloca to being a dynamic alloca. + /// Until this is resolved, disable this transformation if that would ever + /// happen. This bug is PR962. + if (FunctionContainsEscapingAllocas) + return false; + + // Second pass, change any tail calls to loops. + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { + if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) { + bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail, + ArgumentPHIs,CannotTCETailMarkedCall); + if (!Change && BB->getFirstNonPHIOrDbg() == Ret) + Change = FoldReturnAndProcessPred(BB, Ret, OldEntry, + TailCallsAreMarkedTail, ArgumentPHIs, + CannotTCETailMarkedCall); + MadeChange |= Change; + } + } + + // If we eliminated any tail recursions, it's possible that we inserted some + // silly PHI nodes which just merge an initial value (the incoming operand) + // with themselves. Check to see if we did and clean up our mess if so. This + // occurs when a function passes an argument straight through to its tail + // call. + if (!ArgumentPHIs.empty()) { + for (unsigned i = 0, e = ArgumentPHIs.size(); i != e; ++i) { + PHINode *PN = ArgumentPHIs[i]; + + // If the PHI Node is a dynamic constant, replace it with the value it is. + if (Value *PNV = SimplifyInstruction(PN)) { + PN->replaceAllUsesWith(PNV); + PN->eraseFromParent(); + } + } + } + + // Finally, if this function contains no non-escaping allocas, mark all calls + // in the function as eligible for tail calls (there is no stack memory for + // them to access). + if (!FunctionContainsEscapingAllocas) + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) + if (CallInst *CI = dyn_cast<CallInst>(I)) { + CI->setTailCall(); + MadeChange = true; + } + + return MadeChange; +} + + +/// CanMoveAboveCall - Return true if it is safe to move the specified +/// instruction from after the call to before the call, assuming that all +/// instructions between the call and this instruction are movable. +/// +bool TailCallElim::CanMoveAboveCall(Instruction *I, CallInst *CI) { + // FIXME: We can move load/store/call/free instructions above the call if the + // call does not mod/ref the memory location being processed. + if (I->mayHaveSideEffects()) // This also handles volatile loads. + return false; + + if (LoadInst *L = dyn_cast<LoadInst>(I)) { + // Loads may always be moved above calls without side effects. + if (CI->mayHaveSideEffects()) { + // Non-volatile loads may be moved above a call with side effects if it + // does not write to memory and the load provably won't trap. + // FIXME: Writes to memory only matter if they may alias the pointer + // being loaded from. + if (CI->mayWriteToMemory() || + !isSafeToLoadUnconditionally(L->getPointerOperand(), L, + L->getAlignment())) + return false; + } + } + + // Otherwise, if this is a side-effect free instruction, check to make sure + // that it does not use the return value of the call. If it doesn't use the + // return value of the call, it must only use things that are defined before + // the call, or movable instructions between the call and the instruction + // itself. + for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) + if (I->getOperand(i) == CI) + return false; + return true; +} + +// isDynamicConstant - Return true if the specified value is the same when the +// return would exit as it was when the initial iteration of the recursive +// function was executed. +// +// We currently handle static constants and arguments that are not modified as +// part of the recursion. +// +static bool isDynamicConstant(Value *V, CallInst *CI, ReturnInst *RI) { + if (isa<Constant>(V)) return true; // Static constants are always dyn consts + + // Check to see if this is an immutable argument, if so, the value + // will be available to initialize the accumulator. + if (Argument *Arg = dyn_cast<Argument>(V)) { + // Figure out which argument number this is... + unsigned ArgNo = 0; + Function *F = CI->getParent()->getParent(); + for (Function::arg_iterator AI = F->arg_begin(); &*AI != Arg; ++AI) + ++ArgNo; + + // If we are passing this argument into call as the corresponding + // argument operand, then the argument is dynamically constant. + // Otherwise, we cannot transform this function safely. + if (CI->getArgOperand(ArgNo) == Arg) + return true; + } + + // Switch cases are always constant integers. If the value is being switched + // on and the return is only reachable from one of its cases, it's + // effectively constant. + if (BasicBlock *UniquePred = RI->getParent()->getUniquePredecessor()) + if (SwitchInst *SI = dyn_cast<SwitchInst>(UniquePred->getTerminator())) + if (SI->getCondition() == V) + return SI->getDefaultDest() != RI->getParent(); + + // Not a constant or immutable argument, we can't safely transform. + return false; +} + +// getCommonReturnValue - Check to see if the function containing the specified +// tail call consistently returns the same runtime-constant value at all exit +// points except for IgnoreRI. If so, return the returned value. +// +static Value *getCommonReturnValue(ReturnInst *IgnoreRI, CallInst *CI) { + Function *F = CI->getParent()->getParent(); + Value *ReturnedValue = 0; + + for (Function::iterator BBI = F->begin(), E = F->end(); BBI != E; ++BBI) { + ReturnInst *RI = dyn_cast<ReturnInst>(BBI->getTerminator()); + if (RI == 0 || RI == IgnoreRI) continue; + + // We can only perform this transformation if the value returned is + // evaluatable at the start of the initial invocation of the function, + // instead of at the end of the evaluation. + // + Value *RetOp = RI->getOperand(0); + if (!isDynamicConstant(RetOp, CI, RI)) + return 0; + + if (ReturnedValue && RetOp != ReturnedValue) + return 0; // Cannot transform if differing values are returned. + ReturnedValue = RetOp; + } + return ReturnedValue; +} + +/// CanTransformAccumulatorRecursion - If the specified instruction can be +/// transformed using accumulator recursion elimination, return the constant +/// which is the start of the accumulator value. Otherwise return null. +/// +Value *TailCallElim::CanTransformAccumulatorRecursion(Instruction *I, + CallInst *CI) { + if (!I->isAssociative() || !I->isCommutative()) return 0; + assert(I->getNumOperands() == 2 && + "Associative/commutative operations should have 2 args!"); + + // Exactly one operand should be the result of the call instruction. + if ((I->getOperand(0) == CI && I->getOperand(1) == CI) || + (I->getOperand(0) != CI && I->getOperand(1) != CI)) + return 0; + + // The only user of this instruction we allow is a single return instruction. + if (!I->hasOneUse() || !isa<ReturnInst>(I->use_back())) + return 0; + + // Ok, now we have to check all of the other return instructions in this + // function. If they return non-constants or differing values, then we cannot + // transform the function safely. + return getCommonReturnValue(cast<ReturnInst>(I->use_back()), CI); +} + +static Instruction *FirstNonDbg(BasicBlock::iterator I) { + while (isa<DbgInfoIntrinsic>(I)) + ++I; + return &*I; +} + +CallInst* +TailCallElim::FindTRECandidate(Instruction *TI, + bool CannotTailCallElimCallsMarkedTail) { + BasicBlock *BB = TI->getParent(); + Function *F = BB->getParent(); + + if (&BB->front() == TI) // Make sure there is something before the terminator. + return 0; + + // Scan backwards from the return, checking to see if there is a tail call in + // this block. If so, set CI to it. + CallInst *CI = 0; + BasicBlock::iterator BBI = TI; + while (true) { + CI = dyn_cast<CallInst>(BBI); + if (CI && CI->getCalledFunction() == F) + break; + + if (BBI == BB->begin()) + return 0; // Didn't find a potential tail call. + --BBI; + } + + // If this call is marked as a tail call, and if there are dynamic allocas in + // the function, we cannot perform this optimization. + if (CI->isTailCall() && CannotTailCallElimCallsMarkedTail) + return 0; + + // As a special case, detect code like this: + // double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call + // and disable this xform in this case, because the code generator will + // lower the call to fabs into inline code. + if (BB == &F->getEntryBlock() && + FirstNonDbg(BB->front()) == CI && + FirstNonDbg(llvm::next(BB->begin())) == TI && + callIsSmall(F)) { + // A single-block function with just a call and a return. Check that + // the arguments match. + CallSite::arg_iterator I = CallSite(CI).arg_begin(), + E = CallSite(CI).arg_end(); + Function::arg_iterator FI = F->arg_begin(), + FE = F->arg_end(); + for (; I != E && FI != FE; ++I, ++FI) + if (*I != &*FI) break; + if (I == E && FI == FE) + return 0; + } + + return CI; +} + +bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, + BasicBlock *&OldEntry, + bool &TailCallsAreMarkedTail, + SmallVector<PHINode*, 8> &ArgumentPHIs, + bool CannotTailCallElimCallsMarkedTail) { + // If we are introducing accumulator recursion to eliminate operations after + // the call instruction that are both associative and commutative, the initial + // value for the accumulator is placed in this variable. If this value is set + // then we actually perform accumulator recursion elimination instead of + // simple tail recursion elimination. If the operation is an LLVM instruction + // (eg: "add") then it is recorded in AccumulatorRecursionInstr. If not, then + // we are handling the case when the return instruction returns a constant C + // which is different to the constant returned by other return instructions + // (which is recorded in AccumulatorRecursionEliminationInitVal). This is a + // special case of accumulator recursion, the operation being "return C". + Value *AccumulatorRecursionEliminationInitVal = 0; + Instruction *AccumulatorRecursionInstr = 0; + + // Ok, we found a potential tail call. We can currently only transform the + // tail call if all of the instructions between the call and the return are + // movable to above the call itself, leaving the call next to the return. + // Check that this is the case now. + BasicBlock::iterator BBI = CI; + for (++BBI; &*BBI != Ret; ++BBI) { + if (CanMoveAboveCall(BBI, CI)) continue; + + // If we can't move the instruction above the call, it might be because it + // is an associative and commutative operation that could be tranformed + // using accumulator recursion elimination. Check to see if this is the + // case, and if so, remember the initial accumulator value for later. + if ((AccumulatorRecursionEliminationInitVal = + CanTransformAccumulatorRecursion(BBI, CI))) { + // Yes, this is accumulator recursion. Remember which instruction + // accumulates. + AccumulatorRecursionInstr = BBI; + } else { + return false; // Otherwise, we cannot eliminate the tail recursion! + } + } + + // We can only transform call/return pairs that either ignore the return value + // of the call and return void, ignore the value of the call and return a + // constant, return the value returned by the tail call, or that are being + // accumulator recursion variable eliminated. + if (Ret->getNumOperands() == 1 && Ret->getReturnValue() != CI && + !isa<UndefValue>(Ret->getReturnValue()) && + AccumulatorRecursionEliminationInitVal == 0 && + !getCommonReturnValue(0, CI)) { + // One case remains that we are able to handle: the current return + // instruction returns a constant, and all other return instructions + // return a different constant. + if (!isDynamicConstant(Ret->getReturnValue(), CI, Ret)) + return false; // Current return instruction does not return a constant. + // Check that all other return instructions return a common constant. If + // so, record it in AccumulatorRecursionEliminationInitVal. + AccumulatorRecursionEliminationInitVal = getCommonReturnValue(Ret, CI); + if (!AccumulatorRecursionEliminationInitVal) + return false; + } + + BasicBlock *BB = Ret->getParent(); + Function *F = BB->getParent(); + + // OK! We can transform this tail call. If this is the first one found, + // create the new entry block, allowing us to branch back to the old entry. + if (OldEntry == 0) { + OldEntry = &F->getEntryBlock(); + BasicBlock *NewEntry = BasicBlock::Create(F->getContext(), "", F, OldEntry); + NewEntry->takeName(OldEntry); + OldEntry->setName("tailrecurse"); + BranchInst::Create(OldEntry, NewEntry); + + // If this tail call is marked 'tail' and if there are any allocas in the + // entry block, move them up to the new entry block. + TailCallsAreMarkedTail = CI->isTailCall(); + if (TailCallsAreMarkedTail) + // Move all fixed sized allocas from OldEntry to NewEntry. + for (BasicBlock::iterator OEBI = OldEntry->begin(), E = OldEntry->end(), + NEBI = NewEntry->begin(); OEBI != E; ) + if (AllocaInst *AI = dyn_cast<AllocaInst>(OEBI++)) + if (isa<ConstantInt>(AI->getArraySize())) + AI->moveBefore(NEBI); + + // Now that we have created a new block, which jumps to the entry + // block, insert a PHI node for each argument of the function. + // For now, we initialize each PHI to only have the real arguments + // which are passed in. + Instruction *InsertPos = OldEntry->begin(); + for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); + I != E; ++I) { + PHINode *PN = PHINode::Create(I->getType(), + I->getName() + ".tr", InsertPos); + I->replaceAllUsesWith(PN); // Everyone use the PHI node now! + PN->addIncoming(I, NewEntry); + ArgumentPHIs.push_back(PN); + } + } + + // If this function has self recursive calls in the tail position where some + // are marked tail and some are not, only transform one flavor or another. We + // have to choose whether we move allocas in the entry block to the new entry + // block or not, so we can't make a good choice for both. NOTE: We could do + // slightly better here in the case that the function has no entry block + // allocas. + if (TailCallsAreMarkedTail && !CI->isTailCall()) + return false; + + // Ok, now that we know we have a pseudo-entry block WITH all of the + // required PHI nodes, add entries into the PHI node for the actual + // parameters passed into the tail-recursive call. + for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) + ArgumentPHIs[i]->addIncoming(CI->getArgOperand(i), BB); + + // If we are introducing an accumulator variable to eliminate the recursion, + // do so now. Note that we _know_ that no subsequent tail recursion + // eliminations will happen on this function because of the way the + // accumulator recursion predicate is set up. + // + if (AccumulatorRecursionEliminationInitVal) { + Instruction *AccRecInstr = AccumulatorRecursionInstr; + // Start by inserting a new PHI node for the accumulator. + PHINode *AccPN = + PHINode::Create(AccumulatorRecursionEliminationInitVal->getType(), + "accumulator.tr", OldEntry->begin()); + + // Loop over all of the predecessors of the tail recursion block. For the + // real entry into the function we seed the PHI with the initial value, + // computed earlier. For any other existing branches to this block (due to + // other tail recursions eliminated) the accumulator is not modified. + // Because we haven't added the branch in the current block to OldEntry yet, + // it will not show up as a predecessor. + for (pred_iterator PI = pred_begin(OldEntry), PE = pred_end(OldEntry); + PI != PE; ++PI) { + BasicBlock *P = *PI; + if (P == &F->getEntryBlock()) + AccPN->addIncoming(AccumulatorRecursionEliminationInitVal, P); + else + AccPN->addIncoming(AccPN, P); + } + + if (AccRecInstr) { + // Add an incoming argument for the current block, which is computed by + // our associative and commutative accumulator instruction. + AccPN->addIncoming(AccRecInstr, BB); + + // Next, rewrite the accumulator recursion instruction so that it does not + // use the result of the call anymore, instead, use the PHI node we just + // inserted. + AccRecInstr->setOperand(AccRecInstr->getOperand(0) != CI, AccPN); + } else { + // Add an incoming argument for the current block, which is just the + // constant returned by the current return instruction. + AccPN->addIncoming(Ret->getReturnValue(), BB); + } + + // Finally, rewrite any return instructions in the program to return the PHI + // node instead of the "initval" that they do currently. This loop will + // actually rewrite the return value we are destroying, but that's ok. + for (Function::iterator BBI = F->begin(), E = F->end(); BBI != E; ++BBI) + if (ReturnInst *RI = dyn_cast<ReturnInst>(BBI->getTerminator())) + RI->setOperand(0, AccPN); + ++NumAccumAdded; + } + + // Now that all of the PHI nodes are in place, remove the call and + // ret instructions, replacing them with an unconditional branch. + BranchInst::Create(OldEntry, Ret); + BB->getInstList().erase(Ret); // Remove return. + BB->getInstList().erase(CI); // Remove call. + ++NumEliminated; + return true; +} + +bool TailCallElim::FoldReturnAndProcessPred(BasicBlock *BB, + ReturnInst *Ret, BasicBlock *&OldEntry, + bool &TailCallsAreMarkedTail, + SmallVector<PHINode*, 8> &ArgumentPHIs, + bool CannotTailCallElimCallsMarkedTail) { + bool Change = false; + + // If the return block contains nothing but the return and PHI's, + // there might be an opportunity to duplicate the return in its + // predecessors and perform TRC there. Look for predecessors that end + // in unconditional branch and recursive call(s). + SmallVector<BranchInst*, 8> UncondBranchPreds; + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { + BasicBlock *Pred = *PI; + TerminatorInst *PTI = Pred->getTerminator(); + if (BranchInst *BI = dyn_cast<BranchInst>(PTI)) + if (BI->isUnconditional()) + UncondBranchPreds.push_back(BI); + } + + while (!UncondBranchPreds.empty()) { + BranchInst *BI = UncondBranchPreds.pop_back_val(); + BasicBlock *Pred = BI->getParent(); + if (CallInst *CI = FindTRECandidate(BI, CannotTailCallElimCallsMarkedTail)){ + DEBUG(dbgs() << "FOLDING: " << *BB + << "INTO UNCOND BRANCH PRED: " << *Pred); + EliminateRecursiveTailCall(CI, FoldReturnIntoUncondBranch(Ret, BB, Pred), + OldEntry, TailCallsAreMarkedTail, ArgumentPHIs, + CannotTailCallElimCallsMarkedTail); + ++NumRetDuped; + Change = true; + } + } + + return Change; +} + +bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry, + bool &TailCallsAreMarkedTail, + SmallVector<PHINode*, 8> &ArgumentPHIs, + bool CannotTailCallElimCallsMarkedTail) { + CallInst *CI = FindTRECandidate(Ret, CannotTailCallElimCallsMarkedTail); + if (!CI) + return false; + + return EliminateRecursiveTailCall(CI, Ret, OldEntry, TailCallsAreMarkedTail, + ArgumentPHIs, + CannotTailCallElimCallsMarkedTail); +} diff --git a/contrib/llvm/lib/Transforms/Utils/AddrModeMatcher.cpp b/contrib/llvm/lib/Transforms/Utils/AddrModeMatcher.cpp new file mode 100644 index 0000000..be7bed1 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/AddrModeMatcher.cpp @@ -0,0 +1,582 @@ +//===- AddrModeMatcher.cpp - Addressing mode matching facility --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements target addressing mode matcher class. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/AddrModeMatcher.h" +#include "llvm/DerivedTypes.h" +#include "llvm/GlobalValue.h" +#include "llvm/Instruction.h" +#include "llvm/Assembly/Writer.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/GetElementPtrTypeIterator.h" +#include "llvm/Support/PatternMatch.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/CallSite.h" + +using namespace llvm; +using namespace llvm::PatternMatch; + +void ExtAddrMode::print(raw_ostream &OS) const { + bool NeedPlus = false; + OS << "["; + if (BaseGV) { + OS << (NeedPlus ? " + " : "") + << "GV:"; + WriteAsOperand(OS, BaseGV, /*PrintType=*/false); + NeedPlus = true; + } + + if (BaseOffs) + OS << (NeedPlus ? " + " : "") << BaseOffs, NeedPlus = true; + + if (BaseReg) { + OS << (NeedPlus ? " + " : "") + << "Base:"; + WriteAsOperand(OS, BaseReg, /*PrintType=*/false); + NeedPlus = true; + } + if (Scale) { + OS << (NeedPlus ? " + " : "") + << Scale << "*"; + WriteAsOperand(OS, ScaledReg, /*PrintType=*/false); + NeedPlus = true; + } + + OS << ']'; +} + +void ExtAddrMode::dump() const { + print(dbgs()); + dbgs() << '\n'; +} + + +/// MatchScaledValue - Try adding ScaleReg*Scale to the current addressing mode. +/// Return true and update AddrMode if this addr mode is legal for the target, +/// false if not. +bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale, + unsigned Depth) { + // If Scale is 1, then this is the same as adding ScaleReg to the addressing + // mode. Just process that directly. + if (Scale == 1) + return MatchAddr(ScaleReg, Depth); + + // If the scale is 0, it takes nothing to add this. + if (Scale == 0) + return true; + + // If we already have a scale of this value, we can add to it, otherwise, we + // need an available scale field. + if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg) + return false; + + ExtAddrMode TestAddrMode = AddrMode; + + // Add scale to turn X*4+X*3 -> X*7. This could also do things like + // [A+B + A*7] -> [B+A*8]. + TestAddrMode.Scale += Scale; + TestAddrMode.ScaledReg = ScaleReg; + + // If the new address isn't legal, bail out. + if (!TLI.isLegalAddressingMode(TestAddrMode, AccessTy)) + return false; + + // It was legal, so commit it. + AddrMode = TestAddrMode; + + // Okay, we decided that we can add ScaleReg+Scale to AddrMode. Check now + // to see if ScaleReg is actually X+C. If so, we can turn this into adding + // X*Scale + C*Scale to addr mode. + ConstantInt *CI = 0; Value *AddLHS = 0; + if (isa<Instruction>(ScaleReg) && // not a constant expr. + match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI)))) { + TestAddrMode.ScaledReg = AddLHS; + TestAddrMode.BaseOffs += CI->getSExtValue()*TestAddrMode.Scale; + + // If this addressing mode is legal, commit it and remember that we folded + // this instruction. + if (TLI.isLegalAddressingMode(TestAddrMode, AccessTy)) { + AddrModeInsts.push_back(cast<Instruction>(ScaleReg)); + AddrMode = TestAddrMode; + return true; + } + } + + // Otherwise, not (x+c)*scale, just return what we have. + return true; +} + +/// MightBeFoldableInst - This is a little filter, which returns true if an +/// addressing computation involving I might be folded into a load/store +/// accessing it. This doesn't need to be perfect, but needs to accept at least +/// the set of instructions that MatchOperationAddr can. +static bool MightBeFoldableInst(Instruction *I) { + switch (I->getOpcode()) { + case Instruction::BitCast: + // Don't touch identity bitcasts. + if (I->getType() == I->getOperand(0)->getType()) + return false; + return I->getType()->isPointerTy() || I->getType()->isIntegerTy(); + case Instruction::PtrToInt: + // PtrToInt is always a noop, as we know that the int type is pointer sized. + return true; + case Instruction::IntToPtr: + // We know the input is intptr_t, so this is foldable. + return true; + case Instruction::Add: + return true; + case Instruction::Mul: + case Instruction::Shl: + // Can only handle X*C and X << C. + return isa<ConstantInt>(I->getOperand(1)); + case Instruction::GetElementPtr: + return true; + default: + return false; + } +} + + +/// MatchOperationAddr - Given an instruction or constant expr, see if we can +/// fold the operation into the addressing mode. If so, update the addressing +/// mode and return true, otherwise return false without modifying AddrMode. +bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, + unsigned Depth) { + // Avoid exponential behavior on extremely deep expression trees. + if (Depth >= 5) return false; + + switch (Opcode) { + case Instruction::PtrToInt: + // PtrToInt is always a noop, as we know that the int type is pointer sized. + return MatchAddr(AddrInst->getOperand(0), Depth); + case Instruction::IntToPtr: + // This inttoptr is a no-op if the integer type is pointer sized. + if (TLI.getValueType(AddrInst->getOperand(0)->getType()) == + TLI.getPointerTy()) + return MatchAddr(AddrInst->getOperand(0), Depth); + return false; + case Instruction::BitCast: + // BitCast is always a noop, and we can handle it as long as it is + // int->int or pointer->pointer (we don't want int<->fp or something). + if ((AddrInst->getOperand(0)->getType()->isPointerTy() || + AddrInst->getOperand(0)->getType()->isIntegerTy()) && + // Don't touch identity bitcasts. These were probably put here by LSR, + // and we don't want to mess around with them. Assume it knows what it + // is doing. + AddrInst->getOperand(0)->getType() != AddrInst->getType()) + return MatchAddr(AddrInst->getOperand(0), Depth); + return false; + case Instruction::Add: { + // Check to see if we can merge in the RHS then the LHS. If so, we win. + ExtAddrMode BackupAddrMode = AddrMode; + unsigned OldSize = AddrModeInsts.size(); + if (MatchAddr(AddrInst->getOperand(1), Depth+1) && + MatchAddr(AddrInst->getOperand(0), Depth+1)) + return true; + + // Restore the old addr mode info. + AddrMode = BackupAddrMode; + AddrModeInsts.resize(OldSize); + + // Otherwise this was over-aggressive. Try merging in the LHS then the RHS. + if (MatchAddr(AddrInst->getOperand(0), Depth+1) && + MatchAddr(AddrInst->getOperand(1), Depth+1)) + return true; + + // Otherwise we definitely can't merge the ADD in. + AddrMode = BackupAddrMode; + AddrModeInsts.resize(OldSize); + break; + } + //case Instruction::Or: + // TODO: We can handle "Or Val, Imm" iff this OR is equivalent to an ADD. + //break; + case Instruction::Mul: + case Instruction::Shl: { + // Can only handle X*C and X << C. + ConstantInt *RHS = dyn_cast<ConstantInt>(AddrInst->getOperand(1)); + if (!RHS) return false; + int64_t Scale = RHS->getSExtValue(); + if (Opcode == Instruction::Shl) + Scale = 1LL << Scale; + + return MatchScaledValue(AddrInst->getOperand(0), Scale, Depth); + } + case Instruction::GetElementPtr: { + // Scan the GEP. We check it if it contains constant offsets and at most + // one variable offset. + int VariableOperand = -1; + unsigned VariableScale = 0; + + int64_t ConstantOffset = 0; + const TargetData *TD = TLI.getTargetData(); + gep_type_iterator GTI = gep_type_begin(AddrInst); + for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) { + if (const StructType *STy = dyn_cast<StructType>(*GTI)) { + const StructLayout *SL = TD->getStructLayout(STy); + unsigned Idx = + cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue(); + ConstantOffset += SL->getElementOffset(Idx); + } else { + uint64_t TypeSize = TD->getTypeAllocSize(GTI.getIndexedType()); + if (ConstantInt *CI = dyn_cast<ConstantInt>(AddrInst->getOperand(i))) { + ConstantOffset += CI->getSExtValue()*TypeSize; + } else if (TypeSize) { // Scales of zero don't do anything. + // We only allow one variable index at the moment. + if (VariableOperand != -1) + return false; + + // Remember the variable index. + VariableOperand = i; + VariableScale = TypeSize; + } + } + } + + // A common case is for the GEP to only do a constant offset. In this case, + // just add it to the disp field and check validity. + if (VariableOperand == -1) { + AddrMode.BaseOffs += ConstantOffset; + if (ConstantOffset == 0 || TLI.isLegalAddressingMode(AddrMode, AccessTy)){ + // Check to see if we can fold the base pointer in too. + if (MatchAddr(AddrInst->getOperand(0), Depth+1)) + return true; + } + AddrMode.BaseOffs -= ConstantOffset; + return false; + } + + // Save the valid addressing mode in case we can't match. + ExtAddrMode BackupAddrMode = AddrMode; + unsigned OldSize = AddrModeInsts.size(); + + // See if the scale and offset amount is valid for this target. + AddrMode.BaseOffs += ConstantOffset; + + // Match the base operand of the GEP. + if (!MatchAddr(AddrInst->getOperand(0), Depth+1)) { + // If it couldn't be matched, just stuff the value in a register. + if (AddrMode.HasBaseReg) { + AddrMode = BackupAddrMode; + AddrModeInsts.resize(OldSize); + return false; + } + AddrMode.HasBaseReg = true; + AddrMode.BaseReg = AddrInst->getOperand(0); + } + + // Match the remaining variable portion of the GEP. + if (!MatchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale, + Depth)) { + // If it couldn't be matched, try stuffing the base into a register + // instead of matching it, and retrying the match of the scale. + AddrMode = BackupAddrMode; + AddrModeInsts.resize(OldSize); + if (AddrMode.HasBaseReg) + return false; + AddrMode.HasBaseReg = true; + AddrMode.BaseReg = AddrInst->getOperand(0); + AddrMode.BaseOffs += ConstantOffset; + if (!MatchScaledValue(AddrInst->getOperand(VariableOperand), + VariableScale, Depth)) { + // If even that didn't work, bail. + AddrMode = BackupAddrMode; + AddrModeInsts.resize(OldSize); + return false; + } + } + + return true; + } + } + return false; +} + +/// MatchAddr - If we can, try to add the value of 'Addr' into the current +/// addressing mode. If Addr can't be added to AddrMode this returns false and +/// leaves AddrMode unmodified. This assumes that Addr is either a pointer type +/// or intptr_t for the target. +/// +bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) { + if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) { + // Fold in immediates if legal for the target. + AddrMode.BaseOffs += CI->getSExtValue(); + if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) + return true; + AddrMode.BaseOffs -= CI->getSExtValue(); + } else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) { + // If this is a global variable, try to fold it into the addressing mode. + if (AddrMode.BaseGV == 0) { + AddrMode.BaseGV = GV; + if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) + return true; + AddrMode.BaseGV = 0; + } + } else if (Instruction *I = dyn_cast<Instruction>(Addr)) { + ExtAddrMode BackupAddrMode = AddrMode; + unsigned OldSize = AddrModeInsts.size(); + + // Check to see if it is possible to fold this operation. + if (MatchOperationAddr(I, I->getOpcode(), Depth)) { + // Okay, it's possible to fold this. Check to see if it is actually + // *profitable* to do so. We use a simple cost model to avoid increasing + // register pressure too much. + if (I->hasOneUse() || + IsProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) { + AddrModeInsts.push_back(I); + return true; + } + + // It isn't profitable to do this, roll back. + //cerr << "NOT FOLDING: " << *I; + AddrMode = BackupAddrMode; + AddrModeInsts.resize(OldSize); + } + } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) { + if (MatchOperationAddr(CE, CE->getOpcode(), Depth)) + return true; + } else if (isa<ConstantPointerNull>(Addr)) { + // Null pointer gets folded without affecting the addressing mode. + return true; + } + + // Worse case, the target should support [reg] addressing modes. :) + if (!AddrMode.HasBaseReg) { + AddrMode.HasBaseReg = true; + AddrMode.BaseReg = Addr; + // Still check for legality in case the target supports [imm] but not [i+r]. + if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) + return true; + AddrMode.HasBaseReg = false; + AddrMode.BaseReg = 0; + } + + // If the base register is already taken, see if we can do [r+r]. + if (AddrMode.Scale == 0) { + AddrMode.Scale = 1; + AddrMode.ScaledReg = Addr; + if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) + return true; + AddrMode.Scale = 0; + AddrMode.ScaledReg = 0; + } + // Couldn't match. + return false; +} + + +/// IsOperandAMemoryOperand - Check to see if all uses of OpVal by the specified +/// inline asm call are due to memory operands. If so, return true, otherwise +/// return false. +static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal, + const TargetLowering &TLI) { + TargetLowering::AsmOperandInfoVector TargetConstraints = TLI.ParseConstraints(ImmutableCallSite(CI)); + for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { + TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; + + // Compute the constraint code and ConstraintType to use. + TLI.ComputeConstraintToUse(OpInfo, SDValue()); + + // If this asm operand is our Value*, and if it isn't an indirect memory + // operand, we can't fold it! + if (OpInfo.CallOperandVal == OpVal && + (OpInfo.ConstraintType != TargetLowering::C_Memory || + !OpInfo.isIndirect)) + return false; + } + + return true; +} + + +/// FindAllMemoryUses - Recursively walk all the uses of I until we find a +/// memory use. If we find an obviously non-foldable instruction, return true. +/// Add the ultimately found memory instructions to MemoryUses. +static bool FindAllMemoryUses(Instruction *I, + SmallVectorImpl<std::pair<Instruction*,unsigned> > &MemoryUses, + SmallPtrSet<Instruction*, 16> &ConsideredInsts, + const TargetLowering &TLI) { + // If we already considered this instruction, we're done. + if (!ConsideredInsts.insert(I)) + return false; + + // If this is an obviously unfoldable instruction, bail out. + if (!MightBeFoldableInst(I)) + return true; + + // Loop over all the uses, recursively processing them. + for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); + UI != E; ++UI) { + User *U = *UI; + + if (LoadInst *LI = dyn_cast<LoadInst>(U)) { + MemoryUses.push_back(std::make_pair(LI, UI.getOperandNo())); + continue; + } + + if (StoreInst *SI = dyn_cast<StoreInst>(U)) { + unsigned opNo = UI.getOperandNo(); + if (opNo == 0) return true; // Storing addr, not into addr. + MemoryUses.push_back(std::make_pair(SI, opNo)); + continue; + } + + if (CallInst *CI = dyn_cast<CallInst>(U)) { + InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue()); + if (!IA) return true; + + // If this is a memory operand, we're cool, otherwise bail out. + if (!IsOperandAMemoryOperand(CI, IA, I, TLI)) + return true; + continue; + } + + if (FindAllMemoryUses(cast<Instruction>(U), MemoryUses, ConsideredInsts, + TLI)) + return true; + } + + return false; +} + + +/// ValueAlreadyLiveAtInst - Retrn true if Val is already known to be live at +/// the use site that we're folding it into. If so, there is no cost to +/// include it in the addressing mode. KnownLive1 and KnownLive2 are two values +/// that we know are live at the instruction already. +bool AddressingModeMatcher::ValueAlreadyLiveAtInst(Value *Val,Value *KnownLive1, + Value *KnownLive2) { + // If Val is either of the known-live values, we know it is live! + if (Val == 0 || Val == KnownLive1 || Val == KnownLive2) + return true; + + // All values other than instructions and arguments (e.g. constants) are live. + if (!isa<Instruction>(Val) && !isa<Argument>(Val)) return true; + + // If Val is a constant sized alloca in the entry block, it is live, this is + // true because it is just a reference to the stack/frame pointer, which is + // live for the whole function. + if (AllocaInst *AI = dyn_cast<AllocaInst>(Val)) + if (AI->isStaticAlloca()) + return true; + + // Check to see if this value is already used in the memory instruction's + // block. If so, it's already live into the block at the very least, so we + // can reasonably fold it. + BasicBlock *MemBB = MemoryInst->getParent(); + for (Value::use_iterator UI = Val->use_begin(), E = Val->use_end(); + UI != E; ++UI) + // We know that uses of arguments and instructions have to be instructions. + if (cast<Instruction>(*UI)->getParent() == MemBB) + return true; + + return false; +} + + + +/// IsProfitableToFoldIntoAddressingMode - It is possible for the addressing +/// mode of the machine to fold the specified instruction into a load or store +/// that ultimately uses it. However, the specified instruction has multiple +/// uses. Given this, it may actually increase register pressure to fold it +/// into the load. For example, consider this code: +/// +/// X = ... +/// Y = X+1 +/// use(Y) -> nonload/store +/// Z = Y+1 +/// load Z +/// +/// In this case, Y has multiple uses, and can be folded into the load of Z +/// (yielding load [X+2]). However, doing this will cause both "X" and "X+1" to +/// be live at the use(Y) line. If we don't fold Y into load Z, we use one +/// fewer register. Since Y can't be folded into "use(Y)" we don't increase the +/// number of computations either. +/// +/// Note that this (like most of CodeGenPrepare) is just a rough heuristic. If +/// X was live across 'load Z' for other reasons, we actually *would* want to +/// fold the addressing mode in the Z case. This would make Y die earlier. +bool AddressingModeMatcher:: +IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, + ExtAddrMode &AMAfter) { + if (IgnoreProfitability) return true; + + // AMBefore is the addressing mode before this instruction was folded into it, + // and AMAfter is the addressing mode after the instruction was folded. Get + // the set of registers referenced by AMAfter and subtract out those + // referenced by AMBefore: this is the set of values which folding in this + // address extends the lifetime of. + // + // Note that there are only two potential values being referenced here, + // BaseReg and ScaleReg (global addresses are always available, as are any + // folded immediates). + Value *BaseReg = AMAfter.BaseReg, *ScaledReg = AMAfter.ScaledReg; + + // If the BaseReg or ScaledReg was referenced by the previous addrmode, their + // lifetime wasn't extended by adding this instruction. + if (ValueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg)) + BaseReg = 0; + if (ValueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg)) + ScaledReg = 0; + + // If folding this instruction (and it's subexprs) didn't extend any live + // ranges, we're ok with it. + if (BaseReg == 0 && ScaledReg == 0) + return true; + + // If all uses of this instruction are ultimately load/store/inlineasm's, + // check to see if their addressing modes will include this instruction. If + // so, we can fold it into all uses, so it doesn't matter if it has multiple + // uses. + SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses; + SmallPtrSet<Instruction*, 16> ConsideredInsts; + if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI)) + return false; // Has a non-memory, non-foldable use! + + // Now that we know that all uses of this instruction are part of a chain of + // computation involving only operations that could theoretically be folded + // into a memory use, loop over each of these uses and see if they could + // *actually* fold the instruction. + SmallVector<Instruction*, 32> MatchedAddrModeInsts; + for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) { + Instruction *User = MemoryUses[i].first; + unsigned OpNo = MemoryUses[i].second; + + // Get the access type of this use. If the use isn't a pointer, we don't + // know what it accesses. + Value *Address = User->getOperand(OpNo); + if (!Address->getType()->isPointerTy()) + return false; + const Type *AddressAccessTy = + cast<PointerType>(Address->getType())->getElementType(); + + // Do a match against the root of this address, ignoring profitability. This + // will tell us if the addressing mode for the memory operation will + // *actually* cover the shared instruction. + ExtAddrMode Result; + AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, AddressAccessTy, + MemoryInst, Result); + Matcher.IgnoreProfitability = true; + bool Success = Matcher.MatchAddr(Address, 0); + (void)Success; assert(Success && "Couldn't select *anything*?"); + + // If the match didn't cover I, then it won't be shared by it. + if (std::find(MatchedAddrModeInsts.begin(), MatchedAddrModeInsts.end(), + I) == MatchedAddrModeInsts.end()) + return false; + + MatchedAddrModeInsts.clear(); + } + + return true; +} diff --git a/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp new file mode 100644 index 0000000..acaea19 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -0,0 +1,540 @@ +//===-- BasicBlockUtils.cpp - BasicBlock Utilities -------------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This family of functions perform manipulations on basic blocks, and +// instructions contained within basic blocks. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Function.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Constant.h" +#include "llvm/Type.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemoryDependenceAnalysis.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/ValueHandle.h" +#include <algorithm> +using namespace llvm; + +/// DeleteDeadBlock - Delete the specified block, which must have no +/// predecessors. +void llvm::DeleteDeadBlock(BasicBlock *BB) { + assert((pred_begin(BB) == pred_end(BB) || + // Can delete self loop. + BB->getSinglePredecessor() == BB) && "Block is not dead!"); + TerminatorInst *BBTerm = BB->getTerminator(); + + // Loop through all of our successors and make sure they know that one + // of their predecessors is going away. + for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i) + BBTerm->getSuccessor(i)->removePredecessor(BB); + + // Zap all the instructions in the block. + while (!BB->empty()) { + Instruction &I = BB->back(); + // If this instruction is used, replace uses with an arbitrary value. + // Because control flow can't get here, we don't care what we replace the + // value with. Note that since this block is unreachable, and all values + // contained within it must dominate their uses, that all uses will + // eventually be removed (they are themselves dead). + if (!I.use_empty()) + I.replaceAllUsesWith(UndefValue::get(I.getType())); + BB->getInstList().pop_back(); + } + + // Zap the block! + BB->eraseFromParent(); +} + +/// FoldSingleEntryPHINodes - We know that BB has one predecessor. If there are +/// any single-entry PHI nodes in it, fold them away. This handles the case +/// when all entries to the PHI nodes in a block are guaranteed equal, such as +/// when the block has exactly one predecessor. +void llvm::FoldSingleEntryPHINodes(BasicBlock *BB, Pass *P) { + if (!isa<PHINode>(BB->begin())) return; + + AliasAnalysis *AA = 0; + MemoryDependenceAnalysis *MemDep = 0; + if (P) { + AA = P->getAnalysisIfAvailable<AliasAnalysis>(); + MemDep = P->getAnalysisIfAvailable<MemoryDependenceAnalysis>(); + } + + while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) { + if (PN->getIncomingValue(0) != PN) + PN->replaceAllUsesWith(PN->getIncomingValue(0)); + else + PN->replaceAllUsesWith(UndefValue::get(PN->getType())); + + if (MemDep) + MemDep->removeInstruction(PN); // Memdep updates AA itself. + else if (AA && isa<PointerType>(PN->getType())) + AA->deleteValue(PN); + + PN->eraseFromParent(); + } +} + + +/// DeleteDeadPHIs - Examine each PHI in the given block and delete it if it +/// is dead. Also recursively delete any operands that become dead as +/// a result. This includes tracing the def-use list from the PHI to see if +/// it is ultimately unused or if it reaches an unused cycle. +bool llvm::DeleteDeadPHIs(BasicBlock *BB) { + // Recursively deleting a PHI may cause multiple PHIs to be deleted + // or RAUW'd undef, so use an array of WeakVH for the PHIs to delete. + SmallVector<WeakVH, 8> PHIs; + for (BasicBlock::iterator I = BB->begin(); + PHINode *PN = dyn_cast<PHINode>(I); ++I) + PHIs.push_back(PN); + + bool Changed = false; + for (unsigned i = 0, e = PHIs.size(); i != e; ++i) + if (PHINode *PN = dyn_cast_or_null<PHINode>(PHIs[i].operator Value*())) + Changed |= RecursivelyDeleteDeadPHINode(PN); + + return Changed; +} + +/// MergeBlockIntoPredecessor - Attempts to merge a block into its predecessor, +/// if possible. The return value indicates success or failure. +bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P) { + // Don't merge away blocks who have their address taken. + if (BB->hasAddressTaken()) return false; + + // Can't merge if there are multiple predecessors, or no predecessors. + BasicBlock *PredBB = BB->getUniquePredecessor(); + if (!PredBB) return false; + + // Don't break self-loops. + if (PredBB == BB) return false; + // Don't break invokes. + if (isa<InvokeInst>(PredBB->getTerminator())) return false; + + succ_iterator SI(succ_begin(PredBB)), SE(succ_end(PredBB)); + BasicBlock *OnlySucc = BB; + for (; SI != SE; ++SI) + if (*SI != OnlySucc) { + OnlySucc = 0; // There are multiple distinct successors! + break; + } + + // Can't merge if there are multiple successors. + if (!OnlySucc) return false; + + // Can't merge if there is PHI loop. + for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE; ++BI) { + if (PHINode *PN = dyn_cast<PHINode>(BI)) { + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + if (PN->getIncomingValue(i) == PN) + return false; + } else + break; + } + + // Begin by getting rid of unneeded PHIs. + if (isa<PHINode>(BB->front())) + FoldSingleEntryPHINodes(BB, P); + + // Delete the unconditional branch from the predecessor... + PredBB->getInstList().pop_back(); + + // Move all definitions in the successor to the predecessor... + PredBB->getInstList().splice(PredBB->end(), BB->getInstList()); + + // Make all PHI nodes that referred to BB now refer to Pred as their + // source... + BB->replaceAllUsesWith(PredBB); + + // Inherit predecessors name if it exists. + if (!PredBB->hasName()) + PredBB->takeName(BB); + + // Finally, erase the old block and update dominator info. + if (P) { + if (DominatorTree *DT = P->getAnalysisIfAvailable<DominatorTree>()) { + if (DomTreeNode *DTN = DT->getNode(BB)) { + DomTreeNode *PredDTN = DT->getNode(PredBB); + SmallVector<DomTreeNode*, 8> Children(DTN->begin(), DTN->end()); + for (SmallVector<DomTreeNode*, 8>::iterator DI = Children.begin(), + DE = Children.end(); DI != DE; ++DI) + DT->changeImmediateDominator(*DI, PredDTN); + + DT->eraseNode(BB); + } + + if (LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>()) + LI->removeBlock(BB); + + if (MemoryDependenceAnalysis *MD = + P->getAnalysisIfAvailable<MemoryDependenceAnalysis>()) + MD->invalidateCachedPredecessors(); + } + } + + BB->eraseFromParent(); + return true; +} + +/// ReplaceInstWithValue - Replace all uses of an instruction (specified by BI) +/// with a value, then remove and delete the original instruction. +/// +void llvm::ReplaceInstWithValue(BasicBlock::InstListType &BIL, + BasicBlock::iterator &BI, Value *V) { + Instruction &I = *BI; + // Replaces all of the uses of the instruction with uses of the value + I.replaceAllUsesWith(V); + + // Make sure to propagate a name if there is one already. + if (I.hasName() && !V->hasName()) + V->takeName(&I); + + // Delete the unnecessary instruction now... + BI = BIL.erase(BI); +} + + +/// ReplaceInstWithInst - Replace the instruction specified by BI with the +/// instruction specified by I. The original instruction is deleted and BI is +/// updated to point to the new instruction. +/// +void llvm::ReplaceInstWithInst(BasicBlock::InstListType &BIL, + BasicBlock::iterator &BI, Instruction *I) { + assert(I->getParent() == 0 && + "ReplaceInstWithInst: Instruction already inserted into basic block!"); + + // Insert the new instruction into the basic block... + BasicBlock::iterator New = BIL.insert(BI, I); + + // Replace all uses of the old instruction, and delete it. + ReplaceInstWithValue(BIL, BI, I); + + // Move BI back to point to the newly inserted instruction + BI = New; +} + +/// ReplaceInstWithInst - Replace the instruction specified by From with the +/// instruction specified by To. +/// +void llvm::ReplaceInstWithInst(Instruction *From, Instruction *To) { + BasicBlock::iterator BI(From); + ReplaceInstWithInst(From->getParent()->getInstList(), BI, To); +} + +/// GetSuccessorNumber - Search for the specified successor of basic block BB +/// and return its position in the terminator instruction's list of +/// successors. It is an error to call this with a block that is not a +/// successor. +unsigned llvm::GetSuccessorNumber(BasicBlock *BB, BasicBlock *Succ) { + TerminatorInst *Term = BB->getTerminator(); +#ifndef NDEBUG + unsigned e = Term->getNumSuccessors(); +#endif + for (unsigned i = 0; ; ++i) { + assert(i != e && "Didn't find edge?"); + if (Term->getSuccessor(i) == Succ) + return i; + } + return 0; +} + +/// SplitEdge - Split the edge connecting specified block. Pass P must +/// not be NULL. +BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, Pass *P) { + unsigned SuccNum = GetSuccessorNumber(BB, Succ); + + // If this is a critical edge, let SplitCriticalEdge do it. + TerminatorInst *LatchTerm = BB->getTerminator(); + if (SplitCriticalEdge(LatchTerm, SuccNum, P)) + return LatchTerm->getSuccessor(SuccNum); + + // If the edge isn't critical, then BB has a single successor or Succ has a + // single pred. Split the block. + BasicBlock::iterator SplitPoint; + if (BasicBlock *SP = Succ->getSinglePredecessor()) { + // If the successor only has a single pred, split the top of the successor + // block. + assert(SP == BB && "CFG broken"); + SP = NULL; + return SplitBlock(Succ, Succ->begin(), P); + } + + // Otherwise, if BB has a single successor, split it at the bottom of the + // block. + assert(BB->getTerminator()->getNumSuccessors() == 1 && + "Should have a single succ!"); + return SplitBlock(BB, BB->getTerminator(), P); +} + +/// SplitBlock - Split the specified block at the specified instruction - every +/// thing before SplitPt stays in Old and everything starting with SplitPt moves +/// to a new block. The two blocks are joined by an unconditional branch and +/// the loop info is updated. +/// +BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt, Pass *P) { + BasicBlock::iterator SplitIt = SplitPt; + while (isa<PHINode>(SplitIt)) + ++SplitIt; + BasicBlock *New = Old->splitBasicBlock(SplitIt, Old->getName()+".split"); + + // The new block lives in whichever loop the old one did. This preserves + // LCSSA as well, because we force the split point to be after any PHI nodes. + if (LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>()) + if (Loop *L = LI->getLoopFor(Old)) + L->addBasicBlockToLoop(New, LI->getBase()); + + if (DominatorTree *DT = P->getAnalysisIfAvailable<DominatorTree>()) { + // Old dominates New. New node dominates all other nodes dominated by Old. + DomTreeNode *OldNode = DT->getNode(Old); + std::vector<DomTreeNode *> Children; + for (DomTreeNode::iterator I = OldNode->begin(), E = OldNode->end(); + I != E; ++I) + Children.push_back(*I); + + DomTreeNode *NewNode = DT->addNewBlock(New,Old); + for (std::vector<DomTreeNode *>::iterator I = Children.begin(), + E = Children.end(); I != E; ++I) + DT->changeImmediateDominator(*I, NewNode); + } + + return New; +} + + +/// SplitBlockPredecessors - This method transforms BB by introducing a new +/// basic block into the function, and moving some of the predecessors of BB to +/// be predecessors of the new block. The new predecessors are indicated by the +/// Preds array, which has NumPreds elements in it. The new block is given a +/// suffix of 'Suffix'. +/// +/// This currently updates the LLVM IR, AliasAnalysis, DominatorTree, +/// LoopInfo, and LCCSA but no other analyses. In particular, it does not +/// preserve LoopSimplify (because it's complicated to handle the case where one +/// of the edges being split is an exit of a loop with other exits). +/// +BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB, + BasicBlock *const *Preds, + unsigned NumPreds, const char *Suffix, + Pass *P) { + // Create new basic block, insert right before the original block. + BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), BB->getName()+Suffix, + BB->getParent(), BB); + + // The new block unconditionally branches to the old block. + BranchInst *BI = BranchInst::Create(BB, NewBB); + + LoopInfo *LI = P ? P->getAnalysisIfAvailable<LoopInfo>() : 0; + Loop *L = LI ? LI->getLoopFor(BB) : 0; + bool PreserveLCSSA = P->mustPreserveAnalysisID(LCSSAID); + + // Move the edges from Preds to point to NewBB instead of BB. + // While here, if we need to preserve loop analyses, collect + // some information about how this split will affect loops. + bool HasLoopExit = false; + bool IsLoopEntry = !!L; + bool SplitMakesNewLoopHeader = false; + for (unsigned i = 0; i != NumPreds; ++i) { + // This is slightly more strict than necessary; the minimum requirement + // is that there be no more than one indirectbr branching to BB. And + // all BlockAddress uses would need to be updated. + assert(!isa<IndirectBrInst>(Preds[i]->getTerminator()) && + "Cannot split an edge from an IndirectBrInst"); + + Preds[i]->getTerminator()->replaceUsesOfWith(BB, NewBB); + + if (LI) { + // If we need to preserve LCSSA, determine if any of + // the preds is a loop exit. + if (PreserveLCSSA) + if (Loop *PL = LI->getLoopFor(Preds[i])) + if (!PL->contains(BB)) + HasLoopExit = true; + // If we need to preserve LoopInfo, note whether any of the + // preds crosses an interesting loop boundary. + if (L) { + if (L->contains(Preds[i])) + IsLoopEntry = false; + else + SplitMakesNewLoopHeader = true; + } + } + } + + // Update dominator tree if available. + DominatorTree *DT = P ? P->getAnalysisIfAvailable<DominatorTree>() : 0; + if (DT) + DT->splitBlock(NewBB); + + // Insert a new PHI node into NewBB for every PHI node in BB and that new PHI + // node becomes an incoming value for BB's phi node. However, if the Preds + // list is empty, we need to insert dummy entries into the PHI nodes in BB to + // account for the newly created predecessor. + if (NumPreds == 0) { + // Insert dummy values as the incoming value. + for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ++I) + cast<PHINode>(I)->addIncoming(UndefValue::get(I->getType()), NewBB); + return NewBB; + } + + AliasAnalysis *AA = P ? P->getAnalysisIfAvailable<AliasAnalysis>() : 0; + + if (L) { + if (IsLoopEntry) { + // Add the new block to the nearest enclosing loop (and not an + // adjacent loop). To find this, examine each of the predecessors and + // determine which loops enclose them, and select the most-nested loop + // which contains the loop containing the block being split. + Loop *InnermostPredLoop = 0; + for (unsigned i = 0; i != NumPreds; ++i) + if (Loop *PredLoop = LI->getLoopFor(Preds[i])) { + // Seek a loop which actually contains the block being split (to + // avoid adjacent loops). + while (PredLoop && !PredLoop->contains(BB)) + PredLoop = PredLoop->getParentLoop(); + // Select the most-nested of these loops which contains the block. + if (PredLoop && + PredLoop->contains(BB) && + (!InnermostPredLoop || + InnermostPredLoop->getLoopDepth() < PredLoop->getLoopDepth())) + InnermostPredLoop = PredLoop; + } + if (InnermostPredLoop) + InnermostPredLoop->addBasicBlockToLoop(NewBB, LI->getBase()); + } else { + L->addBasicBlockToLoop(NewBB, LI->getBase()); + if (SplitMakesNewLoopHeader) + L->moveToHeader(NewBB); + } + } + + // Otherwise, create a new PHI node in NewBB for each PHI node in BB. + for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ) { + PHINode *PN = cast<PHINode>(I++); + + // Check to see if all of the values coming in are the same. If so, we + // don't need to create a new PHI node, unless it's needed for LCSSA. + Value *InVal = 0; + if (!HasLoopExit) { + InVal = PN->getIncomingValueForBlock(Preds[0]); + for (unsigned i = 1; i != NumPreds; ++i) + if (InVal != PN->getIncomingValueForBlock(Preds[i])) { + InVal = 0; + break; + } + } + + if (InVal) { + // If all incoming values for the new PHI would be the same, just don't + // make a new PHI. Instead, just remove the incoming values from the old + // PHI. + for (unsigned i = 0; i != NumPreds; ++i) + PN->removeIncomingValue(Preds[i], false); + } else { + // If the values coming into the block are not the same, we need a PHI. + // Create the new PHI node, insert it into NewBB at the end of the block + PHINode *NewPHI = + PHINode::Create(PN->getType(), PN->getName()+".ph", BI); + if (AA) AA->copyValue(PN, NewPHI); + + // Move all of the PHI values for 'Preds' to the new PHI. + for (unsigned i = 0; i != NumPreds; ++i) { + Value *V = PN->removeIncomingValue(Preds[i], false); + NewPHI->addIncoming(V, Preds[i]); + } + InVal = NewPHI; + } + + // Add an incoming value to the PHI node in the loop for the preheader + // edge. + PN->addIncoming(InVal, NewBB); + } + + return NewBB; +} + +/// FindFunctionBackedges - Analyze the specified function to find all of the +/// loop backedges in the function and return them. This is a relatively cheap +/// (compared to computing dominators and loop info) analysis. +/// +/// The output is added to Result, as pairs of <from,to> edge info. +void llvm::FindFunctionBackedges(const Function &F, + SmallVectorImpl<std::pair<const BasicBlock*,const BasicBlock*> > &Result) { + const BasicBlock *BB = &F.getEntryBlock(); + if (succ_begin(BB) == succ_end(BB)) + return; + + SmallPtrSet<const BasicBlock*, 8> Visited; + SmallVector<std::pair<const BasicBlock*, succ_const_iterator>, 8> VisitStack; + SmallPtrSet<const BasicBlock*, 8> InStack; + + Visited.insert(BB); + VisitStack.push_back(std::make_pair(BB, succ_begin(BB))); + InStack.insert(BB); + do { + std::pair<const BasicBlock*, succ_const_iterator> &Top = VisitStack.back(); + const BasicBlock *ParentBB = Top.first; + succ_const_iterator &I = Top.second; + + bool FoundNew = false; + while (I != succ_end(ParentBB)) { + BB = *I++; + if (Visited.insert(BB)) { + FoundNew = true; + break; + } + // Successor is in VisitStack, it's a back edge. + if (InStack.count(BB)) + Result.push_back(std::make_pair(ParentBB, BB)); + } + + if (FoundNew) { + // Go down one level if there is a unvisited successor. + InStack.insert(BB); + VisitStack.push_back(std::make_pair(BB, succ_begin(BB))); + } else { + // Go up one level. + InStack.erase(VisitStack.pop_back_val().first); + } + } while (!VisitStack.empty()); +} + +/// FoldReturnIntoUncondBranch - This method duplicates the specified return +/// instruction into a predecessor which ends in an unconditional branch. If +/// the return instruction returns a value defined by a PHI, propagate the +/// right value into the return. It returns the new return instruction in the +/// predecessor. +ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB, + BasicBlock *Pred) { + Instruction *UncondBranch = Pred->getTerminator(); + // Clone the return and add it to the end of the predecessor. + Instruction *NewRet = RI->clone(); + Pred->getInstList().push_back(NewRet); + + // If the return instruction returns a value, and if the value was a + // PHI node in "BB", propagate the right value into the return. + for (User::op_iterator i = NewRet->op_begin(), e = NewRet->op_end(); + i != e; ++i) + if (PHINode *PN = dyn_cast<PHINode>(*i)) + if (PN->getParent() == BB) + *i = PN->getIncomingValueForBlock(Pred); + + // Update any PHI nodes in the returning block to realize that we no + // longer branch to them. + BB->removePredecessor(Pred); + UncondBranch->eraseFromParent(); + return cast<ReturnInst>(NewRet); +} diff --git a/contrib/llvm/lib/Transforms/Utils/BasicInliner.cpp b/contrib/llvm/lib/Transforms/Utils/BasicInliner.cpp new file mode 100644 index 0000000..23a30cc5 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/BasicInliner.cpp @@ -0,0 +1,182 @@ +//===- BasicInliner.cpp - Basic function level inliner --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines a simple function based inliner that does not use +// call graph information. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "basicinliner" +#include "llvm/Module.h" +#include "llvm/Function.h" +#include "llvm/Transforms/Utils/BasicInliner.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/SmallPtrSet.h" +#include <vector> + +using namespace llvm; + +static cl::opt<unsigned> +BasicInlineThreshold("basic-inline-threshold", cl::Hidden, cl::init(200), + cl::desc("Control the amount of basic inlining to perform (default = 200)")); + +namespace llvm { + + /// BasicInlinerImpl - BasicInliner implemantation class. This hides + /// container info, used by basic inliner, from public interface. + struct BasicInlinerImpl { + + BasicInlinerImpl(const BasicInlinerImpl&); // DO NOT IMPLEMENT + void operator=(const BasicInlinerImpl&); // DO NO IMPLEMENT + public: + BasicInlinerImpl(TargetData *T) : TD(T) {} + + /// addFunction - Add function into the list of functions to process. + /// All functions must be inserted using this interface before invoking + /// inlineFunctions(). + void addFunction(Function *F) { + Functions.push_back(F); + } + + /// neverInlineFunction - Sometimes a function is never to be inlined + /// because of one or other reason. + void neverInlineFunction(Function *F) { + NeverInline.insert(F); + } + + /// inlineFuctions - Walk all call sites in all functions supplied by + /// client. Inline as many call sites as possible. Delete completely + /// inlined functions. + void inlineFunctions(); + + private: + TargetData *TD; + std::vector<Function *> Functions; + SmallPtrSet<const Function *, 16> NeverInline; + SmallPtrSet<Function *, 8> DeadFunctions; + InlineCostAnalyzer CA; + }; + +/// inlineFuctions - Walk all call sites in all functions supplied by +/// client. Inline as many call sites as possible. Delete completely +/// inlined functions. +void BasicInlinerImpl::inlineFunctions() { + + // Scan through and identify all call sites ahead of time so that we only + // inline call sites in the original functions, not call sites that result + // from inlining other functions. + std::vector<CallSite> CallSites; + + for (std::vector<Function *>::iterator FI = Functions.begin(), + FE = Functions.end(); FI != FE; ++FI) { + Function *F = *FI; + for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) + for (BasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) { + CallSite CS(cast<Value>(I)); + if (CS && CS.getCalledFunction() + && !CS.getCalledFunction()->isDeclaration()) + CallSites.push_back(CS); + } + } + + DEBUG(dbgs() << ": " << CallSites.size() << " call sites.\n"); + + // Inline call sites. + bool Changed = false; + do { + Changed = false; + for (unsigned index = 0; index != CallSites.size() && !CallSites.empty(); + ++index) { + CallSite CS = CallSites[index]; + if (Function *Callee = CS.getCalledFunction()) { + + // Eliminate calls that are never inlinable. + if (Callee->isDeclaration() || + CS.getInstruction()->getParent()->getParent() == Callee) { + CallSites.erase(CallSites.begin() + index); + --index; + continue; + } + InlineCost IC = CA.getInlineCost(CS, NeverInline); + if (IC.isAlways()) { + DEBUG(dbgs() << " Inlining: cost=always" + <<", call: " << *CS.getInstruction()); + } else if (IC.isNever()) { + DEBUG(dbgs() << " NOT Inlining: cost=never" + <<", call: " << *CS.getInstruction()); + continue; + } else { + int Cost = IC.getValue(); + + if (Cost >= (int) BasicInlineThreshold) { + DEBUG(dbgs() << " NOT Inlining: cost = " << Cost + << ", call: " << *CS.getInstruction()); + continue; + } else { + DEBUG(dbgs() << " Inlining: cost = " << Cost + << ", call: " << *CS.getInstruction()); + } + } + + // Inline + InlineFunctionInfo IFI(0, TD); + if (InlineFunction(CS, IFI)) { + if (Callee->use_empty() && (Callee->hasLocalLinkage() || + Callee->hasAvailableExternallyLinkage())) + DeadFunctions.insert(Callee); + Changed = true; + CallSites.erase(CallSites.begin() + index); + --index; + } + } + } + } while (Changed); + + // Remove completely inlined functions from module. + for(SmallPtrSet<Function *, 8>::iterator I = DeadFunctions.begin(), + E = DeadFunctions.end(); I != E; ++I) { + Function *D = *I; + Module *M = D->getParent(); + M->getFunctionList().remove(D); + } +} + +BasicInliner::BasicInliner(TargetData *TD) { + Impl = new BasicInlinerImpl(TD); +} + +BasicInliner::~BasicInliner() { + delete Impl; +} + +/// addFunction - Add function into the list of functions to process. +/// All functions must be inserted using this interface before invoking +/// inlineFunctions(). +void BasicInliner::addFunction(Function *F) { + Impl->addFunction(F); +} + +/// neverInlineFunction - Sometimes a function is never to be inlined because +/// of one or other reason. +void BasicInliner::neverInlineFunction(Function *F) { + Impl->neverInlineFunction(F); +} + +/// inlineFuctions - Walk all call sites in all functions supplied by +/// client. Inline as many call sites as possible. Delete completely +/// inlined functions. +void BasicInliner::inlineFunctions() { + Impl->inlineFunctions(); +} + +} diff --git a/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp new file mode 100644 index 0000000..616b066 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp @@ -0,0 +1,407 @@ +//===- BreakCriticalEdges.cpp - Critical Edge Elimination Pass ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// BreakCriticalEdges pass - Break all of the critical edges in the CFG by +// inserting a dummy basic block. This pass may be "required" by passes that +// cannot deal with critical edges. For this usage, the structure type is +// forward declared. This pass obviously invalidates the CFG, but can update +// dominator trees. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "break-crit-edges" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ProfileInfo.h" +#include "llvm/Function.h" +#include "llvm/Instructions.h" +#include "llvm/Type.h" +#include "llvm/Support/CFG.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumBroken, "Number of blocks inserted"); + +namespace { + struct BreakCriticalEdges : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + BreakCriticalEdges() : FunctionPass(ID) { + initializeBreakCriticalEdgesPass(*PassRegistry::getPassRegistry()); + } + + virtual bool runOnFunction(Function &F); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addPreserved<DominatorTree>(); + AU.addPreserved<LoopInfo>(); + AU.addPreserved<ProfileInfo>(); + + // No loop canonicalization guarantees are broken by this pass. + AU.addPreservedID(LoopSimplifyID); + } + }; +} + +char BreakCriticalEdges::ID = 0; +INITIALIZE_PASS(BreakCriticalEdges, "break-crit-edges", + "Break critical edges in CFG", false, false) + +// Publically exposed interface to pass... +char &llvm::BreakCriticalEdgesID = BreakCriticalEdges::ID; +FunctionPass *llvm::createBreakCriticalEdgesPass() { + return new BreakCriticalEdges(); +} + +// runOnFunction - Loop over all of the edges in the CFG, breaking critical +// edges as they are found. +// +bool BreakCriticalEdges::runOnFunction(Function &F) { + bool Changed = false; + for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { + TerminatorInst *TI = I->getTerminator(); + if (TI->getNumSuccessors() > 1 && !isa<IndirectBrInst>(TI)) + for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) + if (SplitCriticalEdge(TI, i, this)) { + ++NumBroken; + Changed = true; + } + } + + return Changed; +} + +//===----------------------------------------------------------------------===// +// Implementation of the external critical edge manipulation functions +//===----------------------------------------------------------------------===// + +// isCriticalEdge - Return true if the specified edge is a critical edge. +// Critical edges are edges from a block with multiple successors to a block +// with multiple predecessors. +// +bool llvm::isCriticalEdge(const TerminatorInst *TI, unsigned SuccNum, + bool AllowIdenticalEdges) { + assert(SuccNum < TI->getNumSuccessors() && "Illegal edge specification!"); + if (TI->getNumSuccessors() == 1) return false; + + const BasicBlock *Dest = TI->getSuccessor(SuccNum); + const_pred_iterator I = pred_begin(Dest), E = pred_end(Dest); + + // If there is more than one predecessor, this is a critical edge... + assert(I != E && "No preds, but we have an edge to the block?"); + const BasicBlock *FirstPred = *I; + ++I; // Skip one edge due to the incoming arc from TI. + if (!AllowIdenticalEdges) + return I != E; + + // If AllowIdenticalEdges is true, then we allow this edge to be considered + // non-critical iff all preds come from TI's block. + while (I != E) { + const BasicBlock *P = *I; + if (P != FirstPred) + return true; + // Note: leave this as is until no one ever compiles with either gcc 4.0.1 + // or Xcode 2. This seems to work around the pred_iterator assert in PR 2207 + E = pred_end(P); + ++I; + } + return false; +} + +/// CreatePHIsForSplitLoopExit - When a loop exit edge is split, LCSSA form +/// may require new PHIs in the new exit block. This function inserts the +/// new PHIs, as needed. Preds is a list of preds inside the loop, SplitBB +/// is the new loop exit block, and DestBB is the old loop exit, now the +/// successor of SplitBB. +static void CreatePHIsForSplitLoopExit(SmallVectorImpl<BasicBlock *> &Preds, + BasicBlock *SplitBB, + BasicBlock *DestBB) { + // SplitBB shouldn't have anything non-trivial in it yet. + assert(SplitBB->getFirstNonPHI() == SplitBB->getTerminator() && + "SplitBB has non-PHI nodes!"); + + // For each PHI in the destination block... + for (BasicBlock::iterator I = DestBB->begin(); + PHINode *PN = dyn_cast<PHINode>(I); ++I) { + unsigned Idx = PN->getBasicBlockIndex(SplitBB); + Value *V = PN->getIncomingValue(Idx); + // If the input is a PHI which already satisfies LCSSA, don't create + // a new one. + if (const PHINode *VP = dyn_cast<PHINode>(V)) + if (VP->getParent() == SplitBB) + continue; + // Otherwise a new PHI is needed. Create one and populate it. + PHINode *NewPN = PHINode::Create(PN->getType(), "split", + SplitBB->getTerminator()); + for (unsigned i = 0, e = Preds.size(); i != e; ++i) + NewPN->addIncoming(V, Preds[i]); + // Update the original PHI. + PN->setIncomingValue(Idx, NewPN); + } +} + +/// SplitCriticalEdge - If this edge is a critical edge, insert a new node to +/// split the critical edge. This will update DominatorTree information if it +/// is available, thus calling this pass will not invalidate either of them. +/// This returns the new block if the edge was split, null otherwise. +/// +/// If MergeIdenticalEdges is true (not the default), *all* edges from TI to the +/// specified successor will be merged into the same critical edge block. +/// This is most commonly interesting with switch instructions, which may +/// have many edges to any one destination. This ensures that all edges to that +/// dest go to one block instead of each going to a different block, but isn't +/// the standard definition of a "critical edge". +/// +/// It is invalid to call this function on a critical edge that starts at an +/// IndirectBrInst. Splitting these edges will almost always create an invalid +/// program because the address of the new block won't be the one that is jumped +/// to. +/// +BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, + Pass *P, bool MergeIdenticalEdges) { + if (!isCriticalEdge(TI, SuccNum, MergeIdenticalEdges)) return 0; + + assert(!isa<IndirectBrInst>(TI) && + "Cannot split critical edge from IndirectBrInst"); + + BasicBlock *TIBB = TI->getParent(); + BasicBlock *DestBB = TI->getSuccessor(SuccNum); + + // Create a new basic block, linking it into the CFG. + BasicBlock *NewBB = BasicBlock::Create(TI->getContext(), + TIBB->getName() + "." + DestBB->getName() + "_crit_edge"); + // Create our unconditional branch. + BranchInst::Create(DestBB, NewBB); + + // Branch to the new block, breaking the edge. + TI->setSuccessor(SuccNum, NewBB); + + // Insert the block into the function... right after the block TI lives in. + Function &F = *TIBB->getParent(); + Function::iterator FBBI = TIBB; + F.getBasicBlockList().insert(++FBBI, NewBB); + + // If there are any PHI nodes in DestBB, we need to update them so that they + // merge incoming values from NewBB instead of from TIBB. + if (PHINode *APHI = dyn_cast<PHINode>(DestBB->begin())) { + // This conceptually does: + // foreach (PHINode *PN in DestBB) + // PN->setIncomingBlock(PN->getIncomingBlock(TIBB), NewBB); + // but is optimized for two cases. + + if (APHI->getNumIncomingValues() <= 8) { // Small # preds case. + unsigned BBIdx = 0; + for (BasicBlock::iterator I = DestBB->begin(); isa<PHINode>(I); ++I) { + // We no longer enter through TIBB, now we come in through NewBB. + // Revector exactly one entry in the PHI node that used to come from + // TIBB to come from NewBB. + PHINode *PN = cast<PHINode>(I); + + // Reuse the previous value of BBIdx if it lines up. In cases where we + // have multiple phi nodes with *lots* of predecessors, this is a speed + // win because we don't have to scan the PHI looking for TIBB. This + // happens because the BB list of PHI nodes are usually in the same + // order. + if (PN->getIncomingBlock(BBIdx) != TIBB) + BBIdx = PN->getBasicBlockIndex(TIBB); + PN->setIncomingBlock(BBIdx, NewBB); + } + } else { + // However, the foreach loop is slow for blocks with lots of predecessors + // because PHINode::getIncomingBlock is O(n) in # preds. Instead, walk + // the user list of TIBB to find the PHI nodes. + SmallPtrSet<PHINode*, 16> UpdatedPHIs; + + for (Value::use_iterator UI = TIBB->use_begin(), E = TIBB->use_end(); + UI != E; ) { + Value::use_iterator Use = UI++; + if (PHINode *PN = dyn_cast<PHINode>(*Use)) { + // Remove one entry from each PHI. + if (PN->getParent() == DestBB && UpdatedPHIs.insert(PN)) + PN->setOperand(Use.getOperandNo(), NewBB); + } + } + } + } + + // If there are any other edges from TIBB to DestBB, update those to go + // through the split block, making those edges non-critical as well (and + // reducing the number of phi entries in the DestBB if relevant). + if (MergeIdenticalEdges) { + for (unsigned i = SuccNum+1, e = TI->getNumSuccessors(); i != e; ++i) { + if (TI->getSuccessor(i) != DestBB) continue; + + // Remove an entry for TIBB from DestBB phi nodes. + DestBB->removePredecessor(TIBB); + + // We found another edge to DestBB, go to NewBB instead. + TI->setSuccessor(i, NewBB); + } + } + + + + // If we don't have a pass object, we can't update anything... + if (P == 0) return NewBB; + + DominatorTree *DT = P->getAnalysisIfAvailable<DominatorTree>(); + LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>(); + ProfileInfo *PI = P->getAnalysisIfAvailable<ProfileInfo>(); + + // If we have nothing to update, just return. + if (DT == 0 && LI == 0 && PI == 0) + return NewBB; + + // Now update analysis information. Since the only predecessor of NewBB is + // the TIBB, TIBB clearly dominates NewBB. TIBB usually doesn't dominate + // anything, as there are other successors of DestBB. However, if all other + // predecessors of DestBB are already dominated by DestBB (e.g. DestBB is a + // loop header) then NewBB dominates DestBB. + SmallVector<BasicBlock*, 8> OtherPreds; + + // If there is a PHI in the block, loop over predecessors with it, which is + // faster than iterating pred_begin/end. + if (PHINode *PN = dyn_cast<PHINode>(DestBB->begin())) { + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + if (PN->getIncomingBlock(i) != NewBB) + OtherPreds.push_back(PN->getIncomingBlock(i)); + } else { + for (pred_iterator I = pred_begin(DestBB), E = pred_end(DestBB); + I != E; ++I) { + BasicBlock *P = *I; + if (P != NewBB) + OtherPreds.push_back(P); + } + } + + bool NewBBDominatesDestBB = true; + + // Should we update DominatorTree information? + if (DT) { + DomTreeNode *TINode = DT->getNode(TIBB); + + // The new block is not the immediate dominator for any other nodes, but + // TINode is the immediate dominator for the new node. + // + if (TINode) { // Don't break unreachable code! + DomTreeNode *NewBBNode = DT->addNewBlock(NewBB, TIBB); + DomTreeNode *DestBBNode = 0; + + // If NewBBDominatesDestBB hasn't been computed yet, do so with DT. + if (!OtherPreds.empty()) { + DestBBNode = DT->getNode(DestBB); + while (!OtherPreds.empty() && NewBBDominatesDestBB) { + if (DomTreeNode *OPNode = DT->getNode(OtherPreds.back())) + NewBBDominatesDestBB = DT->dominates(DestBBNode, OPNode); + OtherPreds.pop_back(); + } + OtherPreds.clear(); + } + + // If NewBBDominatesDestBB, then NewBB dominates DestBB, otherwise it + // doesn't dominate anything. + if (NewBBDominatesDestBB) { + if (!DestBBNode) DestBBNode = DT->getNode(DestBB); + DT->changeImmediateDominator(DestBBNode, NewBBNode); + } + } + } + + // Update LoopInfo if it is around. + if (LI) { + if (Loop *TIL = LI->getLoopFor(TIBB)) { + // If one or the other blocks were not in a loop, the new block is not + // either, and thus LI doesn't need to be updated. + if (Loop *DestLoop = LI->getLoopFor(DestBB)) { + if (TIL == DestLoop) { + // Both in the same loop, the NewBB joins loop. + DestLoop->addBasicBlockToLoop(NewBB, LI->getBase()); + } else if (TIL->contains(DestLoop)) { + // Edge from an outer loop to an inner loop. Add to the outer loop. + TIL->addBasicBlockToLoop(NewBB, LI->getBase()); + } else if (DestLoop->contains(TIL)) { + // Edge from an inner loop to an outer loop. Add to the outer loop. + DestLoop->addBasicBlockToLoop(NewBB, LI->getBase()); + } else { + // Edge from two loops with no containment relation. Because these + // are natural loops, we know that the destination block must be the + // header of its loop (adding a branch into a loop elsewhere would + // create an irreducible loop). + assert(DestLoop->getHeader() == DestBB && + "Should not create irreducible loops!"); + if (Loop *P = DestLoop->getParentLoop()) + P->addBasicBlockToLoop(NewBB, LI->getBase()); + } + } + // If TIBB is in a loop and DestBB is outside of that loop, split the + // other exit blocks of the loop that also have predecessors outside + // the loop, to maintain a LoopSimplify guarantee. + if (!TIL->contains(DestBB) && + P->mustPreserveAnalysisID(LoopSimplifyID)) { + assert(!TIL->contains(NewBB) && + "Split point for loop exit is contained in loop!"); + + // Update LCSSA form in the newly created exit block. + if (P->mustPreserveAnalysisID(LCSSAID)) { + SmallVector<BasicBlock *, 1> OrigPred; + OrigPred.push_back(TIBB); + CreatePHIsForSplitLoopExit(OrigPred, NewBB, DestBB); + } + + // For each unique exit block... + SmallVector<BasicBlock *, 4> ExitBlocks; + TIL->getExitBlocks(ExitBlocks); + for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) { + // Collect all the preds that are inside the loop, and note + // whether there are any preds outside the loop. + SmallVector<BasicBlock *, 4> Preds; + bool HasPredOutsideOfLoop = false; + BasicBlock *Exit = ExitBlocks[i]; + for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit); + I != E; ++I) { + BasicBlock *P = *I; + if (TIL->contains(P)) + Preds.push_back(P); + else + HasPredOutsideOfLoop = true; + } + // If there are any preds not in the loop, we'll need to split + // the edges. The Preds.empty() check is needed because a block + // may appear multiple times in the list. We can't use + // getUniqueExitBlocks above because that depends on LoopSimplify + // form, which we're in the process of restoring! + if (!Preds.empty() && HasPredOutsideOfLoop) { + BasicBlock *NewExitBB = + SplitBlockPredecessors(Exit, Preds.data(), Preds.size(), + "split", P); + if (P->mustPreserveAnalysisID(LCSSAID)) + CreatePHIsForSplitLoopExit(Preds, NewExitBB, Exit); + } + } + } + // LCSSA form was updated above for the case where LoopSimplify is + // available, which means that all predecessors of loop exit blocks + // are within the loop. Without LoopSimplify form, it would be + // necessary to insert a new phi. + assert((!P->mustPreserveAnalysisID(LCSSAID) || + P->mustPreserveAnalysisID(LoopSimplifyID)) && + "SplitCriticalEdge doesn't know how to update LCCSA form " + "without LoopSimplify!"); + } + } + + // Update ProfileInfo if it is around. + if (PI) + PI->splitEdge(TIBB, DestBB, NewBB, MergeIdenticalEdges); + + return NewBB; +} diff --git a/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp new file mode 100644 index 0000000..4a90751 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp @@ -0,0 +1,483 @@ +//===- BuildLibCalls.cpp - Utility builder for libcalls -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements some functions that will create standard C libcalls. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/BuildLibCalls.h" +#include "llvm/Type.h" +#include "llvm/Constants.h" +#include "llvm/Function.h" +#include "llvm/Module.h" +#include "llvm/Support/IRBuilder.h" +#include "llvm/Target/TargetData.h" +#include "llvm/LLVMContext.h" +#include "llvm/Intrinsics.h" + +using namespace llvm; + +/// CastToCStr - Return V if it is an i8*, otherwise cast it to i8*. +Value *llvm::CastToCStr(Value *V, IRBuilder<> &B) { + return B.CreateBitCast(V, B.getInt8PtrTy(), "cstr"); +} + +/// EmitStrLen - Emit a call to the strlen function to the builder, for the +/// specified pointer. This always returns an integer value of size intptr_t. +Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const TargetData *TD) { + Module *M = B.GetInsertBlock()->getParent()->getParent(); + AttributeWithIndex AWI[2]; + AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture); + AWI[1] = AttributeWithIndex::get(~0u, Attribute::ReadOnly | + Attribute::NoUnwind); + + LLVMContext &Context = B.GetInsertBlock()->getContext(); + Constant *StrLen = M->getOrInsertFunction("strlen", AttrListPtr::get(AWI, 2), + TD->getIntPtrType(Context), + B.getInt8PtrTy(), + NULL); + CallInst *CI = B.CreateCall(StrLen, CastToCStr(Ptr, B), "strlen"); + if (const Function *F = dyn_cast<Function>(StrLen->stripPointerCasts())) + CI->setCallingConv(F->getCallingConv()); + + return CI; +} + +/// EmitStrChr - Emit a call to the strchr function to the builder, for the +/// specified pointer and character. Ptr is required to be some pointer type, +/// and the return value has 'i8*' type. +Value *llvm::EmitStrChr(Value *Ptr, char C, IRBuilder<> &B, + const TargetData *TD) { + Module *M = B.GetInsertBlock()->getParent()->getParent(); + AttributeWithIndex AWI = + AttributeWithIndex::get(~0u, Attribute::ReadOnly | Attribute::NoUnwind); + + const Type *I8Ptr = B.getInt8PtrTy(); + const Type *I32Ty = B.getInt32Ty(); + Constant *StrChr = M->getOrInsertFunction("strchr", AttrListPtr::get(&AWI, 1), + I8Ptr, I8Ptr, I32Ty, NULL); + CallInst *CI = B.CreateCall2(StrChr, CastToCStr(Ptr, B), + ConstantInt::get(I32Ty, C), "strchr"); + if (const Function *F = dyn_cast<Function>(StrChr->stripPointerCasts())) + CI->setCallingConv(F->getCallingConv()); + return CI; +} + +/// EmitStrNCmp - Emit a call to the strncmp function to the builder. +Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, + IRBuilder<> &B, const TargetData *TD) { + Module *M = B.GetInsertBlock()->getParent()->getParent(); + AttributeWithIndex AWI[3]; + AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture); + AWI[1] = AttributeWithIndex::get(2, Attribute::NoCapture); + AWI[2] = AttributeWithIndex::get(~0u, Attribute::ReadOnly | + Attribute::NoUnwind); + + LLVMContext &Context = B.GetInsertBlock()->getContext(); + Value *StrNCmp = M->getOrInsertFunction("strncmp", AttrListPtr::get(AWI, 3), + B.getInt32Ty(), + B.getInt8PtrTy(), + B.getInt8PtrTy(), + TD->getIntPtrType(Context), NULL); + CallInst *CI = B.CreateCall3(StrNCmp, CastToCStr(Ptr1, B), + CastToCStr(Ptr2, B), Len, "strncmp"); + + if (const Function *F = dyn_cast<Function>(StrNCmp->stripPointerCasts())) + CI->setCallingConv(F->getCallingConv()); + + return CI; +} + +/// EmitStrCpy - Emit a call to the strcpy function to the builder, for the +/// specified pointer arguments. +Value *llvm::EmitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B, + const TargetData *TD, StringRef Name) { + Module *M = B.GetInsertBlock()->getParent()->getParent(); + AttributeWithIndex AWI[2]; + AWI[0] = AttributeWithIndex::get(2, Attribute::NoCapture); + AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind); + const Type *I8Ptr = B.getInt8PtrTy(); + Value *StrCpy = M->getOrInsertFunction(Name, AttrListPtr::get(AWI, 2), + I8Ptr, I8Ptr, I8Ptr, NULL); + CallInst *CI = B.CreateCall2(StrCpy, CastToCStr(Dst, B), CastToCStr(Src, B), + Name); + if (const Function *F = dyn_cast<Function>(StrCpy->stripPointerCasts())) + CI->setCallingConv(F->getCallingConv()); + return CI; +} + +/// EmitStrNCpy - Emit a call to the strncpy function to the builder, for the +/// specified pointer arguments. +Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len, + IRBuilder<> &B, const TargetData *TD, StringRef Name) { + Module *M = B.GetInsertBlock()->getParent()->getParent(); + AttributeWithIndex AWI[2]; + AWI[0] = AttributeWithIndex::get(2, Attribute::NoCapture); + AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind); + const Type *I8Ptr = B.getInt8PtrTy(); + Value *StrNCpy = M->getOrInsertFunction(Name, AttrListPtr::get(AWI, 2), + I8Ptr, I8Ptr, I8Ptr, + Len->getType(), NULL); + CallInst *CI = B.CreateCall3(StrNCpy, CastToCStr(Dst, B), CastToCStr(Src, B), + Len, "strncpy"); + if (const Function *F = dyn_cast<Function>(StrNCpy->stripPointerCasts())) + CI->setCallingConv(F->getCallingConv()); + return CI; +} + +/// EmitMemCpyChk - Emit a call to the __memcpy_chk function to the builder. +/// This expects that the Len and ObjSize have type 'intptr_t' and Dst/Src +/// are pointers. +Value *llvm::EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize, + IRBuilder<> &B, const TargetData *TD) { + Module *M = B.GetInsertBlock()->getParent()->getParent(); + AttributeWithIndex AWI; + AWI = AttributeWithIndex::get(~0u, Attribute::NoUnwind); + LLVMContext &Context = B.GetInsertBlock()->getContext(); + Value *MemCpy = M->getOrInsertFunction("__memcpy_chk", + AttrListPtr::get(&AWI, 1), + B.getInt8PtrTy(), + B.getInt8PtrTy(), + B.getInt8PtrTy(), + TD->getIntPtrType(Context), + TD->getIntPtrType(Context), NULL); + Dst = CastToCStr(Dst, B); + Src = CastToCStr(Src, B); + CallInst *CI = B.CreateCall4(MemCpy, Dst, Src, Len, ObjSize); + if (const Function *F = dyn_cast<Function>(MemCpy->stripPointerCasts())) + CI->setCallingConv(F->getCallingConv()); + return CI; +} + +/// EmitMemChr - Emit a call to the memchr function. This assumes that Ptr is +/// a pointer, Val is an i32 value, and Len is an 'intptr_t' value. +Value *llvm::EmitMemChr(Value *Ptr, Value *Val, + Value *Len, IRBuilder<> &B, const TargetData *TD) { + Module *M = B.GetInsertBlock()->getParent()->getParent(); + AttributeWithIndex AWI; + AWI = AttributeWithIndex::get(~0u, Attribute::ReadOnly | Attribute::NoUnwind); + LLVMContext &Context = B.GetInsertBlock()->getContext(); + Value *MemChr = M->getOrInsertFunction("memchr", AttrListPtr::get(&AWI, 1), + B.getInt8PtrTy(), + B.getInt8PtrTy(), + B.getInt32Ty(), + TD->getIntPtrType(Context), + NULL); + CallInst *CI = B.CreateCall3(MemChr, CastToCStr(Ptr, B), Val, Len, "memchr"); + + if (const Function *F = dyn_cast<Function>(MemChr->stripPointerCasts())) + CI->setCallingConv(F->getCallingConv()); + + return CI; +} + +/// EmitMemCmp - Emit a call to the memcmp function. +Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2, + Value *Len, IRBuilder<> &B, const TargetData *TD) { + Module *M = B.GetInsertBlock()->getParent()->getParent(); + AttributeWithIndex AWI[3]; + AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture); + AWI[1] = AttributeWithIndex::get(2, Attribute::NoCapture); + AWI[2] = AttributeWithIndex::get(~0u, Attribute::ReadOnly | + Attribute::NoUnwind); + + LLVMContext &Context = B.GetInsertBlock()->getContext(); + Value *MemCmp = M->getOrInsertFunction("memcmp", AttrListPtr::get(AWI, 3), + B.getInt32Ty(), + B.getInt8PtrTy(), + B.getInt8PtrTy(), + TD->getIntPtrType(Context), NULL); + CallInst *CI = B.CreateCall3(MemCmp, CastToCStr(Ptr1, B), CastToCStr(Ptr2, B), + Len, "memcmp"); + + if (const Function *F = dyn_cast<Function>(MemCmp->stripPointerCasts())) + CI->setCallingConv(F->getCallingConv()); + + return CI; +} + +/// EmitUnaryFloatFnCall - Emit a call to the unary function named 'Name' (e.g. +/// 'floor'). This function is known to take a single of type matching 'Op' and +/// returns one value with the same type. If 'Op' is a long double, 'l' is +/// added as the suffix of name, if 'Op' is a float, we add a 'f' suffix. +Value *llvm::EmitUnaryFloatFnCall(Value *Op, const char *Name, + IRBuilder<> &B, const AttrListPtr &Attrs) { + char NameBuffer[20]; + if (!Op->getType()->isDoubleTy()) { + // If we need to add a suffix, copy into NameBuffer. + unsigned NameLen = strlen(Name); + assert(NameLen < sizeof(NameBuffer)-2); + memcpy(NameBuffer, Name, NameLen); + if (Op->getType()->isFloatTy()) + NameBuffer[NameLen] = 'f'; // floorf + else + NameBuffer[NameLen] = 'l'; // floorl + NameBuffer[NameLen+1] = 0; + Name = NameBuffer; + } + + Module *M = B.GetInsertBlock()->getParent()->getParent(); + Value *Callee = M->getOrInsertFunction(Name, Op->getType(), + Op->getType(), NULL); + CallInst *CI = B.CreateCall(Callee, Op, Name); + CI->setAttributes(Attrs); + if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts())) + CI->setCallingConv(F->getCallingConv()); + + return CI; +} + +/// EmitPutChar - Emit a call to the putchar function. This assumes that Char +/// is an integer. +Value *llvm::EmitPutChar(Value *Char, IRBuilder<> &B, const TargetData *TD) { + Module *M = B.GetInsertBlock()->getParent()->getParent(); + Value *PutChar = M->getOrInsertFunction("putchar", B.getInt32Ty(), + B.getInt32Ty(), NULL); + CallInst *CI = B.CreateCall(PutChar, + B.CreateIntCast(Char, + B.getInt32Ty(), + /*isSigned*/true, + "chari"), + "putchar"); + + if (const Function *F = dyn_cast<Function>(PutChar->stripPointerCasts())) + CI->setCallingConv(F->getCallingConv()); + return CI; +} + +/// EmitPutS - Emit a call to the puts function. This assumes that Str is +/// some pointer. +void llvm::EmitPutS(Value *Str, IRBuilder<> &B, const TargetData *TD) { + Module *M = B.GetInsertBlock()->getParent()->getParent(); + AttributeWithIndex AWI[2]; + AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture); + AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind); + + Value *PutS = M->getOrInsertFunction("puts", AttrListPtr::get(AWI, 2), + B.getInt32Ty(), + B.getInt8PtrTy(), + NULL); + CallInst *CI = B.CreateCall(PutS, CastToCStr(Str, B), "puts"); + if (const Function *F = dyn_cast<Function>(PutS->stripPointerCasts())) + CI->setCallingConv(F->getCallingConv()); + +} + +/// EmitFPutC - Emit a call to the fputc function. This assumes that Char is +/// an integer and File is a pointer to FILE. +void llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B, + const TargetData *TD) { + Module *M = B.GetInsertBlock()->getParent()->getParent(); + AttributeWithIndex AWI[2]; + AWI[0] = AttributeWithIndex::get(2, Attribute::NoCapture); + AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind); + Constant *F; + if (File->getType()->isPointerTy()) + F = M->getOrInsertFunction("fputc", AttrListPtr::get(AWI, 2), + B.getInt32Ty(), + B.getInt32Ty(), File->getType(), + NULL); + else + F = M->getOrInsertFunction("fputc", + B.getInt32Ty(), + B.getInt32Ty(), + File->getType(), NULL); + Char = B.CreateIntCast(Char, B.getInt32Ty(), /*isSigned*/true, + "chari"); + CallInst *CI = B.CreateCall2(F, Char, File, "fputc"); + + if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts())) + CI->setCallingConv(Fn->getCallingConv()); +} + +/// EmitFPutS - Emit a call to the puts function. Str is required to be a +/// pointer and File is a pointer to FILE. +void llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B, + const TargetData *TD) { + Module *M = B.GetInsertBlock()->getParent()->getParent(); + AttributeWithIndex AWI[3]; + AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture); + AWI[1] = AttributeWithIndex::get(2, Attribute::NoCapture); + AWI[2] = AttributeWithIndex::get(~0u, Attribute::NoUnwind); + Constant *F; + if (File->getType()->isPointerTy()) + F = M->getOrInsertFunction("fputs", AttrListPtr::get(AWI, 3), + B.getInt32Ty(), + B.getInt8PtrTy(), + File->getType(), NULL); + else + F = M->getOrInsertFunction("fputs", B.getInt32Ty(), + B.getInt8PtrTy(), + File->getType(), NULL); + CallInst *CI = B.CreateCall2(F, CastToCStr(Str, B), File, "fputs"); + + if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts())) + CI->setCallingConv(Fn->getCallingConv()); +} + +/// EmitFWrite - Emit a call to the fwrite function. This assumes that Ptr is +/// a pointer, Size is an 'intptr_t', and File is a pointer to FILE. +void llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File, + IRBuilder<> &B, const TargetData *TD) { + Module *M = B.GetInsertBlock()->getParent()->getParent(); + AttributeWithIndex AWI[3]; + AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture); + AWI[1] = AttributeWithIndex::get(4, Attribute::NoCapture); + AWI[2] = AttributeWithIndex::get(~0u, Attribute::NoUnwind); + LLVMContext &Context = B.GetInsertBlock()->getContext(); + Constant *F; + if (File->getType()->isPointerTy()) + F = M->getOrInsertFunction("fwrite", AttrListPtr::get(AWI, 3), + TD->getIntPtrType(Context), + B.getInt8PtrTy(), + TD->getIntPtrType(Context), + TD->getIntPtrType(Context), + File->getType(), NULL); + else + F = M->getOrInsertFunction("fwrite", TD->getIntPtrType(Context), + B.getInt8PtrTy(), + TD->getIntPtrType(Context), + TD->getIntPtrType(Context), + File->getType(), NULL); + CallInst *CI = B.CreateCall4(F, CastToCStr(Ptr, B), Size, + ConstantInt::get(TD->getIntPtrType(Context), 1), File); + + if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts())) + CI->setCallingConv(Fn->getCallingConv()); +} + +SimplifyFortifiedLibCalls::~SimplifyFortifiedLibCalls() { } + +bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const TargetData *TD) { + // We really need TargetData for later. + if (!TD) return false; + + this->CI = CI; + Function *Callee = CI->getCalledFunction(); + StringRef Name = Callee->getName(); + const FunctionType *FT = Callee->getFunctionType(); + BasicBlock *BB = CI->getParent(); + LLVMContext &Context = CI->getParent()->getContext(); + IRBuilder<> B(Context); + + // Set the builder to the instruction after the call. + B.SetInsertPoint(BB, CI); + + if (Name == "__memcpy_chk") { + // Check if this has the right signature. + if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) || + !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isPointerTy() || + FT->getParamType(2) != TD->getIntPtrType(Context) || + FT->getParamType(3) != TD->getIntPtrType(Context)) + return false; + + if (isFoldable(3, 2, false)) { + B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), 1); + replaceCall(CI->getArgOperand(0)); + return true; + } + return false; + } + + // Should be similar to memcpy. + if (Name == "__mempcpy_chk") { + return false; + } + + if (Name == "__memmove_chk") { + // Check if this has the right signature. + if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) || + !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isPointerTy() || + FT->getParamType(2) != TD->getIntPtrType(Context) || + FT->getParamType(3) != TD->getIntPtrType(Context)) + return false; + + if (isFoldable(3, 2, false)) { + B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), 1); + replaceCall(CI->getArgOperand(0)); + return true; + } + return false; + } + + if (Name == "__memset_chk") { + // Check if this has the right signature. + if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) || + !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isIntegerTy() || + FT->getParamType(2) != TD->getIntPtrType(Context) || + FT->getParamType(3) != TD->getIntPtrType(Context)) + return false; + + if (isFoldable(3, 2, false)) { + Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), + false); + B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1); + replaceCall(CI->getArgOperand(0)); + return true; + } + return false; + } + + if (Name == "__strcpy_chk" || Name == "__stpcpy_chk") { + // Check if this has the right signature. + if (FT->getNumParams() != 3 || + FT->getReturnType() != FT->getParamType(0) || + FT->getParamType(0) != FT->getParamType(1) || + FT->getParamType(0) != Type::getInt8PtrTy(Context) || + FT->getParamType(2) != TD->getIntPtrType(Context)) + return 0; + + + // If a) we don't have any length information, or b) we know this will + // fit then just lower to a plain st[rp]cpy. Otherwise we'll keep our + // st[rp]cpy_chk call which may fail at runtime if the size is too long. + // TODO: It might be nice to get a maximum length out of the possible + // string lengths for varying. + if (isFoldable(2, 1, true)) { + Value *Ret = EmitStrCpy(CI->getArgOperand(0), CI->getArgOperand(1), B, TD, + Name.substr(2, 6)); + replaceCall(Ret); + return true; + } + return false; + } + + if (Name == "__strncpy_chk" || Name == "__stpncpy_chk") { + // Check if this has the right signature. + if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) || + FT->getParamType(0) != FT->getParamType(1) || + FT->getParamType(0) != Type::getInt8PtrTy(Context) || + !FT->getParamType(2)->isIntegerTy() || + FT->getParamType(3) != TD->getIntPtrType(Context)) + return false; + + if (isFoldable(3, 2, false)) { + Value *Ret = EmitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), B, TD, Name.substr(2, 7)); + replaceCall(Ret); + return true; + } + return false; + } + + if (Name == "__strcat_chk") { + return false; + } + + if (Name == "__strncat_chk") { + return false; + } + + return false; +} diff --git a/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp new file mode 100644 index 0000000..d967ceb --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp @@ -0,0 +1,586 @@ +//===- CloneFunction.cpp - Clone a function into another function ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the CloneFunctionInto interface, which is used as the +// low-level function cloner. This is used by the CloneFunction and function +// inliner to do the dirty work of copying the body of a function around. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/GlobalVariable.h" +#include "llvm/Function.h" +#include "llvm/LLVMContext.h" +#include "llvm/Metadata.h" +#include "llvm/Support/CFG.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/DebugInfo.h" +#include "llvm/ADT/SmallVector.h" +#include <map> +using namespace llvm; + +// CloneBasicBlock - See comments in Cloning.h +BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, + ValueToValueMapTy &VMap, + const Twine &NameSuffix, Function *F, + ClonedCodeInfo *CodeInfo) { + BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "", F); + if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix); + + bool hasCalls = false, hasDynamicAllocas = false, hasStaticAllocas = false; + + // Loop over all instructions, and copy them over. + for (BasicBlock::const_iterator II = BB->begin(), IE = BB->end(); + II != IE; ++II) { + Instruction *NewInst = II->clone(); + if (II->hasName()) + NewInst->setName(II->getName()+NameSuffix); + NewBB->getInstList().push_back(NewInst); + VMap[II] = NewInst; // Add instruction map to value. + + hasCalls |= (isa<CallInst>(II) && !isa<DbgInfoIntrinsic>(II)); + if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) { + if (isa<ConstantInt>(AI->getArraySize())) + hasStaticAllocas = true; + else + hasDynamicAllocas = true; + } + } + + if (CodeInfo) { + CodeInfo->ContainsCalls |= hasCalls; + CodeInfo->ContainsUnwinds |= isa<UnwindInst>(BB->getTerminator()); + CodeInfo->ContainsDynamicAllocas |= hasDynamicAllocas; + CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas && + BB != &BB->getParent()->getEntryBlock(); + } + return NewBB; +} + +// Clone OldFunc into NewFunc, transforming the old arguments into references to +// VMap values. +// +void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, + ValueToValueMapTy &VMap, + bool ModuleLevelChanges, + SmallVectorImpl<ReturnInst*> &Returns, + const char *NameSuffix, ClonedCodeInfo *CodeInfo) { + assert(NameSuffix && "NameSuffix cannot be null!"); + +#ifndef NDEBUG + for (Function::const_arg_iterator I = OldFunc->arg_begin(), + E = OldFunc->arg_end(); I != E; ++I) + assert(VMap.count(I) && "No mapping from source argument specified!"); +#endif + + // Clone any attributes. + if (NewFunc->arg_size() == OldFunc->arg_size()) + NewFunc->copyAttributesFrom(OldFunc); + else { + //Some arguments were deleted with the VMap. Copy arguments one by one + for (Function::const_arg_iterator I = OldFunc->arg_begin(), + E = OldFunc->arg_end(); I != E; ++I) + if (Argument* Anew = dyn_cast<Argument>(VMap[I])) + Anew->addAttr( OldFunc->getAttributes() + .getParamAttributes(I->getArgNo() + 1)); + NewFunc->setAttributes(NewFunc->getAttributes() + .addAttr(0, OldFunc->getAttributes() + .getRetAttributes())); + NewFunc->setAttributes(NewFunc->getAttributes() + .addAttr(~0, OldFunc->getAttributes() + .getFnAttributes())); + + } + + // Loop over all of the basic blocks in the function, cloning them as + // appropriate. Note that we save BE this way in order to handle cloning of + // recursive functions into themselves. + // + for (Function::const_iterator BI = OldFunc->begin(), BE = OldFunc->end(); + BI != BE; ++BI) { + const BasicBlock &BB = *BI; + + // Create a new basic block and copy instructions into it! + BasicBlock *CBB = CloneBasicBlock(&BB, VMap, NameSuffix, NewFunc, CodeInfo); + VMap[&BB] = CBB; // Add basic block mapping. + + if (ReturnInst *RI = dyn_cast<ReturnInst>(CBB->getTerminator())) + Returns.push_back(RI); + } + + // Loop over all of the instructions in the function, fixing up operand + // references as we go. This uses VMap to do all the hard work. + for (Function::iterator BB = cast<BasicBlock>(VMap[OldFunc->begin()]), + BE = NewFunc->end(); BB != BE; ++BB) + // Loop over all instructions, fixing each one as we find it... + for (BasicBlock::iterator II = BB->begin(); II != BB->end(); ++II) + RemapInstruction(II, VMap, + ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges); +} + +/// CloneFunction - Return a copy of the specified function, but without +/// embedding the function into another module. Also, any references specified +/// in the VMap are changed to refer to their mapped value instead of the +/// original one. If any of the arguments to the function are in the VMap, +/// the arguments are deleted from the resultant function. The VMap is +/// updated to include mappings from all of the instructions and basicblocks in +/// the function from their old to new values. +/// +Function *llvm::CloneFunction(const Function *F, ValueToValueMapTy &VMap, + bool ModuleLevelChanges, + ClonedCodeInfo *CodeInfo) { + std::vector<const Type*> ArgTypes; + + // The user might be deleting arguments to the function by specifying them in + // the VMap. If so, we need to not add the arguments to the arg ty vector + // + for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); + I != E; ++I) + if (VMap.count(I) == 0) // Haven't mapped the argument to anything yet? + ArgTypes.push_back(I->getType()); + + // Create a new function type... + FunctionType *FTy = FunctionType::get(F->getFunctionType()->getReturnType(), + ArgTypes, F->getFunctionType()->isVarArg()); + + // Create the new function... + Function *NewF = Function::Create(FTy, F->getLinkage(), F->getName()); + + // Loop over the arguments, copying the names of the mapped arguments over... + Function::arg_iterator DestI = NewF->arg_begin(); + for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); + I != E; ++I) + if (VMap.count(I) == 0) { // Is this argument preserved? + DestI->setName(I->getName()); // Copy the name over... + VMap[I] = DestI++; // Add mapping to VMap + } + + SmallVector<ReturnInst*, 8> Returns; // Ignore returns cloned. + CloneFunctionInto(NewF, F, VMap, ModuleLevelChanges, Returns, "", CodeInfo); + return NewF; +} + + + +namespace { + /// PruningFunctionCloner - This class is a private class used to implement + /// the CloneAndPruneFunctionInto method. + struct PruningFunctionCloner { + Function *NewFunc; + const Function *OldFunc; + ValueToValueMapTy &VMap; + bool ModuleLevelChanges; + SmallVectorImpl<ReturnInst*> &Returns; + const char *NameSuffix; + ClonedCodeInfo *CodeInfo; + const TargetData *TD; + public: + PruningFunctionCloner(Function *newFunc, const Function *oldFunc, + ValueToValueMapTy &valueMap, + bool moduleLevelChanges, + SmallVectorImpl<ReturnInst*> &returns, + const char *nameSuffix, + ClonedCodeInfo *codeInfo, + const TargetData *td) + : NewFunc(newFunc), OldFunc(oldFunc), + VMap(valueMap), ModuleLevelChanges(moduleLevelChanges), + Returns(returns), NameSuffix(nameSuffix), CodeInfo(codeInfo), TD(td) { + } + + /// CloneBlock - The specified block is found to be reachable, clone it and + /// anything that it can reach. + void CloneBlock(const BasicBlock *BB, + std::vector<const BasicBlock*> &ToClone); + + public: + /// ConstantFoldMappedInstruction - Constant fold the specified instruction, + /// mapping its operands through VMap if they are available. + Constant *ConstantFoldMappedInstruction(const Instruction *I); + }; +} + +/// CloneBlock - The specified block is found to be reachable, clone it and +/// anything that it can reach. +void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, + std::vector<const BasicBlock*> &ToClone){ + TrackingVH<Value> &BBEntry = VMap[BB]; + + // Have we already cloned this block? + if (BBEntry) return; + + // Nope, clone it now. + BasicBlock *NewBB; + BBEntry = NewBB = BasicBlock::Create(BB->getContext()); + if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix); + + bool hasCalls = false, hasDynamicAllocas = false, hasStaticAllocas = false; + + // Loop over all instructions, and copy them over, DCE'ing as we go. This + // loop doesn't include the terminator. + for (BasicBlock::const_iterator II = BB->begin(), IE = --BB->end(); + II != IE; ++II) { + // If this instruction constant folds, don't bother cloning the instruction, + // instead, just add the constant to the value map. + if (Constant *C = ConstantFoldMappedInstruction(II)) { + VMap[II] = C; + continue; + } + + Instruction *NewInst = II->clone(); + if (II->hasName()) + NewInst->setName(II->getName()+NameSuffix); + NewBB->getInstList().push_back(NewInst); + VMap[II] = NewInst; // Add instruction map to value. + + hasCalls |= (isa<CallInst>(II) && !isa<DbgInfoIntrinsic>(II)); + if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) { + if (isa<ConstantInt>(AI->getArraySize())) + hasStaticAllocas = true; + else + hasDynamicAllocas = true; + } + } + + // Finally, clone over the terminator. + const TerminatorInst *OldTI = BB->getTerminator(); + bool TerminatorDone = false; + if (const BranchInst *BI = dyn_cast<BranchInst>(OldTI)) { + if (BI->isConditional()) { + // If the condition was a known constant in the callee... + ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition()); + // Or is a known constant in the caller... + if (Cond == 0) { + Value *V = VMap[BI->getCondition()]; + Cond = dyn_cast_or_null<ConstantInt>(V); + } + + // Constant fold to uncond branch! + if (Cond) { + BasicBlock *Dest = BI->getSuccessor(!Cond->getZExtValue()); + VMap[OldTI] = BranchInst::Create(Dest, NewBB); + ToClone.push_back(Dest); + TerminatorDone = true; + } + } + } else if (const SwitchInst *SI = dyn_cast<SwitchInst>(OldTI)) { + // If switching on a value known constant in the caller. + ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition()); + if (Cond == 0) { // Or known constant after constant prop in the callee... + Value *V = VMap[SI->getCondition()]; + Cond = dyn_cast_or_null<ConstantInt>(V); + } + if (Cond) { // Constant fold to uncond branch! + BasicBlock *Dest = SI->getSuccessor(SI->findCaseValue(Cond)); + VMap[OldTI] = BranchInst::Create(Dest, NewBB); + ToClone.push_back(Dest); + TerminatorDone = true; + } + } + + if (!TerminatorDone) { + Instruction *NewInst = OldTI->clone(); + if (OldTI->hasName()) + NewInst->setName(OldTI->getName()+NameSuffix); + NewBB->getInstList().push_back(NewInst); + VMap[OldTI] = NewInst; // Add instruction map to value. + + // Recursively clone any reachable successor blocks. + const TerminatorInst *TI = BB->getTerminator(); + for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) + ToClone.push_back(TI->getSuccessor(i)); + } + + if (CodeInfo) { + CodeInfo->ContainsCalls |= hasCalls; + CodeInfo->ContainsUnwinds |= isa<UnwindInst>(OldTI); + CodeInfo->ContainsDynamicAllocas |= hasDynamicAllocas; + CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas && + BB != &BB->getParent()->front(); + } + + if (ReturnInst *RI = dyn_cast<ReturnInst>(NewBB->getTerminator())) + Returns.push_back(RI); +} + +/// ConstantFoldMappedInstruction - Constant fold the specified instruction, +/// mapping its operands through VMap if they are available. +Constant *PruningFunctionCloner:: +ConstantFoldMappedInstruction(const Instruction *I) { + SmallVector<Constant*, 8> Ops; + for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) + if (Constant *Op = dyn_cast_or_null<Constant>(MapValue(I->getOperand(i), + VMap, + ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges))) + Ops.push_back(Op); + else + return 0; // All operands not constant! + + if (const CmpInst *CI = dyn_cast<CmpInst>(I)) + return ConstantFoldCompareInstOperands(CI->getPredicate(), Ops[0], Ops[1], + TD); + + if (const LoadInst *LI = dyn_cast<LoadInst>(I)) + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ops[0])) + if (!LI->isVolatile() && CE->getOpcode() == Instruction::GetElementPtr) + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(CE->getOperand(0))) + if (GV->isConstant() && GV->hasDefinitiveInitializer()) + return ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), + CE); + + return ConstantFoldInstOperands(I->getOpcode(), I->getType(), &Ops[0], + Ops.size(), TD); +} + +static DebugLoc +UpdateInlinedAtInfo(const DebugLoc &InsnDL, const DebugLoc &TheCallDL, + LLVMContext &Ctx) { + DebugLoc NewLoc = TheCallDL; + if (MDNode *IA = InsnDL.getInlinedAt(Ctx)) + NewLoc = UpdateInlinedAtInfo(DebugLoc::getFromDILocation(IA), TheCallDL, + Ctx); + + return DebugLoc::get(InsnDL.getLine(), InsnDL.getCol(), + InsnDL.getScope(Ctx), NewLoc.getAsMDNode(Ctx)); +} + +/// CloneAndPruneFunctionInto - This works exactly like CloneFunctionInto, +/// except that it does some simple constant prop and DCE on the fly. The +/// effect of this is to copy significantly less code in cases where (for +/// example) a function call with constant arguments is inlined, and those +/// constant arguments cause a significant amount of code in the callee to be +/// dead. Since this doesn't produce an exact copy of the input, it can't be +/// used for things like CloneFunction or CloneModule. +void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc, + ValueToValueMapTy &VMap, + bool ModuleLevelChanges, + SmallVectorImpl<ReturnInst*> &Returns, + const char *NameSuffix, + ClonedCodeInfo *CodeInfo, + const TargetData *TD, + Instruction *TheCall) { + assert(NameSuffix && "NameSuffix cannot be null!"); + +#ifndef NDEBUG + for (Function::const_arg_iterator II = OldFunc->arg_begin(), + E = OldFunc->arg_end(); II != E; ++II) + assert(VMap.count(II) && "No mapping from source argument specified!"); +#endif + + PruningFunctionCloner PFC(NewFunc, OldFunc, VMap, ModuleLevelChanges, + Returns, NameSuffix, CodeInfo, TD); + + // Clone the entry block, and anything recursively reachable from it. + std::vector<const BasicBlock*> CloneWorklist; + CloneWorklist.push_back(&OldFunc->getEntryBlock()); + while (!CloneWorklist.empty()) { + const BasicBlock *BB = CloneWorklist.back(); + CloneWorklist.pop_back(); + PFC.CloneBlock(BB, CloneWorklist); + } + + // Loop over all of the basic blocks in the old function. If the block was + // reachable, we have cloned it and the old block is now in the value map: + // insert it into the new function in the right order. If not, ignore it. + // + // Defer PHI resolution until rest of function is resolved. + SmallVector<const PHINode*, 16> PHIToResolve; + for (Function::const_iterator BI = OldFunc->begin(), BE = OldFunc->end(); + BI != BE; ++BI) { + Value *V = VMap[BI]; + BasicBlock *NewBB = cast_or_null<BasicBlock>(V); + if (NewBB == 0) continue; // Dead block. + + // Add the new block to the new function. + NewFunc->getBasicBlockList().push_back(NewBB); + + // Loop over all of the instructions in the block, fixing up operand + // references as we go. This uses VMap to do all the hard work. + // + BasicBlock::iterator I = NewBB->begin(); + + DebugLoc TheCallDL; + if (TheCall) + TheCallDL = TheCall->getDebugLoc(); + + // Handle PHI nodes specially, as we have to remove references to dead + // blocks. + if (PHINode *PN = dyn_cast<PHINode>(I)) { + // Skip over all PHI nodes, remembering them for later. + BasicBlock::const_iterator OldI = BI->begin(); + for (; (PN = dyn_cast<PHINode>(I)); ++I, ++OldI) { + if (I->hasMetadata()) { + if (!TheCallDL.isUnknown()) { + DebugLoc IDL = I->getDebugLoc(); + if (!IDL.isUnknown()) { + DebugLoc NewDL = UpdateInlinedAtInfo(IDL, TheCallDL, + I->getContext()); + I->setDebugLoc(NewDL); + } + } else { + // The cloned instruction has dbg info but the call instruction + // does not have dbg info. Remove dbg info from cloned instruction. + I->setDebugLoc(DebugLoc()); + } + } + PHIToResolve.push_back(cast<PHINode>(OldI)); + } + } + + // FIXME: + // FIXME: + // FIXME: Unclone all this metadata stuff. + // FIXME: + // FIXME: + + // Otherwise, remap the rest of the instructions normally. + for (; I != NewBB->end(); ++I) { + if (I->hasMetadata()) { + if (!TheCallDL.isUnknown()) { + DebugLoc IDL = I->getDebugLoc(); + if (!IDL.isUnknown()) { + DebugLoc NewDL = UpdateInlinedAtInfo(IDL, TheCallDL, + I->getContext()); + I->setDebugLoc(NewDL); + } + } else { + // The cloned instruction has dbg info but the call instruction + // does not have dbg info. Remove dbg info from cloned instruction. + I->setDebugLoc(DebugLoc()); + } + } + RemapInstruction(I, VMap, + ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges); + } + } + + // Defer PHI resolution until rest of function is resolved, PHI resolution + // requires the CFG to be up-to-date. + for (unsigned phino = 0, e = PHIToResolve.size(); phino != e; ) { + const PHINode *OPN = PHIToResolve[phino]; + unsigned NumPreds = OPN->getNumIncomingValues(); + const BasicBlock *OldBB = OPN->getParent(); + BasicBlock *NewBB = cast<BasicBlock>(VMap[OldBB]); + + // Map operands for blocks that are live and remove operands for blocks + // that are dead. + for (; phino != PHIToResolve.size() && + PHIToResolve[phino]->getParent() == OldBB; ++phino) { + OPN = PHIToResolve[phino]; + PHINode *PN = cast<PHINode>(VMap[OPN]); + for (unsigned pred = 0, e = NumPreds; pred != e; ++pred) { + Value *V = VMap[PN->getIncomingBlock(pred)]; + if (BasicBlock *MappedBlock = cast_or_null<BasicBlock>(V)) { + Value *InVal = MapValue(PN->getIncomingValue(pred), + VMap, + ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges); + assert(InVal && "Unknown input value?"); + PN->setIncomingValue(pred, InVal); + PN->setIncomingBlock(pred, MappedBlock); + } else { + PN->removeIncomingValue(pred, false); + --pred, --e; // Revisit the next entry. + } + } + } + + // The loop above has removed PHI entries for those blocks that are dead + // and has updated others. However, if a block is live (i.e. copied over) + // but its terminator has been changed to not go to this block, then our + // phi nodes will have invalid entries. Update the PHI nodes in this + // case. + PHINode *PN = cast<PHINode>(NewBB->begin()); + NumPreds = std::distance(pred_begin(NewBB), pred_end(NewBB)); + if (NumPreds != PN->getNumIncomingValues()) { + assert(NumPreds < PN->getNumIncomingValues()); + // Count how many times each predecessor comes to this block. + std::map<BasicBlock*, unsigned> PredCount; + for (pred_iterator PI = pred_begin(NewBB), E = pred_end(NewBB); + PI != E; ++PI) + --PredCount[*PI]; + + // Figure out how many entries to remove from each PHI. + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + ++PredCount[PN->getIncomingBlock(i)]; + + // At this point, the excess predecessor entries are positive in the + // map. Loop over all of the PHIs and remove excess predecessor + // entries. + BasicBlock::iterator I = NewBB->begin(); + for (; (PN = dyn_cast<PHINode>(I)); ++I) { + for (std::map<BasicBlock*, unsigned>::iterator PCI =PredCount.begin(), + E = PredCount.end(); PCI != E; ++PCI) { + BasicBlock *Pred = PCI->first; + for (unsigned NumToRemove = PCI->second; NumToRemove; --NumToRemove) + PN->removeIncomingValue(Pred, false); + } + } + } + + // If the loops above have made these phi nodes have 0 or 1 operand, + // replace them with undef or the input value. We must do this for + // correctness, because 0-operand phis are not valid. + PN = cast<PHINode>(NewBB->begin()); + if (PN->getNumIncomingValues() == 0) { + BasicBlock::iterator I = NewBB->begin(); + BasicBlock::const_iterator OldI = OldBB->begin(); + while ((PN = dyn_cast<PHINode>(I++))) { + Value *NV = UndefValue::get(PN->getType()); + PN->replaceAllUsesWith(NV); + assert(VMap[OldI] == PN && "VMap mismatch"); + VMap[OldI] = NV; + PN->eraseFromParent(); + ++OldI; + } + } + // NOTE: We cannot eliminate single entry phi nodes here, because of + // VMap. Single entry phi nodes can have multiple VMap entries + // pointing at them. Thus, deleting one would require scanning the VMap + // to update any entries in it that would require that. This would be + // really slow. + } + + // Now that the inlined function body has been fully constructed, go through + // and zap unconditional fall-through branches. This happen all the time when + // specializing code: code specialization turns conditional branches into + // uncond branches, and this code folds them. + Function::iterator I = cast<BasicBlock>(VMap[&OldFunc->getEntryBlock()]); + while (I != NewFunc->end()) { + BranchInst *BI = dyn_cast<BranchInst>(I->getTerminator()); + if (!BI || BI->isConditional()) { ++I; continue; } + + // Note that we can't eliminate uncond branches if the destination has + // single-entry PHI nodes. Eliminating the single-entry phi nodes would + // require scanning the VMap to update any entries that point to the phi + // node. + BasicBlock *Dest = BI->getSuccessor(0); + if (!Dest->getSinglePredecessor() || isa<PHINode>(Dest->begin())) { + ++I; continue; + } + + // We know all single-entry PHI nodes in the inlined function have been + // removed, so we just need to splice the blocks. + BI->eraseFromParent(); + + // Move all the instructions in the succ to the pred. + I->getInstList().splice(I->end(), Dest->getInstList()); + + // Make all PHI nodes that referred to Dest now refer to I as their source. + Dest->replaceAllUsesWith(I); + + // Remove the dest block. + Dest->eraseFromParent(); + + // Do not increment I, iteratively merge all things this block branches to. + } +} diff --git a/contrib/llvm/lib/Transforms/Utils/CloneLoop.cpp b/contrib/llvm/lib/Transforms/Utils/CloneLoop.cpp new file mode 100644 index 0000000..87dd141 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/CloneLoop.cpp @@ -0,0 +1,128 @@ +//===- CloneLoop.cpp - Clone loop nest ------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the CloneLoop interface which makes a copy of a loop. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/BasicBlock.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/Dominators.h" + + +using namespace llvm; + +/// CloneDominatorInfo - Clone a basic block's dominator tree. It is expected +/// that the basic block is already cloned. +static void CloneDominatorInfo(BasicBlock *BB, + ValueToValueMapTy &VMap, + DominatorTree *DT) { + + assert (DT && "DominatorTree is not available"); + ValueToValueMapTy::iterator BI = VMap.find(BB); + assert (BI != VMap.end() && "BasicBlock clone is missing"); + BasicBlock *NewBB = cast<BasicBlock>(BI->second); + + // NewBB already got dominator info. + if (DT->getNode(NewBB)) + return; + + assert (DT->getNode(BB) && "BasicBlock does not have dominator info"); + // Entry block is not expected here. Infinite loops are not to cloned. + assert (DT->getNode(BB)->getIDom() && "BasicBlock does not have immediate dominator"); + BasicBlock *BBDom = DT->getNode(BB)->getIDom()->getBlock(); + + // NewBB's dominator is either BB's dominator or BB's dominator's clone. + BasicBlock *NewBBDom = BBDom; + ValueToValueMapTy::iterator BBDomI = VMap.find(BBDom); + if (BBDomI != VMap.end()) { + NewBBDom = cast<BasicBlock>(BBDomI->second); + if (!DT->getNode(NewBBDom)) + CloneDominatorInfo(BBDom, VMap, DT); + } + DT->addNewBlock(NewBB, NewBBDom); +} + +/// CloneLoop - Clone Loop. Clone dominator info. Populate VMap +/// using old blocks to new blocks mapping. +Loop *llvm::CloneLoop(Loop *OrigL, LPPassManager *LPM, LoopInfo *LI, + ValueToValueMapTy &VMap, Pass *P) { + + DominatorTree *DT = NULL; + if (P) + DT = P->getAnalysisIfAvailable<DominatorTree>(); + + SmallVector<BasicBlock *, 16> NewBlocks; + + // Populate loop nest. + SmallVector<Loop *, 8> LoopNest; + LoopNest.push_back(OrigL); + + + Loop *NewParentLoop = NULL; + do { + Loop *L = LoopNest.pop_back_val(); + Loop *NewLoop = new Loop(); + + if (!NewParentLoop) + NewParentLoop = NewLoop; + + LPM->insertLoop(NewLoop, L->getParentLoop()); + + // Clone Basic Blocks. + for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); + I != E; ++I) { + BasicBlock *BB = *I; + BasicBlock *NewBB = CloneBasicBlock(BB, VMap, ".clone"); + VMap[BB] = NewBB; + if (P) + LPM->cloneBasicBlockSimpleAnalysis(BB, NewBB, L); + NewLoop->addBasicBlockToLoop(NewBB, LI->getBase()); + NewBlocks.push_back(NewBB); + } + + // Clone dominator info. + if (DT) + for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); + I != E; ++I) { + BasicBlock *BB = *I; + CloneDominatorInfo(BB, VMap, DT); + } + + // Process sub loops + for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) + LoopNest.push_back(*I); + } while (!LoopNest.empty()); + + // Remap instructions to reference operands from VMap. + for(SmallVector<BasicBlock *, 16>::iterator NBItr = NewBlocks.begin(), + NBE = NewBlocks.end(); NBItr != NBE; ++NBItr) { + BasicBlock *NB = *NBItr; + for(BasicBlock::iterator BI = NB->begin(), BE = NB->end(); + BI != BE; ++BI) { + Instruction *Insn = BI; + for (unsigned index = 0, num_ops = Insn->getNumOperands(); + index != num_ops; ++index) { + Value *Op = Insn->getOperand(index); + ValueToValueMapTy::iterator OpItr = VMap.find(Op); + if (OpItr != VMap.end()) + Insn->setOperand(index, OpItr->second); + } + } + } + + BasicBlock *Latch = OrigL->getLoopLatch(); + Function *F = Latch->getParent(); + F->getBasicBlockList().insert(OrigL->getHeader(), + NewBlocks.begin(), NewBlocks.end()); + + + return NewParentLoop; +} diff --git a/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp b/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp new file mode 100644 index 0000000..1046c38 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp @@ -0,0 +1,137 @@ +//===- CloneModule.cpp - Clone an entire module ---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the CloneModule interface which makes a copy of an +// entire module. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Module.h" +#include "llvm/DerivedTypes.h" +#include "llvm/TypeSymbolTable.h" +#include "llvm/Constant.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +using namespace llvm; + +/// CloneModule - Return an exact copy of the specified module. This is not as +/// easy as it might seem because we have to worry about making copies of global +/// variables and functions, and making their (initializers and references, +/// respectively) refer to the right globals. +/// +Module *llvm::CloneModule(const Module *M) { + // Create the value map that maps things from the old module over to the new + // module. + ValueToValueMapTy VMap; + return CloneModule(M, VMap); +} + +Module *llvm::CloneModule(const Module *M, + ValueToValueMapTy &VMap) { + // First off, we need to create the new module... + Module *New = new Module(M->getModuleIdentifier(), M->getContext()); + New->setDataLayout(M->getDataLayout()); + New->setTargetTriple(M->getTargetTriple()); + New->setModuleInlineAsm(M->getModuleInlineAsm()); + + // Copy all of the type symbol table entries over. + const TypeSymbolTable &TST = M->getTypeSymbolTable(); + for (TypeSymbolTable::const_iterator TI = TST.begin(), TE = TST.end(); + TI != TE; ++TI) + New->addTypeName(TI->first, TI->second); + + // Copy all of the dependent libraries over. + for (Module::lib_iterator I = M->lib_begin(), E = M->lib_end(); I != E; ++I) + New->addLibrary(*I); + + // Loop over all of the global variables, making corresponding globals in the + // new module. Here we add them to the VMap and to the new Module. We + // don't worry about attributes or initializers, they will come later. + // + for (Module::const_global_iterator I = M->global_begin(), E = M->global_end(); + I != E; ++I) { + GlobalVariable *GV = new GlobalVariable(*New, + I->getType()->getElementType(), + false, + GlobalValue::ExternalLinkage, 0, + I->getName()); + GV->setAlignment(I->getAlignment()); + VMap[I] = GV; + } + + // Loop over the functions in the module, making external functions as before + for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I) { + Function *NF = + Function::Create(cast<FunctionType>(I->getType()->getElementType()), + GlobalValue::ExternalLinkage, I->getName(), New); + NF->copyAttributesFrom(I); + VMap[I] = NF; + } + + // Loop over the aliases in the module + for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end(); + I != E; ++I) + VMap[I] = new GlobalAlias(I->getType(), GlobalAlias::ExternalLinkage, + I->getName(), NULL, New); + + // Now that all of the things that global variable initializer can refer to + // have been created, loop through and copy the global variable referrers + // over... We also set the attributes on the global now. + // + for (Module::const_global_iterator I = M->global_begin(), E = M->global_end(); + I != E; ++I) { + GlobalVariable *GV = cast<GlobalVariable>(VMap[I]); + if (I->hasInitializer()) + GV->setInitializer(cast<Constant>(MapValue(I->getInitializer(), + VMap, RF_None))); + GV->setLinkage(I->getLinkage()); + GV->setThreadLocal(I->isThreadLocal()); + GV->setConstant(I->isConstant()); + } + + // Similarly, copy over function bodies now... + // + for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I) { + Function *F = cast<Function>(VMap[I]); + if (!I->isDeclaration()) { + Function::arg_iterator DestI = F->arg_begin(); + for (Function::const_arg_iterator J = I->arg_begin(); J != I->arg_end(); + ++J) { + DestI->setName(J->getName()); + VMap[J] = DestI++; + } + + SmallVector<ReturnInst*, 8> Returns; // Ignore returns cloned. + CloneFunctionInto(F, I, VMap, /*ModuleLevelChanges=*/true, Returns); + } + + F->setLinkage(I->getLinkage()); + } + + // And aliases + for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end(); + I != E; ++I) { + GlobalAlias *GA = cast<GlobalAlias>(VMap[I]); + GA->setLinkage(I->getLinkage()); + if (const Constant* C = I->getAliasee()) + GA->setAliasee(cast<Constant>(MapValue(C, VMap, RF_None))); + } + + // And named metadata.... + for (Module::const_named_metadata_iterator I = M->named_metadata_begin(), + E = M->named_metadata_end(); I != E; ++I) { + const NamedMDNode &NMD = *I; + NamedMDNode *NewNMD = New->getOrInsertNamedMetadata(NMD.getName()); + for (unsigned i = 0, e = NMD.getNumOperands(); i != e; ++i) + NewNMD->addOperand(cast<MDNode>(MapValue(NMD.getOperand(i), VMap, + RF_None))); + } + + return New; +} diff --git a/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp new file mode 100644 index 0000000..e633772 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -0,0 +1,795 @@ +//===- CodeExtractor.cpp - Pull code region into a new function -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the interface to tear out a code region, such as an +// individual loop or a parallel section, into a new function, replacing it with +// a call to the new function. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/FunctionUtils.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Instructions.h" +#include "llvm/Intrinsics.h" +#include "llvm/LLVMContext.h" +#include "llvm/Module.h" +#include "llvm/Pass.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/Verifier.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/StringExtras.h" +#include <algorithm> +#include <set> +using namespace llvm; + +// Provide a command-line option to aggregate function arguments into a struct +// for functions produced by the code extractor. This is useful when converting +// extracted functions to pthread-based code, as only one argument (void*) can +// be passed in to pthread_create(). +static cl::opt<bool> +AggregateArgsOpt("aggregate-extracted-args", cl::Hidden, + cl::desc("Aggregate arguments to code-extracted functions")); + +namespace { + class CodeExtractor { + typedef SetVector<Value*> Values; + SetVector<BasicBlock*> BlocksToExtract; + DominatorTree* DT; + bool AggregateArgs; + unsigned NumExitBlocks; + const Type *RetTy; + public: + CodeExtractor(DominatorTree* dt = 0, bool AggArgs = false) + : DT(dt), AggregateArgs(AggArgs||AggregateArgsOpt), NumExitBlocks(~0U) {} + + Function *ExtractCodeRegion(const std::vector<BasicBlock*> &code); + + bool isEligible(const std::vector<BasicBlock*> &code); + + private: + /// definedInRegion - Return true if the specified value is defined in the + /// extracted region. + bool definedInRegion(Value *V) const { + if (Instruction *I = dyn_cast<Instruction>(V)) + if (BlocksToExtract.count(I->getParent())) + return true; + return false; + } + + /// definedInCaller - Return true if the specified value is defined in the + /// function being code extracted, but not in the region being extracted. + /// These values must be passed in as live-ins to the function. + bool definedInCaller(Value *V) const { + if (isa<Argument>(V)) return true; + if (Instruction *I = dyn_cast<Instruction>(V)) + if (!BlocksToExtract.count(I->getParent())) + return true; + return false; + } + + void severSplitPHINodes(BasicBlock *&Header); + void splitReturnBlocks(); + void findInputsOutputs(Values &inputs, Values &outputs); + + Function *constructFunction(const Values &inputs, + const Values &outputs, + BasicBlock *header, + BasicBlock *newRootNode, BasicBlock *newHeader, + Function *oldFunction, Module *M); + + void moveCodeToFunction(Function *newFunction); + + void emitCallAndSwitchStatement(Function *newFunction, + BasicBlock *newHeader, + Values &inputs, + Values &outputs); + + }; +} + +/// severSplitPHINodes - If a PHI node has multiple inputs from outside of the +/// region, we need to split the entry block of the region so that the PHI node +/// is easier to deal with. +void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) { + bool HasPredsFromRegion = false; + unsigned NumPredsOutsideRegion = 0; + + if (Header != &Header->getParent()->getEntryBlock()) { + PHINode *PN = dyn_cast<PHINode>(Header->begin()); + if (!PN) return; // No PHI nodes. + + // If the header node contains any PHI nodes, check to see if there is more + // than one entry from outside the region. If so, we need to sever the + // header block into two. + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + if (BlocksToExtract.count(PN->getIncomingBlock(i))) + HasPredsFromRegion = true; + else + ++NumPredsOutsideRegion; + + // If there is one (or fewer) predecessor from outside the region, we don't + // need to do anything special. + if (NumPredsOutsideRegion <= 1) return; + } + + // Otherwise, we need to split the header block into two pieces: one + // containing PHI nodes merging values from outside of the region, and a + // second that contains all of the code for the block and merges back any + // incoming values from inside of the region. + BasicBlock::iterator AfterPHIs = Header->getFirstNonPHI(); + BasicBlock *NewBB = Header->splitBasicBlock(AfterPHIs, + Header->getName()+".ce"); + + // We only want to code extract the second block now, and it becomes the new + // header of the region. + BasicBlock *OldPred = Header; + BlocksToExtract.remove(OldPred); + BlocksToExtract.insert(NewBB); + Header = NewBB; + + // Okay, update dominator sets. The blocks that dominate the new one are the + // blocks that dominate TIBB plus the new block itself. + if (DT) + DT->splitBlock(NewBB); + + // Okay, now we need to adjust the PHI nodes and any branches from within the + // region to go to the new header block instead of the old header block. + if (HasPredsFromRegion) { + PHINode *PN = cast<PHINode>(OldPred->begin()); + // Loop over all of the predecessors of OldPred that are in the region, + // changing them to branch to NewBB instead. + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + if (BlocksToExtract.count(PN->getIncomingBlock(i))) { + TerminatorInst *TI = PN->getIncomingBlock(i)->getTerminator(); + TI->replaceUsesOfWith(OldPred, NewBB); + } + + // Okay, everthing within the region is now branching to the right block, we + // just have to update the PHI nodes now, inserting PHI nodes into NewBB. + for (AfterPHIs = OldPred->begin(); isa<PHINode>(AfterPHIs); ++AfterPHIs) { + PHINode *PN = cast<PHINode>(AfterPHIs); + // Create a new PHI node in the new region, which has an incoming value + // from OldPred of PN. + PHINode *NewPN = PHINode::Create(PN->getType(), PN->getName()+".ce", + NewBB->begin()); + NewPN->addIncoming(PN, OldPred); + + // Loop over all of the incoming value in PN, moving them to NewPN if they + // are from the extracted region. + for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) { + if (BlocksToExtract.count(PN->getIncomingBlock(i))) { + NewPN->addIncoming(PN->getIncomingValue(i), PN->getIncomingBlock(i)); + PN->removeIncomingValue(i); + --i; + } + } + } + } +} + +void CodeExtractor::splitReturnBlocks() { + for (SetVector<BasicBlock*>::iterator I = BlocksToExtract.begin(), + E = BlocksToExtract.end(); I != E; ++I) + if (ReturnInst *RI = dyn_cast<ReturnInst>((*I)->getTerminator())) { + BasicBlock *New = (*I)->splitBasicBlock(RI, (*I)->getName()+".ret"); + if (DT) { + // Old dominates New. New node dominates all other nodes dominated + // by Old. + DomTreeNode *OldNode = DT->getNode(*I); + SmallVector<DomTreeNode*, 8> Children; + for (DomTreeNode::iterator DI = OldNode->begin(), DE = OldNode->end(); + DI != DE; ++DI) + Children.push_back(*DI); + + DomTreeNode *NewNode = DT->addNewBlock(New, *I); + + for (SmallVector<DomTreeNode*, 8>::iterator I = Children.begin(), + E = Children.end(); I != E; ++I) + DT->changeImmediateDominator(*I, NewNode); + } + } +} + +// findInputsOutputs - Find inputs to, outputs from the code region. +// +void CodeExtractor::findInputsOutputs(Values &inputs, Values &outputs) { + std::set<BasicBlock*> ExitBlocks; + for (SetVector<BasicBlock*>::const_iterator ci = BlocksToExtract.begin(), + ce = BlocksToExtract.end(); ci != ce; ++ci) { + BasicBlock *BB = *ci; + + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + // If a used value is defined outside the region, it's an input. If an + // instruction is used outside the region, it's an output. + for (User::op_iterator O = I->op_begin(), E = I->op_end(); O != E; ++O) + if (definedInCaller(*O)) + inputs.insert(*O); + + // Consider uses of this instruction (outputs). + for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); + UI != E; ++UI) + if (!definedInRegion(*UI)) { + outputs.insert(I); + break; + } + } // for: insts + + // Keep track of the exit blocks from the region. + TerminatorInst *TI = BB->getTerminator(); + for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) + if (!BlocksToExtract.count(TI->getSuccessor(i))) + ExitBlocks.insert(TI->getSuccessor(i)); + } // for: basic blocks + + NumExitBlocks = ExitBlocks.size(); +} + +/// constructFunction - make a function based on inputs and outputs, as follows: +/// f(in0, ..., inN, out0, ..., outN) +/// +Function *CodeExtractor::constructFunction(const Values &inputs, + const Values &outputs, + BasicBlock *header, + BasicBlock *newRootNode, + BasicBlock *newHeader, + Function *oldFunction, + Module *M) { + DEBUG(dbgs() << "inputs: " << inputs.size() << "\n"); + DEBUG(dbgs() << "outputs: " << outputs.size() << "\n"); + + // This function returns unsigned, outputs will go back by reference. + switch (NumExitBlocks) { + case 0: + case 1: RetTy = Type::getVoidTy(header->getContext()); break; + case 2: RetTy = Type::getInt1Ty(header->getContext()); break; + default: RetTy = Type::getInt16Ty(header->getContext()); break; + } + + std::vector<const Type*> paramTy; + + // Add the types of the input values to the function's argument list + for (Values::const_iterator i = inputs.begin(), + e = inputs.end(); i != e; ++i) { + const Value *value = *i; + DEBUG(dbgs() << "value used in func: " << *value << "\n"); + paramTy.push_back(value->getType()); + } + + // Add the types of the output values to the function's argument list. + for (Values::const_iterator I = outputs.begin(), E = outputs.end(); + I != E; ++I) { + DEBUG(dbgs() << "instr used in func: " << **I << "\n"); + if (AggregateArgs) + paramTy.push_back((*I)->getType()); + else + paramTy.push_back(PointerType::getUnqual((*I)->getType())); + } + + DEBUG(dbgs() << "Function type: " << *RetTy << " f("); + for (std::vector<const Type*>::iterator i = paramTy.begin(), + e = paramTy.end(); i != e; ++i) + DEBUG(dbgs() << **i << ", "); + DEBUG(dbgs() << ")\n"); + + if (AggregateArgs && (inputs.size() + outputs.size() > 0)) { + PointerType *StructPtr = + PointerType::getUnqual(StructType::get(M->getContext(), paramTy)); + paramTy.clear(); + paramTy.push_back(StructPtr); + } + const FunctionType *funcType = + FunctionType::get(RetTy, paramTy, false); + + // Create the new function + Function *newFunction = Function::Create(funcType, + GlobalValue::InternalLinkage, + oldFunction->getName() + "_" + + header->getName(), M); + // If the old function is no-throw, so is the new one. + if (oldFunction->doesNotThrow()) + newFunction->setDoesNotThrow(true); + + newFunction->getBasicBlockList().push_back(newRootNode); + + // Create an iterator to name all of the arguments we inserted. + Function::arg_iterator AI = newFunction->arg_begin(); + + // Rewrite all users of the inputs in the extracted region to use the + // arguments (or appropriate addressing into struct) instead. + for (unsigned i = 0, e = inputs.size(); i != e; ++i) { + Value *RewriteVal; + if (AggregateArgs) { + Value *Idx[2]; + Idx[0] = Constant::getNullValue(Type::getInt32Ty(header->getContext())); + Idx[1] = ConstantInt::get(Type::getInt32Ty(header->getContext()), i); + TerminatorInst *TI = newFunction->begin()->getTerminator(); + GetElementPtrInst *GEP = + GetElementPtrInst::Create(AI, Idx, Idx+2, + "gep_" + inputs[i]->getName(), TI); + RewriteVal = new LoadInst(GEP, "loadgep_" + inputs[i]->getName(), TI); + } else + RewriteVal = AI++; + + std::vector<User*> Users(inputs[i]->use_begin(), inputs[i]->use_end()); + for (std::vector<User*>::iterator use = Users.begin(), useE = Users.end(); + use != useE; ++use) + if (Instruction* inst = dyn_cast<Instruction>(*use)) + if (BlocksToExtract.count(inst->getParent())) + inst->replaceUsesOfWith(inputs[i], RewriteVal); + } + + // Set names for input and output arguments. + if (!AggregateArgs) { + AI = newFunction->arg_begin(); + for (unsigned i = 0, e = inputs.size(); i != e; ++i, ++AI) + AI->setName(inputs[i]->getName()); + for (unsigned i = 0, e = outputs.size(); i != e; ++i, ++AI) + AI->setName(outputs[i]->getName()+".out"); + } + + // Rewrite branches to basic blocks outside of the loop to new dummy blocks + // within the new function. This must be done before we lose track of which + // blocks were originally in the code region. + std::vector<User*> Users(header->use_begin(), header->use_end()); + for (unsigned i = 0, e = Users.size(); i != e; ++i) + // The BasicBlock which contains the branch is not in the region + // modify the branch target to a new block + if (TerminatorInst *TI = dyn_cast<TerminatorInst>(Users[i])) + if (!BlocksToExtract.count(TI->getParent()) && + TI->getParent()->getParent() == oldFunction) + TI->replaceUsesOfWith(header, newHeader); + + return newFunction; +} + +/// FindPhiPredForUseInBlock - Given a value and a basic block, find a PHI +/// that uses the value within the basic block, and return the predecessor +/// block associated with that use, or return 0 if none is found. +static BasicBlock* FindPhiPredForUseInBlock(Value* Used, BasicBlock* BB) { + for (Value::use_iterator UI = Used->use_begin(), + UE = Used->use_end(); UI != UE; ++UI) { + PHINode *P = dyn_cast<PHINode>(*UI); + if (P && P->getParent() == BB) + return P->getIncomingBlock(UI); + } + + return 0; +} + +/// emitCallAndSwitchStatement - This method sets up the caller side by adding +/// the call instruction, splitting any PHI nodes in the header block as +/// necessary. +void CodeExtractor:: +emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, + Values &inputs, Values &outputs) { + // Emit a call to the new function, passing in: *pointer to struct (if + // aggregating parameters), or plan inputs and allocated memory for outputs + std::vector<Value*> params, StructValues, ReloadOutputs, Reloads; + + LLVMContext &Context = newFunction->getContext(); + + // Add inputs as params, or to be filled into the struct + for (Values::iterator i = inputs.begin(), e = inputs.end(); i != e; ++i) + if (AggregateArgs) + StructValues.push_back(*i); + else + params.push_back(*i); + + // Create allocas for the outputs + for (Values::iterator i = outputs.begin(), e = outputs.end(); i != e; ++i) { + if (AggregateArgs) { + StructValues.push_back(*i); + } else { + AllocaInst *alloca = + new AllocaInst((*i)->getType(), 0, (*i)->getName()+".loc", + codeReplacer->getParent()->begin()->begin()); + ReloadOutputs.push_back(alloca); + params.push_back(alloca); + } + } + + AllocaInst *Struct = 0; + if (AggregateArgs && (inputs.size() + outputs.size() > 0)) { + std::vector<const Type*> ArgTypes; + for (Values::iterator v = StructValues.begin(), + ve = StructValues.end(); v != ve; ++v) + ArgTypes.push_back((*v)->getType()); + + // Allocate a struct at the beginning of this function + Type *StructArgTy = StructType::get(newFunction->getContext(), ArgTypes); + Struct = + new AllocaInst(StructArgTy, 0, "structArg", + codeReplacer->getParent()->begin()->begin()); + params.push_back(Struct); + + for (unsigned i = 0, e = inputs.size(); i != e; ++i) { + Value *Idx[2]; + Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context)); + Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), i); + GetElementPtrInst *GEP = + GetElementPtrInst::Create(Struct, Idx, Idx + 2, + "gep_" + StructValues[i]->getName()); + codeReplacer->getInstList().push_back(GEP); + StoreInst *SI = new StoreInst(StructValues[i], GEP); + codeReplacer->getInstList().push_back(SI); + } + } + + // Emit the call to the function + CallInst *call = CallInst::Create(newFunction, params.begin(), params.end(), + NumExitBlocks > 1 ? "targetBlock" : ""); + codeReplacer->getInstList().push_back(call); + + Function::arg_iterator OutputArgBegin = newFunction->arg_begin(); + unsigned FirstOut = inputs.size(); + if (!AggregateArgs) + std::advance(OutputArgBegin, inputs.size()); + + // Reload the outputs passed in by reference + for (unsigned i = 0, e = outputs.size(); i != e; ++i) { + Value *Output = 0; + if (AggregateArgs) { + Value *Idx[2]; + Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context)); + Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), FirstOut + i); + GetElementPtrInst *GEP + = GetElementPtrInst::Create(Struct, Idx, Idx + 2, + "gep_reload_" + outputs[i]->getName()); + codeReplacer->getInstList().push_back(GEP); + Output = GEP; + } else { + Output = ReloadOutputs[i]; + } + LoadInst *load = new LoadInst(Output, outputs[i]->getName()+".reload"); + Reloads.push_back(load); + codeReplacer->getInstList().push_back(load); + std::vector<User*> Users(outputs[i]->use_begin(), outputs[i]->use_end()); + for (unsigned u = 0, e = Users.size(); u != e; ++u) { + Instruction *inst = cast<Instruction>(Users[u]); + if (!BlocksToExtract.count(inst->getParent())) + inst->replaceUsesOfWith(outputs[i], load); + } + } + + // Now we can emit a switch statement using the call as a value. + SwitchInst *TheSwitch = + SwitchInst::Create(Constant::getNullValue(Type::getInt16Ty(Context)), + codeReplacer, 0, codeReplacer); + + // Since there may be multiple exits from the original region, make the new + // function return an unsigned, switch on that number. This loop iterates + // over all of the blocks in the extracted region, updating any terminator + // instructions in the to-be-extracted region that branch to blocks that are + // not in the region to be extracted. + std::map<BasicBlock*, BasicBlock*> ExitBlockMap; + + unsigned switchVal = 0; + for (SetVector<BasicBlock*>::const_iterator i = BlocksToExtract.begin(), + e = BlocksToExtract.end(); i != e; ++i) { + TerminatorInst *TI = (*i)->getTerminator(); + for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) + if (!BlocksToExtract.count(TI->getSuccessor(i))) { + BasicBlock *OldTarget = TI->getSuccessor(i); + // add a new basic block which returns the appropriate value + BasicBlock *&NewTarget = ExitBlockMap[OldTarget]; + if (!NewTarget) { + // If we don't already have an exit stub for this non-extracted + // destination, create one now! + NewTarget = BasicBlock::Create(Context, + OldTarget->getName() + ".exitStub", + newFunction); + unsigned SuccNum = switchVal++; + + Value *brVal = 0; + switch (NumExitBlocks) { + case 0: + case 1: break; // No value needed. + case 2: // Conditional branch, return a bool + brVal = ConstantInt::get(Type::getInt1Ty(Context), !SuccNum); + break; + default: + brVal = ConstantInt::get(Type::getInt16Ty(Context), SuccNum); + break; + } + + ReturnInst *NTRet = ReturnInst::Create(Context, brVal, NewTarget); + + // Update the switch instruction. + TheSwitch->addCase(ConstantInt::get(Type::getInt16Ty(Context), + SuccNum), + OldTarget); + + // Restore values just before we exit + Function::arg_iterator OAI = OutputArgBegin; + for (unsigned out = 0, e = outputs.size(); out != e; ++out) { + // For an invoke, the normal destination is the only one that is + // dominated by the result of the invocation + BasicBlock *DefBlock = cast<Instruction>(outputs[out])->getParent(); + + bool DominatesDef = true; + + if (InvokeInst *Invoke = dyn_cast<InvokeInst>(outputs[out])) { + DefBlock = Invoke->getNormalDest(); + + // Make sure we are looking at the original successor block, not + // at a newly inserted exit block, which won't be in the dominator + // info. + for (std::map<BasicBlock*, BasicBlock*>::iterator I = + ExitBlockMap.begin(), E = ExitBlockMap.end(); I != E; ++I) + if (DefBlock == I->second) { + DefBlock = I->first; + break; + } + + // In the extract block case, if the block we are extracting ends + // with an invoke instruction, make sure that we don't emit a + // store of the invoke value for the unwind block. + if (!DT && DefBlock != OldTarget) + DominatesDef = false; + } + + if (DT) { + DominatesDef = DT->dominates(DefBlock, OldTarget); + + // If the output value is used by a phi in the target block, + // then we need to test for dominance of the phi's predecessor + // instead. Unfortunately, this a little complicated since we + // have already rewritten uses of the value to uses of the reload. + BasicBlock* pred = FindPhiPredForUseInBlock(Reloads[out], + OldTarget); + if (pred && DT && DT->dominates(DefBlock, pred)) + DominatesDef = true; + } + + if (DominatesDef) { + if (AggregateArgs) { + Value *Idx[2]; + Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context)); + Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), + FirstOut+out); + GetElementPtrInst *GEP = + GetElementPtrInst::Create(OAI, Idx, Idx + 2, + "gep_" + outputs[out]->getName(), + NTRet); + new StoreInst(outputs[out], GEP, NTRet); + } else { + new StoreInst(outputs[out], OAI, NTRet); + } + } + // Advance output iterator even if we don't emit a store + if (!AggregateArgs) ++OAI; + } + } + + // rewrite the original branch instruction with this new target + TI->setSuccessor(i, NewTarget); + } + } + + // Now that we've done the deed, simplify the switch instruction. + const Type *OldFnRetTy = TheSwitch->getParent()->getParent()->getReturnType(); + switch (NumExitBlocks) { + case 0: + // There are no successors (the block containing the switch itself), which + // means that previously this was the last part of the function, and hence + // this should be rewritten as a `ret' + + // Check if the function should return a value + if (OldFnRetTy->isVoidTy()) { + ReturnInst::Create(Context, 0, TheSwitch); // Return void + } else if (OldFnRetTy == TheSwitch->getCondition()->getType()) { + // return what we have + ReturnInst::Create(Context, TheSwitch->getCondition(), TheSwitch); + } else { + // Otherwise we must have code extracted an unwind or something, just + // return whatever we want. + ReturnInst::Create(Context, + Constant::getNullValue(OldFnRetTy), TheSwitch); + } + + TheSwitch->eraseFromParent(); + break; + case 1: + // Only a single destination, change the switch into an unconditional + // branch. + BranchInst::Create(TheSwitch->getSuccessor(1), TheSwitch); + TheSwitch->eraseFromParent(); + break; + case 2: + BranchInst::Create(TheSwitch->getSuccessor(1), TheSwitch->getSuccessor(2), + call, TheSwitch); + TheSwitch->eraseFromParent(); + break; + default: + // Otherwise, make the default destination of the switch instruction be one + // of the other successors. + TheSwitch->setOperand(0, call); + TheSwitch->setSuccessor(0, TheSwitch->getSuccessor(NumExitBlocks)); + TheSwitch->removeCase(NumExitBlocks); // Remove redundant case + break; + } +} + +void CodeExtractor::moveCodeToFunction(Function *newFunction) { + Function *oldFunc = (*BlocksToExtract.begin())->getParent(); + Function::BasicBlockListType &oldBlocks = oldFunc->getBasicBlockList(); + Function::BasicBlockListType &newBlocks = newFunction->getBasicBlockList(); + + for (SetVector<BasicBlock*>::const_iterator i = BlocksToExtract.begin(), + e = BlocksToExtract.end(); i != e; ++i) { + // Delete the basic block from the old function, and the list of blocks + oldBlocks.remove(*i); + + // Insert this basic block into the new function + newBlocks.push_back(*i); + } +} + +/// ExtractRegion - Removes a loop from a function, replaces it with a call to +/// new function. Returns pointer to the new function. +/// +/// algorithm: +/// +/// find inputs and outputs for the region +/// +/// for inputs: add to function as args, map input instr* to arg# +/// for outputs: add allocas for scalars, +/// add to func as args, map output instr* to arg# +/// +/// rewrite func to use argument #s instead of instr* +/// +/// for each scalar output in the function: at every exit, store intermediate +/// computed result back into memory. +/// +Function *CodeExtractor:: +ExtractCodeRegion(const std::vector<BasicBlock*> &code) { + if (!isEligible(code)) + return 0; + + // 1) Find inputs, outputs + // 2) Construct new function + // * Add allocas for defs, pass as args by reference + // * Pass in uses as args + // 3) Move code region, add call instr to func + // + BlocksToExtract.insert(code.begin(), code.end()); + + Values inputs, outputs; + + // Assumption: this is a single-entry code region, and the header is the first + // block in the region. + BasicBlock *header = code[0]; + + for (unsigned i = 1, e = code.size(); i != e; ++i) + for (pred_iterator PI = pred_begin(code[i]), E = pred_end(code[i]); + PI != E; ++PI) + assert(BlocksToExtract.count(*PI) && + "No blocks in this region may have entries from outside the region" + " except for the first block!"); + + // If we have to split PHI nodes or the entry block, do so now. + severSplitPHINodes(header); + + // If we have any return instructions in the region, split those blocks so + // that the return is not in the region. + splitReturnBlocks(); + + Function *oldFunction = header->getParent(); + + // This takes place of the original loop + BasicBlock *codeReplacer = BasicBlock::Create(header->getContext(), + "codeRepl", oldFunction, + header); + + // The new function needs a root node because other nodes can branch to the + // head of the region, but the entry node of a function cannot have preds. + BasicBlock *newFuncRoot = BasicBlock::Create(header->getContext(), + "newFuncRoot"); + newFuncRoot->getInstList().push_back(BranchInst::Create(header)); + + // Find inputs to, outputs from the code region. + findInputsOutputs(inputs, outputs); + + // Construct new function based on inputs/outputs & add allocas for all defs. + Function *newFunction = constructFunction(inputs, outputs, header, + newFuncRoot, + codeReplacer, oldFunction, + oldFunction->getParent()); + + emitCallAndSwitchStatement(newFunction, codeReplacer, inputs, outputs); + + moveCodeToFunction(newFunction); + + // Loop over all of the PHI nodes in the header block, and change any + // references to the old incoming edge to be the new incoming edge. + for (BasicBlock::iterator I = header->begin(); isa<PHINode>(I); ++I) { + PHINode *PN = cast<PHINode>(I); + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + if (!BlocksToExtract.count(PN->getIncomingBlock(i))) + PN->setIncomingBlock(i, newFuncRoot); + } + + // Look at all successors of the codeReplacer block. If any of these blocks + // had PHI nodes in them, we need to update the "from" block to be the code + // replacer, not the original block in the extracted region. + std::vector<BasicBlock*> Succs(succ_begin(codeReplacer), + succ_end(codeReplacer)); + for (unsigned i = 0, e = Succs.size(); i != e; ++i) + for (BasicBlock::iterator I = Succs[i]->begin(); isa<PHINode>(I); ++I) { + PHINode *PN = cast<PHINode>(I); + std::set<BasicBlock*> ProcessedPreds; + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + if (BlocksToExtract.count(PN->getIncomingBlock(i))) { + if (ProcessedPreds.insert(PN->getIncomingBlock(i)).second) + PN->setIncomingBlock(i, codeReplacer); + else { + // There were multiple entries in the PHI for this block, now there + // is only one, so remove the duplicated entries. + PN->removeIncomingValue(i, false); + --i; --e; + } + } + } + + //cerr << "NEW FUNCTION: " << *newFunction; + // verifyFunction(*newFunction); + + // cerr << "OLD FUNCTION: " << *oldFunction; + // verifyFunction(*oldFunction); + + DEBUG(if (verifyFunction(*newFunction)) + report_fatal_error("verifyFunction failed!")); + return newFunction; +} + +bool CodeExtractor::isEligible(const std::vector<BasicBlock*> &code) { + // Deny code region if it contains allocas or vastarts. + for (std::vector<BasicBlock*>::const_iterator BB = code.begin(), e=code.end(); + BB != e; ++BB) + for (BasicBlock::const_iterator I = (*BB)->begin(), Ie = (*BB)->end(); + I != Ie; ++I) + if (isa<AllocaInst>(*I)) + return false; + else if (const CallInst *CI = dyn_cast<CallInst>(I)) + if (const Function *F = CI->getCalledFunction()) + if (F->getIntrinsicID() == Intrinsic::vastart) + return false; + return true; +} + + +/// ExtractCodeRegion - slurp a sequence of basic blocks into a brand new +/// function +/// +Function* llvm::ExtractCodeRegion(DominatorTree &DT, + const std::vector<BasicBlock*> &code, + bool AggregateArgs) { + return CodeExtractor(&DT, AggregateArgs).ExtractCodeRegion(code); +} + +/// ExtractBasicBlock - slurp a natural loop into a brand new function +/// +Function* llvm::ExtractLoop(DominatorTree &DT, Loop *L, bool AggregateArgs) { + return CodeExtractor(&DT, AggregateArgs).ExtractCodeRegion(L->getBlocks()); +} + +/// ExtractBasicBlock - slurp a basic block into a brand new function +/// +Function* llvm::ExtractBasicBlock(BasicBlock *BB, bool AggregateArgs) { + std::vector<BasicBlock*> Blocks; + Blocks.push_back(BB); + return CodeExtractor(0, AggregateArgs).ExtractCodeRegion(Blocks); +} diff --git a/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp b/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp new file mode 100644 index 0000000..8cc2649 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp @@ -0,0 +1,146 @@ +//===- DemoteRegToStack.cpp - Move a virtual register to the stack --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provide the function DemoteRegToStack(). This function takes a +// virtual register computed by an Instruction and replaces it with a slot in +// the stack frame, allocated via alloca. It returns the pointer to the +// AllocaInst inserted. After this function is called on an instruction, we are +// guaranteed that the only user of the instruction is a store that is +// immediately after it. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Function.h" +#include "llvm/Instructions.h" +#include "llvm/Type.h" +#include <map> +using namespace llvm; + +/// DemoteRegToStack - This function takes a virtual register computed by an +/// Instruction and replaces it with a slot in the stack frame, allocated via +/// alloca. This allows the CFG to be changed around without fear of +/// invalidating the SSA information for the value. It returns the pointer to +/// the alloca inserted to create a stack slot for I. +/// +AllocaInst* llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads, + Instruction *AllocaPoint) { + if (I.use_empty()) { + I.eraseFromParent(); + return 0; + } + + // Create a stack slot to hold the value. + AllocaInst *Slot; + if (AllocaPoint) { + Slot = new AllocaInst(I.getType(), 0, + I.getName()+".reg2mem", AllocaPoint); + } else { + Function *F = I.getParent()->getParent(); + Slot = new AllocaInst(I.getType(), 0, I.getName()+".reg2mem", + F->getEntryBlock().begin()); + } + + // Change all of the users of the instruction to read from the stack slot + // instead. + while (!I.use_empty()) { + Instruction *U = cast<Instruction>(I.use_back()); + if (PHINode *PN = dyn_cast<PHINode>(U)) { + // If this is a PHI node, we can't insert a load of the value before the + // use. Instead, insert the load in the predecessor block corresponding + // to the incoming value. + // + // Note that if there are multiple edges from a basic block to this PHI + // node that we cannot multiple loads. The problem is that the resultant + // PHI node will have multiple values (from each load) coming in from the + // same block, which is illegal SSA form. For this reason, we keep track + // and reuse loads we insert. + std::map<BasicBlock*, Value*> Loads; + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + if (PN->getIncomingValue(i) == &I) { + Value *&V = Loads[PN->getIncomingBlock(i)]; + if (V == 0) { + // Insert the load into the predecessor block + V = new LoadInst(Slot, I.getName()+".reload", VolatileLoads, + PN->getIncomingBlock(i)->getTerminator()); + } + PN->setIncomingValue(i, V); + } + + } else { + // If this is a normal instruction, just insert a load. + Value *V = new LoadInst(Slot, I.getName()+".reload", VolatileLoads, U); + U->replaceUsesOfWith(&I, V); + } + } + + + // Insert stores of the computed value into the stack slot. We have to be + // careful is I is an invoke instruction though, because we can't insert the + // store AFTER the terminator instruction. + BasicBlock::iterator InsertPt; + if (!isa<TerminatorInst>(I)) { + InsertPt = &I; + ++InsertPt; + } else { + // We cannot demote invoke instructions to the stack if their normal edge + // is critical. + InvokeInst &II = cast<InvokeInst>(I); + assert(II.getNormalDest()->getSinglePredecessor() && + "Cannot demote invoke with a critical successor!"); + InsertPt = II.getNormalDest()->begin(); + } + + for (; isa<PHINode>(InsertPt); ++InsertPt) + /* empty */; // Don't insert before any PHI nodes. + new StoreInst(&I, Slot, InsertPt); + + return Slot; +} + + +/// DemotePHIToStack - This function takes a virtual register computed by a phi +/// node and replaces it with a slot in the stack frame, allocated via alloca. +/// The phi node is deleted and it returns the pointer to the alloca inserted. +AllocaInst* llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) { + if (P->use_empty()) { + P->eraseFromParent(); + return 0; + } + + // Create a stack slot to hold the value. + AllocaInst *Slot; + if (AllocaPoint) { + Slot = new AllocaInst(P->getType(), 0, + P->getName()+".reg2mem", AllocaPoint); + } else { + Function *F = P->getParent()->getParent(); + Slot = new AllocaInst(P->getType(), 0, P->getName()+".reg2mem", + F->getEntryBlock().begin()); + } + + // Iterate over each operand, insert store in each predecessor. + for (unsigned i = 0, e = P->getNumIncomingValues(); i < e; ++i) { + if (InvokeInst *II = dyn_cast<InvokeInst>(P->getIncomingValue(i))) { + assert(II->getParent() != P->getIncomingBlock(i) && + "Invoke edge not supported yet"); (void)II; + } + new StoreInst(P->getIncomingValue(i), Slot, + P->getIncomingBlock(i)->getTerminator()); + } + + // Insert load in place of the phi and replace all uses. + Value *V = new LoadInst(Slot, P->getName()+".reload", P); + P->replaceAllUsesWith(V); + + // Delete phi. + P->eraseFromParent(); + + return Slot; +} diff --git a/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp b/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp new file mode 100644 index 0000000..c1faf24 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -0,0 +1,709 @@ +//===- InlineFunction.cpp - Code to perform function inlining -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements inlining of a function into a call site, resolving +// parameters and the return value as appropriate. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Module.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Intrinsics.h" +#include "llvm/Attributes.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/Analysis/DebugInfo.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/CallSite.h" +using namespace llvm; + +bool llvm::InlineFunction(CallInst *CI, InlineFunctionInfo &IFI) { + return InlineFunction(CallSite(CI), IFI); +} +bool llvm::InlineFunction(InvokeInst *II, InlineFunctionInfo &IFI) { + return InlineFunction(CallSite(II), IFI); +} + + +/// HandleCallsInBlockInlinedThroughInvoke - When we inline a basic block into +/// an invoke, we have to turn all of the calls that can throw into +/// invokes. This function analyze BB to see if there are any calls, and if so, +/// it rewrites them to be invokes that jump to InvokeDest and fills in the PHI +/// nodes in that block with the values specified in InvokeDestPHIValues. +/// +static void HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB, + BasicBlock *InvokeDest, + const SmallVectorImpl<Value*> &InvokeDestPHIValues) { + for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) { + Instruction *I = BBI++; + + // We only need to check for function calls: inlined invoke + // instructions require no special handling. + CallInst *CI = dyn_cast<CallInst>(I); + if (CI == 0) continue; + + // If this call cannot unwind, don't convert it to an invoke. + if (CI->doesNotThrow()) + continue; + + // Convert this function call into an invoke instruction. + // First, split the basic block. + BasicBlock *Split = BB->splitBasicBlock(CI, CI->getName()+".noexc"); + + // Next, create the new invoke instruction, inserting it at the end + // of the old basic block. + ImmutableCallSite CS(CI); + SmallVector<Value*, 8> InvokeArgs(CS.arg_begin(), CS.arg_end()); + InvokeInst *II = + InvokeInst::Create(CI->getCalledValue(), Split, InvokeDest, + InvokeArgs.begin(), InvokeArgs.end(), + CI->getName(), BB->getTerminator()); + II->setCallingConv(CI->getCallingConv()); + II->setAttributes(CI->getAttributes()); + + // Make sure that anything using the call now uses the invoke! This also + // updates the CallGraph if present, because it uses a WeakVH. + CI->replaceAllUsesWith(II); + + // Delete the unconditional branch inserted by splitBasicBlock + BB->getInstList().pop_back(); + Split->getInstList().pop_front(); // Delete the original call + + // Update any PHI nodes in the exceptional block to indicate that + // there is now a new entry in them. + unsigned i = 0; + for (BasicBlock::iterator I = InvokeDest->begin(); + isa<PHINode>(I); ++I, ++i) + cast<PHINode>(I)->addIncoming(InvokeDestPHIValues[i], BB); + + // This basic block is now complete, the caller will continue scanning the + // next one. + return; + } +} + + +/// HandleInlinedInvoke - If we inlined an invoke site, we need to convert calls +/// in the body of the inlined function into invokes and turn unwind +/// instructions into branches to the invoke unwind dest. +/// +/// II is the invoke instruction being inlined. FirstNewBlock is the first +/// block of the inlined code (the last block is the end of the function), +/// and InlineCodeInfo is information about the code that got inlined. +static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock, + ClonedCodeInfo &InlinedCodeInfo) { + BasicBlock *InvokeDest = II->getUnwindDest(); + SmallVector<Value*, 8> InvokeDestPHIValues; + + // If there are PHI nodes in the unwind destination block, we need to + // keep track of which values came into them from this invoke, then remove + // the entry for this block. + BasicBlock *InvokeBlock = II->getParent(); + for (BasicBlock::iterator I = InvokeDest->begin(); isa<PHINode>(I); ++I) { + PHINode *PN = cast<PHINode>(I); + // Save the value to use for this edge. + InvokeDestPHIValues.push_back(PN->getIncomingValueForBlock(InvokeBlock)); + } + + Function *Caller = FirstNewBlock->getParent(); + + // The inlined code is currently at the end of the function, scan from the + // start of the inlined code to its end, checking for stuff we need to + // rewrite. If the code doesn't have calls or unwinds, we know there is + // nothing to rewrite. + if (!InlinedCodeInfo.ContainsCalls && !InlinedCodeInfo.ContainsUnwinds) { + // Now that everything is happy, we have one final detail. The PHI nodes in + // the exception destination block still have entries due to the original + // invoke instruction. Eliminate these entries (which might even delete the + // PHI node) now. + InvokeDest->removePredecessor(II->getParent()); + return; + } + + for (Function::iterator BB = FirstNewBlock, E = Caller->end(); BB != E; ++BB){ + if (InlinedCodeInfo.ContainsCalls) + HandleCallsInBlockInlinedThroughInvoke(BB, InvokeDest, + InvokeDestPHIValues); + + if (UnwindInst *UI = dyn_cast<UnwindInst>(BB->getTerminator())) { + // An UnwindInst requires special handling when it gets inlined into an + // invoke site. Once this happens, we know that the unwind would cause + // a control transfer to the invoke exception destination, so we can + // transform it into a direct branch to the exception destination. + BranchInst::Create(InvokeDest, UI); + + // Delete the unwind instruction! + UI->eraseFromParent(); + + // Update any PHI nodes in the exceptional block to indicate that + // there is now a new entry in them. + unsigned i = 0; + for (BasicBlock::iterator I = InvokeDest->begin(); + isa<PHINode>(I); ++I, ++i) { + PHINode *PN = cast<PHINode>(I); + PN->addIncoming(InvokeDestPHIValues[i], BB); + } + } + } + + // Now that everything is happy, we have one final detail. The PHI nodes in + // the exception destination block still have entries due to the original + // invoke instruction. Eliminate these entries (which might even delete the + // PHI node) now. + InvokeDest->removePredecessor(II->getParent()); +} + +/// UpdateCallGraphAfterInlining - Once we have cloned code over from a callee +/// into the caller, update the specified callgraph to reflect the changes we +/// made. Note that it's possible that not all code was copied over, so only +/// some edges of the callgraph may remain. +static void UpdateCallGraphAfterInlining(CallSite CS, + Function::iterator FirstNewBlock, + ValueToValueMapTy &VMap, + InlineFunctionInfo &IFI) { + CallGraph &CG = *IFI.CG; + const Function *Caller = CS.getInstruction()->getParent()->getParent(); + const Function *Callee = CS.getCalledFunction(); + CallGraphNode *CalleeNode = CG[Callee]; + CallGraphNode *CallerNode = CG[Caller]; + + // Since we inlined some uninlined call sites in the callee into the caller, + // add edges from the caller to all of the callees of the callee. + CallGraphNode::iterator I = CalleeNode->begin(), E = CalleeNode->end(); + + // Consider the case where CalleeNode == CallerNode. + CallGraphNode::CalledFunctionsVector CallCache; + if (CalleeNode == CallerNode) { + CallCache.assign(I, E); + I = CallCache.begin(); + E = CallCache.end(); + } + + for (; I != E; ++I) { + const Value *OrigCall = I->first; + + ValueToValueMapTy::iterator VMI = VMap.find(OrigCall); + // Only copy the edge if the call was inlined! + if (VMI == VMap.end() || VMI->second == 0) + continue; + + // If the call was inlined, but then constant folded, there is no edge to + // add. Check for this case. + Instruction *NewCall = dyn_cast<Instruction>(VMI->second); + if (NewCall == 0) continue; + + // Remember that this call site got inlined for the client of + // InlineFunction. + IFI.InlinedCalls.push_back(NewCall); + + // It's possible that inlining the callsite will cause it to go from an + // indirect to a direct call by resolving a function pointer. If this + // happens, set the callee of the new call site to a more precise + // destination. This can also happen if the call graph node of the caller + // was just unnecessarily imprecise. + if (I->second->getFunction() == 0) + if (Function *F = CallSite(NewCall).getCalledFunction()) { + // Indirect call site resolved to direct call. + CallerNode->addCalledFunction(CallSite(NewCall), CG[F]); + + continue; + } + + CallerNode->addCalledFunction(CallSite(NewCall), I->second); + } + + // Update the call graph by deleting the edge from Callee to Caller. We must + // do this after the loop above in case Caller and Callee are the same. + CallerNode->removeCallEdgeFor(CS); +} + +/// HandleByValArgument - When inlining a call site that has a byval argument, +/// we have to make the implicit memcpy explicit by adding it. +static Value *HandleByValArgument(Value *Arg, Instruction *TheCall, + const Function *CalledFunc, + InlineFunctionInfo &IFI, + unsigned ByValAlignment) { + const Type *AggTy = cast<PointerType>(Arg->getType())->getElementType(); + + // If the called function is readonly, then it could not mutate the caller's + // copy of the byval'd memory. In this case, it is safe to elide the copy and + // temporary. + if (CalledFunc->onlyReadsMemory()) { + // If the byval argument has a specified alignment that is greater than the + // passed in pointer, then we either have to round up the input pointer or + // give up on this transformation. + if (ByValAlignment <= 1) // 0 = unspecified, 1 = no particular alignment. + return Arg; + + // If the pointer is already known to be sufficiently aligned, or if we can + // round it up to a larger alignment, then we don't need a temporary. + if (getOrEnforceKnownAlignment(Arg, ByValAlignment, + IFI.TD) >= ByValAlignment) + return Arg; + + // Otherwise, we have to make a memcpy to get a safe alignment. This is bad + // for code quality, but rarely happens and is required for correctness. + } + + LLVMContext &Context = Arg->getContext(); + + const Type *VoidPtrTy = Type::getInt8PtrTy(Context); + + // Create the alloca. If we have TargetData, use nice alignment. + unsigned Align = 1; + if (IFI.TD) + Align = IFI.TD->getPrefTypeAlignment(AggTy); + + // If the byval had an alignment specified, we *must* use at least that + // alignment, as it is required by the byval argument (and uses of the + // pointer inside the callee). + Align = std::max(Align, ByValAlignment); + + Function *Caller = TheCall->getParent()->getParent(); + + Value *NewAlloca = new AllocaInst(AggTy, 0, Align, Arg->getName(), + &*Caller->begin()->begin()); + // Emit a memcpy. + const Type *Tys[3] = {VoidPtrTy, VoidPtrTy, Type::getInt64Ty(Context)}; + Function *MemCpyFn = Intrinsic::getDeclaration(Caller->getParent(), + Intrinsic::memcpy, + Tys, 3); + Value *DestCast = new BitCastInst(NewAlloca, VoidPtrTy, "tmp", TheCall); + Value *SrcCast = new BitCastInst(Arg, VoidPtrTy, "tmp", TheCall); + + Value *Size; + if (IFI.TD == 0) + Size = ConstantExpr::getSizeOf(AggTy); + else + Size = ConstantInt::get(Type::getInt64Ty(Context), + IFI.TD->getTypeStoreSize(AggTy)); + + // Always generate a memcpy of alignment 1 here because we don't know + // the alignment of the src pointer. Other optimizations can infer + // better alignment. + Value *CallArgs[] = { + DestCast, SrcCast, Size, + ConstantInt::get(Type::getInt32Ty(Context), 1), + ConstantInt::getFalse(Context) // isVolatile + }; + CallInst *TheMemCpy = + CallInst::Create(MemCpyFn, CallArgs, CallArgs+5, "", TheCall); + + // If we have a call graph, update it. + if (CallGraph *CG = IFI.CG) { + CallGraphNode *MemCpyCGN = CG->getOrInsertFunction(MemCpyFn); + CallGraphNode *CallerNode = (*CG)[Caller]; + CallerNode->addCalledFunction(TheMemCpy, MemCpyCGN); + } + + // Uses of the argument in the function should use our new alloca + // instead. + return NewAlloca; +} + +// InlineFunction - This function inlines the called function into the basic +// block of the caller. This returns false if it is not possible to inline this +// call. The program is still in a well defined state if this occurs though. +// +// Note that this only does one level of inlining. For example, if the +// instruction 'call B' is inlined, and 'B' calls 'C', then the call to 'C' now +// exists in the instruction stream. Similiarly this will inline a recursive +// function by one level. +// +bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI) { + Instruction *TheCall = CS.getInstruction(); + LLVMContext &Context = TheCall->getContext(); + assert(TheCall->getParent() && TheCall->getParent()->getParent() && + "Instruction not in function!"); + + // If IFI has any state in it, zap it before we fill it in. + IFI.reset(); + + const Function *CalledFunc = CS.getCalledFunction(); + if (CalledFunc == 0 || // Can't inline external function or indirect + CalledFunc->isDeclaration() || // call, or call to a vararg function! + CalledFunc->getFunctionType()->isVarArg()) return false; + + // If the call to the callee is not a tail call, we must clear the 'tail' + // flags on any calls that we inline. + bool MustClearTailCallFlags = + !(isa<CallInst>(TheCall) && cast<CallInst>(TheCall)->isTailCall()); + + // If the call to the callee cannot throw, set the 'nounwind' flag on any + // calls that we inline. + bool MarkNoUnwind = CS.doesNotThrow(); + + BasicBlock *OrigBB = TheCall->getParent(); + Function *Caller = OrigBB->getParent(); + + // GC poses two hazards to inlining, which only occur when the callee has GC: + // 1. If the caller has no GC, then the callee's GC must be propagated to the + // caller. + // 2. If the caller has a differing GC, it is invalid to inline. + if (CalledFunc->hasGC()) { + if (!Caller->hasGC()) + Caller->setGC(CalledFunc->getGC()); + else if (CalledFunc->getGC() != Caller->getGC()) + return false; + } + + // Get an iterator to the last basic block in the function, which will have + // the new function inlined after it. + // + Function::iterator LastBlock = &Caller->back(); + + // Make sure to capture all of the return instructions from the cloned + // function. + SmallVector<ReturnInst*, 8> Returns; + ClonedCodeInfo InlinedFunctionInfo; + Function::iterator FirstNewBlock; + + { // Scope to destroy VMap after cloning. + ValueToValueMapTy VMap; + + assert(CalledFunc->arg_size() == CS.arg_size() && + "No varargs calls can be inlined!"); + + // Calculate the vector of arguments to pass into the function cloner, which + // matches up the formal to the actual argument values. + CallSite::arg_iterator AI = CS.arg_begin(); + unsigned ArgNo = 0; + for (Function::const_arg_iterator I = CalledFunc->arg_begin(), + E = CalledFunc->arg_end(); I != E; ++I, ++AI, ++ArgNo) { + Value *ActualArg = *AI; + + // When byval arguments actually inlined, we need to make the copy implied + // by them explicit. However, we don't do this if the callee is readonly + // or readnone, because the copy would be unneeded: the callee doesn't + // modify the struct. + if (CalledFunc->paramHasAttr(ArgNo+1, Attribute::ByVal)) { + ActualArg = HandleByValArgument(ActualArg, TheCall, CalledFunc, IFI, + CalledFunc->getParamAlignment(ArgNo+1)); + + // Calls that we inline may use the new alloca, so we need to clear + // their 'tail' flags if HandleByValArgument introduced a new alloca and + // the callee has calls. + MustClearTailCallFlags |= ActualArg != *AI; + } + + VMap[I] = ActualArg; + } + + // We want the inliner to prune the code as it copies. We would LOVE to + // have no dead or constant instructions leftover after inlining occurs + // (which can happen, e.g., because an argument was constant), but we'll be + // happy with whatever the cloner can do. + CloneAndPruneFunctionInto(Caller, CalledFunc, VMap, + /*ModuleLevelChanges=*/false, Returns, ".i", + &InlinedFunctionInfo, IFI.TD, TheCall); + + // Remember the first block that is newly cloned over. + FirstNewBlock = LastBlock; ++FirstNewBlock; + + // Update the callgraph if requested. + if (IFI.CG) + UpdateCallGraphAfterInlining(CS, FirstNewBlock, VMap, IFI); + } + + // If there are any alloca instructions in the block that used to be the entry + // block for the callee, move them to the entry block of the caller. First + // calculate which instruction they should be inserted before. We insert the + // instructions at the end of the current alloca list. + // + { + BasicBlock::iterator InsertPoint = Caller->begin()->begin(); + for (BasicBlock::iterator I = FirstNewBlock->begin(), + E = FirstNewBlock->end(); I != E; ) { + AllocaInst *AI = dyn_cast<AllocaInst>(I++); + if (AI == 0) continue; + + // If the alloca is now dead, remove it. This often occurs due to code + // specialization. + if (AI->use_empty()) { + AI->eraseFromParent(); + continue; + } + + if (!isa<Constant>(AI->getArraySize())) + continue; + + // Keep track of the static allocas that we inline into the caller. + IFI.StaticAllocas.push_back(AI); + + // Scan for the block of allocas that we can move over, and move them + // all at once. + while (isa<AllocaInst>(I) && + isa<Constant>(cast<AllocaInst>(I)->getArraySize())) { + IFI.StaticAllocas.push_back(cast<AllocaInst>(I)); + ++I; + } + + // Transfer all of the allocas over in a block. Using splice means + // that the instructions aren't removed from the symbol table, then + // reinserted. + Caller->getEntryBlock().getInstList().splice(InsertPoint, + FirstNewBlock->getInstList(), + AI, I); + } + } + + // If the inlined code contained dynamic alloca instructions, wrap the inlined + // code with llvm.stacksave/llvm.stackrestore intrinsics. + if (InlinedFunctionInfo.ContainsDynamicAllocas) { + Module *M = Caller->getParent(); + // Get the two intrinsics we care about. + Function *StackSave = Intrinsic::getDeclaration(M, Intrinsic::stacksave); + Function *StackRestore=Intrinsic::getDeclaration(M,Intrinsic::stackrestore); + + // If we are preserving the callgraph, add edges to the stacksave/restore + // functions for the calls we insert. + CallGraphNode *StackSaveCGN = 0, *StackRestoreCGN = 0, *CallerNode = 0; + if (CallGraph *CG = IFI.CG) { + StackSaveCGN = CG->getOrInsertFunction(StackSave); + StackRestoreCGN = CG->getOrInsertFunction(StackRestore); + CallerNode = (*CG)[Caller]; + } + + // Insert the llvm.stacksave. + CallInst *SavedPtr = CallInst::Create(StackSave, "savedstack", + FirstNewBlock->begin()); + if (IFI.CG) CallerNode->addCalledFunction(SavedPtr, StackSaveCGN); + + // Insert a call to llvm.stackrestore before any return instructions in the + // inlined function. + for (unsigned i = 0, e = Returns.size(); i != e; ++i) { + CallInst *CI = CallInst::Create(StackRestore, SavedPtr, "", Returns[i]); + if (IFI.CG) CallerNode->addCalledFunction(CI, StackRestoreCGN); + } + + // Count the number of StackRestore calls we insert. + unsigned NumStackRestores = Returns.size(); + + // If we are inlining an invoke instruction, insert restores before each + // unwind. These unwinds will be rewritten into branches later. + if (InlinedFunctionInfo.ContainsUnwinds && isa<InvokeInst>(TheCall)) { + for (Function::iterator BB = FirstNewBlock, E = Caller->end(); + BB != E; ++BB) + if (UnwindInst *UI = dyn_cast<UnwindInst>(BB->getTerminator())) { + CallInst *CI = CallInst::Create(StackRestore, SavedPtr, "", UI); + if (IFI.CG) CallerNode->addCalledFunction(CI, StackRestoreCGN); + ++NumStackRestores; + } + } + } + + // If we are inlining tail call instruction through a call site that isn't + // marked 'tail', we must remove the tail marker for any calls in the inlined + // code. Also, calls inlined through a 'nounwind' call site should be marked + // 'nounwind'. + if (InlinedFunctionInfo.ContainsCalls && + (MustClearTailCallFlags || MarkNoUnwind)) { + for (Function::iterator BB = FirstNewBlock, E = Caller->end(); + BB != E; ++BB) + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) + if (CallInst *CI = dyn_cast<CallInst>(I)) { + if (MustClearTailCallFlags) + CI->setTailCall(false); + if (MarkNoUnwind) + CI->setDoesNotThrow(); + } + } + + // If we are inlining through a 'nounwind' call site then any inlined 'unwind' + // instructions are unreachable. + if (InlinedFunctionInfo.ContainsUnwinds && MarkNoUnwind) + for (Function::iterator BB = FirstNewBlock, E = Caller->end(); + BB != E; ++BB) { + TerminatorInst *Term = BB->getTerminator(); + if (isa<UnwindInst>(Term)) { + new UnreachableInst(Context, Term); + BB->getInstList().erase(Term); + } + } + + // If we are inlining for an invoke instruction, we must make sure to rewrite + // any inlined 'unwind' instructions into branches to the invoke exception + // destination, and call instructions into invoke instructions. + if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall)) + HandleInlinedInvoke(II, FirstNewBlock, InlinedFunctionInfo); + + // If we cloned in _exactly one_ basic block, and if that block ends in a + // return instruction, we splice the body of the inlined callee directly into + // the calling basic block. + if (Returns.size() == 1 && std::distance(FirstNewBlock, Caller->end()) == 1) { + // Move all of the instructions right before the call. + OrigBB->getInstList().splice(TheCall, FirstNewBlock->getInstList(), + FirstNewBlock->begin(), FirstNewBlock->end()); + // Remove the cloned basic block. + Caller->getBasicBlockList().pop_back(); + + // If the call site was an invoke instruction, add a branch to the normal + // destination. + if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall)) + BranchInst::Create(II->getNormalDest(), TheCall); + + // If the return instruction returned a value, replace uses of the call with + // uses of the returned value. + if (!TheCall->use_empty()) { + ReturnInst *R = Returns[0]; + if (TheCall == R->getReturnValue()) + TheCall->replaceAllUsesWith(UndefValue::get(TheCall->getType())); + else + TheCall->replaceAllUsesWith(R->getReturnValue()); + } + // Since we are now done with the Call/Invoke, we can delete it. + TheCall->eraseFromParent(); + + // Since we are now done with the return instruction, delete it also. + Returns[0]->eraseFromParent(); + + // We are now done with the inlining. + return true; + } + + // Otherwise, we have the normal case, of more than one block to inline or + // multiple return sites. + + // We want to clone the entire callee function into the hole between the + // "starter" and "ender" blocks. How we accomplish this depends on whether + // this is an invoke instruction or a call instruction. + BasicBlock *AfterCallBB; + if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall)) { + + // Add an unconditional branch to make this look like the CallInst case... + BranchInst *NewBr = BranchInst::Create(II->getNormalDest(), TheCall); + + // Split the basic block. This guarantees that no PHI nodes will have to be + // updated due to new incoming edges, and make the invoke case more + // symmetric to the call case. + AfterCallBB = OrigBB->splitBasicBlock(NewBr, + CalledFunc->getName()+".exit"); + + } else { // It's a call + // If this is a call instruction, we need to split the basic block that + // the call lives in. + // + AfterCallBB = OrigBB->splitBasicBlock(TheCall, + CalledFunc->getName()+".exit"); + } + + // Change the branch that used to go to AfterCallBB to branch to the first + // basic block of the inlined function. + // + TerminatorInst *Br = OrigBB->getTerminator(); + assert(Br && Br->getOpcode() == Instruction::Br && + "splitBasicBlock broken!"); + Br->setOperand(0, FirstNewBlock); + + + // Now that the function is correct, make it a little bit nicer. In + // particular, move the basic blocks inserted from the end of the function + // into the space made by splitting the source basic block. + Caller->getBasicBlockList().splice(AfterCallBB, Caller->getBasicBlockList(), + FirstNewBlock, Caller->end()); + + // Handle all of the return instructions that we just cloned in, and eliminate + // any users of the original call/invoke instruction. + const Type *RTy = CalledFunc->getReturnType(); + + PHINode *PHI = 0; + if (Returns.size() > 1) { + // The PHI node should go at the front of the new basic block to merge all + // possible incoming values. + if (!TheCall->use_empty()) { + PHI = PHINode::Create(RTy, TheCall->getName(), + AfterCallBB->begin()); + // Anything that used the result of the function call should now use the + // PHI node as their operand. + TheCall->replaceAllUsesWith(PHI); + } + + // Loop over all of the return instructions adding entries to the PHI node + // as appropriate. + if (PHI) { + for (unsigned i = 0, e = Returns.size(); i != e; ++i) { + ReturnInst *RI = Returns[i]; + assert(RI->getReturnValue()->getType() == PHI->getType() && + "Ret value not consistent in function!"); + PHI->addIncoming(RI->getReturnValue(), RI->getParent()); + } + } + + + // Add a branch to the merge points and remove return instructions. + for (unsigned i = 0, e = Returns.size(); i != e; ++i) { + ReturnInst *RI = Returns[i]; + BranchInst::Create(AfterCallBB, RI); + RI->eraseFromParent(); + } + } else if (!Returns.empty()) { + // Otherwise, if there is exactly one return value, just replace anything + // using the return value of the call with the computed value. + if (!TheCall->use_empty()) { + if (TheCall == Returns[0]->getReturnValue()) + TheCall->replaceAllUsesWith(UndefValue::get(TheCall->getType())); + else + TheCall->replaceAllUsesWith(Returns[0]->getReturnValue()); + } + + // Splice the code from the return block into the block that it will return + // to, which contains the code that was after the call. + BasicBlock *ReturnBB = Returns[0]->getParent(); + AfterCallBB->getInstList().splice(AfterCallBB->begin(), + ReturnBB->getInstList()); + + // Update PHI nodes that use the ReturnBB to use the AfterCallBB. + ReturnBB->replaceAllUsesWith(AfterCallBB); + + // Delete the return instruction now and empty ReturnBB now. + Returns[0]->eraseFromParent(); + ReturnBB->eraseFromParent(); + } else if (!TheCall->use_empty()) { + // No returns, but something is using the return value of the call. Just + // nuke the result. + TheCall->replaceAllUsesWith(UndefValue::get(TheCall->getType())); + } + + // Since we are now done with the Call/Invoke, we can delete it. + TheCall->eraseFromParent(); + + // We should always be able to fold the entry block of the function into the + // single predecessor of the block... + assert(cast<BranchInst>(Br)->isUnconditional() && "splitBasicBlock broken!"); + BasicBlock *CalleeEntry = cast<BranchInst>(Br)->getSuccessor(0); + + // Splice the code entry block into calling block, right before the + // unconditional branch. + OrigBB->getInstList().splice(Br, CalleeEntry->getInstList()); + CalleeEntry->replaceAllUsesWith(OrigBB); // Update PHI nodes + + // Remove the unconditional branch. + OrigBB->getInstList().erase(Br); + + // Now we can remove the CalleeEntry block, which is now empty. + Caller->getBasicBlockList().erase(CalleeEntry); + + // If we inserted a phi node, check to see if it has a single value (e.g. all + // the entries are the same or undef). If so, remove the PHI so it doesn't + // block other optimizations. + if (PHI) + if (Value *V = SimplifyInstruction(PHI, IFI.TD)) { + PHI->replaceAllUsesWith(V); + PHI->eraseFromParent(); + } + + return true; +} diff --git a/contrib/llvm/lib/Transforms/Utils/InstructionNamer.cpp b/contrib/llvm/lib/Transforms/Utils/InstructionNamer.cpp new file mode 100644 index 0000000..45c15de --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/InstructionNamer.cpp @@ -0,0 +1,64 @@ +//===- InstructionNamer.cpp - Give anonymous instructions names -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This is a little utility pass that gives instructions names, this is mostly +// useful when diffing the effect of an optimization because deleting an +// unnamed instruction can change all other instruction numbering, making the +// diff very noisy. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar.h" +#include "llvm/Function.h" +#include "llvm/Pass.h" +#include "llvm/Type.h" +using namespace llvm; + +namespace { + struct InstNamer : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + InstNamer() : FunctionPass(ID) { + initializeInstNamerPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &Info) const { + Info.setPreservesAll(); + } + + bool runOnFunction(Function &F) { + for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end(); + AI != AE; ++AI) + if (!AI->hasName() && !AI->getType()->isVoidTy()) + AI->setName("arg"); + + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { + if (!BB->hasName()) + BB->setName("bb"); + + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) + if (!I->hasName() && !I->getType()->isVoidTy()) + I->setName("tmp"); + } + return true; + } + }; + + char InstNamer::ID = 0; +} + +INITIALIZE_PASS(InstNamer, "instnamer", + "Assign names to anonymous instructions", false, false) +char &llvm::InstructionNamerID = InstNamer::ID; +//===----------------------------------------------------------------------===// +// +// InstructionNamer - Give any unnamed non-void instructions "tmp" names. +// +FunctionPass *llvm::createInstructionNamerPass() { + return new InstNamer(); +} diff --git a/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp b/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp new file mode 100644 index 0000000..b2e5fa6 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp @@ -0,0 +1,268 @@ +//===-- LCSSA.cpp - Convert loops into loop-closed SSA form ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass transforms loops by placing phi nodes at the end of the loops for +// all values that are live across the loop boundary. For example, it turns +// the left into the right code: +// +// for (...) for (...) +// if (c) if (c) +// X1 = ... X1 = ... +// else else +// X2 = ... X2 = ... +// X3 = phi(X1, X2) X3 = phi(X1, X2) +// ... = X3 + 4 X4 = phi(X3) +// ... = X4 + 4 +// +// This is still valid LLVM; the extra phi nodes are purely redundant, and will +// be trivially eliminated by InstCombine. The major benefit of this +// transformation is that it makes many other loop optimizations, such as +// LoopUnswitching, simpler. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "lcssa" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Constants.h" +#include "llvm/Pass.h" +#include "llvm/Function.h" +#include "llvm/Instructions.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/PredIteratorCache.h" +using namespace llvm; + +STATISTIC(NumLCSSA, "Number of live out of a loop variables"); + +namespace { + struct LCSSA : public LoopPass { + static char ID; // Pass identification, replacement for typeid + LCSSA() : LoopPass(ID) { + initializeLCSSAPass(*PassRegistry::getPassRegistry()); + } + + // Cached analysis information for the current function. + DominatorTree *DT; + std::vector<BasicBlock*> LoopBlocks; + PredIteratorCache PredCache; + Loop *L; + + virtual bool runOnLoop(Loop *L, LPPassManager &LPM); + + /// This transformation requires natural loop information & requires that + /// loop preheaders be inserted into the CFG. It maintains both of these, + /// as well as the CFG. It also requires dominator information. + /// + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + + AU.addRequired<DominatorTree>(); + AU.addRequired<LoopInfo>(); + AU.addPreservedID(LoopSimplifyID); + AU.addPreserved<ScalarEvolution>(); + } + private: + bool ProcessInstruction(Instruction *Inst, + const SmallVectorImpl<BasicBlock*> &ExitBlocks); + + /// verifyAnalysis() - Verify loop nest. + virtual void verifyAnalysis() const { + // Check the special guarantees that LCSSA makes. + assert(L->isLCSSAForm(*DT) && "LCSSA form not preserved!"); + } + + /// inLoop - returns true if the given block is within the current loop + bool inLoop(BasicBlock *B) const { + return std::binary_search(LoopBlocks.begin(), LoopBlocks.end(), B); + } + }; +} + +char LCSSA::ID = 0; +INITIALIZE_PASS_BEGIN(LCSSA, "lcssa", "Loop-Closed SSA Form Pass", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_END(LCSSA, "lcssa", "Loop-Closed SSA Form Pass", false, false) + +Pass *llvm::createLCSSAPass() { return new LCSSA(); } +char &llvm::LCSSAID = LCSSA::ID; + + +/// BlockDominatesAnExit - Return true if the specified block dominates at least +/// one of the blocks in the specified list. +static bool BlockDominatesAnExit(BasicBlock *BB, + const SmallVectorImpl<BasicBlock*> &ExitBlocks, + DominatorTree *DT) { + DomTreeNode *DomNode = DT->getNode(BB); + for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) + if (DT->dominates(DomNode, DT->getNode(ExitBlocks[i]))) + return true; + + return false; +} + + +/// runOnFunction - Process all loops in the function, inner-most out. +bool LCSSA::runOnLoop(Loop *TheLoop, LPPassManager &LPM) { + L = TheLoop; + + DT = &getAnalysis<DominatorTree>(); + + // Get the set of exiting blocks. + SmallVector<BasicBlock*, 8> ExitBlocks; + L->getExitBlocks(ExitBlocks); + + if (ExitBlocks.empty()) + return false; + + // Speed up queries by creating a sorted vector of blocks. + LoopBlocks.clear(); + LoopBlocks.insert(LoopBlocks.end(), L->block_begin(), L->block_end()); + array_pod_sort(LoopBlocks.begin(), LoopBlocks.end()); + + // Look at all the instructions in the loop, checking to see if they have uses + // outside the loop. If so, rewrite those uses. + bool MadeChange = false; + + for (Loop::block_iterator BBI = L->block_begin(), E = L->block_end(); + BBI != E; ++BBI) { + BasicBlock *BB = *BBI; + + // For large loops, avoid use-scanning by using dominance information: In + // particular, if a block does not dominate any of the loop exits, then none + // of the values defined in the block could be used outside the loop. + if (!BlockDominatesAnExit(BB, ExitBlocks, DT)) + continue; + + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); + I != E; ++I) { + // Reject two common cases fast: instructions with no uses (like stores) + // and instructions with one use that is in the same block as this. + if (I->use_empty() || + (I->hasOneUse() && I->use_back()->getParent() == BB && + !isa<PHINode>(I->use_back()))) + continue; + + MadeChange |= ProcessInstruction(I, ExitBlocks); + } + } + + assert(L->isLCSSAForm(*DT)); + PredCache.clear(); + + return MadeChange; +} + +/// isExitBlock - Return true if the specified block is in the list. +static bool isExitBlock(BasicBlock *BB, + const SmallVectorImpl<BasicBlock*> &ExitBlocks) { + for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) + if (ExitBlocks[i] == BB) + return true; + return false; +} + +/// ProcessInstruction - Given an instruction in the loop, check to see if it +/// has any uses that are outside the current loop. If so, insert LCSSA PHI +/// nodes and rewrite the uses. +bool LCSSA::ProcessInstruction(Instruction *Inst, + const SmallVectorImpl<BasicBlock*> &ExitBlocks) { + SmallVector<Use*, 16> UsesToRewrite; + + BasicBlock *InstBB = Inst->getParent(); + + for (Value::use_iterator UI = Inst->use_begin(), E = Inst->use_end(); + UI != E; ++UI) { + User *U = *UI; + BasicBlock *UserBB = cast<Instruction>(U)->getParent(); + if (PHINode *PN = dyn_cast<PHINode>(U)) + UserBB = PN->getIncomingBlock(UI); + + if (InstBB != UserBB && !inLoop(UserBB)) + UsesToRewrite.push_back(&UI.getUse()); + } + + // If there are no uses outside the loop, exit with no change. + if (UsesToRewrite.empty()) return false; + + ++NumLCSSA; // We are applying the transformation + + // Invoke instructions are special in that their result value is not available + // along their unwind edge. The code below tests to see whether DomBB dominates + // the value, so adjust DomBB to the normal destination block, which is + // effectively where the value is first usable. + BasicBlock *DomBB = Inst->getParent(); + if (InvokeInst *Inv = dyn_cast<InvokeInst>(Inst)) + DomBB = Inv->getNormalDest(); + + DomTreeNode *DomNode = DT->getNode(DomBB); + + SSAUpdater SSAUpdate; + SSAUpdate.Initialize(Inst->getType(), Inst->getName()); + + // Insert the LCSSA phi's into all of the exit blocks dominated by the + // value, and add them to the Phi's map. + for (SmallVectorImpl<BasicBlock*>::const_iterator BBI = ExitBlocks.begin(), + BBE = ExitBlocks.end(); BBI != BBE; ++BBI) { + BasicBlock *ExitBB = *BBI; + if (!DT->dominates(DomNode, DT->getNode(ExitBB))) continue; + + // If we already inserted something for this BB, don't reprocess it. + if (SSAUpdate.HasValueForBlock(ExitBB)) continue; + + PHINode *PN = PHINode::Create(Inst->getType(), Inst->getName()+".lcssa", + ExitBB->begin()); + PN->reserveOperandSpace(PredCache.GetNumPreds(ExitBB)); + + // Add inputs from inside the loop for this PHI. + for (BasicBlock **PI = PredCache.GetPreds(ExitBB); *PI; ++PI) { + PN->addIncoming(Inst, *PI); + + // If the exit block has a predecessor not within the loop, arrange for + // the incoming value use corresponding to that predecessor to be + // rewritten in terms of a different LCSSA PHI. + if (!inLoop(*PI)) + UsesToRewrite.push_back( + &PN->getOperandUse( + PN->getOperandNumForIncomingValue(PN->getNumIncomingValues()-1))); + } + + // Remember that this phi makes the value alive in this block. + SSAUpdate.AddAvailableValue(ExitBB, PN); + } + + // Rewrite all uses outside the loop in terms of the new PHIs we just + // inserted. + for (unsigned i = 0, e = UsesToRewrite.size(); i != e; ++i) { + // If this use is in an exit block, rewrite to use the newly inserted PHI. + // This is required for correctness because SSAUpdate doesn't handle uses in + // the same block. It assumes the PHI we inserted is at the end of the + // block. + Instruction *User = cast<Instruction>(UsesToRewrite[i]->getUser()); + BasicBlock *UserBB = User->getParent(); + if (PHINode *PN = dyn_cast<PHINode>(User)) + UserBB = PN->getIncomingBlock(*UsesToRewrite[i]); + + if (isa<PHINode>(UserBB->begin()) && + isExitBlock(UserBB, ExitBlocks)) { + UsesToRewrite[i]->set(UserBB->begin()); + continue; + } + + // Otherwise, do full PHI insertion. + SSAUpdate.RewriteUse(*UsesToRewrite[i]); + } + + return true; +} + diff --git a/contrib/llvm/lib/Transforms/Utils/Local.cpp b/contrib/llvm/lib/Transforms/Utils/Local.cpp new file mode 100644 index 0000000..063c76e --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/Local.cpp @@ -0,0 +1,761 @@ +//===-- Local.cpp - Functions to perform local transformations ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This family of functions perform various local transformations to the +// program. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Constants.h" +#include "llvm/GlobalAlias.h" +#include "llvm/GlobalVariable.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Instructions.h" +#include "llvm/Intrinsics.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/ProfileInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Support/CFG.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/GetElementPtrTypeIterator.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/ValueHandle.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +//===----------------------------------------------------------------------===// +// Local constant propagation. +// + +// ConstantFoldTerminator - If a terminator instruction is predicated on a +// constant value, convert it into an unconditional branch to the constant +// destination. +// +bool llvm::ConstantFoldTerminator(BasicBlock *BB) { + TerminatorInst *T = BB->getTerminator(); + + // Branch - See if we are conditional jumping on constant + if (BranchInst *BI = dyn_cast<BranchInst>(T)) { + if (BI->isUnconditional()) return false; // Can't optimize uncond branch + BasicBlock *Dest1 = BI->getSuccessor(0); + BasicBlock *Dest2 = BI->getSuccessor(1); + + if (ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition())) { + // Are we branching on constant? + // YES. Change to unconditional branch... + BasicBlock *Destination = Cond->getZExtValue() ? Dest1 : Dest2; + BasicBlock *OldDest = Cond->getZExtValue() ? Dest2 : Dest1; + + //cerr << "Function: " << T->getParent()->getParent() + // << "\nRemoving branch from " << T->getParent() + // << "\n\nTo: " << OldDest << endl; + + // Let the basic block know that we are letting go of it. Based on this, + // it will adjust it's PHI nodes. + assert(BI->getParent() && "Terminator not inserted in block!"); + OldDest->removePredecessor(BI->getParent()); + + // Replace the conditional branch with an unconditional one. + BranchInst::Create(Destination, BI); + BI->eraseFromParent(); + return true; + } + + if (Dest2 == Dest1) { // Conditional branch to same location? + // This branch matches something like this: + // br bool %cond, label %Dest, label %Dest + // and changes it into: br label %Dest + + // Let the basic block know that we are letting go of one copy of it. + assert(BI->getParent() && "Terminator not inserted in block!"); + Dest1->removePredecessor(BI->getParent()); + + // Replace the conditional branch with an unconditional one. + BranchInst::Create(Dest1, BI); + BI->eraseFromParent(); + return true; + } + return false; + } + + if (SwitchInst *SI = dyn_cast<SwitchInst>(T)) { + // If we are switching on a constant, we can convert the switch into a + // single branch instruction! + ConstantInt *CI = dyn_cast<ConstantInt>(SI->getCondition()); + BasicBlock *TheOnlyDest = SI->getSuccessor(0); // The default dest + BasicBlock *DefaultDest = TheOnlyDest; + assert(TheOnlyDest == SI->getDefaultDest() && + "Default destination is not successor #0?"); + + // Figure out which case it goes to. + for (unsigned i = 1, e = SI->getNumSuccessors(); i != e; ++i) { + // Found case matching a constant operand? + if (SI->getSuccessorValue(i) == CI) { + TheOnlyDest = SI->getSuccessor(i); + break; + } + + // Check to see if this branch is going to the same place as the default + // dest. If so, eliminate it as an explicit compare. + if (SI->getSuccessor(i) == DefaultDest) { + // Remove this entry. + DefaultDest->removePredecessor(SI->getParent()); + SI->removeCase(i); + --i; --e; // Don't skip an entry... + continue; + } + + // Otherwise, check to see if the switch only branches to one destination. + // We do this by reseting "TheOnlyDest" to null when we find two non-equal + // destinations. + if (SI->getSuccessor(i) != TheOnlyDest) TheOnlyDest = 0; + } + + if (CI && !TheOnlyDest) { + // Branching on a constant, but not any of the cases, go to the default + // successor. + TheOnlyDest = SI->getDefaultDest(); + } + + // If we found a single destination that we can fold the switch into, do so + // now. + if (TheOnlyDest) { + // Insert the new branch. + BranchInst::Create(TheOnlyDest, SI); + BasicBlock *BB = SI->getParent(); + + // Remove entries from PHI nodes which we no longer branch to... + for (unsigned i = 0, e = SI->getNumSuccessors(); i != e; ++i) { + // Found case matching a constant operand? + BasicBlock *Succ = SI->getSuccessor(i); + if (Succ == TheOnlyDest) + TheOnlyDest = 0; // Don't modify the first branch to TheOnlyDest + else + Succ->removePredecessor(BB); + } + + // Delete the old switch. + BB->getInstList().erase(SI); + return true; + } + + if (SI->getNumSuccessors() == 2) { + // Otherwise, we can fold this switch into a conditional branch + // instruction if it has only one non-default destination. + Value *Cond = new ICmpInst(SI, ICmpInst::ICMP_EQ, SI->getCondition(), + SI->getSuccessorValue(1), "cond"); + // Insert the new branch. + BranchInst::Create(SI->getSuccessor(1), SI->getSuccessor(0), Cond, SI); + + // Delete the old switch. + SI->eraseFromParent(); + return true; + } + return false; + } + + if (IndirectBrInst *IBI = dyn_cast<IndirectBrInst>(T)) { + // indirectbr blockaddress(@F, @BB) -> br label @BB + if (BlockAddress *BA = + dyn_cast<BlockAddress>(IBI->getAddress()->stripPointerCasts())) { + BasicBlock *TheOnlyDest = BA->getBasicBlock(); + // Insert the new branch. + BranchInst::Create(TheOnlyDest, IBI); + + for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) { + if (IBI->getDestination(i) == TheOnlyDest) + TheOnlyDest = 0; + else + IBI->getDestination(i)->removePredecessor(IBI->getParent()); + } + IBI->eraseFromParent(); + + // If we didn't find our destination in the IBI successor list, then we + // have undefined behavior. Replace the unconditional branch with an + // 'unreachable' instruction. + if (TheOnlyDest) { + BB->getTerminator()->eraseFromParent(); + new UnreachableInst(BB->getContext(), BB); + } + + return true; + } + } + + return false; +} + + +//===----------------------------------------------------------------------===// +// Local dead code elimination. +// + +/// isInstructionTriviallyDead - Return true if the result produced by the +/// instruction is not used, and the instruction has no side effects. +/// +bool llvm::isInstructionTriviallyDead(Instruction *I) { + if (!I->use_empty() || isa<TerminatorInst>(I)) return false; + + // We don't want debug info removed by anything this general. + if (isa<DbgInfoIntrinsic>(I)) return false; + + if (!I->mayHaveSideEffects()) return true; + + // Special case intrinsics that "may have side effects" but can be deleted + // when dead. + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) + // Safe to delete llvm.stacksave if dead. + if (II->getIntrinsicID() == Intrinsic::stacksave) + return true; + return false; +} + +/// RecursivelyDeleteTriviallyDeadInstructions - If the specified value is a +/// trivially dead instruction, delete it. If that makes any of its operands +/// trivially dead, delete them too, recursively. Return true if any +/// instructions were deleted. +bool llvm::RecursivelyDeleteTriviallyDeadInstructions(Value *V) { + Instruction *I = dyn_cast<Instruction>(V); + if (!I || !I->use_empty() || !isInstructionTriviallyDead(I)) + return false; + + SmallVector<Instruction*, 16> DeadInsts; + DeadInsts.push_back(I); + + do { + I = DeadInsts.pop_back_val(); + + // Null out all of the instruction's operands to see if any operand becomes + // dead as we go. + for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { + Value *OpV = I->getOperand(i); + I->setOperand(i, 0); + + if (!OpV->use_empty()) continue; + + // If the operand is an instruction that became dead as we nulled out the + // operand, and if it is 'trivially' dead, delete it in a future loop + // iteration. + if (Instruction *OpI = dyn_cast<Instruction>(OpV)) + if (isInstructionTriviallyDead(OpI)) + DeadInsts.push_back(OpI); + } + + I->eraseFromParent(); + } while (!DeadInsts.empty()); + + return true; +} + +/// areAllUsesEqual - Check whether the uses of a value are all the same. +/// This is similar to Instruction::hasOneUse() except this will also return +/// true when there are multiple uses that all refer to the same value. +static bool areAllUsesEqual(Instruction *I) { + Value::use_iterator UI = I->use_begin(); + Value::use_iterator UE = I->use_end(); + if (UI == UE) + return false; + + User *TheUse = *UI; + for (++UI; UI != UE; ++UI) { + if (*UI != TheUse) + return false; + } + return true; +} + +/// RecursivelyDeleteDeadPHINode - If the specified value is an effectively +/// dead PHI node, due to being a def-use chain of single-use nodes that +/// either forms a cycle or is terminated by a trivially dead instruction, +/// delete it. If that makes any of its operands trivially dead, delete them +/// too, recursively. Return true if the PHI node is actually deleted. +bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN) { + // We can remove a PHI if it is on a cycle in the def-use graph + // where each node in the cycle has degree one, i.e. only one use, + // and is an instruction with no side effects. + if (!areAllUsesEqual(PN)) + return false; + + bool Changed = false; + SmallPtrSet<PHINode *, 4> PHIs; + PHIs.insert(PN); + for (Instruction *J = cast<Instruction>(*PN->use_begin()); + areAllUsesEqual(J) && !J->mayHaveSideEffects(); + J = cast<Instruction>(*J->use_begin())) + // If we find a PHI more than once, we're on a cycle that + // won't prove fruitful. + if (PHINode *JP = dyn_cast<PHINode>(J)) + if (!PHIs.insert(JP)) { + // Break the cycle and delete the PHI and its operands. + JP->replaceAllUsesWith(UndefValue::get(JP->getType())); + (void)RecursivelyDeleteTriviallyDeadInstructions(JP); + Changed = true; + break; + } + return Changed; +} + +/// SimplifyInstructionsInBlock - Scan the specified basic block and try to +/// simplify any instructions in it and recursively delete dead instructions. +/// +/// This returns true if it changed the code, note that it can delete +/// instructions in other blocks as well in this block. +bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const TargetData *TD) { + bool MadeChange = false; + for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) { + Instruction *Inst = BI++; + + if (Value *V = SimplifyInstruction(Inst, TD)) { + WeakVH BIHandle(BI); + ReplaceAndSimplifyAllUses(Inst, V, TD); + MadeChange = true; + if (BIHandle != BI) + BI = BB->begin(); + continue; + } + + MadeChange |= RecursivelyDeleteTriviallyDeadInstructions(Inst); + } + return MadeChange; +} + +//===----------------------------------------------------------------------===// +// Control Flow Graph Restructuring. +// + + +/// RemovePredecessorAndSimplify - Like BasicBlock::removePredecessor, this +/// method is called when we're about to delete Pred as a predecessor of BB. If +/// BB contains any PHI nodes, this drops the entries in the PHI nodes for Pred. +/// +/// Unlike the removePredecessor method, this attempts to simplify uses of PHI +/// nodes that collapse into identity values. For example, if we have: +/// x = phi(1, 0, 0, 0) +/// y = and x, z +/// +/// .. and delete the predecessor corresponding to the '1', this will attempt to +/// recursively fold the and to 0. +void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred, + TargetData *TD) { + // This only adjusts blocks with PHI nodes. + if (!isa<PHINode>(BB->begin())) + return; + + // Remove the entries for Pred from the PHI nodes in BB, but do not simplify + // them down. This will leave us with single entry phi nodes and other phis + // that can be removed. + BB->removePredecessor(Pred, true); + + WeakVH PhiIt = &BB->front(); + while (PHINode *PN = dyn_cast<PHINode>(PhiIt)) { + PhiIt = &*++BasicBlock::iterator(cast<Instruction>(PhiIt)); + + Value *PNV = SimplifyInstruction(PN, TD); + if (PNV == 0) continue; + + // If we're able to simplify the phi to a single value, substitute the new + // value into all of its uses. + assert(PNV != PN && "SimplifyInstruction broken!"); + + Value *OldPhiIt = PhiIt; + ReplaceAndSimplifyAllUses(PN, PNV, TD); + + // If recursive simplification ended up deleting the next PHI node we would + // iterate to, then our iterator is invalid, restart scanning from the top + // of the block. + if (PhiIt != OldPhiIt) PhiIt = &BB->front(); + } +} + + +/// MergeBasicBlockIntoOnlyPred - DestBB is a block with one predecessor and its +/// predecessor is known to have one successor (DestBB!). Eliminate the edge +/// between them, moving the instructions in the predecessor into DestBB and +/// deleting the predecessor block. +/// +void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, Pass *P) { + // If BB has single-entry PHI nodes, fold them. + while (PHINode *PN = dyn_cast<PHINode>(DestBB->begin())) { + Value *NewVal = PN->getIncomingValue(0); + // Replace self referencing PHI with undef, it must be dead. + if (NewVal == PN) NewVal = UndefValue::get(PN->getType()); + PN->replaceAllUsesWith(NewVal); + PN->eraseFromParent(); + } + + BasicBlock *PredBB = DestBB->getSinglePredecessor(); + assert(PredBB && "Block doesn't have a single predecessor!"); + + // Splice all the instructions from PredBB to DestBB. + PredBB->getTerminator()->eraseFromParent(); + DestBB->getInstList().splice(DestBB->begin(), PredBB->getInstList()); + + // Zap anything that took the address of DestBB. Not doing this will give the + // address an invalid value. + if (DestBB->hasAddressTaken()) { + BlockAddress *BA = BlockAddress::get(DestBB); + Constant *Replacement = + ConstantInt::get(llvm::Type::getInt32Ty(BA->getContext()), 1); + BA->replaceAllUsesWith(ConstantExpr::getIntToPtr(Replacement, + BA->getType())); + BA->destroyConstant(); + } + + // Anything that branched to PredBB now branches to DestBB. + PredBB->replaceAllUsesWith(DestBB); + + if (P) { + DominatorTree *DT = P->getAnalysisIfAvailable<DominatorTree>(); + if (DT) { + BasicBlock *PredBBIDom = DT->getNode(PredBB)->getIDom()->getBlock(); + DT->changeImmediateDominator(DestBB, PredBBIDom); + DT->eraseNode(PredBB); + } + ProfileInfo *PI = P->getAnalysisIfAvailable<ProfileInfo>(); + if (PI) { + PI->replaceAllUses(PredBB, DestBB); + PI->removeEdge(ProfileInfo::getEdge(PredBB, DestBB)); + } + } + // Nuke BB. + PredBB->eraseFromParent(); +} + +/// CanPropagatePredecessorsForPHIs - Return true if we can fold BB, an +/// almost-empty BB ending in an unconditional branch to Succ, into succ. +/// +/// Assumption: Succ is the single successor for BB. +/// +static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) { + assert(*succ_begin(BB) == Succ && "Succ is not successor of BB!"); + + DEBUG(dbgs() << "Looking to fold " << BB->getName() << " into " + << Succ->getName() << "\n"); + // Shortcut, if there is only a single predecessor it must be BB and merging + // is always safe + if (Succ->getSinglePredecessor()) return true; + + // Make a list of the predecessors of BB + typedef SmallPtrSet<BasicBlock*, 16> BlockSet; + BlockSet BBPreds(pred_begin(BB), pred_end(BB)); + + // Use that list to make another list of common predecessors of BB and Succ + BlockSet CommonPreds; + for (pred_iterator PI = pred_begin(Succ), PE = pred_end(Succ); + PI != PE; ++PI) { + BasicBlock *P = *PI; + if (BBPreds.count(P)) + CommonPreds.insert(P); + } + + // Shortcut, if there are no common predecessors, merging is always safe + if (CommonPreds.empty()) + return true; + + // Look at all the phi nodes in Succ, to see if they present a conflict when + // merging these blocks + for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) { + PHINode *PN = cast<PHINode>(I); + + // If the incoming value from BB is again a PHINode in + // BB which has the same incoming value for *PI as PN does, we can + // merge the phi nodes and then the blocks can still be merged + PHINode *BBPN = dyn_cast<PHINode>(PN->getIncomingValueForBlock(BB)); + if (BBPN && BBPN->getParent() == BB) { + for (BlockSet::iterator PI = CommonPreds.begin(), PE = CommonPreds.end(); + PI != PE; PI++) { + if (BBPN->getIncomingValueForBlock(*PI) + != PN->getIncomingValueForBlock(*PI)) { + DEBUG(dbgs() << "Can't fold, phi node " << PN->getName() << " in " + << Succ->getName() << " is conflicting with " + << BBPN->getName() << " with regard to common predecessor " + << (*PI)->getName() << "\n"); + return false; + } + } + } else { + Value* Val = PN->getIncomingValueForBlock(BB); + for (BlockSet::iterator PI = CommonPreds.begin(), PE = CommonPreds.end(); + PI != PE; PI++) { + // See if the incoming value for the common predecessor is equal to the + // one for BB, in which case this phi node will not prevent the merging + // of the block. + if (Val != PN->getIncomingValueForBlock(*PI)) { + DEBUG(dbgs() << "Can't fold, phi node " << PN->getName() << " in " + << Succ->getName() << " is conflicting with regard to common " + << "predecessor " << (*PI)->getName() << "\n"); + return false; + } + } + } + } + + return true; +} + +/// TryToSimplifyUncondBranchFromEmptyBlock - BB is known to contain an +/// unconditional branch, and contains no instructions other than PHI nodes, +/// potential debug intrinsics and the branch. If possible, eliminate BB by +/// rewriting all the predecessors to branch to the successor block and return +/// true. If we can't transform, return false. +bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB) { + assert(BB != &BB->getParent()->getEntryBlock() && + "TryToSimplifyUncondBranchFromEmptyBlock called on entry block!"); + + // We can't eliminate infinite loops. + BasicBlock *Succ = cast<BranchInst>(BB->getTerminator())->getSuccessor(0); + if (BB == Succ) return false; + + // Check to see if merging these blocks would cause conflicts for any of the + // phi nodes in BB or Succ. If not, we can safely merge. + if (!CanPropagatePredecessorsForPHIs(BB, Succ)) return false; + + // Check for cases where Succ has multiple predecessors and a PHI node in BB + // has uses which will not disappear when the PHI nodes are merged. It is + // possible to handle such cases, but difficult: it requires checking whether + // BB dominates Succ, which is non-trivial to calculate in the case where + // Succ has multiple predecessors. Also, it requires checking whether + // constructing the necessary self-referential PHI node doesn't intoduce any + // conflicts; this isn't too difficult, but the previous code for doing this + // was incorrect. + // + // Note that if this check finds a live use, BB dominates Succ, so BB is + // something like a loop pre-header (or rarely, a part of an irreducible CFG); + // folding the branch isn't profitable in that case anyway. + if (!Succ->getSinglePredecessor()) { + BasicBlock::iterator BBI = BB->begin(); + while (isa<PHINode>(*BBI)) { + for (Value::use_iterator UI = BBI->use_begin(), E = BBI->use_end(); + UI != E; ++UI) { + if (PHINode* PN = dyn_cast<PHINode>(*UI)) { + if (PN->getIncomingBlock(UI) != BB) + return false; + } else { + return false; + } + } + ++BBI; + } + } + + DEBUG(dbgs() << "Killing Trivial BB: \n" << *BB); + + if (isa<PHINode>(Succ->begin())) { + // If there is more than one pred of succ, and there are PHI nodes in + // the successor, then we need to add incoming edges for the PHI nodes + // + const SmallVector<BasicBlock*, 16> BBPreds(pred_begin(BB), pred_end(BB)); + + // Loop over all of the PHI nodes in the successor of BB. + for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) { + PHINode *PN = cast<PHINode>(I); + Value *OldVal = PN->removeIncomingValue(BB, false); + assert(OldVal && "No entry in PHI for Pred BB!"); + + // If this incoming value is one of the PHI nodes in BB, the new entries + // in the PHI node are the entries from the old PHI. + if (isa<PHINode>(OldVal) && cast<PHINode>(OldVal)->getParent() == BB) { + PHINode *OldValPN = cast<PHINode>(OldVal); + for (unsigned i = 0, e = OldValPN->getNumIncomingValues(); i != e; ++i) + // Note that, since we are merging phi nodes and BB and Succ might + // have common predecessors, we could end up with a phi node with + // identical incoming branches. This will be cleaned up later (and + // will trigger asserts if we try to clean it up now, without also + // simplifying the corresponding conditional branch). + PN->addIncoming(OldValPN->getIncomingValue(i), + OldValPN->getIncomingBlock(i)); + } else { + // Add an incoming value for each of the new incoming values. + for (unsigned i = 0, e = BBPreds.size(); i != e; ++i) + PN->addIncoming(OldVal, BBPreds[i]); + } + } + } + + while (PHINode *PN = dyn_cast<PHINode>(&BB->front())) { + if (Succ->getSinglePredecessor()) { + // BB is the only predecessor of Succ, so Succ will end up with exactly + // the same predecessors BB had. + Succ->getInstList().splice(Succ->begin(), + BB->getInstList(), BB->begin()); + } else { + // We explicitly check for such uses in CanPropagatePredecessorsForPHIs. + assert(PN->use_empty() && "There shouldn't be any uses here!"); + PN->eraseFromParent(); + } + } + + // Everything that jumped to BB now goes to Succ. + BB->replaceAllUsesWith(Succ); + if (!Succ->hasName()) Succ->takeName(BB); + BB->eraseFromParent(); // Delete the old basic block. + return true; +} + +/// EliminateDuplicatePHINodes - Check for and eliminate duplicate PHI +/// nodes in this block. This doesn't try to be clever about PHI nodes +/// which differ only in the order of the incoming values, but instcombine +/// orders them so it usually won't matter. +/// +bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) { + bool Changed = false; + + // This implementation doesn't currently consider undef operands + // specially. Theroetically, two phis which are identical except for + // one having an undef where the other doesn't could be collapsed. + + // Map from PHI hash values to PHI nodes. If multiple PHIs have + // the same hash value, the element is the first PHI in the + // linked list in CollisionMap. + DenseMap<uintptr_t, PHINode *> HashMap; + + // Maintain linked lists of PHI nodes with common hash values. + DenseMap<PHINode *, PHINode *> CollisionMap; + + // Examine each PHI. + for (BasicBlock::iterator I = BB->begin(); + PHINode *PN = dyn_cast<PHINode>(I++); ) { + // Compute a hash value on the operands. Instcombine will likely have sorted + // them, which helps expose duplicates, but we have to check all the + // operands to be safe in case instcombine hasn't run. + uintptr_t Hash = 0; + for (User::op_iterator I = PN->op_begin(), E = PN->op_end(); I != E; ++I) { + // This hash algorithm is quite weak as hash functions go, but it seems + // to do a good enough job for this particular purpose, and is very quick. + Hash ^= reinterpret_cast<uintptr_t>(static_cast<Value *>(*I)); + Hash = (Hash << 7) | (Hash >> (sizeof(uintptr_t) * CHAR_BIT - 7)); + } + // If we've never seen this hash value before, it's a unique PHI. + std::pair<DenseMap<uintptr_t, PHINode *>::iterator, bool> Pair = + HashMap.insert(std::make_pair(Hash, PN)); + if (Pair.second) continue; + // Otherwise it's either a duplicate or a hash collision. + for (PHINode *OtherPN = Pair.first->second; ; ) { + if (OtherPN->isIdenticalTo(PN)) { + // A duplicate. Replace this PHI with its duplicate. + PN->replaceAllUsesWith(OtherPN); + PN->eraseFromParent(); + Changed = true; + break; + } + // A non-duplicate hash collision. + DenseMap<PHINode *, PHINode *>::iterator I = CollisionMap.find(OtherPN); + if (I == CollisionMap.end()) { + // Set this PHI to be the head of the linked list of colliding PHIs. + PHINode *Old = Pair.first->second; + Pair.first->second = PN; + CollisionMap[PN] = Old; + break; + } + // Procede to the next PHI in the list. + OtherPN = I->second; + } + } + + return Changed; +} + +/// enforceKnownAlignment - If the specified pointer points to an object that +/// we control, modify the object's alignment to PrefAlign. This isn't +/// often possible though. If alignment is important, a more reliable approach +/// is to simply align all global variables and allocation instructions to +/// their preferred alignment from the beginning. +/// +static unsigned enforceKnownAlignment(Value *V, unsigned Align, + unsigned PrefAlign) { + + User *U = dyn_cast<User>(V); + if (!U) return Align; + + switch (Operator::getOpcode(U)) { + default: break; + case Instruction::BitCast: + return enforceKnownAlignment(U->getOperand(0), Align, PrefAlign); + case Instruction::GetElementPtr: { + // If all indexes are zero, it is just the alignment of the base pointer. + bool AllZeroOperands = true; + for (User::op_iterator i = U->op_begin() + 1, e = U->op_end(); i != e; ++i) + if (!isa<Constant>(*i) || + !cast<Constant>(*i)->isNullValue()) { + AllZeroOperands = false; + break; + } + + if (AllZeroOperands) { + // Treat this like a bitcast. + return enforceKnownAlignment(U->getOperand(0), Align, PrefAlign); + } + return Align; + } + case Instruction::Alloca: { + AllocaInst *AI = cast<AllocaInst>(V); + // If there is a requested alignment and if this is an alloca, round up. + if (AI->getAlignment() >= PrefAlign) + return AI->getAlignment(); + AI->setAlignment(PrefAlign); + return PrefAlign; + } + } + + if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) { + // If there is a large requested alignment and we can, bump up the alignment + // of the global. + if (GV->isDeclaration()) return Align; + + if (GV->getAlignment() >= PrefAlign) + return GV->getAlignment(); + // We can only increase the alignment of the global if it has no alignment + // specified or if it is not assigned a section. If it is assigned a + // section, the global could be densely packed with other objects in the + // section, increasing the alignment could cause padding issues. + if (!GV->hasSection() || GV->getAlignment() == 0) + GV->setAlignment(PrefAlign); + return GV->getAlignment(); + } + + return Align; +} + +/// getOrEnforceKnownAlignment - If the specified pointer has an alignment that +/// we can determine, return it, otherwise return 0. If PrefAlign is specified, +/// and it is more than the alignment of the ultimate object, see if we can +/// increase the alignment of the ultimate object, making this check succeed. +unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign, + const TargetData *TD) { + assert(V->getType()->isPointerTy() && + "getOrEnforceKnownAlignment expects a pointer!"); + unsigned BitWidth = TD ? TD->getPointerSizeInBits() : 64; + APInt Mask = APInt::getAllOnesValue(BitWidth); + APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0); + ComputeMaskedBits(V, Mask, KnownZero, KnownOne, TD); + unsigned TrailZ = KnownZero.countTrailingOnes(); + + // Avoid trouble with rediculously large TrailZ values, such as + // those computed from a null pointer. + TrailZ = std::min(TrailZ, unsigned(sizeof(unsigned) * CHAR_BIT - 1)); + + unsigned Align = 1u << std::min(BitWidth - 1, TrailZ); + + // LLVM doesn't support alignments larger than this currently. + Align = std::min(Align, +Value::MaximumAlignment); + + if (PrefAlign > Align) + Align = enforceKnownAlignment(V, Align, PrefAlign); + + // We don't need to make any adjustment. + return Align; +} + diff --git a/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp new file mode 100644 index 0000000..2462630 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp @@ -0,0 +1,753 @@ +//===- LoopSimplify.cpp - Loop Canonicalization Pass ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass performs several transformations to transform natural loops into a +// simpler form, which makes subsequent analyses and transformations simpler and +// more effective. +// +// Loop pre-header insertion guarantees that there is a single, non-critical +// entry edge from outside of the loop to the loop header. This simplifies a +// number of analyses and transformations, such as LICM. +// +// Loop exit-block insertion guarantees that all exit blocks from the loop +// (blocks which are outside of the loop that have predecessors inside of the +// loop) only have predecessors from inside of the loop (and are thus dominated +// by the loop header). This simplifies transformations such as store-sinking +// that are built into LICM. +// +// This pass also guarantees that loops will have exactly one backedge. +// +// Indirectbr instructions introduce several complications. If the loop +// contains or is entered by an indirectbr instruction, it may not be possible +// to transform the loop and make these guarantees. Client code should check +// that these conditions are true before relying on them. +// +// Note that the simplifycfg pass will clean up blocks which are split out but +// end up being unnecessary, so usage of this pass should not pessimize +// generated code. +// +// This pass obviously modifies the CFG, but updates loop information and +// dominator information. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "loop-simplify" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Constants.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Function.h" +#include "llvm/LLVMContext.h" +#include "llvm/Type.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Support/CFG.h" +#include "llvm/Support/Debug.h" +#include "llvm/ADT/SetOperations.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/DepthFirstIterator.h" +using namespace llvm; + +STATISTIC(NumInserted, "Number of pre-header or exit blocks inserted"); +STATISTIC(NumNested , "Number of nested loops split out"); + +namespace { + struct LoopSimplify : public LoopPass { + static char ID; // Pass identification, replacement for typeid + LoopSimplify() : LoopPass(ID) { + initializeLoopSimplifyPass(*PassRegistry::getPassRegistry()); + } + + // AA - If we have an alias analysis object to update, this is it, otherwise + // this is null. + AliasAnalysis *AA; + LoopInfo *LI; + DominatorTree *DT; + ScalarEvolution *SE; + Loop *L; + virtual bool runOnLoop(Loop *L, LPPassManager &LPM); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + // We need loop information to identify the loops... + AU.addRequired<DominatorTree>(); + AU.addPreserved<DominatorTree>(); + + AU.addRequired<LoopInfo>(); + AU.addPreserved<LoopInfo>(); + + AU.addPreserved<AliasAnalysis>(); + AU.addPreserved<ScalarEvolution>(); + AU.addPreservedID(BreakCriticalEdgesID); // No critical edges added. + } + + /// verifyAnalysis() - Verify LoopSimplifyForm's guarantees. + void verifyAnalysis() const; + + private: + bool ProcessLoop(Loop *L, LPPassManager &LPM); + BasicBlock *RewriteLoopExitBlock(Loop *L, BasicBlock *Exit); + BasicBlock *InsertPreheaderForLoop(Loop *L); + Loop *SeparateNestedLoop(Loop *L, LPPassManager &LPM); + BasicBlock *InsertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader); + void PlaceSplitBlockCarefully(BasicBlock *NewBB, + SmallVectorImpl<BasicBlock*> &SplitPreds, + Loop *L); + }; +} + +char LoopSimplify::ID = 0; +INITIALIZE_PASS_BEGIN(LoopSimplify, "loop-simplify", + "Canonicalize natural loops", true, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_END(LoopSimplify, "loop-simplify", + "Canonicalize natural loops", true, false) + +// Publically exposed interface to pass... +char &llvm::LoopSimplifyID = LoopSimplify::ID; +Pass *llvm::createLoopSimplifyPass() { return new LoopSimplify(); } + +/// runOnLoop - Run down all loops in the CFG (recursively, but we could do +/// it in any convenient order) inserting preheaders... +/// +bool LoopSimplify::runOnLoop(Loop *l, LPPassManager &LPM) { + L = l; + bool Changed = false; + LI = &getAnalysis<LoopInfo>(); + AA = getAnalysisIfAvailable<AliasAnalysis>(); + DT = &getAnalysis<DominatorTree>(); + SE = getAnalysisIfAvailable<ScalarEvolution>(); + + Changed |= ProcessLoop(L, LPM); + + return Changed; +} + +/// ProcessLoop - Walk the loop structure in depth first order, ensuring that +/// all loops have preheaders. +/// +bool LoopSimplify::ProcessLoop(Loop *L, LPPassManager &LPM) { + bool Changed = false; +ReprocessLoop: + + // Check to see that no blocks (other than the header) in this loop have + // predecessors that are not in the loop. This is not valid for natural + // loops, but can occur if the blocks are unreachable. Since they are + // unreachable we can just shamelessly delete those CFG edges! + for (Loop::block_iterator BB = L->block_begin(), E = L->block_end(); + BB != E; ++BB) { + if (*BB == L->getHeader()) continue; + + SmallPtrSet<BasicBlock*, 4> BadPreds; + for (pred_iterator PI = pred_begin(*BB), + PE = pred_end(*BB); PI != PE; ++PI) { + BasicBlock *P = *PI; + if (!L->contains(P)) + BadPreds.insert(P); + } + + // Delete each unique out-of-loop (and thus dead) predecessor. + for (SmallPtrSet<BasicBlock*, 4>::iterator I = BadPreds.begin(), + E = BadPreds.end(); I != E; ++I) { + + DEBUG(dbgs() << "LoopSimplify: Deleting edge from dead predecessor " + << (*I)->getName() << "\n"); + + // Inform each successor of each dead pred. + for (succ_iterator SI = succ_begin(*I), SE = succ_end(*I); SI != SE; ++SI) + (*SI)->removePredecessor(*I); + // Zap the dead pred's terminator and replace it with unreachable. + TerminatorInst *TI = (*I)->getTerminator(); + TI->replaceAllUsesWith(UndefValue::get(TI->getType())); + (*I)->getTerminator()->eraseFromParent(); + new UnreachableInst((*I)->getContext(), *I); + Changed = true; + } + } + + // If there are exiting blocks with branches on undef, resolve the undef in + // the direction which will exit the loop. This will help simplify loop + // trip count computations. + SmallVector<BasicBlock*, 8> ExitingBlocks; + L->getExitingBlocks(ExitingBlocks); + for (SmallVectorImpl<BasicBlock *>::iterator I = ExitingBlocks.begin(), + E = ExitingBlocks.end(); I != E; ++I) + if (BranchInst *BI = dyn_cast<BranchInst>((*I)->getTerminator())) + if (BI->isConditional()) { + if (UndefValue *Cond = dyn_cast<UndefValue>(BI->getCondition())) { + + DEBUG(dbgs() << "LoopSimplify: Resolving \"br i1 undef\" to exit in " + << (*I)->getName() << "\n"); + + BI->setCondition(ConstantInt::get(Cond->getType(), + !L->contains(BI->getSuccessor(0)))); + Changed = true; + } + } + + // Does the loop already have a preheader? If so, don't insert one. + BasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) { + Preheader = InsertPreheaderForLoop(L); + if (Preheader) { + ++NumInserted; + Changed = true; + } + } + + // Next, check to make sure that all exit nodes of the loop only have + // predecessors that are inside of the loop. This check guarantees that the + // loop preheader/header will dominate the exit blocks. If the exit block has + // predecessors from outside of the loop, split the edge now. + SmallVector<BasicBlock*, 8> ExitBlocks; + L->getExitBlocks(ExitBlocks); + + SmallSetVector<BasicBlock *, 8> ExitBlockSet(ExitBlocks.begin(), + ExitBlocks.end()); + for (SmallSetVector<BasicBlock *, 8>::iterator I = ExitBlockSet.begin(), + E = ExitBlockSet.end(); I != E; ++I) { + BasicBlock *ExitBlock = *I; + for (pred_iterator PI = pred_begin(ExitBlock), PE = pred_end(ExitBlock); + PI != PE; ++PI) + // Must be exactly this loop: no subloops, parent loops, or non-loop preds + // allowed. + if (!L->contains(*PI)) { + if (RewriteLoopExitBlock(L, ExitBlock)) { + ++NumInserted; + Changed = true; + } + break; + } + } + + // If the header has more than two predecessors at this point (from the + // preheader and from multiple backedges), we must adjust the loop. + BasicBlock *LoopLatch = L->getLoopLatch(); + if (!LoopLatch) { + // If this is really a nested loop, rip it out into a child loop. Don't do + // this for loops with a giant number of backedges, just factor them into a + // common backedge instead. + if (L->getNumBackEdges() < 8) { + if (SeparateNestedLoop(L, LPM)) { + ++NumNested; + // This is a big restructuring change, reprocess the whole loop. + Changed = true; + // GCC doesn't tail recursion eliminate this. + goto ReprocessLoop; + } + } + + // If we either couldn't, or didn't want to, identify nesting of the loops, + // insert a new block that all backedges target, then make it jump to the + // loop header. + LoopLatch = InsertUniqueBackedgeBlock(L, Preheader); + if (LoopLatch) { + ++NumInserted; + Changed = true; + } + } + + // Scan over the PHI nodes in the loop header. Since they now have only two + // incoming values (the loop is canonicalized), we may have simplified the PHI + // down to 'X = phi [X, Y]', which should be replaced with 'Y'. + PHINode *PN; + for (BasicBlock::iterator I = L->getHeader()->begin(); + (PN = dyn_cast<PHINode>(I++)); ) + if (Value *V = SimplifyInstruction(PN, 0, DT)) { + if (AA) AA->deleteValue(PN); + if (SE) SE->forgetValue(PN); + PN->replaceAllUsesWith(V); + PN->eraseFromParent(); + } + + // If this loop has multiple exits and the exits all go to the same + // block, attempt to merge the exits. This helps several passes, such + // as LoopRotation, which do not support loops with multiple exits. + // SimplifyCFG also does this (and this code uses the same utility + // function), however this code is loop-aware, where SimplifyCFG is + // not. That gives it the advantage of being able to hoist + // loop-invariant instructions out of the way to open up more + // opportunities, and the disadvantage of having the responsibility + // to preserve dominator information. + bool UniqueExit = true; + if (!ExitBlocks.empty()) + for (unsigned i = 1, e = ExitBlocks.size(); i != e; ++i) + if (ExitBlocks[i] != ExitBlocks[0]) { + UniqueExit = false; + break; + } + if (UniqueExit) { + for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) { + BasicBlock *ExitingBlock = ExitingBlocks[i]; + if (!ExitingBlock->getSinglePredecessor()) continue; + BranchInst *BI = dyn_cast<BranchInst>(ExitingBlock->getTerminator()); + if (!BI || !BI->isConditional()) continue; + CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition()); + if (!CI || CI->getParent() != ExitingBlock) continue; + + // Attempt to hoist out all instructions except for the + // comparison and the branch. + bool AllInvariant = true; + for (BasicBlock::iterator I = ExitingBlock->begin(); &*I != BI; ) { + Instruction *Inst = I++; + // Skip debug info intrinsics. + if (isa<DbgInfoIntrinsic>(Inst)) + continue; + if (Inst == CI) + continue; + if (!L->makeLoopInvariant(Inst, Changed, + Preheader ? Preheader->getTerminator() : 0)) { + AllInvariant = false; + break; + } + } + if (!AllInvariant) continue; + + // The block has now been cleared of all instructions except for + // a comparison and a conditional branch. SimplifyCFG may be able + // to fold it now. + if (!FoldBranchToCommonDest(BI)) continue; + + // Success. The block is now dead, so remove it from the loop, + // update the dominator tree and delete it. + DEBUG(dbgs() << "LoopSimplify: Eliminating exiting block " + << ExitingBlock->getName() << "\n"); + + assert(pred_begin(ExitingBlock) == pred_end(ExitingBlock)); + Changed = true; + LI->removeBlock(ExitingBlock); + + DomTreeNode *Node = DT->getNode(ExitingBlock); + const std::vector<DomTreeNodeBase<BasicBlock> *> &Children = + Node->getChildren(); + while (!Children.empty()) { + DomTreeNode *Child = Children.front(); + DT->changeImmediateDominator(Child, Node->getIDom()); + } + DT->eraseNode(ExitingBlock); + + BI->getSuccessor(0)->removePredecessor(ExitingBlock); + BI->getSuccessor(1)->removePredecessor(ExitingBlock); + ExitingBlock->eraseFromParent(); + } + } + + return Changed; +} + +/// InsertPreheaderForLoop - Once we discover that a loop doesn't have a +/// preheader, this method is called to insert one. This method has two phases: +/// preheader insertion and analysis updating. +/// +BasicBlock *LoopSimplify::InsertPreheaderForLoop(Loop *L) { + BasicBlock *Header = L->getHeader(); + + // Compute the set of predecessors of the loop that are not in the loop. + SmallVector<BasicBlock*, 8> OutsideBlocks; + for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header); + PI != PE; ++PI) { + BasicBlock *P = *PI; + if (!L->contains(P)) { // Coming in from outside the loop? + // If the loop is branched to from an indirect branch, we won't + // be able to fully transform the loop, because it prohibits + // edge splitting. + if (isa<IndirectBrInst>(P->getTerminator())) return 0; + + // Keep track of it. + OutsideBlocks.push_back(P); + } + } + + // Split out the loop pre-header. + BasicBlock *NewBB = + SplitBlockPredecessors(Header, &OutsideBlocks[0], OutsideBlocks.size(), + ".preheader", this); + + DEBUG(dbgs() << "LoopSimplify: Creating pre-header " << NewBB->getName() + << "\n"); + + // Make sure that NewBB is put someplace intelligent, which doesn't mess up + // code layout too horribly. + PlaceSplitBlockCarefully(NewBB, OutsideBlocks, L); + + return NewBB; +} + +/// RewriteLoopExitBlock - Ensure that the loop preheader dominates all exit +/// blocks. This method is used to split exit blocks that have predecessors +/// outside of the loop. +BasicBlock *LoopSimplify::RewriteLoopExitBlock(Loop *L, BasicBlock *Exit) { + SmallVector<BasicBlock*, 8> LoopBlocks; + for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit); I != E; ++I) { + BasicBlock *P = *I; + if (L->contains(P)) { + // Don't do this if the loop is exited via an indirect branch. + if (isa<IndirectBrInst>(P->getTerminator())) return 0; + + LoopBlocks.push_back(P); + } + } + + assert(!LoopBlocks.empty() && "No edges coming in from outside the loop?"); + BasicBlock *NewBB = SplitBlockPredecessors(Exit, &LoopBlocks[0], + LoopBlocks.size(), ".loopexit", + this); + + DEBUG(dbgs() << "LoopSimplify: Creating dedicated exit block " + << NewBB->getName() << "\n"); + return NewBB; +} + +/// AddBlockAndPredsToSet - Add the specified block, and all of its +/// predecessors, to the specified set, if it's not already in there. Stop +/// predecessor traversal when we reach StopBlock. +static void AddBlockAndPredsToSet(BasicBlock *InputBB, BasicBlock *StopBlock, + std::set<BasicBlock*> &Blocks) { + std::vector<BasicBlock *> WorkList; + WorkList.push_back(InputBB); + do { + BasicBlock *BB = WorkList.back(); WorkList.pop_back(); + if (Blocks.insert(BB).second && BB != StopBlock) + // If BB is not already processed and it is not a stop block then + // insert its predecessor in the work list + for (pred_iterator I = pred_begin(BB), E = pred_end(BB); I != E; ++I) { + BasicBlock *WBB = *I; + WorkList.push_back(WBB); + } + } while(!WorkList.empty()); +} + +/// FindPHIToPartitionLoops - The first part of loop-nestification is to find a +/// PHI node that tells us how to partition the loops. +static PHINode *FindPHIToPartitionLoops(Loop *L, DominatorTree *DT, + AliasAnalysis *AA, LoopInfo *LI) { + for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ) { + PHINode *PN = cast<PHINode>(I); + ++I; + if (Value *V = SimplifyInstruction(PN, 0, DT)) { + // This is a degenerate PHI already, don't modify it! + PN->replaceAllUsesWith(V); + if (AA) AA->deleteValue(PN); + PN->eraseFromParent(); + continue; + } + + // Scan this PHI node looking for a use of the PHI node by itself. + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + if (PN->getIncomingValue(i) == PN && + L->contains(PN->getIncomingBlock(i))) + // We found something tasty to remove. + return PN; + } + return 0; +} + +// PlaceSplitBlockCarefully - If the block isn't already, move the new block to +// right after some 'outside block' block. This prevents the preheader from +// being placed inside the loop body, e.g. when the loop hasn't been rotated. +void LoopSimplify::PlaceSplitBlockCarefully(BasicBlock *NewBB, + SmallVectorImpl<BasicBlock*> &SplitPreds, + Loop *L) { + // Check to see if NewBB is already well placed. + Function::iterator BBI = NewBB; --BBI; + for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) { + if (&*BBI == SplitPreds[i]) + return; + } + + // If it isn't already after an outside block, move it after one. This is + // always good as it makes the uncond branch from the outside block into a + // fall-through. + + // Figure out *which* outside block to put this after. Prefer an outside + // block that neighbors a BB actually in the loop. + BasicBlock *FoundBB = 0; + for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) { + Function::iterator BBI = SplitPreds[i]; + if (++BBI != NewBB->getParent()->end() && + L->contains(BBI)) { + FoundBB = SplitPreds[i]; + break; + } + } + + // If our heuristic for a *good* bb to place this after doesn't find + // anything, just pick something. It's likely better than leaving it within + // the loop. + if (!FoundBB) + FoundBB = SplitPreds[0]; + NewBB->moveAfter(FoundBB); +} + + +/// SeparateNestedLoop - If this loop has multiple backedges, try to pull one of +/// them out into a nested loop. This is important for code that looks like +/// this: +/// +/// Loop: +/// ... +/// br cond, Loop, Next +/// ... +/// br cond2, Loop, Out +/// +/// To identify this common case, we look at the PHI nodes in the header of the +/// loop. PHI nodes with unchanging values on one backedge correspond to values +/// that change in the "outer" loop, but not in the "inner" loop. +/// +/// If we are able to separate out a loop, return the new outer loop that was +/// created. +/// +Loop *LoopSimplify::SeparateNestedLoop(Loop *L, LPPassManager &LPM) { + PHINode *PN = FindPHIToPartitionLoops(L, DT, AA, LI); + if (PN == 0) return 0; // No known way to partition. + + // Pull out all predecessors that have varying values in the loop. This + // handles the case when a PHI node has multiple instances of itself as + // arguments. + SmallVector<BasicBlock*, 8> OuterLoopPreds; + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + if (PN->getIncomingValue(i) != PN || + !L->contains(PN->getIncomingBlock(i))) { + // We can't split indirectbr edges. + if (isa<IndirectBrInst>(PN->getIncomingBlock(i)->getTerminator())) + return 0; + + OuterLoopPreds.push_back(PN->getIncomingBlock(i)); + } + + DEBUG(dbgs() << "LoopSimplify: Splitting out a new outer loop\n"); + + // If ScalarEvolution is around and knows anything about values in + // this loop, tell it to forget them, because we're about to + // substantially change it. + if (SE) + SE->forgetLoop(L); + + BasicBlock *Header = L->getHeader(); + BasicBlock *NewBB = SplitBlockPredecessors(Header, &OuterLoopPreds[0], + OuterLoopPreds.size(), + ".outer", this); + + // Make sure that NewBB is put someplace intelligent, which doesn't mess up + // code layout too horribly. + PlaceSplitBlockCarefully(NewBB, OuterLoopPreds, L); + + // Create the new outer loop. + Loop *NewOuter = new Loop(); + + // Change the parent loop to use the outer loop as its child now. + if (Loop *Parent = L->getParentLoop()) + Parent->replaceChildLoopWith(L, NewOuter); + else + LI->changeTopLevelLoop(L, NewOuter); + + // L is now a subloop of our outer loop. + NewOuter->addChildLoop(L); + + // Add the new loop to the pass manager queue. + LPM.insertLoopIntoQueue(NewOuter); + + for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); + I != E; ++I) + NewOuter->addBlockEntry(*I); + + // Now reset the header in L, which had been moved by + // SplitBlockPredecessors for the outer loop. + L->moveToHeader(Header); + + // Determine which blocks should stay in L and which should be moved out to + // the Outer loop now. + std::set<BasicBlock*> BlocksInL; + for (pred_iterator PI=pred_begin(Header), E = pred_end(Header); PI!=E; ++PI) { + BasicBlock *P = *PI; + if (DT->dominates(Header, P)) + AddBlockAndPredsToSet(P, Header, BlocksInL); + } + + // Scan all of the loop children of L, moving them to OuterLoop if they are + // not part of the inner loop. + const std::vector<Loop*> &SubLoops = L->getSubLoops(); + for (size_t I = 0; I != SubLoops.size(); ) + if (BlocksInL.count(SubLoops[I]->getHeader())) + ++I; // Loop remains in L + else + NewOuter->addChildLoop(L->removeChildLoop(SubLoops.begin() + I)); + + // Now that we know which blocks are in L and which need to be moved to + // OuterLoop, move any blocks that need it. + for (unsigned i = 0; i != L->getBlocks().size(); ++i) { + BasicBlock *BB = L->getBlocks()[i]; + if (!BlocksInL.count(BB)) { + // Move this block to the parent, updating the exit blocks sets + L->removeBlockFromLoop(BB); + if ((*LI)[BB] == L) + LI->changeLoopFor(BB, NewOuter); + --i; + } + } + + return NewOuter; +} + + + +/// InsertUniqueBackedgeBlock - This method is called when the specified loop +/// has more than one backedge in it. If this occurs, revector all of these +/// backedges to target a new basic block and have that block branch to the loop +/// header. This ensures that loops have exactly one backedge. +/// +BasicBlock * +LoopSimplify::InsertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader) { + assert(L->getNumBackEdges() > 1 && "Must have > 1 backedge!"); + + // Get information about the loop + BasicBlock *Header = L->getHeader(); + Function *F = Header->getParent(); + + // Unique backedge insertion currently depends on having a preheader. + if (!Preheader) + return 0; + + // Figure out which basic blocks contain back-edges to the loop header. + std::vector<BasicBlock*> BackedgeBlocks; + for (pred_iterator I = pred_begin(Header), E = pred_end(Header); I != E; ++I){ + BasicBlock *P = *I; + + // Indirectbr edges cannot be split, so we must fail if we find one. + if (isa<IndirectBrInst>(P->getTerminator())) + return 0; + + if (P != Preheader) BackedgeBlocks.push_back(P); + } + + // Create and insert the new backedge block... + BasicBlock *BEBlock = BasicBlock::Create(Header->getContext(), + Header->getName()+".backedge", F); + BranchInst *BETerminator = BranchInst::Create(Header, BEBlock); + + DEBUG(dbgs() << "LoopSimplify: Inserting unique backedge block " + << BEBlock->getName() << "\n"); + + // Move the new backedge block to right after the last backedge block. + Function::iterator InsertPos = BackedgeBlocks.back(); ++InsertPos; + F->getBasicBlockList().splice(InsertPos, F->getBasicBlockList(), BEBlock); + + // Now that the block has been inserted into the function, create PHI nodes in + // the backedge block which correspond to any PHI nodes in the header block. + for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) { + PHINode *PN = cast<PHINode>(I); + PHINode *NewPN = PHINode::Create(PN->getType(), PN->getName()+".be", + BETerminator); + NewPN->reserveOperandSpace(BackedgeBlocks.size()); + if (AA) AA->copyValue(PN, NewPN); + + // Loop over the PHI node, moving all entries except the one for the + // preheader over to the new PHI node. + unsigned PreheaderIdx = ~0U; + bool HasUniqueIncomingValue = true; + Value *UniqueValue = 0; + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + BasicBlock *IBB = PN->getIncomingBlock(i); + Value *IV = PN->getIncomingValue(i); + if (IBB == Preheader) { + PreheaderIdx = i; + } else { + NewPN->addIncoming(IV, IBB); + if (HasUniqueIncomingValue) { + if (UniqueValue == 0) + UniqueValue = IV; + else if (UniqueValue != IV) + HasUniqueIncomingValue = false; + } + } + } + + // Delete all of the incoming values from the old PN except the preheader's + assert(PreheaderIdx != ~0U && "PHI has no preheader entry??"); + if (PreheaderIdx != 0) { + PN->setIncomingValue(0, PN->getIncomingValue(PreheaderIdx)); + PN->setIncomingBlock(0, PN->getIncomingBlock(PreheaderIdx)); + } + // Nuke all entries except the zero'th. + for (unsigned i = 0, e = PN->getNumIncomingValues()-1; i != e; ++i) + PN->removeIncomingValue(e-i, false); + + // Finally, add the newly constructed PHI node as the entry for the BEBlock. + PN->addIncoming(NewPN, BEBlock); + + // As an optimization, if all incoming values in the new PhiNode (which is a + // subset of the incoming values of the old PHI node) have the same value, + // eliminate the PHI Node. + if (HasUniqueIncomingValue) { + NewPN->replaceAllUsesWith(UniqueValue); + if (AA) AA->deleteValue(NewPN); + BEBlock->getInstList().erase(NewPN); + } + } + + // Now that all of the PHI nodes have been inserted and adjusted, modify the + // backedge blocks to just to the BEBlock instead of the header. + for (unsigned i = 0, e = BackedgeBlocks.size(); i != e; ++i) { + TerminatorInst *TI = BackedgeBlocks[i]->getTerminator(); + for (unsigned Op = 0, e = TI->getNumSuccessors(); Op != e; ++Op) + if (TI->getSuccessor(Op) == Header) + TI->setSuccessor(Op, BEBlock); + } + + //===--- Update all analyses which we must preserve now -----------------===// + + // Update Loop Information - we know that this block is now in the current + // loop and all parent loops. + L->addBasicBlockToLoop(BEBlock, LI->getBase()); + + // Update dominator information + DT->splitBlock(BEBlock); + + return BEBlock; +} + +void LoopSimplify::verifyAnalysis() const { + // It used to be possible to just assert L->isLoopSimplifyForm(), however + // with the introduction of indirectbr, there are now cases where it's + // not possible to transform a loop as necessary. We can at least check + // that there is an indirectbr near any time there's trouble. + + // Indirectbr can interfere with preheader and unique backedge insertion. + if (!L->getLoopPreheader() || !L->getLoopLatch()) { + bool HasIndBrPred = false; + for (pred_iterator PI = pred_begin(L->getHeader()), + PE = pred_end(L->getHeader()); PI != PE; ++PI) + if (isa<IndirectBrInst>((*PI)->getTerminator())) { + HasIndBrPred = true; + break; + } + assert(HasIndBrPred && + "LoopSimplify has no excuse for missing loop header info!"); + } + + // Indirectbr can interfere with exit block canonicalization. + if (!L->hasDedicatedExits()) { + bool HasIndBrExiting = false; + SmallVector<BasicBlock*, 8> ExitingBlocks; + L->getExitingBlocks(ExitingBlocks); + for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) + if (isa<IndirectBrInst>((ExitingBlocks[i])->getTerminator())) { + HasIndBrExiting = true; + break; + } + assert(HasIndBrExiting && + "LoopSimplify has no excuse for missing exit block info!"); + } +} diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp new file mode 100644 index 0000000..7da7271 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -0,0 +1,388 @@ +//===-- UnrollLoop.cpp - Loop unrolling utilities -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements some loop unrolling utilities. It does not define any +// actual pass or policy, but provides a single function to perform loop +// unrolling. +// +// It works best when loops have been canonicalized by the -indvars pass, +// allowing it to determine the trip counts of loops easily. +// +// The process of unrolling can produce extraneous basic blocks linked with +// unconditional branches. This will be corrected in the future. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "loop-unroll" +#include "llvm/Transforms/Utils/UnrollLoop.h" +#include "llvm/BasicBlock.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" +using namespace llvm; + +// TODO: Should these be here or in LoopUnroll? +STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled"); +STATISTIC(NumUnrolled, "Number of loops unrolled (completely or otherwise)"); + +/// RemapInstruction - Convert the instruction operands from referencing the +/// current values into those specified by VMap. +static inline void RemapInstruction(Instruction *I, + ValueToValueMapTy &VMap) { + for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) { + Value *Op = I->getOperand(op); + ValueToValueMapTy::iterator It = VMap.find(Op); + if (It != VMap.end()) + I->setOperand(op, It->second); + } +} + +/// FoldBlockIntoPredecessor - Folds a basic block into its predecessor if it +/// only has one predecessor, and that predecessor only has one successor. +/// The LoopInfo Analysis that is passed will be kept consistent. +/// Returns the new combined block. +static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI) { + // Merge basic blocks into their predecessor if there is only one distinct + // pred, and if there is only one distinct successor of the predecessor, and + // if there are no PHI nodes. + BasicBlock *OnlyPred = BB->getSinglePredecessor(); + if (!OnlyPred) return 0; + + if (OnlyPred->getTerminator()->getNumSuccessors() != 1) + return 0; + + DEBUG(dbgs() << "Merging: " << *BB << "into: " << *OnlyPred); + + // Resolve any PHI nodes at the start of the block. They are all + // guaranteed to have exactly one entry if they exist, unless there are + // multiple duplicate (but guaranteed to be equal) entries for the + // incoming edges. This occurs when there are multiple edges from + // OnlyPred to OnlySucc. + FoldSingleEntryPHINodes(BB); + + // Delete the unconditional branch from the predecessor... + OnlyPred->getInstList().pop_back(); + + // Move all definitions in the successor to the predecessor... + OnlyPred->getInstList().splice(OnlyPred->end(), BB->getInstList()); + + // Make all PHI nodes that referred to BB now refer to Pred as their + // source... + BB->replaceAllUsesWith(OnlyPred); + + std::string OldName = BB->getName(); + + // Erase basic block from the function... + LI->removeBlock(BB); + BB->eraseFromParent(); + + // Inherit predecessor's name if it exists... + if (!OldName.empty() && !OnlyPred->hasName()) + OnlyPred->setName(OldName); + + return OnlyPred; +} + +/// Unroll the given loop by Count. The loop must be in LCSSA form. Returns true +/// if unrolling was successful, or false if the loop was unmodified. Unrolling +/// can only fail when the loop's latch block is not terminated by a conditional +/// branch instruction. However, if the trip count (and multiple) are not known, +/// loop unrolling will mostly produce more code that is no faster. +/// +/// The LoopInfo Analysis that is passed will be kept consistent. +/// +/// If a LoopPassManager is passed in, and the loop is fully removed, it will be +/// removed from the LoopPassManager as well. LPM can also be NULL. +bool llvm::UnrollLoop(Loop *L, unsigned Count, + LoopInfo *LI, LPPassManager *LPM) { + BasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) { + DEBUG(dbgs() << " Can't unroll; loop preheader-insertion failed.\n"); + return false; + } + + BasicBlock *LatchBlock = L->getLoopLatch(); + if (!LatchBlock) { + DEBUG(dbgs() << " Can't unroll; loop exit-block-insertion failed.\n"); + return false; + } + + BasicBlock *Header = L->getHeader(); + BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator()); + + if (!BI || BI->isUnconditional()) { + // The loop-rotate pass can be helpful to avoid this in many cases. + DEBUG(dbgs() << + " Can't unroll; loop not terminated by a conditional branch.\n"); + return false; + } + + if (Header->hasAddressTaken()) { + // The loop-rotate pass can be helpful to avoid this in many cases. + DEBUG(dbgs() << + " Won't unroll loop: address of header block is taken.\n"); + return false; + } + + // Notify ScalarEvolution that the loop will be substantially changed, + // if not outright eliminated. + if (ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>()) + SE->forgetLoop(L); + + // Find trip count + unsigned TripCount = L->getSmallConstantTripCount(); + // Find trip multiple if count is not available + unsigned TripMultiple = 1; + if (TripCount == 0) + TripMultiple = L->getSmallConstantTripMultiple(); + + if (TripCount != 0) + DEBUG(dbgs() << " Trip Count = " << TripCount << "\n"); + if (TripMultiple != 1) + DEBUG(dbgs() << " Trip Multiple = " << TripMultiple << "\n"); + + // Effectively "DCE" unrolled iterations that are beyond the tripcount + // and will never be executed. + if (TripCount != 0 && Count > TripCount) + Count = TripCount; + + assert(Count > 0); + assert(TripMultiple > 0); + assert(TripCount == 0 || TripCount % TripMultiple == 0); + + // Are we eliminating the loop control altogether? + bool CompletelyUnroll = Count == TripCount; + + // If we know the trip count, we know the multiple... + unsigned BreakoutTrip = 0; + if (TripCount != 0) { + BreakoutTrip = TripCount % Count; + TripMultiple = 0; + } else { + // Figure out what multiple to use. + BreakoutTrip = TripMultiple = + (unsigned)GreatestCommonDivisor64(Count, TripMultiple); + } + + if (CompletelyUnroll) { + DEBUG(dbgs() << "COMPLETELY UNROLLING loop %" << Header->getName() + << " with trip count " << TripCount << "!\n"); + } else { + DEBUG(dbgs() << "UNROLLING loop %" << Header->getName() + << " by " << Count); + if (TripMultiple == 0 || BreakoutTrip != TripMultiple) { + DEBUG(dbgs() << " with a breakout at trip " << BreakoutTrip); + } else if (TripMultiple != 1) { + DEBUG(dbgs() << " with " << TripMultiple << " trips per branch"); + } + DEBUG(dbgs() << "!\n"); + } + + std::vector<BasicBlock*> LoopBlocks = L->getBlocks(); + + bool ContinueOnTrue = L->contains(BI->getSuccessor(0)); + BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue); + + // For the first iteration of the loop, we should use the precloned values for + // PHI nodes. Insert associations now. + ValueToValueMapTy LastValueMap; + std::vector<PHINode*> OrigPHINode; + for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) { + PHINode *PN = cast<PHINode>(I); + OrigPHINode.push_back(PN); + if (Instruction *I = + dyn_cast<Instruction>(PN->getIncomingValueForBlock(LatchBlock))) + if (L->contains(I)) + LastValueMap[I] = I; + } + + std::vector<BasicBlock*> Headers; + std::vector<BasicBlock*> Latches; + Headers.push_back(Header); + Latches.push_back(LatchBlock); + + for (unsigned It = 1; It != Count; ++It) { + std::vector<BasicBlock*> NewBlocks; + + for (std::vector<BasicBlock*>::iterator BB = LoopBlocks.begin(), + E = LoopBlocks.end(); BB != E; ++BB) { + ValueToValueMapTy VMap; + BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It)); + Header->getParent()->getBasicBlockList().push_back(New); + + // Loop over all of the PHI nodes in the block, changing them to use the + // incoming values from the previous block. + if (*BB == Header) + for (unsigned i = 0, e = OrigPHINode.size(); i != e; ++i) { + PHINode *NewPHI = cast<PHINode>(VMap[OrigPHINode[i]]); + Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock); + if (Instruction *InValI = dyn_cast<Instruction>(InVal)) + if (It > 1 && L->contains(InValI)) + InVal = LastValueMap[InValI]; + VMap[OrigPHINode[i]] = InVal; + New->getInstList().erase(NewPHI); + } + + // Update our running map of newest clones + LastValueMap[*BB] = New; + for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end(); + VI != VE; ++VI) + LastValueMap[VI->first] = VI->second; + + L->addBasicBlockToLoop(New, LI->getBase()); + + // Add phi entries for newly created values to all exit blocks except + // the successor of the latch block. The successor of the exit block will + // be updated specially after unrolling all the way. + if (*BB != LatchBlock) + for (Value::use_iterator UI = (*BB)->use_begin(), UE = (*BB)->use_end(); + UI != UE;) { + Instruction *UseInst = cast<Instruction>(*UI); + ++UI; + if (isa<PHINode>(UseInst) && !L->contains(UseInst)) { + PHINode *phi = cast<PHINode>(UseInst); + Value *Incoming = phi->getIncomingValueForBlock(*BB); + phi->addIncoming(Incoming, New); + } + } + + // Keep track of new headers and latches as we create them, so that + // we can insert the proper branches later. + if (*BB == Header) + Headers.push_back(New); + if (*BB == LatchBlock) { + Latches.push_back(New); + + // Also, clear out the new latch's back edge so that it doesn't look + // like a new loop, so that it's amenable to being merged with adjacent + // blocks later on. + TerminatorInst *Term = New->getTerminator(); + assert(L->contains(Term->getSuccessor(!ContinueOnTrue))); + assert(Term->getSuccessor(ContinueOnTrue) == LoopExit); + Term->setSuccessor(!ContinueOnTrue, NULL); + } + + NewBlocks.push_back(New); + } + + // Remap all instructions in the most recent iteration + for (unsigned i = 0; i < NewBlocks.size(); ++i) + for (BasicBlock::iterator I = NewBlocks[i]->begin(), + E = NewBlocks[i]->end(); I != E; ++I) + ::RemapInstruction(I, LastValueMap); + } + + // The latch block exits the loop. If there are any PHI nodes in the + // successor blocks, update them to use the appropriate values computed as the + // last iteration of the loop. + if (Count != 1) { + SmallPtrSet<PHINode*, 8> Users; + for (Value::use_iterator UI = LatchBlock->use_begin(), + UE = LatchBlock->use_end(); UI != UE; ++UI) + if (PHINode *phi = dyn_cast<PHINode>(*UI)) + Users.insert(phi); + + BasicBlock *LastIterationBB = cast<BasicBlock>(LastValueMap[LatchBlock]); + for (SmallPtrSet<PHINode*,8>::iterator SI = Users.begin(), SE = Users.end(); + SI != SE; ++SI) { + PHINode *PN = *SI; + Value *InVal = PN->removeIncomingValue(LatchBlock, false); + // If this value was defined in the loop, take the value defined by the + // last iteration of the loop. + if (Instruction *InValI = dyn_cast<Instruction>(InVal)) { + if (L->contains(InValI)) + InVal = LastValueMap[InVal]; + } + PN->addIncoming(InVal, LastIterationBB); + } + } + + // Now, if we're doing complete unrolling, loop over the PHI nodes in the + // original block, setting them to their incoming values. + if (CompletelyUnroll) { + BasicBlock *Preheader = L->getLoopPreheader(); + for (unsigned i = 0, e = OrigPHINode.size(); i != e; ++i) { + PHINode *PN = OrigPHINode[i]; + PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader)); + Header->getInstList().erase(PN); + } + } + + // Now that all the basic blocks for the unrolled iterations are in place, + // set up the branches to connect them. + for (unsigned i = 0, e = Latches.size(); i != e; ++i) { + // The original branch was replicated in each unrolled iteration. + BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator()); + + // The branch destination. + unsigned j = (i + 1) % e; + BasicBlock *Dest = Headers[j]; + bool NeedConditional = true; + + // For a complete unroll, make the last iteration end with a branch + // to the exit block. + if (CompletelyUnroll && j == 0) { + Dest = LoopExit; + NeedConditional = false; + } + + // If we know the trip count or a multiple of it, we can safely use an + // unconditional branch for some iterations. + if (j != BreakoutTrip && (TripMultiple == 0 || j % TripMultiple != 0)) { + NeedConditional = false; + } + + if (NeedConditional) { + // Update the conditional branch's successor for the following + // iteration. + Term->setSuccessor(!ContinueOnTrue, Dest); + } else { + // Replace the conditional branch with an unconditional one. + BranchInst::Create(Dest, Term); + Term->eraseFromParent(); + // Merge adjacent basic blocks, if possible. + if (BasicBlock *Fold = FoldBlockIntoPredecessor(Dest, LI)) { + std::replace(Latches.begin(), Latches.end(), Dest, Fold); + std::replace(Headers.begin(), Headers.end(), Dest, Fold); + } + } + } + + // At this point, the code is well formed. We now do a quick sweep over the + // inserted code, doing constant propagation and dead code elimination as we + // go. + const std::vector<BasicBlock*> &NewLoopBlocks = L->getBlocks(); + for (std::vector<BasicBlock*>::const_iterator BB = NewLoopBlocks.begin(), + BBE = NewLoopBlocks.end(); BB != BBE; ++BB) + for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); I != E; ) { + Instruction *Inst = I++; + + if (isInstructionTriviallyDead(Inst)) + (*BB)->getInstList().erase(Inst); + else if (Value *V = SimplifyInstruction(Inst)) + if (LI->replacementPreservesLCSSAForm(Inst, V)) { + Inst->replaceAllUsesWith(V); + (*BB)->getInstList().erase(Inst); + } + } + + NumCompletelyUnrolled += CompletelyUnroll; + ++NumUnrolled; + // Remove the loop from the LoopPassManager if it's completely removed. + if (CompletelyUnroll && LPM != NULL) + LPM->deleteLoopFromQueue(L); + + return true; +} diff --git a/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp b/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp new file mode 100644 index 0000000..025ae0d --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp @@ -0,0 +1,613 @@ +//===- LowerInvoke.cpp - Eliminate Invoke & Unwind instructions -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This transformation is designed for use by code generators which do not yet +// support stack unwinding. This pass supports two models of exception handling +// lowering, the 'cheap' support and the 'expensive' support. +// +// 'Cheap' exception handling support gives the program the ability to execute +// any program which does not "throw an exception", by turning 'invoke' +// instructions into calls and by turning 'unwind' instructions into calls to +// abort(). If the program does dynamically use the unwind instruction, the +// program will print a message then abort. +// +// 'Expensive' exception handling support gives the full exception handling +// support to the program at the cost of making the 'invoke' instruction +// really expensive. It basically inserts setjmp/longjmp calls to emulate the +// exception handling as necessary. +// +// Because the 'expensive' support slows down programs a lot, and EH is only +// used for a subset of the programs, it must be specifically enabled by an +// option. +// +// Note that after this pass runs the CFG is not entirely accurate (exceptional +// control flow edges are not correct anymore) so only very simple things should +// be done after the lowerinvoke pass has run (like generation of native code). +// This should not be used as a general purpose "my LLVM-to-LLVM pass doesn't +// support the invoke instruction yet" lowering pass. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "lowerinvoke" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Instructions.h" +#include "llvm/Intrinsics.h" +#include "llvm/LLVMContext.h" +#include "llvm/Module.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Target/TargetLowering.h" +#include <csetjmp> +#include <set> +using namespace llvm; + +STATISTIC(NumInvokes, "Number of invokes replaced"); +STATISTIC(NumUnwinds, "Number of unwinds replaced"); +STATISTIC(NumSpilled, "Number of registers live across unwind edges"); + +static cl::opt<bool> ExpensiveEHSupport("enable-correct-eh-support", + cl::desc("Make the -lowerinvoke pass insert expensive, but correct, EH code")); + +namespace { + class LowerInvoke : public FunctionPass { + // Used for both models. + Constant *AbortFn; + + // Used for expensive EH support. + const Type *JBLinkTy; + GlobalVariable *JBListHead; + Constant *SetJmpFn, *LongJmpFn, *StackSaveFn, *StackRestoreFn; + bool useExpensiveEHSupport; + + // We peek in TLI to grab the target's jmp_buf size and alignment + const TargetLowering *TLI; + + public: + static char ID; // Pass identification, replacement for typeid + explicit LowerInvoke(const TargetLowering *tli = NULL, + bool useExpensiveEHSupport = ExpensiveEHSupport) + : FunctionPass(ID), useExpensiveEHSupport(useExpensiveEHSupport), + TLI(tli) { + initializeLowerInvokePass(*PassRegistry::getPassRegistry()); + } + bool doInitialization(Module &M); + bool runOnFunction(Function &F); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + // This is a cluster of orthogonal Transforms + AU.addPreserved("mem2reg"); + AU.addPreservedID(LowerSwitchID); + } + + private: + bool insertCheapEHSupport(Function &F); + void splitLiveRangesLiveAcrossInvokes(SmallVectorImpl<InvokeInst*>&Invokes); + void rewriteExpensiveInvoke(InvokeInst *II, unsigned InvokeNo, + AllocaInst *InvokeNum, AllocaInst *StackPtr, + SwitchInst *CatchSwitch); + bool insertExpensiveEHSupport(Function &F); + }; +} + +char LowerInvoke::ID = 0; +INITIALIZE_PASS(LowerInvoke, "lowerinvoke", + "Lower invoke and unwind, for unwindless code generators", + false, false) + +char &llvm::LowerInvokePassID = LowerInvoke::ID; + +// Public Interface To the LowerInvoke pass. +FunctionPass *llvm::createLowerInvokePass(const TargetLowering *TLI) { + return new LowerInvoke(TLI, ExpensiveEHSupport); +} +FunctionPass *llvm::createLowerInvokePass(const TargetLowering *TLI, + bool useExpensiveEHSupport) { + return new LowerInvoke(TLI, useExpensiveEHSupport); +} + +// doInitialization - Make sure that there is a prototype for abort in the +// current module. +bool LowerInvoke::doInitialization(Module &M) { + const Type *VoidPtrTy = + Type::getInt8PtrTy(M.getContext()); + if (useExpensiveEHSupport) { + // Insert a type for the linked list of jump buffers. + unsigned JBSize = TLI ? TLI->getJumpBufSize() : 0; + JBSize = JBSize ? JBSize : 200; + const Type *JmpBufTy = ArrayType::get(VoidPtrTy, JBSize); + + { // The type is recursive, so use a type holder. + std::vector<const Type*> Elements; + Elements.push_back(JmpBufTy); + OpaqueType *OT = OpaqueType::get(M.getContext()); + Elements.push_back(PointerType::getUnqual(OT)); + PATypeHolder JBLType(StructType::get(M.getContext(), Elements)); + OT->refineAbstractTypeTo(JBLType.get()); // Complete the cycle. + JBLinkTy = JBLType.get(); + M.addTypeName("llvm.sjljeh.jmpbufty", JBLinkTy); + } + + const Type *PtrJBList = PointerType::getUnqual(JBLinkTy); + + // Now that we've done that, insert the jmpbuf list head global, unless it + // already exists. + if (!(JBListHead = M.getGlobalVariable("llvm.sjljeh.jblist", PtrJBList))) { + JBListHead = new GlobalVariable(M, PtrJBList, false, + GlobalValue::LinkOnceAnyLinkage, + Constant::getNullValue(PtrJBList), + "llvm.sjljeh.jblist"); + } + +// VisualStudio defines setjmp as _setjmp +#if defined(_MSC_VER) && defined(setjmp) && \ + !defined(setjmp_undefined_for_msvc) +# pragma push_macro("setjmp") +# undef setjmp +# define setjmp_undefined_for_msvc +#endif + + SetJmpFn = Intrinsic::getDeclaration(&M, Intrinsic::setjmp); + +#if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc) + // let's return it to _setjmp state +# pragma pop_macro("setjmp") +# undef setjmp_undefined_for_msvc +#endif + + LongJmpFn = Intrinsic::getDeclaration(&M, Intrinsic::longjmp); + StackSaveFn = Intrinsic::getDeclaration(&M, Intrinsic::stacksave); + StackRestoreFn = Intrinsic::getDeclaration(&M, Intrinsic::stackrestore); + } + + // We need the 'write' and 'abort' functions for both models. + AbortFn = M.getOrInsertFunction("abort", Type::getVoidTy(M.getContext()), + (Type *)0); + return true; +} + +bool LowerInvoke::insertCheapEHSupport(Function &F) { + bool Changed = false; + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) + if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) { + SmallVector<Value*,16> CallArgs(II->op_begin(), II->op_end() - 3); + // Insert a normal call instruction... + CallInst *NewCall = CallInst::Create(II->getCalledValue(), + CallArgs.begin(), CallArgs.end(), + "",II); + NewCall->takeName(II); + NewCall->setCallingConv(II->getCallingConv()); + NewCall->setAttributes(II->getAttributes()); + NewCall->setDebugLoc(II->getDebugLoc()); + II->replaceAllUsesWith(NewCall); + + // Insert an unconditional branch to the normal destination. + BranchInst::Create(II->getNormalDest(), II); + + // Remove any PHI node entries from the exception destination. + II->getUnwindDest()->removePredecessor(BB); + + // Remove the invoke instruction now. + BB->getInstList().erase(II); + + ++NumInvokes; Changed = true; + } else if (UnwindInst *UI = dyn_cast<UnwindInst>(BB->getTerminator())) { + // Insert a call to abort() + CallInst::Create(AbortFn, "", UI)->setTailCall(); + + // Insert a return instruction. This really should be a "barrier", as it + // is unreachable. + ReturnInst::Create(F.getContext(), + F.getReturnType()->isVoidTy() ? + 0 : Constant::getNullValue(F.getReturnType()), UI); + + // Remove the unwind instruction now. + BB->getInstList().erase(UI); + + ++NumUnwinds; Changed = true; + } + return Changed; +} + +/// rewriteExpensiveInvoke - Insert code and hack the function to replace the +/// specified invoke instruction with a call. +void LowerInvoke::rewriteExpensiveInvoke(InvokeInst *II, unsigned InvokeNo, + AllocaInst *InvokeNum, + AllocaInst *StackPtr, + SwitchInst *CatchSwitch) { + ConstantInt *InvokeNoC = ConstantInt::get(Type::getInt32Ty(II->getContext()), + InvokeNo); + + // If the unwind edge has phi nodes, split the edge. + if (isa<PHINode>(II->getUnwindDest()->begin())) { + SplitCriticalEdge(II, 1, this); + + // If there are any phi nodes left, they must have a single predecessor. + while (PHINode *PN = dyn_cast<PHINode>(II->getUnwindDest()->begin())) { + PN->replaceAllUsesWith(PN->getIncomingValue(0)); + PN->eraseFromParent(); + } + } + + // Insert a store of the invoke num before the invoke and store zero into the + // location afterward. + new StoreInst(InvokeNoC, InvokeNum, true, II); // volatile + + // Insert a store of the stack ptr before the invoke, so we can restore it + // later in the exception case. + CallInst* StackSaveRet = CallInst::Create(StackSaveFn, "ssret", II); + new StoreInst(StackSaveRet, StackPtr, true, II); // volatile + + BasicBlock::iterator NI = II->getNormalDest()->getFirstNonPHI(); + // nonvolatile. + new StoreInst(Constant::getNullValue(Type::getInt32Ty(II->getContext())), + InvokeNum, false, NI); + + Instruction* StackPtrLoad = new LoadInst(StackPtr, "stackptr.restore", true, + II->getUnwindDest()->getFirstNonPHI() + ); + CallInst::Create(StackRestoreFn, StackPtrLoad, "")->insertAfter(StackPtrLoad); + + // Add a switch case to our unwind block. + CatchSwitch->addCase(InvokeNoC, II->getUnwindDest()); + + // Insert a normal call instruction. + SmallVector<Value*,16> CallArgs(II->op_begin(), II->op_end() - 3); + CallInst *NewCall = CallInst::Create(II->getCalledValue(), + CallArgs.begin(), CallArgs.end(), "", + II); + NewCall->takeName(II); + NewCall->setCallingConv(II->getCallingConv()); + NewCall->setAttributes(II->getAttributes()); + NewCall->setDebugLoc(II->getDebugLoc()); + II->replaceAllUsesWith(NewCall); + + // Replace the invoke with an uncond branch. + BranchInst::Create(II->getNormalDest(), NewCall->getParent()); + II->eraseFromParent(); +} + +/// MarkBlocksLiveIn - Insert BB and all of its predescessors into LiveBBs until +/// we reach blocks we've already seen. +static void MarkBlocksLiveIn(BasicBlock *BB, std::set<BasicBlock*> &LiveBBs) { + if (!LiveBBs.insert(BB).second) return; // already been here. + + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) + MarkBlocksLiveIn(*PI, LiveBBs); +} + +// First thing we need to do is scan the whole function for values that are +// live across unwind edges. Each value that is live across an unwind edge +// we spill into a stack location, guaranteeing that there is nothing live +// across the unwind edge. This process also splits all critical edges +// coming out of invoke's. +void LowerInvoke:: +splitLiveRangesLiveAcrossInvokes(SmallVectorImpl<InvokeInst*> &Invokes) { + // First step, split all critical edges from invoke instructions. + for (unsigned i = 0, e = Invokes.size(); i != e; ++i) { + InvokeInst *II = Invokes[i]; + SplitCriticalEdge(II, 0, this); + SplitCriticalEdge(II, 1, this); + assert(!isa<PHINode>(II->getNormalDest()) && + !isa<PHINode>(II->getUnwindDest()) && + "critical edge splitting left single entry phi nodes?"); + } + + Function *F = Invokes.back()->getParent()->getParent(); + + // To avoid having to handle incoming arguments specially, we lower each arg + // to a copy instruction in the entry block. This ensures that the argument + // value itself cannot be live across the entry block. + BasicBlock::iterator AfterAllocaInsertPt = F->begin()->begin(); + while (isa<AllocaInst>(AfterAllocaInsertPt) && + isa<ConstantInt>(cast<AllocaInst>(AfterAllocaInsertPt)->getArraySize())) + ++AfterAllocaInsertPt; + for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); + AI != E; ++AI) { + const Type *Ty = AI->getType(); + // Aggregate types can't be cast, but are legal argument types, so we have + // to handle them differently. We use an extract/insert pair as a + // lightweight method to achieve the same goal. + if (isa<StructType>(Ty) || isa<ArrayType>(Ty) || isa<VectorType>(Ty)) { + Instruction *EI = ExtractValueInst::Create(AI, 0, "",AfterAllocaInsertPt); + Instruction *NI = InsertValueInst::Create(AI, EI, 0); + NI->insertAfter(EI); + AI->replaceAllUsesWith(NI); + // Set the operand of the instructions back to the AllocaInst. + EI->setOperand(0, AI); + NI->setOperand(0, AI); + } else { + // This is always a no-op cast because we're casting AI to AI->getType() + // so src and destination types are identical. BitCast is the only + // possibility. + CastInst *NC = new BitCastInst( + AI, AI->getType(), AI->getName()+".tmp", AfterAllocaInsertPt); + AI->replaceAllUsesWith(NC); + // Set the operand of the cast instruction back to the AllocaInst. + // Normally it's forbidden to replace a CastInst's operand because it + // could cause the opcode to reflect an illegal conversion. However, + // we're replacing it here with the same value it was constructed with. + // We do this because the above replaceAllUsesWith() clobbered the + // operand, but we want this one to remain. + NC->setOperand(0, AI); + } + } + + // Finally, scan the code looking for instructions with bad live ranges. + for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) + for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ++II) { + // Ignore obvious cases we don't have to handle. In particular, most + // instructions either have no uses or only have a single use inside the + // current block. Ignore them quickly. + Instruction *Inst = II; + if (Inst->use_empty()) continue; + if (Inst->hasOneUse() && + cast<Instruction>(Inst->use_back())->getParent() == BB && + !isa<PHINode>(Inst->use_back())) continue; + + // If this is an alloca in the entry block, it's not a real register + // value. + if (AllocaInst *AI = dyn_cast<AllocaInst>(Inst)) + if (isa<ConstantInt>(AI->getArraySize()) && BB == F->begin()) + continue; + + // Avoid iterator invalidation by copying users to a temporary vector. + SmallVector<Instruction*,16> Users; + for (Value::use_iterator UI = Inst->use_begin(), E = Inst->use_end(); + UI != E; ++UI) { + Instruction *User = cast<Instruction>(*UI); + if (User->getParent() != BB || isa<PHINode>(User)) + Users.push_back(User); + } + + // Scan all of the uses and see if the live range is live across an unwind + // edge. If we find a use live across an invoke edge, create an alloca + // and spill the value. + std::set<InvokeInst*> InvokesWithStoreInserted; + + // Find all of the blocks that this value is live in. + std::set<BasicBlock*> LiveBBs; + LiveBBs.insert(Inst->getParent()); + while (!Users.empty()) { + Instruction *U = Users.back(); + Users.pop_back(); + + if (!isa<PHINode>(U)) { + MarkBlocksLiveIn(U->getParent(), LiveBBs); + } else { + // Uses for a PHI node occur in their predecessor block. + PHINode *PN = cast<PHINode>(U); + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + if (PN->getIncomingValue(i) == Inst) + MarkBlocksLiveIn(PN->getIncomingBlock(i), LiveBBs); + } + } + + // Now that we know all of the blocks that this thing is live in, see if + // it includes any of the unwind locations. + bool NeedsSpill = false; + for (unsigned i = 0, e = Invokes.size(); i != e; ++i) { + BasicBlock *UnwindBlock = Invokes[i]->getUnwindDest(); + if (UnwindBlock != BB && LiveBBs.count(UnwindBlock)) { + NeedsSpill = true; + } + } + + // If we decided we need a spill, do it. + if (NeedsSpill) { + ++NumSpilled; + DemoteRegToStack(*Inst, true); + } + } +} + +bool LowerInvoke::insertExpensiveEHSupport(Function &F) { + SmallVector<ReturnInst*,16> Returns; + SmallVector<UnwindInst*,16> Unwinds; + SmallVector<InvokeInst*,16> Invokes; + + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) + if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) { + // Remember all return instructions in case we insert an invoke into this + // function. + Returns.push_back(RI); + } else if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) { + Invokes.push_back(II); + } else if (UnwindInst *UI = dyn_cast<UnwindInst>(BB->getTerminator())) { + Unwinds.push_back(UI); + } + + if (Unwinds.empty() && Invokes.empty()) return false; + + NumInvokes += Invokes.size(); + NumUnwinds += Unwinds.size(); + + // TODO: This is not an optimal way to do this. In particular, this always + // inserts setjmp calls into the entries of functions with invoke instructions + // even though there are possibly paths through the function that do not + // execute any invokes. In particular, for functions with early exits, e.g. + // the 'addMove' method in hexxagon, it would be nice to not have to do the + // setjmp stuff on the early exit path. This requires a bit of dataflow, but + // would not be too hard to do. + + // If we have an invoke instruction, insert a setjmp that dominates all + // invokes. After the setjmp, use a cond branch that goes to the original + // code path on zero, and to a designated 'catch' block of nonzero. + Value *OldJmpBufPtr = 0; + if (!Invokes.empty()) { + // First thing we need to do is scan the whole function for values that are + // live across unwind edges. Each value that is live across an unwind edge + // we spill into a stack location, guaranteeing that there is nothing live + // across the unwind edge. This process also splits all critical edges + // coming out of invoke's. + splitLiveRangesLiveAcrossInvokes(Invokes); + + BasicBlock *EntryBB = F.begin(); + + // Create an alloca for the incoming jump buffer ptr and the new jump buffer + // that needs to be restored on all exits from the function. This is an + // alloca because the value needs to be live across invokes. + unsigned Align = TLI ? TLI->getJumpBufAlignment() : 0; + AllocaInst *JmpBuf = + new AllocaInst(JBLinkTy, 0, Align, + "jblink", F.begin()->begin()); + + Value *Idx[] = { Constant::getNullValue(Type::getInt32Ty(F.getContext())), + ConstantInt::get(Type::getInt32Ty(F.getContext()), 1) }; + OldJmpBufPtr = GetElementPtrInst::Create(JmpBuf, &Idx[0], &Idx[2], + "OldBuf", + EntryBB->getTerminator()); + + // Copy the JBListHead to the alloca. + Value *OldBuf = new LoadInst(JBListHead, "oldjmpbufptr", true, + EntryBB->getTerminator()); + new StoreInst(OldBuf, OldJmpBufPtr, true, EntryBB->getTerminator()); + + // Add the new jumpbuf to the list. + new StoreInst(JmpBuf, JBListHead, true, EntryBB->getTerminator()); + + // Create the catch block. The catch block is basically a big switch + // statement that goes to all of the invoke catch blocks. + BasicBlock *CatchBB = + BasicBlock::Create(F.getContext(), "setjmp.catch", &F); + + // Create an alloca which keeps track of the stack pointer before every + // invoke, this allows us to properly restore the stack pointer after + // long jumping. + AllocaInst *StackPtr = new AllocaInst(Type::getInt8PtrTy(F.getContext()), 0, + "stackptr", EntryBB->begin()); + + // Create an alloca which keeps track of which invoke is currently + // executing. For normal calls it contains zero. + AllocaInst *InvokeNum = new AllocaInst(Type::getInt32Ty(F.getContext()), 0, + "invokenum",EntryBB->begin()); + new StoreInst(ConstantInt::get(Type::getInt32Ty(F.getContext()), 0), + InvokeNum, true, EntryBB->getTerminator()); + + // Insert a load in the Catch block, and a switch on its value. By default, + // we go to a block that just does an unwind (which is the correct action + // for a standard call). + BasicBlock *UnwindBB = BasicBlock::Create(F.getContext(), "unwindbb", &F); + Unwinds.push_back(new UnwindInst(F.getContext(), UnwindBB)); + + Value *CatchLoad = new LoadInst(InvokeNum, "invoke.num", true, CatchBB); + SwitchInst *CatchSwitch = + SwitchInst::Create(CatchLoad, UnwindBB, Invokes.size(), CatchBB); + + // Now that things are set up, insert the setjmp call itself. + + // Split the entry block to insert the conditional branch for the setjmp. + BasicBlock *ContBlock = EntryBB->splitBasicBlock(EntryBB->getTerminator(), + "setjmp.cont"); + + Idx[1] = ConstantInt::get(Type::getInt32Ty(F.getContext()), 0); + Value *JmpBufPtr = GetElementPtrInst::Create(JmpBuf, &Idx[0], &Idx[2], + "TheJmpBuf", + EntryBB->getTerminator()); + JmpBufPtr = new BitCastInst(JmpBufPtr, + Type::getInt8PtrTy(F.getContext()), + "tmp", EntryBB->getTerminator()); + Value *SJRet = CallInst::Create(SetJmpFn, JmpBufPtr, "sjret", + EntryBB->getTerminator()); + + // Compare the return value to zero. + Value *IsNormal = new ICmpInst(EntryBB->getTerminator(), + ICmpInst::ICMP_EQ, SJRet, + Constant::getNullValue(SJRet->getType()), + "notunwind"); + // Nuke the uncond branch. + EntryBB->getTerminator()->eraseFromParent(); + + // Put in a new condbranch in its place. + BranchInst::Create(ContBlock, CatchBB, IsNormal, EntryBB); + + // At this point, we are all set up, rewrite each invoke instruction. + for (unsigned i = 0, e = Invokes.size(); i != e; ++i) + rewriteExpensiveInvoke(Invokes[i], i+1, InvokeNum, StackPtr, CatchSwitch); + } + + // We know that there is at least one unwind. + + // Create three new blocks, the block to load the jmpbuf ptr and compare + // against null, the block to do the longjmp, and the error block for if it + // is null. Add them at the end of the function because they are not hot. + BasicBlock *UnwindHandler = BasicBlock::Create(F.getContext(), + "dounwind", &F); + BasicBlock *UnwindBlock = BasicBlock::Create(F.getContext(), "unwind", &F); + BasicBlock *TermBlock = BasicBlock::Create(F.getContext(), "unwinderror", &F); + + // If this function contains an invoke, restore the old jumpbuf ptr. + Value *BufPtr; + if (OldJmpBufPtr) { + // Before the return, insert a copy from the saved value to the new value. + BufPtr = new LoadInst(OldJmpBufPtr, "oldjmpbufptr", UnwindHandler); + new StoreInst(BufPtr, JBListHead, UnwindHandler); + } else { + BufPtr = new LoadInst(JBListHead, "ehlist", UnwindHandler); + } + + // Load the JBList, if it's null, then there was no catch! + Value *NotNull = new ICmpInst(*UnwindHandler, ICmpInst::ICMP_NE, BufPtr, + Constant::getNullValue(BufPtr->getType()), + "notnull"); + BranchInst::Create(UnwindBlock, TermBlock, NotNull, UnwindHandler); + + // Create the block to do the longjmp. + // Get a pointer to the jmpbuf and longjmp. + Value *Idx[] = { Constant::getNullValue(Type::getInt32Ty(F.getContext())), + ConstantInt::get(Type::getInt32Ty(F.getContext()), 0) }; + Idx[0] = GetElementPtrInst::Create(BufPtr, &Idx[0], &Idx[2], "JmpBuf", + UnwindBlock); + Idx[0] = new BitCastInst(Idx[0], + Type::getInt8PtrTy(F.getContext()), + "tmp", UnwindBlock); + Idx[1] = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1); + CallInst::Create(LongJmpFn, &Idx[0], &Idx[2], "", UnwindBlock); + new UnreachableInst(F.getContext(), UnwindBlock); + + // Set up the term block ("throw without a catch"). + new UnreachableInst(F.getContext(), TermBlock); + + // Insert a call to abort() + CallInst::Create(AbortFn, "", + TermBlock->getTerminator())->setTailCall(); + + + // Replace all unwinds with a branch to the unwind handler. + for (unsigned i = 0, e = Unwinds.size(); i != e; ++i) { + BranchInst::Create(UnwindHandler, Unwinds[i]); + Unwinds[i]->eraseFromParent(); + } + + // Finally, for any returns from this function, if this function contains an + // invoke, restore the old jmpbuf pointer to its input value. + if (OldJmpBufPtr) { + for (unsigned i = 0, e = Returns.size(); i != e; ++i) { + ReturnInst *R = Returns[i]; + + // Before the return, insert a copy from the saved value to the new value. + Value *OldBuf = new LoadInst(OldJmpBufPtr, "oldjmpbufptr", true, R); + new StoreInst(OldBuf, JBListHead, true, R); + } + } + + return true; +} + +bool LowerInvoke::runOnFunction(Function &F) { + if (useExpensiveEHSupport) + return insertExpensiveEHSupport(F); + else + return insertCheapEHSupport(F); +} diff --git a/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp b/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp new file mode 100644 index 0000000..914a439 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp @@ -0,0 +1,323 @@ +//===- LowerSwitch.cpp - Eliminate Switch instructions --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// The LowerSwitch transformation rewrites switch instructions with a sequence +// of branches, which allows targets to get away with not implementing the +// switch instruction until it is convenient. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h" +#include "llvm/Constants.h" +#include "llvm/Function.h" +#include "llvm/Instructions.h" +#include "llvm/LLVMContext.h" +#include "llvm/Pass.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include <algorithm> +using namespace llvm; + +namespace { + /// LowerSwitch Pass - Replace all SwitchInst instructions with chained branch + /// instructions. + class LowerSwitch : public FunctionPass { + public: + static char ID; // Pass identification, replacement for typeid + LowerSwitch() : FunctionPass(ID) { + initializeLowerSwitchPass(*PassRegistry::getPassRegistry()); + } + + virtual bool runOnFunction(Function &F); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + // This is a cluster of orthogonal Transforms + AU.addPreserved<UnifyFunctionExitNodes>(); + AU.addPreserved("mem2reg"); + AU.addPreservedID(LowerInvokePassID); + } + + struct CaseRange { + Constant* Low; + Constant* High; + BasicBlock* BB; + + CaseRange(Constant *low = 0, Constant *high = 0, BasicBlock *bb = 0) : + Low(low), High(high), BB(bb) { } + }; + + typedef std::vector<CaseRange> CaseVector; + typedef std::vector<CaseRange>::iterator CaseItr; + private: + void processSwitchInst(SwitchInst *SI); + + BasicBlock* switchConvert(CaseItr Begin, CaseItr End, Value* Val, + BasicBlock* OrigBlock, BasicBlock* Default); + BasicBlock* newLeafBlock(CaseRange& Leaf, Value* Val, + BasicBlock* OrigBlock, BasicBlock* Default); + unsigned Clusterify(CaseVector& Cases, SwitchInst *SI); + }; + + /// The comparison function for sorting the switch case values in the vector. + /// WARNING: Case ranges should be disjoint! + struct CaseCmp { + bool operator () (const LowerSwitch::CaseRange& C1, + const LowerSwitch::CaseRange& C2) { + + const ConstantInt* CI1 = cast<const ConstantInt>(C1.Low); + const ConstantInt* CI2 = cast<const ConstantInt>(C2.High); + return CI1->getValue().slt(CI2->getValue()); + } + }; +} + +char LowerSwitch::ID = 0; +INITIALIZE_PASS(LowerSwitch, "lowerswitch", + "Lower SwitchInst's to branches", false, false) + +// Publically exposed interface to pass... +char &llvm::LowerSwitchID = LowerSwitch::ID; +// createLowerSwitchPass - Interface to this file... +FunctionPass *llvm::createLowerSwitchPass() { + return new LowerSwitch(); +} + +bool LowerSwitch::runOnFunction(Function &F) { + bool Changed = false; + + for (Function::iterator I = F.begin(), E = F.end(); I != E; ) { + BasicBlock *Cur = I++; // Advance over block so we don't traverse new blocks + + if (SwitchInst *SI = dyn_cast<SwitchInst>(Cur->getTerminator())) { + Changed = true; + processSwitchInst(SI); + } + } + + return Changed; +} + +// operator<< - Used for debugging purposes. +// +static raw_ostream& operator<<(raw_ostream &O, + const LowerSwitch::CaseVector &C) + LLVM_ATTRIBUTE_USED; +static raw_ostream& operator<<(raw_ostream &O, + const LowerSwitch::CaseVector &C) { + O << "["; + + for (LowerSwitch::CaseVector::const_iterator B = C.begin(), + E = C.end(); B != E; ) { + O << *B->Low << " -" << *B->High; + if (++B != E) O << ", "; + } + + return O << "]"; +} + +// switchConvert - Convert the switch statement into a binary lookup of +// the case values. The function recursively builds this tree. +// +BasicBlock* LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, + Value* Val, BasicBlock* OrigBlock, + BasicBlock* Default) +{ + unsigned Size = End - Begin; + + if (Size == 1) + return newLeafBlock(*Begin, Val, OrigBlock, Default); + + unsigned Mid = Size / 2; + std::vector<CaseRange> LHS(Begin, Begin + Mid); + DEBUG(dbgs() << "LHS: " << LHS << "\n"); + std::vector<CaseRange> RHS(Begin + Mid, End); + DEBUG(dbgs() << "RHS: " << RHS << "\n"); + + CaseRange& Pivot = *(Begin + Mid); + DEBUG(dbgs() << "Pivot ==> " + << cast<ConstantInt>(Pivot.Low)->getValue() << " -" + << cast<ConstantInt>(Pivot.High)->getValue() << "\n"); + + BasicBlock* LBranch = switchConvert(LHS.begin(), LHS.end(), Val, + OrigBlock, Default); + BasicBlock* RBranch = switchConvert(RHS.begin(), RHS.end(), Val, + OrigBlock, Default); + + // Create a new node that checks if the value is < pivot. Go to the + // left branch if it is and right branch if not. + Function* F = OrigBlock->getParent(); + BasicBlock* NewNode = BasicBlock::Create(Val->getContext(), "NodeBlock"); + Function::iterator FI = OrigBlock; + F->getBasicBlockList().insert(++FI, NewNode); + + ICmpInst* Comp = new ICmpInst(ICmpInst::ICMP_SLT, + Val, Pivot.Low, "Pivot"); + NewNode->getInstList().push_back(Comp); + BranchInst::Create(LBranch, RBranch, Comp, NewNode); + return NewNode; +} + +// newLeafBlock - Create a new leaf block for the binary lookup tree. It +// checks if the switch's value == the case's value. If not, then it +// jumps to the default branch. At this point in the tree, the value +// can't be another valid case value, so the jump to the "default" branch +// is warranted. +// +BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val, + BasicBlock* OrigBlock, + BasicBlock* Default) +{ + Function* F = OrigBlock->getParent(); + BasicBlock* NewLeaf = BasicBlock::Create(Val->getContext(), "LeafBlock"); + Function::iterator FI = OrigBlock; + F->getBasicBlockList().insert(++FI, NewLeaf); + + // Emit comparison + ICmpInst* Comp = NULL; + if (Leaf.Low == Leaf.High) { + // Make the seteq instruction... + Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_EQ, Val, + Leaf.Low, "SwitchLeaf"); + } else { + // Make range comparison + if (cast<ConstantInt>(Leaf.Low)->isMinValue(true /*isSigned*/)) { + // Val >= Min && Val <= Hi --> Val <= Hi + Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_SLE, Val, Leaf.High, + "SwitchLeaf"); + } else if (cast<ConstantInt>(Leaf.Low)->isZero()) { + // Val >= 0 && Val <= Hi --> Val <=u Hi + Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_ULE, Val, Leaf.High, + "SwitchLeaf"); + } else { + // Emit V-Lo <=u Hi-Lo + Constant* NegLo = ConstantExpr::getNeg(Leaf.Low); + Instruction* Add = BinaryOperator::CreateAdd(Val, NegLo, + Val->getName()+".off", + NewLeaf); + Constant *UpperBound = ConstantExpr::getAdd(NegLo, Leaf.High); + Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_ULE, Add, UpperBound, + "SwitchLeaf"); + } + } + + // Make the conditional branch... + BasicBlock* Succ = Leaf.BB; + BranchInst::Create(Succ, Default, Comp, NewLeaf); + + // If there were any PHI nodes in this successor, rewrite one entry + // from OrigBlock to come from NewLeaf. + for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) { + PHINode* PN = cast<PHINode>(I); + // Remove all but one incoming entries from the cluster + uint64_t Range = cast<ConstantInt>(Leaf.High)->getSExtValue() - + cast<ConstantInt>(Leaf.Low)->getSExtValue(); + for (uint64_t j = 0; j < Range; ++j) { + PN->removeIncomingValue(OrigBlock); + } + + int BlockIdx = PN->getBasicBlockIndex(OrigBlock); + assert(BlockIdx != -1 && "Switch didn't go to this successor??"); + PN->setIncomingBlock((unsigned)BlockIdx, NewLeaf); + } + + return NewLeaf; +} + +// Clusterify - Transform simple list of Cases into list of CaseRange's +unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) { + unsigned numCmps = 0; + + // Start with "simple" cases + for (unsigned i = 1; i < SI->getNumSuccessors(); ++i) + Cases.push_back(CaseRange(SI->getSuccessorValue(i), + SI->getSuccessorValue(i), + SI->getSuccessor(i))); + std::sort(Cases.begin(), Cases.end(), CaseCmp()); + + // Merge case into clusters + if (Cases.size()>=2) + for (CaseItr I=Cases.begin(), J=llvm::next(Cases.begin()); J!=Cases.end(); ) { + int64_t nextValue = cast<ConstantInt>(J->Low)->getSExtValue(); + int64_t currentValue = cast<ConstantInt>(I->High)->getSExtValue(); + BasicBlock* nextBB = J->BB; + BasicBlock* currentBB = I->BB; + + // If the two neighboring cases go to the same destination, merge them + // into a single case. + if ((nextValue-currentValue==1) && (currentBB == nextBB)) { + I->High = J->High; + J = Cases.erase(J); + } else { + I = J++; + } + } + + for (CaseItr I=Cases.begin(), E=Cases.end(); I!=E; ++I, ++numCmps) { + if (I->Low != I->High) + // A range counts double, since it requires two compares. + ++numCmps; + } + + return numCmps; +} + +// processSwitchInst - Replace the specified switch instruction with a sequence +// of chained if-then insts in a balanced binary search. +// +void LowerSwitch::processSwitchInst(SwitchInst *SI) { + BasicBlock *CurBlock = SI->getParent(); + BasicBlock *OrigBlock = CurBlock; + Function *F = CurBlock->getParent(); + Value *Val = SI->getOperand(0); // The value we are switching on... + BasicBlock* Default = SI->getDefaultDest(); + + // If there is only the default destination, don't bother with the code below. + if (SI->getNumOperands() == 2) { + BranchInst::Create(SI->getDefaultDest(), CurBlock); + CurBlock->getInstList().erase(SI); + return; + } + + // Create a new, empty default block so that the new hierarchy of + // if-then statements go to this and the PHI nodes are happy. + BasicBlock* NewDefault = BasicBlock::Create(SI->getContext(), "NewDefault"); + F->getBasicBlockList().insert(Default, NewDefault); + + BranchInst::Create(Default, NewDefault); + + // If there is an entry in any PHI nodes for the default edge, make sure + // to update them as well. + for (BasicBlock::iterator I = Default->begin(); isa<PHINode>(I); ++I) { + PHINode *PN = cast<PHINode>(I); + int BlockIdx = PN->getBasicBlockIndex(OrigBlock); + assert(BlockIdx != -1 && "Switch didn't go to this successor??"); + PN->setIncomingBlock((unsigned)BlockIdx, NewDefault); + } + + // Prepare cases vector. + CaseVector Cases; + unsigned numCmps = Clusterify(Cases, SI); + + DEBUG(dbgs() << "Clusterify finished. Total clusters: " << Cases.size() + << ". Total compares: " << numCmps << "\n"); + DEBUG(dbgs() << "Cases: " << Cases << "\n"); + (void)numCmps; + + BasicBlock* SwitchBlock = switchConvert(Cases.begin(), Cases.end(), Val, + OrigBlock, NewDefault); + + // Branch to our shiny new if-then stuff... + BranchInst::Create(SwitchBlock, OrigBlock); + + // We are now done with the switch instruction, delete it. + CurBlock->getInstList().erase(SI); +} diff --git a/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp b/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp new file mode 100644 index 0000000..f4ca81a --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp @@ -0,0 +1,90 @@ +//===- Mem2Reg.cpp - The -mem2reg pass, a wrapper around the Utils lib ----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass is a simple pass wrapper around the PromoteMemToReg function call +// exposed by the Utils library. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "mem2reg" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/PromoteMemToReg.h" +#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Instructions.h" +#include "llvm/Function.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumPromoted, "Number of alloca's promoted"); + +namespace { + struct PromotePass : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + PromotePass() : FunctionPass(ID) { + initializePromotePassPass(*PassRegistry::getPassRegistry()); + } + + // runOnFunction - To run this pass, first we calculate the alloca + // instructions that are safe for promotion, then we promote each one. + // + virtual bool runOnFunction(Function &F); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<DominatorTree>(); + AU.setPreservesCFG(); + // This is a cluster of orthogonal Transforms + AU.addPreserved<UnifyFunctionExitNodes>(); + AU.addPreservedID(LowerSwitchID); + AU.addPreservedID(LowerInvokePassID); + } + }; +} // end of anonymous namespace + +char PromotePass::ID = 0; +INITIALIZE_PASS_BEGIN(PromotePass, "mem2reg", "Promote Memory to Register", + false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_END(PromotePass, "mem2reg", "Promote Memory to Register", + false, false) + +bool PromotePass::runOnFunction(Function &F) { + std::vector<AllocaInst*> Allocas; + + BasicBlock &BB = F.getEntryBlock(); // Get the entry node for the function + + bool Changed = false; + + DominatorTree &DT = getAnalysis<DominatorTree>(); + + while (1) { + Allocas.clear(); + + // Find allocas that are safe to promote, by looking at all instructions in + // the entry node + for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I) + if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) // Is it an alloca? + if (isAllocaPromotable(AI)) + Allocas.push_back(AI); + + if (Allocas.empty()) break; + + PromoteMemToReg(Allocas, DT); + NumPromoted += Allocas.size(); + Changed = true; + } + + return Changed; +} + +// createPromoteMemoryToRegister - Provide an entry point to create this pass. +// +FunctionPass *llvm::createPromoteMemoryToRegisterPass() { + return new PromotePass(); +} diff --git a/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp new file mode 100644 index 0000000..e6a4373 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -0,0 +1,1118 @@ +//===- PromoteMemoryToRegister.cpp - Convert allocas to registers ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file promotes memory references to be register references. It promotes +// alloca instructions which only have loads and stores as uses. An alloca is +// transformed by using iterated dominator frontiers to place PHI nodes, then +// traversing the function in depth-first order to rewrite loads and stores as +// appropriate. +// +// The algorithm used here is based on: +// +// Sreedhar and Gao. A linear time algorithm for placing phi-nodes. +// In Proceedings of the 22nd ACM SIGPLAN-SIGACT Symposium on Principles of +// Programming Languages +// POPL '95. ACM, New York, NY, 62-73. +// +// It has been modified to not explicitly use the DJ graph data structure and to +// directly compute pruned SSA using per-variable liveness information. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "mem2reg" +#include "llvm/Transforms/Utils/PromoteMemToReg.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Metadata.h" +#include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/DebugInfo.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/CFG.h" +#include <algorithm> +#include <map> +#include <queue> +using namespace llvm; + +STATISTIC(NumLocalPromoted, "Number of alloca's promoted within one block"); +STATISTIC(NumSingleStore, "Number of alloca's promoted with a single store"); +STATISTIC(NumDeadAlloca, "Number of dead alloca's removed"); +STATISTIC(NumPHIInsert, "Number of PHI nodes inserted"); + +namespace llvm { +template<> +struct DenseMapInfo<std::pair<BasicBlock*, unsigned> > { + typedef std::pair<BasicBlock*, unsigned> EltTy; + static inline EltTy getEmptyKey() { + return EltTy(reinterpret_cast<BasicBlock*>(-1), ~0U); + } + static inline EltTy getTombstoneKey() { + return EltTy(reinterpret_cast<BasicBlock*>(-2), 0U); + } + static unsigned getHashValue(const std::pair<BasicBlock*, unsigned> &Val) { + return DenseMapInfo<void*>::getHashValue(Val.first) + Val.second*2; + } + static bool isEqual(const EltTy &LHS, const EltTy &RHS) { + return LHS == RHS; + } +}; +} + +/// isAllocaPromotable - Return true if this alloca is legal for promotion. +/// This is true if there are only loads and stores to the alloca. +/// +bool llvm::isAllocaPromotable(const AllocaInst *AI) { + // FIXME: If the memory unit is of pointer or integer type, we can permit + // assignments to subsections of the memory unit. + + // Only allow direct and non-volatile loads and stores... + for (Value::const_use_iterator UI = AI->use_begin(), UE = AI->use_end(); + UI != UE; ++UI) { // Loop over all of the uses of the alloca + const User *U = *UI; + if (const LoadInst *LI = dyn_cast<LoadInst>(U)) { + if (LI->isVolatile()) + return false; + } else if (const StoreInst *SI = dyn_cast<StoreInst>(U)) { + if (SI->getOperand(0) == AI) + return false; // Don't allow a store OF the AI, only INTO the AI. + if (SI->isVolatile()) + return false; + } else { + return false; + } + } + + return true; +} + +/// FindAllocaDbgDeclare - Finds the llvm.dbg.declare intrinsic describing the +/// alloca 'V', if any. +static DbgDeclareInst *FindAllocaDbgDeclare(Value *V) { + if (MDNode *DebugNode = MDNode::getIfExists(V->getContext(), &V, 1)) + for (Value::use_iterator UI = DebugNode->use_begin(), + E = DebugNode->use_end(); UI != E; ++UI) + if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(*UI)) + return DDI; + + return 0; +} + +namespace { + struct AllocaInfo; + + // Data package used by RenamePass() + class RenamePassData { + public: + typedef std::vector<Value *> ValVector; + + RenamePassData() : BB(NULL), Pred(NULL), Values() {} + RenamePassData(BasicBlock *B, BasicBlock *P, + const ValVector &V) : BB(B), Pred(P), Values(V) {} + BasicBlock *BB; + BasicBlock *Pred; + ValVector Values; + + void swap(RenamePassData &RHS) { + std::swap(BB, RHS.BB); + std::swap(Pred, RHS.Pred); + Values.swap(RHS.Values); + } + }; + + /// LargeBlockInfo - This assigns and keeps a per-bb relative ordering of + /// load/store instructions in the block that directly load or store an alloca. + /// + /// This functionality is important because it avoids scanning large basic + /// blocks multiple times when promoting many allocas in the same block. + class LargeBlockInfo { + /// InstNumbers - For each instruction that we track, keep the index of the + /// instruction. The index starts out as the number of the instruction from + /// the start of the block. + DenseMap<const Instruction *, unsigned> InstNumbers; + public: + + /// isInterestingInstruction - This code only looks at accesses to allocas. + static bool isInterestingInstruction(const Instruction *I) { + return (isa<LoadInst>(I) && isa<AllocaInst>(I->getOperand(0))) || + (isa<StoreInst>(I) && isa<AllocaInst>(I->getOperand(1))); + } + + /// getInstructionIndex - Get or calculate the index of the specified + /// instruction. + unsigned getInstructionIndex(const Instruction *I) { + assert(isInterestingInstruction(I) && + "Not a load/store to/from an alloca?"); + + // If we already have this instruction number, return it. + DenseMap<const Instruction *, unsigned>::iterator It = InstNumbers.find(I); + if (It != InstNumbers.end()) return It->second; + + // Scan the whole block to get the instruction. This accumulates + // information for every interesting instruction in the block, in order to + // avoid gratuitus rescans. + const BasicBlock *BB = I->getParent(); + unsigned InstNo = 0; + for (BasicBlock::const_iterator BBI = BB->begin(), E = BB->end(); + BBI != E; ++BBI) + if (isInterestingInstruction(BBI)) + InstNumbers[BBI] = InstNo++; + It = InstNumbers.find(I); + + assert(It != InstNumbers.end() && "Didn't insert instruction?"); + return It->second; + } + + void deleteValue(const Instruction *I) { + InstNumbers.erase(I); + } + + void clear() { + InstNumbers.clear(); + } + }; + + struct PromoteMem2Reg { + /// Allocas - The alloca instructions being promoted. + /// + std::vector<AllocaInst*> Allocas; + DominatorTree &DT; + DIFactory *DIF; + + /// AST - An AliasSetTracker object to update. If null, don't update it. + /// + AliasSetTracker *AST; + + /// AllocaLookup - Reverse mapping of Allocas. + /// + DenseMap<AllocaInst*, unsigned> AllocaLookup; + + /// NewPhiNodes - The PhiNodes we're adding. + /// + DenseMap<std::pair<BasicBlock*, unsigned>, PHINode*> NewPhiNodes; + + /// PhiToAllocaMap - For each PHI node, keep track of which entry in Allocas + /// it corresponds to. + DenseMap<PHINode*, unsigned> PhiToAllocaMap; + + /// PointerAllocaValues - If we are updating an AliasSetTracker, then for + /// each alloca that is of pointer type, we keep track of what to copyValue + /// to the inserted PHI nodes here. + /// + std::vector<Value*> PointerAllocaValues; + + /// AllocaDbgDeclares - For each alloca, we keep track of the dbg.declare + /// intrinsic that describes it, if any, so that we can convert it to a + /// dbg.value intrinsic if the alloca gets promoted. + SmallVector<DbgDeclareInst*, 8> AllocaDbgDeclares; + + /// Visited - The set of basic blocks the renamer has already visited. + /// + SmallPtrSet<BasicBlock*, 16> Visited; + + /// BBNumbers - Contains a stable numbering of basic blocks to avoid + /// non-determinstic behavior. + DenseMap<BasicBlock*, unsigned> BBNumbers; + + /// DomLevels - Maps DomTreeNodes to their level in the dominator tree. + DenseMap<DomTreeNode*, unsigned> DomLevels; + + /// BBNumPreds - Lazily compute the number of predecessors a block has. + DenseMap<const BasicBlock*, unsigned> BBNumPreds; + public: + PromoteMem2Reg(const std::vector<AllocaInst*> &A, DominatorTree &dt, + AliasSetTracker *ast) + : Allocas(A), DT(dt), DIF(0), AST(ast) {} + ~PromoteMem2Reg() { + delete DIF; + } + + void run(); + + /// dominates - Return true if BB1 dominates BB2 using the DominatorTree. + /// + bool dominates(BasicBlock *BB1, BasicBlock *BB2) const { + return DT.dominates(BB1, BB2); + } + + private: + void RemoveFromAllocasList(unsigned &AllocaIdx) { + Allocas[AllocaIdx] = Allocas.back(); + Allocas.pop_back(); + --AllocaIdx; + } + + unsigned getNumPreds(const BasicBlock *BB) { + unsigned &NP = BBNumPreds[BB]; + if (NP == 0) + NP = std::distance(pred_begin(BB), pred_end(BB))+1; + return NP-1; + } + + void DetermineInsertionPoint(AllocaInst *AI, unsigned AllocaNum, + AllocaInfo &Info); + void ComputeLiveInBlocks(AllocaInst *AI, AllocaInfo &Info, + const SmallPtrSet<BasicBlock*, 32> &DefBlocks, + SmallPtrSet<BasicBlock*, 32> &LiveInBlocks); + + void RewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, + LargeBlockInfo &LBI); + void PromoteSingleBlockAlloca(AllocaInst *AI, AllocaInfo &Info, + LargeBlockInfo &LBI); + void ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI, StoreInst *SI); + + + void RenamePass(BasicBlock *BB, BasicBlock *Pred, + RenamePassData::ValVector &IncVals, + std::vector<RenamePassData> &Worklist); + bool QueuePhiNode(BasicBlock *BB, unsigned AllocaIdx, unsigned &Version); + }; + + struct AllocaInfo { + SmallVector<BasicBlock*, 32> DefiningBlocks; + SmallVector<BasicBlock*, 32> UsingBlocks; + + StoreInst *OnlyStore; + BasicBlock *OnlyBlock; + bool OnlyUsedInOneBlock; + + Value *AllocaPointerVal; + DbgDeclareInst *DbgDeclare; + + void clear() { + DefiningBlocks.clear(); + UsingBlocks.clear(); + OnlyStore = 0; + OnlyBlock = 0; + OnlyUsedInOneBlock = true; + AllocaPointerVal = 0; + DbgDeclare = 0; + } + + /// AnalyzeAlloca - Scan the uses of the specified alloca, filling in our + /// ivars. + void AnalyzeAlloca(AllocaInst *AI) { + clear(); + + // As we scan the uses of the alloca instruction, keep track of stores, + // and decide whether all of the loads and stores to the alloca are within + // the same basic block. + for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); + UI != E;) { + Instruction *User = cast<Instruction>(*UI++); + + if (StoreInst *SI = dyn_cast<StoreInst>(User)) { + // Remember the basic blocks which define new values for the alloca + DefiningBlocks.push_back(SI->getParent()); + AllocaPointerVal = SI->getOperand(0); + OnlyStore = SI; + } else { + LoadInst *LI = cast<LoadInst>(User); + // Otherwise it must be a load instruction, keep track of variable + // reads. + UsingBlocks.push_back(LI->getParent()); + AllocaPointerVal = LI; + } + + if (OnlyUsedInOneBlock) { + if (OnlyBlock == 0) + OnlyBlock = User->getParent(); + else if (OnlyBlock != User->getParent()) + OnlyUsedInOneBlock = false; + } + } + + DbgDeclare = FindAllocaDbgDeclare(AI); + } + }; + + typedef std::pair<DomTreeNode*, unsigned> DomTreeNodePair; + + struct DomTreeNodeCompare { + bool operator()(const DomTreeNodePair &LHS, const DomTreeNodePair &RHS) { + return LHS.second < RHS.second; + } + }; +} // end of anonymous namespace + + +void PromoteMem2Reg::run() { + Function &F = *DT.getRoot()->getParent(); + + if (AST) PointerAllocaValues.resize(Allocas.size()); + AllocaDbgDeclares.resize(Allocas.size()); + + AllocaInfo Info; + LargeBlockInfo LBI; + + for (unsigned AllocaNum = 0; AllocaNum != Allocas.size(); ++AllocaNum) { + AllocaInst *AI = Allocas[AllocaNum]; + + assert(isAllocaPromotable(AI) && + "Cannot promote non-promotable alloca!"); + assert(AI->getParent()->getParent() == &F && + "All allocas should be in the same function, which is same as DF!"); + + if (AI->use_empty()) { + // If there are no uses of the alloca, just delete it now. + if (AST) AST->deleteValue(AI); + AI->eraseFromParent(); + + // Remove the alloca from the Allocas list, since it has been processed + RemoveFromAllocasList(AllocaNum); + ++NumDeadAlloca; + continue; + } + + // Calculate the set of read and write-locations for each alloca. This is + // analogous to finding the 'uses' and 'definitions' of each variable. + Info.AnalyzeAlloca(AI); + + // If there is only a single store to this value, replace any loads of + // it that are directly dominated by the definition with the value stored. + if (Info.DefiningBlocks.size() == 1) { + RewriteSingleStoreAlloca(AI, Info, LBI); + + // Finally, after the scan, check to see if the store is all that is left. + if (Info.UsingBlocks.empty()) { + // Record debuginfo for the store and remove the declaration's debuginfo. + if (DbgDeclareInst *DDI = Info.DbgDeclare) { + ConvertDebugDeclareToDebugValue(DDI, Info.OnlyStore); + DDI->eraseFromParent(); + } + // Remove the (now dead) store and alloca. + Info.OnlyStore->eraseFromParent(); + LBI.deleteValue(Info.OnlyStore); + + if (AST) AST->deleteValue(AI); + AI->eraseFromParent(); + LBI.deleteValue(AI); + + // The alloca has been processed, move on. + RemoveFromAllocasList(AllocaNum); + + ++NumSingleStore; + continue; + } + } + + // If the alloca is only read and written in one basic block, just perform a + // linear sweep over the block to eliminate it. + if (Info.OnlyUsedInOneBlock) { + PromoteSingleBlockAlloca(AI, Info, LBI); + + // Finally, after the scan, check to see if the stores are all that is + // left. + if (Info.UsingBlocks.empty()) { + + // Remove the (now dead) stores and alloca. + while (!AI->use_empty()) { + StoreInst *SI = cast<StoreInst>(AI->use_back()); + // Record debuginfo for the store before removing it. + if (DbgDeclareInst *DDI = Info.DbgDeclare) + ConvertDebugDeclareToDebugValue(DDI, SI); + SI->eraseFromParent(); + LBI.deleteValue(SI); + } + + if (AST) AST->deleteValue(AI); + AI->eraseFromParent(); + LBI.deleteValue(AI); + + // The alloca has been processed, move on. + RemoveFromAllocasList(AllocaNum); + + // The alloca's debuginfo can be removed as well. + if (DbgDeclareInst *DDI = Info.DbgDeclare) + DDI->eraseFromParent(); + + ++NumLocalPromoted; + continue; + } + } + + // If we haven't computed dominator tree levels, do so now. + if (DomLevels.empty()) { + SmallVector<DomTreeNode*, 32> Worklist; + + DomTreeNode *Root = DT.getRootNode(); + DomLevels[Root] = 0; + Worklist.push_back(Root); + + while (!Worklist.empty()) { + DomTreeNode *Node = Worklist.pop_back_val(); + unsigned ChildLevel = DomLevels[Node] + 1; + for (DomTreeNode::iterator CI = Node->begin(), CE = Node->end(); + CI != CE; ++CI) { + DomLevels[*CI] = ChildLevel; + Worklist.push_back(*CI); + } + } + } + + // If we haven't computed a numbering for the BB's in the function, do so + // now. + if (BBNumbers.empty()) { + unsigned ID = 0; + for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) + BBNumbers[I] = ID++; + } + + // If we have an AST to keep updated, remember some pointer value that is + // stored into the alloca. + if (AST) + PointerAllocaValues[AllocaNum] = Info.AllocaPointerVal; + + // Remember the dbg.declare intrinsic describing this alloca, if any. + if (Info.DbgDeclare) AllocaDbgDeclares[AllocaNum] = Info.DbgDeclare; + + // Keep the reverse mapping of the 'Allocas' array for the rename pass. + AllocaLookup[Allocas[AllocaNum]] = AllocaNum; + + // At this point, we're committed to promoting the alloca using IDF's, and + // the standard SSA construction algorithm. Determine which blocks need PHI + // nodes and see if we can optimize out some work by avoiding insertion of + // dead phi nodes. + DetermineInsertionPoint(AI, AllocaNum, Info); + } + + if (Allocas.empty()) + return; // All of the allocas must have been trivial! + + LBI.clear(); + + + // Set the incoming values for the basic block to be null values for all of + // the alloca's. We do this in case there is a load of a value that has not + // been stored yet. In this case, it will get this null value. + // + RenamePassData::ValVector Values(Allocas.size()); + for (unsigned i = 0, e = Allocas.size(); i != e; ++i) + Values[i] = UndefValue::get(Allocas[i]->getAllocatedType()); + + // Walks all basic blocks in the function performing the SSA rename algorithm + // and inserting the phi nodes we marked as necessary + // + std::vector<RenamePassData> RenamePassWorkList; + RenamePassWorkList.push_back(RenamePassData(F.begin(), 0, Values)); + do { + RenamePassData RPD; + RPD.swap(RenamePassWorkList.back()); + RenamePassWorkList.pop_back(); + // RenamePass may add new worklist entries. + RenamePass(RPD.BB, RPD.Pred, RPD.Values, RenamePassWorkList); + } while (!RenamePassWorkList.empty()); + + // The renamer uses the Visited set to avoid infinite loops. Clear it now. + Visited.clear(); + + // Remove the allocas themselves from the function. + for (unsigned i = 0, e = Allocas.size(); i != e; ++i) { + Instruction *A = Allocas[i]; + + // If there are any uses of the alloca instructions left, they must be in + // unreachable basic blocks that were not processed by walking the dominator + // tree. Just delete the users now. + if (!A->use_empty()) + A->replaceAllUsesWith(UndefValue::get(A->getType())); + if (AST) AST->deleteValue(A); + A->eraseFromParent(); + } + + // Remove alloca's dbg.declare instrinsics from the function. + for (unsigned i = 0, e = AllocaDbgDeclares.size(); i != e; ++i) + if (DbgDeclareInst *DDI = AllocaDbgDeclares[i]) + DDI->eraseFromParent(); + + // Loop over all of the PHI nodes and see if there are any that we can get + // rid of because they merge all of the same incoming values. This can + // happen due to undef values coming into the PHI nodes. This process is + // iterative, because eliminating one PHI node can cause others to be removed. + bool EliminatedAPHI = true; + while (EliminatedAPHI) { + EliminatedAPHI = false; + + for (DenseMap<std::pair<BasicBlock*, unsigned>, PHINode*>::iterator I = + NewPhiNodes.begin(), E = NewPhiNodes.end(); I != E;) { + PHINode *PN = I->second; + + // If this PHI node merges one value and/or undefs, get the value. + if (Value *V = SimplifyInstruction(PN, 0, &DT)) { + if (AST && PN->getType()->isPointerTy()) + AST->deleteValue(PN); + PN->replaceAllUsesWith(V); + PN->eraseFromParent(); + NewPhiNodes.erase(I++); + EliminatedAPHI = true; + continue; + } + ++I; + } + } + + // At this point, the renamer has added entries to PHI nodes for all reachable + // code. Unfortunately, there may be unreachable blocks which the renamer + // hasn't traversed. If this is the case, the PHI nodes may not + // have incoming values for all predecessors. Loop over all PHI nodes we have + // created, inserting undef values if they are missing any incoming values. + // + for (DenseMap<std::pair<BasicBlock*, unsigned>, PHINode*>::iterator I = + NewPhiNodes.begin(), E = NewPhiNodes.end(); I != E; ++I) { + // We want to do this once per basic block. As such, only process a block + // when we find the PHI that is the first entry in the block. + PHINode *SomePHI = I->second; + BasicBlock *BB = SomePHI->getParent(); + if (&BB->front() != SomePHI) + continue; + + // Only do work here if there the PHI nodes are missing incoming values. We + // know that all PHI nodes that were inserted in a block will have the same + // number of incoming values, so we can just check any of them. + if (SomePHI->getNumIncomingValues() == getNumPreds(BB)) + continue; + + // Get the preds for BB. + SmallVector<BasicBlock*, 16> Preds(pred_begin(BB), pred_end(BB)); + + // Ok, now we know that all of the PHI nodes are missing entries for some + // basic blocks. Start by sorting the incoming predecessors for efficient + // access. + std::sort(Preds.begin(), Preds.end()); + + // Now we loop through all BB's which have entries in SomePHI and remove + // them from the Preds list. + for (unsigned i = 0, e = SomePHI->getNumIncomingValues(); i != e; ++i) { + // Do a log(n) search of the Preds list for the entry we want. + SmallVector<BasicBlock*, 16>::iterator EntIt = + std::lower_bound(Preds.begin(), Preds.end(), + SomePHI->getIncomingBlock(i)); + assert(EntIt != Preds.end() && *EntIt == SomePHI->getIncomingBlock(i)&& + "PHI node has entry for a block which is not a predecessor!"); + + // Remove the entry + Preds.erase(EntIt); + } + + // At this point, the blocks left in the preds list must have dummy + // entries inserted into every PHI nodes for the block. Update all the phi + // nodes in this block that we are inserting (there could be phis before + // mem2reg runs). + unsigned NumBadPreds = SomePHI->getNumIncomingValues(); + BasicBlock::iterator BBI = BB->begin(); + while ((SomePHI = dyn_cast<PHINode>(BBI++)) && + SomePHI->getNumIncomingValues() == NumBadPreds) { + Value *UndefVal = UndefValue::get(SomePHI->getType()); + for (unsigned pred = 0, e = Preds.size(); pred != e; ++pred) + SomePHI->addIncoming(UndefVal, Preds[pred]); + } + } + + NewPhiNodes.clear(); +} + + +/// ComputeLiveInBlocks - Determine which blocks the value is live in. These +/// are blocks which lead to uses. Knowing this allows us to avoid inserting +/// PHI nodes into blocks which don't lead to uses (thus, the inserted phi nodes +/// would be dead). +void PromoteMem2Reg:: +ComputeLiveInBlocks(AllocaInst *AI, AllocaInfo &Info, + const SmallPtrSet<BasicBlock*, 32> &DefBlocks, + SmallPtrSet<BasicBlock*, 32> &LiveInBlocks) { + + // To determine liveness, we must iterate through the predecessors of blocks + // where the def is live. Blocks are added to the worklist if we need to + // check their predecessors. Start with all the using blocks. + SmallVector<BasicBlock*, 64> LiveInBlockWorklist(Info.UsingBlocks.begin(), + Info.UsingBlocks.end()); + + // If any of the using blocks is also a definition block, check to see if the + // definition occurs before or after the use. If it happens before the use, + // the value isn't really live-in. + for (unsigned i = 0, e = LiveInBlockWorklist.size(); i != e; ++i) { + BasicBlock *BB = LiveInBlockWorklist[i]; + if (!DefBlocks.count(BB)) continue; + + // Okay, this is a block that both uses and defines the value. If the first + // reference to the alloca is a def (store), then we know it isn't live-in. + for (BasicBlock::iterator I = BB->begin(); ; ++I) { + if (StoreInst *SI = dyn_cast<StoreInst>(I)) { + if (SI->getOperand(1) != AI) continue; + + // We found a store to the alloca before a load. The alloca is not + // actually live-in here. + LiveInBlockWorklist[i] = LiveInBlockWorklist.back(); + LiveInBlockWorklist.pop_back(); + --i, --e; + break; + } + + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + if (LI->getOperand(0) != AI) continue; + + // Okay, we found a load before a store to the alloca. It is actually + // live into this block. + break; + } + } + } + + // Now that we have a set of blocks where the phi is live-in, recursively add + // their predecessors until we find the full region the value is live. + while (!LiveInBlockWorklist.empty()) { + BasicBlock *BB = LiveInBlockWorklist.pop_back_val(); + + // The block really is live in here, insert it into the set. If already in + // the set, then it has already been processed. + if (!LiveInBlocks.insert(BB)) + continue; + + // Since the value is live into BB, it is either defined in a predecessor or + // live into it to. Add the preds to the worklist unless they are a + // defining block. + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { + BasicBlock *P = *PI; + + // The value is not live into a predecessor if it defines the value. + if (DefBlocks.count(P)) + continue; + + // Otherwise it is, add to the worklist. + LiveInBlockWorklist.push_back(P); + } + } +} + +/// DetermineInsertionPoint - At this point, we're committed to promoting the +/// alloca using IDF's, and the standard SSA construction algorithm. Determine +/// which blocks need phi nodes and see if we can optimize out some work by +/// avoiding insertion of dead phi nodes. +void PromoteMem2Reg::DetermineInsertionPoint(AllocaInst *AI, unsigned AllocaNum, + AllocaInfo &Info) { + // Unique the set of defining blocks for efficient lookup. + SmallPtrSet<BasicBlock*, 32> DefBlocks; + DefBlocks.insert(Info.DefiningBlocks.begin(), Info.DefiningBlocks.end()); + + // Determine which blocks the value is live in. These are blocks which lead + // to uses. + SmallPtrSet<BasicBlock*, 32> LiveInBlocks; + ComputeLiveInBlocks(AI, Info, DefBlocks, LiveInBlocks); + + // Use a priority queue keyed on dominator tree level so that inserted nodes + // are handled from the bottom of the dominator tree upwards. + typedef std::priority_queue<DomTreeNodePair, SmallVector<DomTreeNodePair, 32>, + DomTreeNodeCompare> IDFPriorityQueue; + IDFPriorityQueue PQ; + + for (SmallPtrSet<BasicBlock*, 32>::const_iterator I = DefBlocks.begin(), + E = DefBlocks.end(); I != E; ++I) { + if (DomTreeNode *Node = DT.getNode(*I)) + PQ.push(std::make_pair(Node, DomLevels[Node])); + } + + SmallVector<std::pair<unsigned, BasicBlock*>, 32> DFBlocks; + SmallPtrSet<DomTreeNode*, 32> Visited; + SmallVector<DomTreeNode*, 32> Worklist; + while (!PQ.empty()) { + DomTreeNodePair RootPair = PQ.top(); + PQ.pop(); + DomTreeNode *Root = RootPair.first; + unsigned RootLevel = RootPair.second; + + // Walk all dominator tree children of Root, inspecting their CFG edges with + // targets elsewhere on the dominator tree. Only targets whose level is at + // most Root's level are added to the iterated dominance frontier of the + // definition set. + + Worklist.clear(); + Worklist.push_back(Root); + + while (!Worklist.empty()) { + DomTreeNode *Node = Worklist.pop_back_val(); + BasicBlock *BB = Node->getBlock(); + + for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; + ++SI) { + DomTreeNode *SuccNode = DT.getNode(*SI); + + // Quickly skip all CFG edges that are also dominator tree edges instead + // of catching them below. + if (SuccNode->getIDom() == Node) + continue; + + unsigned SuccLevel = DomLevels[SuccNode]; + if (SuccLevel > RootLevel) + continue; + + if (!Visited.insert(SuccNode)) + continue; + + BasicBlock *SuccBB = SuccNode->getBlock(); + if (!LiveInBlocks.count(SuccBB)) + continue; + + DFBlocks.push_back(std::make_pair(BBNumbers[SuccBB], SuccBB)); + if (!DefBlocks.count(SuccBB)) + PQ.push(std::make_pair(SuccNode, SuccLevel)); + } + + for (DomTreeNode::iterator CI = Node->begin(), CE = Node->end(); CI != CE; + ++CI) { + if (!Visited.count(*CI)) + Worklist.push_back(*CI); + } + } + } + + if (DFBlocks.size() > 1) + std::sort(DFBlocks.begin(), DFBlocks.end()); + + unsigned CurrentVersion = 0; + for (unsigned i = 0, e = DFBlocks.size(); i != e; ++i) + QueuePhiNode(DFBlocks[i].second, AllocaNum, CurrentVersion); +} + +/// RewriteSingleStoreAlloca - If there is only a single store to this value, +/// replace any loads of it that are directly dominated by the definition with +/// the value stored. +void PromoteMem2Reg::RewriteSingleStoreAlloca(AllocaInst *AI, + AllocaInfo &Info, + LargeBlockInfo &LBI) { + StoreInst *OnlyStore = Info.OnlyStore; + bool StoringGlobalVal = !isa<Instruction>(OnlyStore->getOperand(0)); + BasicBlock *StoreBB = OnlyStore->getParent(); + int StoreIndex = -1; + + // Clear out UsingBlocks. We will reconstruct it here if needed. + Info.UsingBlocks.clear(); + + for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); UI != E; ) { + Instruction *UserInst = cast<Instruction>(*UI++); + if (!isa<LoadInst>(UserInst)) { + assert(UserInst == OnlyStore && "Should only have load/stores"); + continue; + } + LoadInst *LI = cast<LoadInst>(UserInst); + + // Okay, if we have a load from the alloca, we want to replace it with the + // only value stored to the alloca. We can do this if the value is + // dominated by the store. If not, we use the rest of the mem2reg machinery + // to insert the phi nodes as needed. + if (!StoringGlobalVal) { // Non-instructions are always dominated. + if (LI->getParent() == StoreBB) { + // If we have a use that is in the same block as the store, compare the + // indices of the two instructions to see which one came first. If the + // load came before the store, we can't handle it. + if (StoreIndex == -1) + StoreIndex = LBI.getInstructionIndex(OnlyStore); + + if (unsigned(StoreIndex) > LBI.getInstructionIndex(LI)) { + // Can't handle this load, bail out. + Info.UsingBlocks.push_back(StoreBB); + continue; + } + + } else if (LI->getParent() != StoreBB && + !dominates(StoreBB, LI->getParent())) { + // If the load and store are in different blocks, use BB dominance to + // check their relationships. If the store doesn't dom the use, bail + // out. + Info.UsingBlocks.push_back(LI->getParent()); + continue; + } + } + + // Otherwise, we *can* safely rewrite this load. + Value *ReplVal = OnlyStore->getOperand(0); + // If the replacement value is the load, this must occur in unreachable + // code. + if (ReplVal == LI) + ReplVal = UndefValue::get(LI->getType()); + LI->replaceAllUsesWith(ReplVal); + if (AST && LI->getType()->isPointerTy()) + AST->deleteValue(LI); + LI->eraseFromParent(); + LBI.deleteValue(LI); + } +} + +namespace { + +/// StoreIndexSearchPredicate - This is a helper predicate used to search by the +/// first element of a pair. +struct StoreIndexSearchPredicate { + bool operator()(const std::pair<unsigned, StoreInst*> &LHS, + const std::pair<unsigned, StoreInst*> &RHS) { + return LHS.first < RHS.first; + } +}; + +} + +/// PromoteSingleBlockAlloca - Many allocas are only used within a single basic +/// block. If this is the case, avoid traversing the CFG and inserting a lot of +/// potentially useless PHI nodes by just performing a single linear pass over +/// the basic block using the Alloca. +/// +/// If we cannot promote this alloca (because it is read before it is written), +/// return true. This is necessary in cases where, due to control flow, the +/// alloca is potentially undefined on some control flow paths. e.g. code like +/// this is potentially correct: +/// +/// for (...) { if (c) { A = undef; undef = B; } } +/// +/// ... so long as A is not used before undef is set. +/// +void PromoteMem2Reg::PromoteSingleBlockAlloca(AllocaInst *AI, AllocaInfo &Info, + LargeBlockInfo &LBI) { + // The trickiest case to handle is when we have large blocks. Because of this, + // this code is optimized assuming that large blocks happen. This does not + // significantly pessimize the small block case. This uses LargeBlockInfo to + // make it efficient to get the index of various operations in the block. + + // Clear out UsingBlocks. We will reconstruct it here if needed. + Info.UsingBlocks.clear(); + + // Walk the use-def list of the alloca, getting the locations of all stores. + typedef SmallVector<std::pair<unsigned, StoreInst*>, 64> StoresByIndexTy; + StoresByIndexTy StoresByIndex; + + for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); + UI != E; ++UI) + if (StoreInst *SI = dyn_cast<StoreInst>(*UI)) + StoresByIndex.push_back(std::make_pair(LBI.getInstructionIndex(SI), SI)); + + // If there are no stores to the alloca, just replace any loads with undef. + if (StoresByIndex.empty()) { + for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); UI != E;) + if (LoadInst *LI = dyn_cast<LoadInst>(*UI++)) { + LI->replaceAllUsesWith(UndefValue::get(LI->getType())); + if (AST && LI->getType()->isPointerTy()) + AST->deleteValue(LI); + LBI.deleteValue(LI); + LI->eraseFromParent(); + } + return; + } + + // Sort the stores by their index, making it efficient to do a lookup with a + // binary search. + std::sort(StoresByIndex.begin(), StoresByIndex.end()); + + // Walk all of the loads from this alloca, replacing them with the nearest + // store above them, if any. + for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); UI != E;) { + LoadInst *LI = dyn_cast<LoadInst>(*UI++); + if (!LI) continue; + + unsigned LoadIdx = LBI.getInstructionIndex(LI); + + // Find the nearest store that has a lower than this load. + StoresByIndexTy::iterator I = + std::lower_bound(StoresByIndex.begin(), StoresByIndex.end(), + std::pair<unsigned, StoreInst*>(LoadIdx, static_cast<StoreInst*>(0)), + StoreIndexSearchPredicate()); + + // If there is no store before this load, then we can't promote this load. + if (I == StoresByIndex.begin()) { + // Can't handle this load, bail out. + Info.UsingBlocks.push_back(LI->getParent()); + continue; + } + + // Otherwise, there was a store before this load, the load takes its value. + --I; + LI->replaceAllUsesWith(I->second->getOperand(0)); + if (AST && LI->getType()->isPointerTy()) + AST->deleteValue(LI); + LI->eraseFromParent(); + LBI.deleteValue(LI); + } +} + +// Inserts a llvm.dbg.value instrinsic before the stores to an alloca'd value +// that has an associated llvm.dbg.decl intrinsic. +void PromoteMem2Reg::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI, + StoreInst *SI) { + DIVariable DIVar(DDI->getVariable()); + if (!DIVar.Verify()) + return; + + if (!DIF) + DIF = new DIFactory(*SI->getParent()->getParent()->getParent()); + Instruction *DbgVal = DIF->InsertDbgValueIntrinsic(SI->getOperand(0), 0, + DIVar, SI); + + // Propagate any debug metadata from the store onto the dbg.value. + DebugLoc SIDL = SI->getDebugLoc(); + if (!SIDL.isUnknown()) + DbgVal->setDebugLoc(SIDL); + // Otherwise propagate debug metadata from dbg.declare. + else + DbgVal->setDebugLoc(DDI->getDebugLoc()); +} + +// QueuePhiNode - queues a phi-node to be added to a basic-block for a specific +// Alloca returns true if there wasn't already a phi-node for that variable +// +bool PromoteMem2Reg::QueuePhiNode(BasicBlock *BB, unsigned AllocaNo, + unsigned &Version) { + // Look up the basic-block in question. + PHINode *&PN = NewPhiNodes[std::make_pair(BB, AllocaNo)]; + + // If the BB already has a phi node added for the i'th alloca then we're done! + if (PN) return false; + + // Create a PhiNode using the dereferenced type... and add the phi-node to the + // BasicBlock. + PN = PHINode::Create(Allocas[AllocaNo]->getAllocatedType(), + Allocas[AllocaNo]->getName() + "." + Twine(Version++), + BB->begin()); + ++NumPHIInsert; + PhiToAllocaMap[PN] = AllocaNo; + PN->reserveOperandSpace(getNumPreds(BB)); + + if (AST && PN->getType()->isPointerTy()) + AST->copyValue(PointerAllocaValues[AllocaNo], PN); + + return true; +} + +// RenamePass - Recursively traverse the CFG of the function, renaming loads and +// stores to the allocas which we are promoting. IncomingVals indicates what +// value each Alloca contains on exit from the predecessor block Pred. +// +void PromoteMem2Reg::RenamePass(BasicBlock *BB, BasicBlock *Pred, + RenamePassData::ValVector &IncomingVals, + std::vector<RenamePassData> &Worklist) { +NextIteration: + // If we are inserting any phi nodes into this BB, they will already be in the + // block. + if (PHINode *APN = dyn_cast<PHINode>(BB->begin())) { + // If we have PHI nodes to update, compute the number of edges from Pred to + // BB. + if (PhiToAllocaMap.count(APN)) { + // We want to be able to distinguish between PHI nodes being inserted by + // this invocation of mem2reg from those phi nodes that already existed in + // the IR before mem2reg was run. We determine that APN is being inserted + // because it is missing incoming edges. All other PHI nodes being + // inserted by this pass of mem2reg will have the same number of incoming + // operands so far. Remember this count. + unsigned NewPHINumOperands = APN->getNumOperands(); + + unsigned NumEdges = 0; + for (succ_iterator I = succ_begin(Pred), E = succ_end(Pred); I != E; ++I) + if (*I == BB) + ++NumEdges; + assert(NumEdges && "Must be at least one edge from Pred to BB!"); + + // Add entries for all the phis. + BasicBlock::iterator PNI = BB->begin(); + do { + unsigned AllocaNo = PhiToAllocaMap[APN]; + + // Add N incoming values to the PHI node. + for (unsigned i = 0; i != NumEdges; ++i) + APN->addIncoming(IncomingVals[AllocaNo], Pred); + + // The currently active variable for this block is now the PHI. + IncomingVals[AllocaNo] = APN; + + // Get the next phi node. + ++PNI; + APN = dyn_cast<PHINode>(PNI); + if (APN == 0) break; + + // Verify that it is missing entries. If not, it is not being inserted + // by this mem2reg invocation so we want to ignore it. + } while (APN->getNumOperands() == NewPHINumOperands); + } + } + + // Don't revisit blocks. + if (!Visited.insert(BB)) return; + + for (BasicBlock::iterator II = BB->begin(); !isa<TerminatorInst>(II); ) { + Instruction *I = II++; // get the instruction, increment iterator + + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + AllocaInst *Src = dyn_cast<AllocaInst>(LI->getPointerOperand()); + if (!Src) continue; + + DenseMap<AllocaInst*, unsigned>::iterator AI = AllocaLookup.find(Src); + if (AI == AllocaLookup.end()) continue; + + Value *V = IncomingVals[AI->second]; + + // Anything using the load now uses the current value. + LI->replaceAllUsesWith(V); + if (AST && LI->getType()->isPointerTy()) + AST->deleteValue(LI); + BB->getInstList().erase(LI); + } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) { + // Delete this instruction and mark the name as the current holder of the + // value + AllocaInst *Dest = dyn_cast<AllocaInst>(SI->getPointerOperand()); + if (!Dest) continue; + + DenseMap<AllocaInst *, unsigned>::iterator ai = AllocaLookup.find(Dest); + if (ai == AllocaLookup.end()) + continue; + + // what value were we writing? + IncomingVals[ai->second] = SI->getOperand(0); + // Record debuginfo for the store before removing it. + if (DbgDeclareInst *DDI = AllocaDbgDeclares[ai->second]) + ConvertDebugDeclareToDebugValue(DDI, SI); + BB->getInstList().erase(SI); + } + } + + // 'Recurse' to our successors. + succ_iterator I = succ_begin(BB), E = succ_end(BB); + if (I == E) return; + + // Keep track of the successors so we don't visit the same successor twice + SmallPtrSet<BasicBlock*, 8> VisitedSuccs; + + // Handle the first successor without using the worklist. + VisitedSuccs.insert(*I); + Pred = BB; + BB = *I; + ++I; + + for (; I != E; ++I) + if (VisitedSuccs.insert(*I)) + Worklist.push_back(RenamePassData(*I, Pred, IncomingVals)); + + goto NextIteration; +} + +/// PromoteMemToReg - Promote the specified list of alloca instructions into +/// scalar registers, inserting PHI nodes as appropriate. This function does +/// not modify the CFG of the function at all. All allocas must be from the +/// same function. +/// +/// If AST is specified, the specified tracker is updated to reflect changes +/// made to the IR. +/// +void llvm::PromoteMemToReg(const std::vector<AllocaInst*> &Allocas, + DominatorTree &DT, AliasSetTracker *AST) { + // If there is nothing to do, bail out... + if (Allocas.empty()) return; + + PromoteMem2Reg(Allocas, DT, AST).run(); +} diff --git a/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp new file mode 100644 index 0000000..3896d98 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp @@ -0,0 +1,511 @@ +//===- SSAUpdater.cpp - Unstructured SSA Update Tool ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the SSAUpdater class. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "ssaupdater" +#include "llvm/Instructions.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Support/AlignOf.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/CFG.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" +#include "llvm/Transforms/Utils/SSAUpdaterImpl.h" +using namespace llvm; + +typedef DenseMap<BasicBlock*, Value*> AvailableValsTy; +static AvailableValsTy &getAvailableVals(void *AV) { + return *static_cast<AvailableValsTy*>(AV); +} + +SSAUpdater::SSAUpdater(SmallVectorImpl<PHINode*> *NewPHI) + : AV(0), ProtoType(0), ProtoName(), InsertedPHIs(NewPHI) {} + +SSAUpdater::~SSAUpdater() { + delete &getAvailableVals(AV); +} + +/// Initialize - Reset this object to get ready for a new set of SSA +/// updates with type 'Ty'. PHI nodes get a name based on 'Name'. +void SSAUpdater::Initialize(const Type *Ty, StringRef Name) { + if (AV == 0) + AV = new AvailableValsTy(); + else + getAvailableVals(AV).clear(); + ProtoType = Ty; + ProtoName = Name; +} + +/// HasValueForBlock - Return true if the SSAUpdater already has a value for +/// the specified block. +bool SSAUpdater::HasValueForBlock(BasicBlock *BB) const { + return getAvailableVals(AV).count(BB); +} + +/// AddAvailableValue - Indicate that a rewritten value is available in the +/// specified block with the specified value. +void SSAUpdater::AddAvailableValue(BasicBlock *BB, Value *V) { + assert(ProtoType != 0 && "Need to initialize SSAUpdater"); + assert(ProtoType == V->getType() && + "All rewritten values must have the same type"); + getAvailableVals(AV)[BB] = V; +} + +/// IsEquivalentPHI - Check if PHI has the same incoming value as specified +/// in ValueMapping for each predecessor block. +static bool IsEquivalentPHI(PHINode *PHI, + DenseMap<BasicBlock*, Value*> &ValueMapping) { + unsigned PHINumValues = PHI->getNumIncomingValues(); + if (PHINumValues != ValueMapping.size()) + return false; + + // Scan the phi to see if it matches. + for (unsigned i = 0, e = PHINumValues; i != e; ++i) + if (ValueMapping[PHI->getIncomingBlock(i)] != + PHI->getIncomingValue(i)) { + return false; + } + + return true; +} + +/// GetValueAtEndOfBlock - Construct SSA form, materializing a value that is +/// live at the end of the specified block. +Value *SSAUpdater::GetValueAtEndOfBlock(BasicBlock *BB) { + Value *Res = GetValueAtEndOfBlockInternal(BB); + return Res; +} + +/// GetValueInMiddleOfBlock - Construct SSA form, materializing a value that +/// is live in the middle of the specified block. +/// +/// GetValueInMiddleOfBlock is the same as GetValueAtEndOfBlock except in one +/// important case: if there is a definition of the rewritten value after the +/// 'use' in BB. Consider code like this: +/// +/// X1 = ... +/// SomeBB: +/// use(X) +/// X2 = ... +/// br Cond, SomeBB, OutBB +/// +/// In this case, there are two values (X1 and X2) added to the AvailableVals +/// set by the client of the rewriter, and those values are both live out of +/// their respective blocks. However, the use of X happens in the *middle* of +/// a block. Because of this, we need to insert a new PHI node in SomeBB to +/// merge the appropriate values, and this value isn't live out of the block. +/// +Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) { + // If there is no definition of the renamed variable in this block, just use + // GetValueAtEndOfBlock to do our work. + if (!HasValueForBlock(BB)) + return GetValueAtEndOfBlock(BB); + + // Otherwise, we have the hard case. Get the live-in values for each + // predecessor. + SmallVector<std::pair<BasicBlock*, Value*>, 8> PredValues; + Value *SingularValue = 0; + + // We can get our predecessor info by walking the pred_iterator list, but it + // is relatively slow. If we already have PHI nodes in this block, walk one + // of them to get the predecessor list instead. + if (PHINode *SomePhi = dyn_cast<PHINode>(BB->begin())) { + for (unsigned i = 0, e = SomePhi->getNumIncomingValues(); i != e; ++i) { + BasicBlock *PredBB = SomePhi->getIncomingBlock(i); + Value *PredVal = GetValueAtEndOfBlock(PredBB); + PredValues.push_back(std::make_pair(PredBB, PredVal)); + + // Compute SingularValue. + if (i == 0) + SingularValue = PredVal; + else if (PredVal != SingularValue) + SingularValue = 0; + } + } else { + bool isFirstPred = true; + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { + BasicBlock *PredBB = *PI; + Value *PredVal = GetValueAtEndOfBlock(PredBB); + PredValues.push_back(std::make_pair(PredBB, PredVal)); + + // Compute SingularValue. + if (isFirstPred) { + SingularValue = PredVal; + isFirstPred = false; + } else if (PredVal != SingularValue) + SingularValue = 0; + } + } + + // If there are no predecessors, just return undef. + if (PredValues.empty()) + return UndefValue::get(ProtoType); + + // Otherwise, if all the merged values are the same, just use it. + if (SingularValue != 0) + return SingularValue; + + // Otherwise, we do need a PHI: check to see if we already have one available + // in this block that produces the right value. + if (isa<PHINode>(BB->begin())) { + DenseMap<BasicBlock*, Value*> ValueMapping(PredValues.begin(), + PredValues.end()); + PHINode *SomePHI; + for (BasicBlock::iterator It = BB->begin(); + (SomePHI = dyn_cast<PHINode>(It)); ++It) { + if (IsEquivalentPHI(SomePHI, ValueMapping)) + return SomePHI; + } + } + + // Ok, we have no way out, insert a new one now. + PHINode *InsertedPHI = PHINode::Create(ProtoType, ProtoName, &BB->front()); + InsertedPHI->reserveOperandSpace(PredValues.size()); + + // Fill in all the predecessors of the PHI. + for (unsigned i = 0, e = PredValues.size(); i != e; ++i) + InsertedPHI->addIncoming(PredValues[i].second, PredValues[i].first); + + // See if the PHI node can be merged to a single value. This can happen in + // loop cases when we get a PHI of itself and one other value. + if (Value *V = SimplifyInstruction(InsertedPHI)) { + InsertedPHI->eraseFromParent(); + return V; + } + + // If the client wants to know about all new instructions, tell it. + if (InsertedPHIs) InsertedPHIs->push_back(InsertedPHI); + + DEBUG(dbgs() << " Inserted PHI: " << *InsertedPHI << "\n"); + return InsertedPHI; +} + +/// RewriteUse - Rewrite a use of the symbolic value. This handles PHI nodes, +/// which use their value in the corresponding predecessor. +void SSAUpdater::RewriteUse(Use &U) { + Instruction *User = cast<Instruction>(U.getUser()); + + Value *V; + if (PHINode *UserPN = dyn_cast<PHINode>(User)) + V = GetValueAtEndOfBlock(UserPN->getIncomingBlock(U)); + else + V = GetValueInMiddleOfBlock(User->getParent()); + + U.set(V); +} + +/// RewriteUseAfterInsertions - Rewrite a use, just like RewriteUse. However, +/// this version of the method can rewrite uses in the same block as a +/// definition, because it assumes that all uses of a value are below any +/// inserted values. +void SSAUpdater::RewriteUseAfterInsertions(Use &U) { + Instruction *User = cast<Instruction>(U.getUser()); + + Value *V; + if (PHINode *UserPN = dyn_cast<PHINode>(User)) + V = GetValueAtEndOfBlock(UserPN->getIncomingBlock(U)); + else + V = GetValueAtEndOfBlock(User->getParent()); + + U.set(V); +} + +/// PHIiter - Iterator for PHI operands. This is used for the PHI_iterator +/// in the SSAUpdaterImpl template. +namespace { + class PHIiter { + private: + PHINode *PHI; + unsigned idx; + + public: + explicit PHIiter(PHINode *P) // begin iterator + : PHI(P), idx(0) {} + PHIiter(PHINode *P, bool) // end iterator + : PHI(P), idx(PHI->getNumIncomingValues()) {} + + PHIiter &operator++() { ++idx; return *this; } + bool operator==(const PHIiter& x) const { return idx == x.idx; } + bool operator!=(const PHIiter& x) const { return !operator==(x); } + Value *getIncomingValue() { return PHI->getIncomingValue(idx); } + BasicBlock *getIncomingBlock() { return PHI->getIncomingBlock(idx); } + }; +} + +/// SSAUpdaterTraits<SSAUpdater> - Traits for the SSAUpdaterImpl template, +/// specialized for SSAUpdater. +namespace llvm { +template<> +class SSAUpdaterTraits<SSAUpdater> { +public: + typedef BasicBlock BlkT; + typedef Value *ValT; + typedef PHINode PhiT; + + typedef succ_iterator BlkSucc_iterator; + static BlkSucc_iterator BlkSucc_begin(BlkT *BB) { return succ_begin(BB); } + static BlkSucc_iterator BlkSucc_end(BlkT *BB) { return succ_end(BB); } + + typedef PHIiter PHI_iterator; + static inline PHI_iterator PHI_begin(PhiT *PHI) { return PHI_iterator(PHI); } + static inline PHI_iterator PHI_end(PhiT *PHI) { + return PHI_iterator(PHI, true); + } + + /// FindPredecessorBlocks - Put the predecessors of Info->BB into the Preds + /// vector, set Info->NumPreds, and allocate space in Info->Preds. + static void FindPredecessorBlocks(BasicBlock *BB, + SmallVectorImpl<BasicBlock*> *Preds) { + // We can get our predecessor info by walking the pred_iterator list, + // but it is relatively slow. If we already have PHI nodes in this + // block, walk one of them to get the predecessor list instead. + if (PHINode *SomePhi = dyn_cast<PHINode>(BB->begin())) { + for (unsigned PI = 0, E = SomePhi->getNumIncomingValues(); PI != E; ++PI) + Preds->push_back(SomePhi->getIncomingBlock(PI)); + } else { + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) + Preds->push_back(*PI); + } + } + + /// GetUndefVal - Get an undefined value of the same type as the value + /// being handled. + static Value *GetUndefVal(BasicBlock *BB, SSAUpdater *Updater) { + return UndefValue::get(Updater->ProtoType); + } + + /// CreateEmptyPHI - Create a new PHI instruction in the specified block. + /// Reserve space for the operands but do not fill them in yet. + static Value *CreateEmptyPHI(BasicBlock *BB, unsigned NumPreds, + SSAUpdater *Updater) { + PHINode *PHI = PHINode::Create(Updater->ProtoType, Updater->ProtoName, + &BB->front()); + PHI->reserveOperandSpace(NumPreds); + return PHI; + } + + /// AddPHIOperand - Add the specified value as an operand of the PHI for + /// the specified predecessor block. + static void AddPHIOperand(PHINode *PHI, Value *Val, BasicBlock *Pred) { + PHI->addIncoming(Val, Pred); + } + + /// InstrIsPHI - Check if an instruction is a PHI. + /// + static PHINode *InstrIsPHI(Instruction *I) { + return dyn_cast<PHINode>(I); + } + + /// ValueIsPHI - Check if a value is a PHI. + /// + static PHINode *ValueIsPHI(Value *Val, SSAUpdater *Updater) { + return dyn_cast<PHINode>(Val); + } + + /// ValueIsNewPHI - Like ValueIsPHI but also check if the PHI has no source + /// operands, i.e., it was just added. + static PHINode *ValueIsNewPHI(Value *Val, SSAUpdater *Updater) { + PHINode *PHI = ValueIsPHI(Val, Updater); + if (PHI && PHI->getNumIncomingValues() == 0) + return PHI; + return 0; + } + + /// GetPHIValue - For the specified PHI instruction, return the value + /// that it defines. + static Value *GetPHIValue(PHINode *PHI) { + return PHI; + } +}; + +} // End llvm namespace + +/// GetValueAtEndOfBlockInternal - Check to see if AvailableVals has an entry +/// for the specified BB and if so, return it. If not, construct SSA form by +/// first calculating the required placement of PHIs and then inserting new +/// PHIs where needed. +Value *SSAUpdater::GetValueAtEndOfBlockInternal(BasicBlock *BB) { + AvailableValsTy &AvailableVals = getAvailableVals(AV); + if (Value *V = AvailableVals[BB]) + return V; + + SSAUpdaterImpl<SSAUpdater> Impl(this, &AvailableVals, InsertedPHIs); + return Impl.GetValue(BB); +} + +//===----------------------------------------------------------------------===// +// LoadAndStorePromoter Implementation +//===----------------------------------------------------------------------===// + +LoadAndStorePromoter:: +LoadAndStorePromoter(const SmallVectorImpl<Instruction*> &Insts, + SSAUpdater &S, StringRef BaseName) : SSA(S) { + if (Insts.empty()) return; + + Value *SomeVal; + if (LoadInst *LI = dyn_cast<LoadInst>(Insts[0])) + SomeVal = LI; + else + SomeVal = cast<StoreInst>(Insts[0])->getOperand(0); + + if (BaseName.empty()) + BaseName = SomeVal->getName(); + SSA.Initialize(SomeVal->getType(), BaseName); +} + + +void LoadAndStorePromoter:: +run(const SmallVectorImpl<Instruction*> &Insts) const { + + // First step: bucket up uses of the alloca by the block they occur in. + // This is important because we have to handle multiple defs/uses in a block + // ourselves: SSAUpdater is purely for cross-block references. + // FIXME: Want a TinyVector<Instruction*> since there is often 0/1 element. + DenseMap<BasicBlock*, std::vector<Instruction*> > UsesByBlock; + + for (unsigned i = 0, e = Insts.size(); i != e; ++i) { + Instruction *User = Insts[i]; + UsesByBlock[User->getParent()].push_back(User); + } + + // Okay, now we can iterate over all the blocks in the function with uses, + // processing them. Keep track of which loads are loading a live-in value. + // Walk the uses in the use-list order to be determinstic. + SmallVector<LoadInst*, 32> LiveInLoads; + DenseMap<Value*, Value*> ReplacedLoads; + + for (unsigned i = 0, e = Insts.size(); i != e; ++i) { + Instruction *User = Insts[i]; + BasicBlock *BB = User->getParent(); + std::vector<Instruction*> &BlockUses = UsesByBlock[BB]; + + // If this block has already been processed, ignore this repeat use. + if (BlockUses.empty()) continue; + + // Okay, this is the first use in the block. If this block just has a + // single user in it, we can rewrite it trivially. + if (BlockUses.size() == 1) { + // If it is a store, it is a trivial def of the value in the block. + if (StoreInst *SI = dyn_cast<StoreInst>(User)) + SSA.AddAvailableValue(BB, SI->getOperand(0)); + else + // Otherwise it is a load, queue it to rewrite as a live-in load. + LiveInLoads.push_back(cast<LoadInst>(User)); + BlockUses.clear(); + continue; + } + + // Otherwise, check to see if this block is all loads. + bool HasStore = false; + for (unsigned i = 0, e = BlockUses.size(); i != e; ++i) { + if (isa<StoreInst>(BlockUses[i])) { + HasStore = true; + break; + } + } + + // If so, we can queue them all as live in loads. We don't have an + // efficient way to tell which on is first in the block and don't want to + // scan large blocks, so just add all loads as live ins. + if (!HasStore) { + for (unsigned i = 0, e = BlockUses.size(); i != e; ++i) + LiveInLoads.push_back(cast<LoadInst>(BlockUses[i])); + BlockUses.clear(); + continue; + } + + // Otherwise, we have mixed loads and stores (or just a bunch of stores). + // Since SSAUpdater is purely for cross-block values, we need to determine + // the order of these instructions in the block. If the first use in the + // block is a load, then it uses the live in value. The last store defines + // the live out value. We handle this by doing a linear scan of the block. + Value *StoredValue = 0; + for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ++II) { + if (LoadInst *L = dyn_cast<LoadInst>(II)) { + // If this is a load from an unrelated pointer, ignore it. + if (!isInstInList(L, Insts)) continue; + + // If we haven't seen a store yet, this is a live in use, otherwise + // use the stored value. + if (StoredValue) { + replaceLoadWithValue(L, StoredValue); + L->replaceAllUsesWith(StoredValue); + ReplacedLoads[L] = StoredValue; + } else { + LiveInLoads.push_back(L); + } + continue; + } + + if (StoreInst *S = dyn_cast<StoreInst>(II)) { + // If this is a store to an unrelated pointer, ignore it. + if (!isInstInList(S, Insts)) continue; + + // Remember that this is the active value in the block. + StoredValue = S->getOperand(0); + } + } + + // The last stored value that happened is the live-out for the block. + assert(StoredValue && "Already checked that there is a store in block"); + SSA.AddAvailableValue(BB, StoredValue); + BlockUses.clear(); + } + + // Okay, now we rewrite all loads that use live-in values in the loop, + // inserting PHI nodes as necessary. + for (unsigned i = 0, e = LiveInLoads.size(); i != e; ++i) { + LoadInst *ALoad = LiveInLoads[i]; + Value *NewVal = SSA.GetValueInMiddleOfBlock(ALoad->getParent()); + replaceLoadWithValue(ALoad, NewVal); + + // Avoid assertions in unreachable code. + if (NewVal == ALoad) NewVal = UndefValue::get(NewVal->getType()); + ALoad->replaceAllUsesWith(NewVal); + ReplacedLoads[ALoad] = NewVal; + } + + // Allow the client to do stuff before we start nuking things. + doExtraRewritesBeforeFinalDeletion(); + + // Now that everything is rewritten, delete the old instructions from the + // function. They should all be dead now. + for (unsigned i = 0, e = Insts.size(); i != e; ++i) { + Instruction *User = Insts[i]; + + // If this is a load that still has uses, then the load must have been added + // as a live value in the SSAUpdate data structure for a block (e.g. because + // the loaded value was stored later). In this case, we need to recursively + // propagate the updates until we get to the real value. + if (!User->use_empty()) { + Value *NewVal = ReplacedLoads[User]; + assert(NewVal && "not a replaced load?"); + + // Propagate down to the ultimate replacee. The intermediately loads + // could theoretically already have been deleted, so we don't want to + // dereference the Value*'s. + DenseMap<Value*, Value*>::iterator RLI = ReplacedLoads.find(NewVal); + while (RLI != ReplacedLoads.end()) { + NewVal = RLI->second; + RLI = ReplacedLoads.find(NewVal); + } + + replaceLoadWithValue(cast<LoadInst>(User), NewVal); + User->replaceAllUsesWith(NewVal); + } + + instructionDeleted(User); + User->eraseFromParent(); + } +} diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp new file mode 100644 index 0000000..fb660db --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -0,0 +1,2536 @@ +//===- SimplifyCFG.cpp - Code to perform CFG simplification ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Peephole optimize the CFG. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "simplifycfg" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Constants.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Type.h" +#include "llvm/DerivedTypes.h" +#include "llvm/GlobalVariable.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/CFG.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ConstantRange.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include <algorithm> +#include <set> +#include <map> +using namespace llvm; + +static cl::opt<bool> +DupRet("simplifycfg-dup-ret", cl::Hidden, cl::init(false), + cl::desc("Duplicate return instructions into unconditional branches")); + +STATISTIC(NumSpeculations, "Number of speculative executed instructions"); + +namespace { +class SimplifyCFGOpt { + const TargetData *const TD; + + Value *isValueEqualityComparison(TerminatorInst *TI); + BasicBlock *GetValueEqualityComparisonCases(TerminatorInst *TI, + std::vector<std::pair<ConstantInt*, BasicBlock*> > &Cases); + bool SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI, + BasicBlock *Pred); + bool FoldValueComparisonIntoPredecessors(TerminatorInst *TI); + + bool SimplifyReturn(ReturnInst *RI); + bool SimplifyUnwind(UnwindInst *UI); + bool SimplifyUnreachable(UnreachableInst *UI); + bool SimplifySwitch(SwitchInst *SI); + bool SimplifyIndirectBr(IndirectBrInst *IBI); + bool SimplifyUncondBranch(BranchInst *BI); + bool SimplifyCondBranch(BranchInst *BI); + +public: + explicit SimplifyCFGOpt(const TargetData *td) : TD(td) {} + bool run(BasicBlock *BB); +}; +} + +/// SafeToMergeTerminators - Return true if it is safe to merge these two +/// terminator instructions together. +/// +static bool SafeToMergeTerminators(TerminatorInst *SI1, TerminatorInst *SI2) { + if (SI1 == SI2) return false; // Can't merge with self! + + // It is not safe to merge these two switch instructions if they have a common + // successor, and if that successor has a PHI node, and if *that* PHI node has + // conflicting incoming values from the two switch blocks. + BasicBlock *SI1BB = SI1->getParent(); + BasicBlock *SI2BB = SI2->getParent(); + SmallPtrSet<BasicBlock*, 16> SI1Succs(succ_begin(SI1BB), succ_end(SI1BB)); + + for (succ_iterator I = succ_begin(SI2BB), E = succ_end(SI2BB); I != E; ++I) + if (SI1Succs.count(*I)) + for (BasicBlock::iterator BBI = (*I)->begin(); + isa<PHINode>(BBI); ++BBI) { + PHINode *PN = cast<PHINode>(BBI); + if (PN->getIncomingValueForBlock(SI1BB) != + PN->getIncomingValueForBlock(SI2BB)) + return false; + } + + return true; +} + +/// AddPredecessorToBlock - Update PHI nodes in Succ to indicate that there will +/// now be entries in it from the 'NewPred' block. The values that will be +/// flowing into the PHI nodes will be the same as those coming in from +/// ExistPred, an existing predecessor of Succ. +static void AddPredecessorToBlock(BasicBlock *Succ, BasicBlock *NewPred, + BasicBlock *ExistPred) { + if (!isa<PHINode>(Succ->begin())) return; // Quick exit if nothing to do + + PHINode *PN; + for (BasicBlock::iterator I = Succ->begin(); + (PN = dyn_cast<PHINode>(I)); ++I) + PN->addIncoming(PN->getIncomingValueForBlock(ExistPred), NewPred); +} + + +/// GetIfCondition - Given a basic block (BB) with two predecessors (and at +/// least one PHI node in it), check to see if the merge at this block is due +/// to an "if condition". If so, return the boolean condition that determines +/// which entry into BB will be taken. Also, return by references the block +/// that will be entered from if the condition is true, and the block that will +/// be entered if the condition is false. +/// +/// This does no checking to see if the true/false blocks have large or unsavory +/// instructions in them. +static Value *GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue, + BasicBlock *&IfFalse) { + PHINode *SomePHI = cast<PHINode>(BB->begin()); + assert(SomePHI->getNumIncomingValues() == 2 && + "Function can only handle blocks with 2 predecessors!"); + BasicBlock *Pred1 = SomePHI->getIncomingBlock(0); + BasicBlock *Pred2 = SomePHI->getIncomingBlock(1); + + // We can only handle branches. Other control flow will be lowered to + // branches if possible anyway. + BranchInst *Pred1Br = dyn_cast<BranchInst>(Pred1->getTerminator()); + BranchInst *Pred2Br = dyn_cast<BranchInst>(Pred2->getTerminator()); + if (Pred1Br == 0 || Pred2Br == 0) + return 0; + + // Eliminate code duplication by ensuring that Pred1Br is conditional if + // either are. + if (Pred2Br->isConditional()) { + // If both branches are conditional, we don't have an "if statement". In + // reality, we could transform this case, but since the condition will be + // required anyway, we stand no chance of eliminating it, so the xform is + // probably not profitable. + if (Pred1Br->isConditional()) + return 0; + + std::swap(Pred1, Pred2); + std::swap(Pred1Br, Pred2Br); + } + + if (Pred1Br->isConditional()) { + // The only thing we have to watch out for here is to make sure that Pred2 + // doesn't have incoming edges from other blocks. If it does, the condition + // doesn't dominate BB. + if (Pred2->getSinglePredecessor() == 0) + return 0; + + // If we found a conditional branch predecessor, make sure that it branches + // to BB and Pred2Br. If it doesn't, this isn't an "if statement". + if (Pred1Br->getSuccessor(0) == BB && + Pred1Br->getSuccessor(1) == Pred2) { + IfTrue = Pred1; + IfFalse = Pred2; + } else if (Pred1Br->getSuccessor(0) == Pred2 && + Pred1Br->getSuccessor(1) == BB) { + IfTrue = Pred2; + IfFalse = Pred1; + } else { + // We know that one arm of the conditional goes to BB, so the other must + // go somewhere unrelated, and this must not be an "if statement". + return 0; + } + + return Pred1Br->getCondition(); + } + + // Ok, if we got here, both predecessors end with an unconditional branch to + // BB. Don't panic! If both blocks only have a single (identical) + // predecessor, and THAT is a conditional branch, then we're all ok! + BasicBlock *CommonPred = Pred1->getSinglePredecessor(); + if (CommonPred == 0 || CommonPred != Pred2->getSinglePredecessor()) + return 0; + + // Otherwise, if this is a conditional branch, then we can use it! + BranchInst *BI = dyn_cast<BranchInst>(CommonPred->getTerminator()); + if (BI == 0) return 0; + + assert(BI->isConditional() && "Two successors but not conditional?"); + if (BI->getSuccessor(0) == Pred1) { + IfTrue = Pred1; + IfFalse = Pred2; + } else { + IfTrue = Pred2; + IfFalse = Pred1; + } + return BI->getCondition(); +} + +/// DominatesMergePoint - If we have a merge point of an "if condition" as +/// accepted above, return true if the specified value dominates the block. We +/// don't handle the true generality of domination here, just a special case +/// which works well enough for us. +/// +/// If AggressiveInsts is non-null, and if V does not dominate BB, we check to +/// see if V (which must be an instruction) is cheap to compute and is +/// non-trapping. If both are true, the instruction is inserted into the set +/// and true is returned. +static bool DominatesMergePoint(Value *V, BasicBlock *BB, + SmallPtrSet<Instruction*, 4> *AggressiveInsts) { + Instruction *I = dyn_cast<Instruction>(V); + if (!I) { + // Non-instructions all dominate instructions, but not all constantexprs + // can be executed unconditionally. + if (ConstantExpr *C = dyn_cast<ConstantExpr>(V)) + if (C->canTrap()) + return false; + return true; + } + BasicBlock *PBB = I->getParent(); + + // We don't want to allow weird loops that might have the "if condition" in + // the bottom of this block. + if (PBB == BB) return false; + + // If this instruction is defined in a block that contains an unconditional + // branch to BB, then it must be in the 'conditional' part of the "if + // statement". If not, it definitely dominates the region. + BranchInst *BI = dyn_cast<BranchInst>(PBB->getTerminator()); + if (BI == 0 || BI->isConditional() || BI->getSuccessor(0) != BB) + return true; + + // If we aren't allowing aggressive promotion anymore, then don't consider + // instructions in the 'if region'. + if (AggressiveInsts == 0) return false; + + // Okay, it looks like the instruction IS in the "condition". Check to + // see if it's a cheap instruction to unconditionally compute, and if it + // only uses stuff defined outside of the condition. If so, hoist it out. + if (!I->isSafeToSpeculativelyExecute()) + return false; + + switch (I->getOpcode()) { + default: return false; // Cannot hoist this out safely. + case Instruction::Load: + // We have to check to make sure there are no instructions before the + // load in its basic block, as we are going to hoist the load out to its + // predecessor. + if (PBB->getFirstNonPHIOrDbg() != I) + return false; + break; + case Instruction::Add: + case Instruction::Sub: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::ICmp: + break; // These are all cheap and non-trapping instructions. + } + + // Okay, we can only really hoist these out if their operands are not + // defined in the conditional region. + for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i) + if (!DominatesMergePoint(*i, BB, 0)) + return false; + // Okay, it's safe to do this! Remember this instruction. + AggressiveInsts->insert(I); + return true; +} + +/// GetConstantInt - Extract ConstantInt from value, looking through IntToPtr +/// and PointerNullValue. Return NULL if value is not a constant int. +static ConstantInt *GetConstantInt(Value *V, const TargetData *TD) { + // Normal constant int. + ConstantInt *CI = dyn_cast<ConstantInt>(V); + if (CI || !TD || !isa<Constant>(V) || !V->getType()->isPointerTy()) + return CI; + + // This is some kind of pointer constant. Turn it into a pointer-sized + // ConstantInt if possible. + const IntegerType *PtrTy = TD->getIntPtrType(V->getContext()); + + // Null pointer means 0, see SelectionDAGBuilder::getValue(const Value*). + if (isa<ConstantPointerNull>(V)) + return ConstantInt::get(PtrTy, 0); + + // IntToPtr const int. + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) + if (CE->getOpcode() == Instruction::IntToPtr) + if (ConstantInt *CI = dyn_cast<ConstantInt>(CE->getOperand(0))) { + // The constant is very likely to have the right type already. + if (CI->getType() == PtrTy) + return CI; + else + return cast<ConstantInt> + (ConstantExpr::getIntegerCast(CI, PtrTy, /*isSigned=*/false)); + } + return 0; +} + +/// GatherConstantCompares - Given a potentially 'or'd or 'and'd together +/// collection of icmp eq/ne instructions that compare a value against a +/// constant, return the value being compared, and stick the constant into the +/// Values vector. +static Value * +GatherConstantCompares(Value *V, std::vector<ConstantInt*> &Vals, Value *&Extra, + const TargetData *TD, bool isEQ, unsigned &UsedICmps) { + Instruction *I = dyn_cast<Instruction>(V); + if (I == 0) return 0; + + // If this is an icmp against a constant, handle this as one of the cases. + if (ICmpInst *ICI = dyn_cast<ICmpInst>(I)) { + if (ConstantInt *C = GetConstantInt(I->getOperand(1), TD)) { + if (ICI->getPredicate() == (isEQ ? ICmpInst::ICMP_EQ:ICmpInst::ICMP_NE)) { + UsedICmps++; + Vals.push_back(C); + return I->getOperand(0); + } + + // If we have "x ult 3" comparison, for example, then we can add 0,1,2 to + // the set. + ConstantRange Span = + ConstantRange::makeICmpRegion(ICI->getPredicate(), C->getValue()); + + // If this is an and/!= check then we want to optimize "x ugt 2" into + // x != 0 && x != 1. + if (!isEQ) + Span = Span.inverse(); + + // If there are a ton of values, we don't want to make a ginormous switch. + if (Span.getSetSize().ugt(8) || Span.isEmptySet() || + // We don't handle wrapped sets yet. + Span.isWrappedSet()) + return 0; + + for (APInt Tmp = Span.getLower(); Tmp != Span.getUpper(); ++Tmp) + Vals.push_back(ConstantInt::get(V->getContext(), Tmp)); + UsedICmps++; + return I->getOperand(0); + } + return 0; + } + + // Otherwise, we can only handle an | or &, depending on isEQ. + if (I->getOpcode() != (isEQ ? Instruction::Or : Instruction::And)) + return 0; + + unsigned NumValsBeforeLHS = Vals.size(); + unsigned UsedICmpsBeforeLHS = UsedICmps; + if (Value *LHS = GatherConstantCompares(I->getOperand(0), Vals, Extra, TD, + isEQ, UsedICmps)) { + unsigned NumVals = Vals.size(); + unsigned UsedICmpsBeforeRHS = UsedICmps; + if (Value *RHS = GatherConstantCompares(I->getOperand(1), Vals, Extra, TD, + isEQ, UsedICmps)) { + if (LHS == RHS) + return LHS; + Vals.resize(NumVals); + UsedICmps = UsedICmpsBeforeRHS; + } + + // The RHS of the or/and can't be folded in and we haven't used "Extra" yet, + // set it and return success. + if (Extra == 0 || Extra == I->getOperand(1)) { + Extra = I->getOperand(1); + return LHS; + } + + Vals.resize(NumValsBeforeLHS); + UsedICmps = UsedICmpsBeforeLHS; + return 0; + } + + // If the LHS can't be folded in, but Extra is available and RHS can, try to + // use LHS as Extra. + if (Extra == 0 || Extra == I->getOperand(0)) { + Value *OldExtra = Extra; + Extra = I->getOperand(0); + if (Value *RHS = GatherConstantCompares(I->getOperand(1), Vals, Extra, TD, + isEQ, UsedICmps)) + return RHS; + assert(Vals.size() == NumValsBeforeLHS); + Extra = OldExtra; + } + + return 0; +} + +static void EraseTerminatorInstAndDCECond(TerminatorInst *TI) { + Instruction* Cond = 0; + if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { + Cond = dyn_cast<Instruction>(SI->getCondition()); + } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { + if (BI->isConditional()) + Cond = dyn_cast<Instruction>(BI->getCondition()); + } else if (IndirectBrInst *IBI = dyn_cast<IndirectBrInst>(TI)) { + Cond = dyn_cast<Instruction>(IBI->getAddress()); + } + + TI->eraseFromParent(); + if (Cond) RecursivelyDeleteTriviallyDeadInstructions(Cond); +} + +/// isValueEqualityComparison - Return true if the specified terminator checks +/// to see if a value is equal to constant integer value. +Value *SimplifyCFGOpt::isValueEqualityComparison(TerminatorInst *TI) { + Value *CV = 0; + if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { + // Do not permit merging of large switch instructions into their + // predecessors unless there is only one predecessor. + if (SI->getNumSuccessors()*std::distance(pred_begin(SI->getParent()), + pred_end(SI->getParent())) <= 128) + CV = SI->getCondition(); + } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) + if (BI->isConditional() && BI->getCondition()->hasOneUse()) + if (ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition())) + if ((ICI->getPredicate() == ICmpInst::ICMP_EQ || + ICI->getPredicate() == ICmpInst::ICMP_NE) && + GetConstantInt(ICI->getOperand(1), TD)) + CV = ICI->getOperand(0); + + // Unwrap any lossless ptrtoint cast. + if (TD && CV && CV->getType() == TD->getIntPtrType(CV->getContext())) + if (PtrToIntInst *PTII = dyn_cast<PtrToIntInst>(CV)) + CV = PTII->getOperand(0); + return CV; +} + +/// GetValueEqualityComparisonCases - Given a value comparison instruction, +/// decode all of the 'cases' that it represents and return the 'default' block. +BasicBlock *SimplifyCFGOpt:: +GetValueEqualityComparisonCases(TerminatorInst *TI, + std::vector<std::pair<ConstantInt*, + BasicBlock*> > &Cases) { + if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { + Cases.reserve(SI->getNumCases()); + for (unsigned i = 1, e = SI->getNumCases(); i != e; ++i) + Cases.push_back(std::make_pair(SI->getCaseValue(i), SI->getSuccessor(i))); + return SI->getDefaultDest(); + } + + BranchInst *BI = cast<BranchInst>(TI); + ICmpInst *ICI = cast<ICmpInst>(BI->getCondition()); + Cases.push_back(std::make_pair(GetConstantInt(ICI->getOperand(1), TD), + BI->getSuccessor(ICI->getPredicate() == + ICmpInst::ICMP_NE))); + return BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_EQ); +} + + +/// EliminateBlockCases - Given a vector of bb/value pairs, remove any entries +/// in the list that match the specified block. +static void EliminateBlockCases(BasicBlock *BB, + std::vector<std::pair<ConstantInt*, BasicBlock*> > &Cases) { + for (unsigned i = 0, e = Cases.size(); i != e; ++i) + if (Cases[i].second == BB) { + Cases.erase(Cases.begin()+i); + --i; --e; + } +} + +/// ValuesOverlap - Return true if there are any keys in C1 that exist in C2 as +/// well. +static bool +ValuesOverlap(std::vector<std::pair<ConstantInt*, BasicBlock*> > &C1, + std::vector<std::pair<ConstantInt*, BasicBlock*> > &C2) { + std::vector<std::pair<ConstantInt*, BasicBlock*> > *V1 = &C1, *V2 = &C2; + + // Make V1 be smaller than V2. + if (V1->size() > V2->size()) + std::swap(V1, V2); + + if (V1->size() == 0) return false; + if (V1->size() == 1) { + // Just scan V2. + ConstantInt *TheVal = (*V1)[0].first; + for (unsigned i = 0, e = V2->size(); i != e; ++i) + if (TheVal == (*V2)[i].first) + return true; + } + + // Otherwise, just sort both lists and compare element by element. + array_pod_sort(V1->begin(), V1->end()); + array_pod_sort(V2->begin(), V2->end()); + unsigned i1 = 0, i2 = 0, e1 = V1->size(), e2 = V2->size(); + while (i1 != e1 && i2 != e2) { + if ((*V1)[i1].first == (*V2)[i2].first) + return true; + if ((*V1)[i1].first < (*V2)[i2].first) + ++i1; + else + ++i2; + } + return false; +} + +/// SimplifyEqualityComparisonWithOnlyPredecessor - If TI is known to be a +/// terminator instruction and its block is known to only have a single +/// predecessor block, check to see if that predecessor is also a value +/// comparison with the same value, and if that comparison determines the +/// outcome of this comparison. If so, simplify TI. This does a very limited +/// form of jump threading. +bool SimplifyCFGOpt:: +SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI, + BasicBlock *Pred) { + Value *PredVal = isValueEqualityComparison(Pred->getTerminator()); + if (!PredVal) return false; // Not a value comparison in predecessor. + + Value *ThisVal = isValueEqualityComparison(TI); + assert(ThisVal && "This isn't a value comparison!!"); + if (ThisVal != PredVal) return false; // Different predicates. + + // Find out information about when control will move from Pred to TI's block. + std::vector<std::pair<ConstantInt*, BasicBlock*> > PredCases; + BasicBlock *PredDef = GetValueEqualityComparisonCases(Pred->getTerminator(), + PredCases); + EliminateBlockCases(PredDef, PredCases); // Remove default from cases. + + // Find information about how control leaves this block. + std::vector<std::pair<ConstantInt*, BasicBlock*> > ThisCases; + BasicBlock *ThisDef = GetValueEqualityComparisonCases(TI, ThisCases); + EliminateBlockCases(ThisDef, ThisCases); // Remove default from cases. + + // If TI's block is the default block from Pred's comparison, potentially + // simplify TI based on this knowledge. + if (PredDef == TI->getParent()) { + // If we are here, we know that the value is none of those cases listed in + // PredCases. If there are any cases in ThisCases that are in PredCases, we + // can simplify TI. + if (!ValuesOverlap(PredCases, ThisCases)) + return false; + + if (isa<BranchInst>(TI)) { + // Okay, one of the successors of this condbr is dead. Convert it to a + // uncond br. + assert(ThisCases.size() == 1 && "Branch can only have one case!"); + // Insert the new branch. + Instruction *NI = BranchInst::Create(ThisDef, TI); + (void) NI; + + // Remove PHI node entries for the dead edge. + ThisCases[0].second->removePredecessor(TI->getParent()); + + DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator() + << "Through successor TI: " << *TI << "Leaving: " << *NI << "\n"); + + EraseTerminatorInstAndDCECond(TI); + return true; + } + + SwitchInst *SI = cast<SwitchInst>(TI); + // Okay, TI has cases that are statically dead, prune them away. + SmallPtrSet<Constant*, 16> DeadCases; + for (unsigned i = 0, e = PredCases.size(); i != e; ++i) + DeadCases.insert(PredCases[i].first); + + DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator() + << "Through successor TI: " << *TI); + + for (unsigned i = SI->getNumCases()-1; i != 0; --i) + if (DeadCases.count(SI->getCaseValue(i))) { + SI->getSuccessor(i)->removePredecessor(TI->getParent()); + SI->removeCase(i); + } + + DEBUG(dbgs() << "Leaving: " << *TI << "\n"); + return true; + } + + // Otherwise, TI's block must correspond to some matched value. Find out + // which value (or set of values) this is. + ConstantInt *TIV = 0; + BasicBlock *TIBB = TI->getParent(); + for (unsigned i = 0, e = PredCases.size(); i != e; ++i) + if (PredCases[i].second == TIBB) { + if (TIV != 0) + return false; // Cannot handle multiple values coming to this block. + TIV = PredCases[i].first; + } + assert(TIV && "No edge from pred to succ?"); + + // Okay, we found the one constant that our value can be if we get into TI's + // BB. Find out which successor will unconditionally be branched to. + BasicBlock *TheRealDest = 0; + for (unsigned i = 0, e = ThisCases.size(); i != e; ++i) + if (ThisCases[i].first == TIV) { + TheRealDest = ThisCases[i].second; + break; + } + + // If not handled by any explicit cases, it is handled by the default case. + if (TheRealDest == 0) TheRealDest = ThisDef; + + // Remove PHI node entries for dead edges. + BasicBlock *CheckEdge = TheRealDest; + for (succ_iterator SI = succ_begin(TIBB), e = succ_end(TIBB); SI != e; ++SI) + if (*SI != CheckEdge) + (*SI)->removePredecessor(TIBB); + else + CheckEdge = 0; + + // Insert the new branch. + Instruction *NI = BranchInst::Create(TheRealDest, TI); + (void) NI; + + DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator() + << "Through successor TI: " << *TI << "Leaving: " << *NI << "\n"); + + EraseTerminatorInstAndDCECond(TI); + return true; +} + +namespace { + /// ConstantIntOrdering - This class implements a stable ordering of constant + /// integers that does not depend on their address. This is important for + /// applications that sort ConstantInt's to ensure uniqueness. + struct ConstantIntOrdering { + bool operator()(const ConstantInt *LHS, const ConstantInt *RHS) const { + return LHS->getValue().ult(RHS->getValue()); + } + }; +} + +static int ConstantIntSortPredicate(const void *P1, const void *P2) { + const ConstantInt *LHS = *(const ConstantInt**)P1; + const ConstantInt *RHS = *(const ConstantInt**)P2; + if (LHS->getValue().ult(RHS->getValue())) + return 1; + if (LHS->getValue() == RHS->getValue()) + return 0; + return -1; +} + +/// FoldValueComparisonIntoPredecessors - The specified terminator is a value +/// equality comparison instruction (either a switch or a branch on "X == c"). +/// See if any of the predecessors of the terminator block are value comparisons +/// on the same value. If so, and if safe to do so, fold them together. +bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI) { + BasicBlock *BB = TI->getParent(); + Value *CV = isValueEqualityComparison(TI); // CondVal + assert(CV && "Not a comparison?"); + bool Changed = false; + + SmallVector<BasicBlock*, 16> Preds(pred_begin(BB), pred_end(BB)); + while (!Preds.empty()) { + BasicBlock *Pred = Preds.pop_back_val(); + + // See if the predecessor is a comparison with the same value. + TerminatorInst *PTI = Pred->getTerminator(); + Value *PCV = isValueEqualityComparison(PTI); // PredCondVal + + if (PCV == CV && SafeToMergeTerminators(TI, PTI)) { + // Figure out which 'cases' to copy from SI to PSI. + std::vector<std::pair<ConstantInt*, BasicBlock*> > BBCases; + BasicBlock *BBDefault = GetValueEqualityComparisonCases(TI, BBCases); + + std::vector<std::pair<ConstantInt*, BasicBlock*> > PredCases; + BasicBlock *PredDefault = GetValueEqualityComparisonCases(PTI, PredCases); + + // Based on whether the default edge from PTI goes to BB or not, fill in + // PredCases and PredDefault with the new switch cases we would like to + // build. + SmallVector<BasicBlock*, 8> NewSuccessors; + + if (PredDefault == BB) { + // If this is the default destination from PTI, only the edges in TI + // that don't occur in PTI, or that branch to BB will be activated. + std::set<ConstantInt*, ConstantIntOrdering> PTIHandled; + for (unsigned i = 0, e = PredCases.size(); i != e; ++i) + if (PredCases[i].second != BB) + PTIHandled.insert(PredCases[i].first); + else { + // The default destination is BB, we don't need explicit targets. + std::swap(PredCases[i], PredCases.back()); + PredCases.pop_back(); + --i; --e; + } + + // Reconstruct the new switch statement we will be building. + if (PredDefault != BBDefault) { + PredDefault->removePredecessor(Pred); + PredDefault = BBDefault; + NewSuccessors.push_back(BBDefault); + } + for (unsigned i = 0, e = BBCases.size(); i != e; ++i) + if (!PTIHandled.count(BBCases[i].first) && + BBCases[i].second != BBDefault) { + PredCases.push_back(BBCases[i]); + NewSuccessors.push_back(BBCases[i].second); + } + + } else { + // If this is not the default destination from PSI, only the edges + // in SI that occur in PSI with a destination of BB will be + // activated. + std::set<ConstantInt*, ConstantIntOrdering> PTIHandled; + for (unsigned i = 0, e = PredCases.size(); i != e; ++i) + if (PredCases[i].second == BB) { + PTIHandled.insert(PredCases[i].first); + std::swap(PredCases[i], PredCases.back()); + PredCases.pop_back(); + --i; --e; + } + + // Okay, now we know which constants were sent to BB from the + // predecessor. Figure out where they will all go now. + for (unsigned i = 0, e = BBCases.size(); i != e; ++i) + if (PTIHandled.count(BBCases[i].first)) { + // If this is one we are capable of getting... + PredCases.push_back(BBCases[i]); + NewSuccessors.push_back(BBCases[i].second); + PTIHandled.erase(BBCases[i].first);// This constant is taken care of + } + + // If there are any constants vectored to BB that TI doesn't handle, + // they must go to the default destination of TI. + for (std::set<ConstantInt*, ConstantIntOrdering>::iterator I = + PTIHandled.begin(), + E = PTIHandled.end(); I != E; ++I) { + PredCases.push_back(std::make_pair(*I, BBDefault)); + NewSuccessors.push_back(BBDefault); + } + } + + // Okay, at this point, we know which new successor Pred will get. Make + // sure we update the number of entries in the PHI nodes for these + // successors. + for (unsigned i = 0, e = NewSuccessors.size(); i != e; ++i) + AddPredecessorToBlock(NewSuccessors[i], Pred, BB); + + // Convert pointer to int before we switch. + if (CV->getType()->isPointerTy()) { + assert(TD && "Cannot switch on pointer without TargetData"); + CV = new PtrToIntInst(CV, TD->getIntPtrType(CV->getContext()), + "magicptr", PTI); + } + + // Now that the successors are updated, create the new Switch instruction. + SwitchInst *NewSI = SwitchInst::Create(CV, PredDefault, + PredCases.size(), PTI); + for (unsigned i = 0, e = PredCases.size(); i != e; ++i) + NewSI->addCase(PredCases[i].first, PredCases[i].second); + + EraseTerminatorInstAndDCECond(PTI); + + // Okay, last check. If BB is still a successor of PSI, then we must + // have an infinite loop case. If so, add an infinitely looping block + // to handle the case to preserve the behavior of the code. + BasicBlock *InfLoopBlock = 0; + for (unsigned i = 0, e = NewSI->getNumSuccessors(); i != e; ++i) + if (NewSI->getSuccessor(i) == BB) { + if (InfLoopBlock == 0) { + // Insert it at the end of the function, because it's either code, + // or it won't matter if it's hot. :) + InfLoopBlock = BasicBlock::Create(BB->getContext(), + "infloop", BB->getParent()); + BranchInst::Create(InfLoopBlock, InfLoopBlock); + } + NewSI->setSuccessor(i, InfLoopBlock); + } + + Changed = true; + } + } + return Changed; +} + +// isSafeToHoistInvoke - If we would need to insert a select that uses the +// value of this invoke (comments in HoistThenElseCodeToIf explain why we +// would need to do this), we can't hoist the invoke, as there is nowhere +// to put the select in this case. +static bool isSafeToHoistInvoke(BasicBlock *BB1, BasicBlock *BB2, + Instruction *I1, Instruction *I2) { + for (succ_iterator SI = succ_begin(BB1), E = succ_end(BB1); SI != E; ++SI) { + PHINode *PN; + for (BasicBlock::iterator BBI = SI->begin(); + (PN = dyn_cast<PHINode>(BBI)); ++BBI) { + Value *BB1V = PN->getIncomingValueForBlock(BB1); + Value *BB2V = PN->getIncomingValueForBlock(BB2); + if (BB1V != BB2V && (BB1V==I1 || BB2V==I2)) { + return false; + } + } + } + return true; +} + +/// HoistThenElseCodeToIf - Given a conditional branch that goes to BB1 and +/// BB2, hoist any common code in the two blocks up into the branch block. The +/// caller of this function guarantees that BI's block dominates BB1 and BB2. +static bool HoistThenElseCodeToIf(BranchInst *BI) { + // This does very trivial matching, with limited scanning, to find identical + // instructions in the two blocks. In particular, we don't want to get into + // O(M*N) situations here where M and N are the sizes of BB1 and BB2. As + // such, we currently just scan for obviously identical instructions in an + // identical order. + BasicBlock *BB1 = BI->getSuccessor(0); // The true destination. + BasicBlock *BB2 = BI->getSuccessor(1); // The false destination + + BasicBlock::iterator BB1_Itr = BB1->begin(); + BasicBlock::iterator BB2_Itr = BB2->begin(); + + Instruction *I1 = BB1_Itr++, *I2 = BB2_Itr++; + while (isa<DbgInfoIntrinsic>(I1)) + I1 = BB1_Itr++; + while (isa<DbgInfoIntrinsic>(I2)) + I2 = BB2_Itr++; + if (I1->getOpcode() != I2->getOpcode() || isa<PHINode>(I1) || + !I1->isIdenticalToWhenDefined(I2) || + (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2))) + return false; + + // If we get here, we can hoist at least one instruction. + BasicBlock *BIParent = BI->getParent(); + + do { + // If we are hoisting the terminator instruction, don't move one (making a + // broken BB), instead clone it, and remove BI. + if (isa<TerminatorInst>(I1)) + goto HoistTerminator; + + // For a normal instruction, we just move one to right before the branch, + // then replace all uses of the other with the first. Finally, we remove + // the now redundant second instruction. + BIParent->getInstList().splice(BI, BB1->getInstList(), I1); + if (!I2->use_empty()) + I2->replaceAllUsesWith(I1); + I1->intersectOptionalDataWith(I2); + I2->eraseFromParent(); + + I1 = BB1_Itr++; + while (isa<DbgInfoIntrinsic>(I1)) + I1 = BB1_Itr++; + I2 = BB2_Itr++; + while (isa<DbgInfoIntrinsic>(I2)) + I2 = BB2_Itr++; + } while (I1->getOpcode() == I2->getOpcode() && + I1->isIdenticalToWhenDefined(I2)); + + return true; + +HoistTerminator: + // It may not be possible to hoist an invoke. + if (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2)) + return true; + + // Okay, it is safe to hoist the terminator. + Instruction *NT = I1->clone(); + BIParent->getInstList().insert(BI, NT); + if (!NT->getType()->isVoidTy()) { + I1->replaceAllUsesWith(NT); + I2->replaceAllUsesWith(NT); + NT->takeName(I1); + } + + // Hoisting one of the terminators from our successor is a great thing. + // Unfortunately, the successors of the if/else blocks may have PHI nodes in + // them. If they do, all PHI entries for BB1/BB2 must agree for all PHI + // nodes, so we insert select instruction to compute the final result. + std::map<std::pair<Value*,Value*>, SelectInst*> InsertedSelects; + for (succ_iterator SI = succ_begin(BB1), E = succ_end(BB1); SI != E; ++SI) { + PHINode *PN; + for (BasicBlock::iterator BBI = SI->begin(); + (PN = dyn_cast<PHINode>(BBI)); ++BBI) { + Value *BB1V = PN->getIncomingValueForBlock(BB1); + Value *BB2V = PN->getIncomingValueForBlock(BB2); + if (BB1V == BB2V) continue; + + // These values do not agree. Insert a select instruction before NT + // that determines the right value. + SelectInst *&SI = InsertedSelects[std::make_pair(BB1V, BB2V)]; + if (SI == 0) + SI = SelectInst::Create(BI->getCondition(), BB1V, BB2V, + BB1V->getName()+"."+BB2V->getName(), NT); + // Make the PHI node use the select for all incoming values for BB1/BB2 + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + if (PN->getIncomingBlock(i) == BB1 || PN->getIncomingBlock(i) == BB2) + PN->setIncomingValue(i, SI); + } + } + + // Update any PHI nodes in our new successors. + for (succ_iterator SI = succ_begin(BB1), E = succ_end(BB1); SI != E; ++SI) + AddPredecessorToBlock(*SI, BIParent, BB1); + + EraseTerminatorInstAndDCECond(BI); + return true; +} + +/// SpeculativelyExecuteBB - Given a conditional branch that goes to BB1 +/// and an BB2 and the only successor of BB1 is BB2, hoist simple code +/// (for now, restricted to a single instruction that's side effect free) from +/// the BB1 into the branch block to speculatively execute it. +static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *BB1) { + // Only speculatively execution a single instruction (not counting the + // terminator) for now. + Instruction *HInst = NULL; + Instruction *Term = BB1->getTerminator(); + for (BasicBlock::iterator BBI = BB1->begin(), BBE = BB1->end(); + BBI != BBE; ++BBI) { + Instruction *I = BBI; + // Skip debug info. + if (isa<DbgInfoIntrinsic>(I)) continue; + if (I == Term) break; + + if (HInst) + return false; + HInst = I; + } + if (!HInst) + return false; + + // Be conservative for now. FP select instruction can often be expensive. + Value *BrCond = BI->getCondition(); + if (isa<FCmpInst>(BrCond)) + return false; + + // If BB1 is actually on the false edge of the conditional branch, remember + // to swap the select operands later. + bool Invert = false; + if (BB1 != BI->getSuccessor(0)) { + assert(BB1 == BI->getSuccessor(1) && "No edge from 'if' block?"); + Invert = true; + } + + // Turn + // BB: + // %t1 = icmp + // br i1 %t1, label %BB1, label %BB2 + // BB1: + // %t3 = add %t2, c + // br label BB2 + // BB2: + // => + // BB: + // %t1 = icmp + // %t4 = add %t2, c + // %t3 = select i1 %t1, %t2, %t3 + switch (HInst->getOpcode()) { + default: return false; // Not safe / profitable to hoist. + case Instruction::Add: + case Instruction::Sub: + // Not worth doing for vector ops. + if (HInst->getType()->isVectorTy()) + return false; + break; + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + // Don't mess with vector operations. + if (HInst->getType()->isVectorTy()) + return false; + break; // These are all cheap and non-trapping instructions. + } + + // If the instruction is obviously dead, don't try to predicate it. + if (HInst->use_empty()) { + HInst->eraseFromParent(); + return true; + } + + // Can we speculatively execute the instruction? And what is the value + // if the condition is false? Consider the phi uses, if the incoming value + // from the "if" block are all the same V, then V is the value of the + // select if the condition is false. + BasicBlock *BIParent = BI->getParent(); + SmallVector<PHINode*, 4> PHIUses; + Value *FalseV = NULL; + + BasicBlock *BB2 = BB1->getTerminator()->getSuccessor(0); + for (Value::use_iterator UI = HInst->use_begin(), E = HInst->use_end(); + UI != E; ++UI) { + // Ignore any user that is not a PHI node in BB2. These can only occur in + // unreachable blocks, because they would not be dominated by the instr. + PHINode *PN = dyn_cast<PHINode>(*UI); + if (!PN || PN->getParent() != BB2) + return false; + PHIUses.push_back(PN); + + Value *PHIV = PN->getIncomingValueForBlock(BIParent); + if (!FalseV) + FalseV = PHIV; + else if (FalseV != PHIV) + return false; // Inconsistent value when condition is false. + } + + assert(FalseV && "Must have at least one user, and it must be a PHI"); + + // Do not hoist the instruction if any of its operands are defined but not + // used in this BB. The transformation will prevent the operand from + // being sunk into the use block. + for (User::op_iterator i = HInst->op_begin(), e = HInst->op_end(); + i != e; ++i) { + Instruction *OpI = dyn_cast<Instruction>(*i); + if (OpI && OpI->getParent() == BIParent && + !OpI->isUsedInBasicBlock(BIParent)) + return false; + } + + // If we get here, we can hoist the instruction. Try to place it + // before the icmp instruction preceding the conditional branch. + BasicBlock::iterator InsertPos = BI; + if (InsertPos != BIParent->begin()) + --InsertPos; + // Skip debug info between condition and branch. + while (InsertPos != BIParent->begin() && isa<DbgInfoIntrinsic>(InsertPos)) + --InsertPos; + if (InsertPos == BrCond && !isa<PHINode>(BrCond)) { + SmallPtrSet<Instruction *, 4> BB1Insns; + for(BasicBlock::iterator BB1I = BB1->begin(), BB1E = BB1->end(); + BB1I != BB1E; ++BB1I) + BB1Insns.insert(BB1I); + for(Value::use_iterator UI = BrCond->use_begin(), UE = BrCond->use_end(); + UI != UE; ++UI) { + Instruction *Use = cast<Instruction>(*UI); + if (!BB1Insns.count(Use)) continue; + + // If BrCond uses the instruction that place it just before + // branch instruction. + InsertPos = BI; + break; + } + } else + InsertPos = BI; + BIParent->getInstList().splice(InsertPos, BB1->getInstList(), HInst); + + // Create a select whose true value is the speculatively executed value and + // false value is the previously determined FalseV. + SelectInst *SI; + if (Invert) + SI = SelectInst::Create(BrCond, FalseV, HInst, + FalseV->getName() + "." + HInst->getName(), BI); + else + SI = SelectInst::Create(BrCond, HInst, FalseV, + HInst->getName() + "." + FalseV->getName(), BI); + + // Make the PHI node use the select for all incoming values for "then" and + // "if" blocks. + for (unsigned i = 0, e = PHIUses.size(); i != e; ++i) { + PHINode *PN = PHIUses[i]; + for (unsigned j = 0, ee = PN->getNumIncomingValues(); j != ee; ++j) + if (PN->getIncomingBlock(j) == BB1 || PN->getIncomingBlock(j) == BIParent) + PN->setIncomingValue(j, SI); + } + + ++NumSpeculations; + return true; +} + +/// BlockIsSimpleEnoughToThreadThrough - Return true if we can thread a branch +/// across this block. +static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) { + BranchInst *BI = cast<BranchInst>(BB->getTerminator()); + unsigned Size = 0; + + for (BasicBlock::iterator BBI = BB->begin(); &*BBI != BI; ++BBI) { + if (isa<DbgInfoIntrinsic>(BBI)) + continue; + if (Size > 10) return false; // Don't clone large BB's. + ++Size; + + // We can only support instructions that do not define values that are + // live outside of the current basic block. + for (Value::use_iterator UI = BBI->use_begin(), E = BBI->use_end(); + UI != E; ++UI) { + Instruction *U = cast<Instruction>(*UI); + if (U->getParent() != BB || isa<PHINode>(U)) return false; + } + + // Looks ok, continue checking. + } + + return true; +} + +/// FoldCondBranchOnPHI - If we have a conditional branch on a PHI node value +/// that is defined in the same block as the branch and if any PHI entries are +/// constants, thread edges corresponding to that entry to be branches to their +/// ultimate destination. +static bool FoldCondBranchOnPHI(BranchInst *BI, const TargetData *TD) { + BasicBlock *BB = BI->getParent(); + PHINode *PN = dyn_cast<PHINode>(BI->getCondition()); + // NOTE: we currently cannot transform this case if the PHI node is used + // outside of the block. + if (!PN || PN->getParent() != BB || !PN->hasOneUse()) + return false; + + // Degenerate case of a single entry PHI. + if (PN->getNumIncomingValues() == 1) { + FoldSingleEntryPHINodes(PN->getParent()); + return true; + } + + // Now we know that this block has multiple preds and two succs. + if (!BlockIsSimpleEnoughToThreadThrough(BB)) return false; + + // Okay, this is a simple enough basic block. See if any phi values are + // constants. + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + ConstantInt *CB = dyn_cast<ConstantInt>(PN->getIncomingValue(i)); + if (CB == 0 || !CB->getType()->isIntegerTy(1)) continue; + + // Okay, we now know that all edges from PredBB should be revectored to + // branch to RealDest. + BasicBlock *PredBB = PN->getIncomingBlock(i); + BasicBlock *RealDest = BI->getSuccessor(!CB->getZExtValue()); + + if (RealDest == BB) continue; // Skip self loops. + + // The dest block might have PHI nodes, other predecessors and other + // difficult cases. Instead of being smart about this, just insert a new + // block that jumps to the destination block, effectively splitting + // the edge we are about to create. + BasicBlock *EdgeBB = BasicBlock::Create(BB->getContext(), + RealDest->getName()+".critedge", + RealDest->getParent(), RealDest); + BranchInst::Create(RealDest, EdgeBB); + + // Update PHI nodes. + AddPredecessorToBlock(RealDest, EdgeBB, BB); + + // BB may have instructions that are being threaded over. Clone these + // instructions into EdgeBB. We know that there will be no uses of the + // cloned instructions outside of EdgeBB. + BasicBlock::iterator InsertPt = EdgeBB->begin(); + DenseMap<Value*, Value*> TranslateMap; // Track translated values. + for (BasicBlock::iterator BBI = BB->begin(); &*BBI != BI; ++BBI) { + if (PHINode *PN = dyn_cast<PHINode>(BBI)) { + TranslateMap[PN] = PN->getIncomingValueForBlock(PredBB); + continue; + } + // Clone the instruction. + Instruction *N = BBI->clone(); + if (BBI->hasName()) N->setName(BBI->getName()+".c"); + + // Update operands due to translation. + for (User::op_iterator i = N->op_begin(), e = N->op_end(); + i != e; ++i) { + DenseMap<Value*, Value*>::iterator PI = TranslateMap.find(*i); + if (PI != TranslateMap.end()) + *i = PI->second; + } + + // Check for trivial simplification. + if (Value *V = SimplifyInstruction(N, TD)) { + TranslateMap[BBI] = V; + delete N; // Instruction folded away, don't need actual inst + } else { + // Insert the new instruction into its new home. + EdgeBB->getInstList().insert(InsertPt, N); + if (!BBI->use_empty()) + TranslateMap[BBI] = N; + } + } + + // Loop over all of the edges from PredBB to BB, changing them to branch + // to EdgeBB instead. + TerminatorInst *PredBBTI = PredBB->getTerminator(); + for (unsigned i = 0, e = PredBBTI->getNumSuccessors(); i != e; ++i) + if (PredBBTI->getSuccessor(i) == BB) { + BB->removePredecessor(PredBB); + PredBBTI->setSuccessor(i, EdgeBB); + } + + // Recurse, simplifying any other constants. + return FoldCondBranchOnPHI(BI, TD) | true; + } + + return false; +} + +/// FoldTwoEntryPHINode - Given a BB that starts with the specified two-entry +/// PHI node, see if we can eliminate it. +static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) { + // Ok, this is a two entry PHI node. Check to see if this is a simple "if + // statement", which has a very simple dominance structure. Basically, we + // are trying to find the condition that is being branched on, which + // subsequently causes this merge to happen. We really want control + // dependence information for this check, but simplifycfg can't keep it up + // to date, and this catches most of the cases we care about anyway. + BasicBlock *BB = PN->getParent(); + BasicBlock *IfTrue, *IfFalse; + Value *IfCond = GetIfCondition(BB, IfTrue, IfFalse); + if (!IfCond || + // Don't bother if the branch will be constant folded trivially. + isa<ConstantInt>(IfCond)) + return false; + + // Okay, we found that we can merge this two-entry phi node into a select. + // Doing so would require us to fold *all* two entry phi nodes in this block. + // At some point this becomes non-profitable (particularly if the target + // doesn't support cmov's). Only do this transformation if there are two or + // fewer PHI nodes in this block. + unsigned NumPhis = 0; + for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ++NumPhis, ++I) + if (NumPhis > 2) + return false; + + // Loop over the PHI's seeing if we can promote them all to select + // instructions. While we are at it, keep track of the instructions + // that need to be moved to the dominating block. + SmallPtrSet<Instruction*, 4> AggressiveInsts; + + for (BasicBlock::iterator II = BB->begin(); isa<PHINode>(II);) { + PHINode *PN = cast<PHINode>(II++); + if (Value *V = SimplifyInstruction(PN, TD)) { + PN->replaceAllUsesWith(V); + PN->eraseFromParent(); + continue; + } + + if (!DominatesMergePoint(PN->getIncomingValue(0), BB, &AggressiveInsts) || + !DominatesMergePoint(PN->getIncomingValue(1), BB, &AggressiveInsts)) + return false; + } + + // If we folded the the first phi, PN dangles at this point. Refresh it. If + // we ran out of PHIs then we simplified them all. + PN = dyn_cast<PHINode>(BB->begin()); + if (PN == 0) return true; + + // Don't fold i1 branches on PHIs which contain binary operators. These can + // often be turned into switches and other things. + if (PN->getType()->isIntegerTy(1) && + (isa<BinaryOperator>(PN->getIncomingValue(0)) || + isa<BinaryOperator>(PN->getIncomingValue(1)) || + isa<BinaryOperator>(IfCond))) + return false; + + // If we all PHI nodes are promotable, check to make sure that all + // instructions in the predecessor blocks can be promoted as well. If + // not, we won't be able to get rid of the control flow, so it's not + // worth promoting to select instructions. + BasicBlock *DomBlock = 0; + BasicBlock *IfBlock1 = PN->getIncomingBlock(0); + BasicBlock *IfBlock2 = PN->getIncomingBlock(1); + if (cast<BranchInst>(IfBlock1->getTerminator())->isConditional()) { + IfBlock1 = 0; + } else { + DomBlock = *pred_begin(IfBlock1); + for (BasicBlock::iterator I = IfBlock1->begin();!isa<TerminatorInst>(I);++I) + if (!AggressiveInsts.count(I) && !isa<DbgInfoIntrinsic>(I)) { + // This is not an aggressive instruction that we can promote. + // Because of this, we won't be able to get rid of the control + // flow, so the xform is not worth it. + return false; + } + } + + if (cast<BranchInst>(IfBlock2->getTerminator())->isConditional()) { + IfBlock2 = 0; + } else { + DomBlock = *pred_begin(IfBlock2); + for (BasicBlock::iterator I = IfBlock2->begin();!isa<TerminatorInst>(I);++I) + if (!AggressiveInsts.count(I) && !isa<DbgInfoIntrinsic>(I)) { + // This is not an aggressive instruction that we can promote. + // Because of this, we won't be able to get rid of the control + // flow, so the xform is not worth it. + return false; + } + } + + DEBUG(dbgs() << "FOUND IF CONDITION! " << *IfCond << " T: " + << IfTrue->getName() << " F: " << IfFalse->getName() << "\n"); + + // If we can still promote the PHI nodes after this gauntlet of tests, + // do all of the PHI's now. + Instruction *InsertPt = DomBlock->getTerminator(); + + // Move all 'aggressive' instructions, which are defined in the + // conditional parts of the if's up to the dominating block. + if (IfBlock1) + DomBlock->getInstList().splice(InsertPt, + IfBlock1->getInstList(), IfBlock1->begin(), + IfBlock1->getTerminator()); + if (IfBlock2) + DomBlock->getInstList().splice(InsertPt, + IfBlock2->getInstList(), IfBlock2->begin(), + IfBlock2->getTerminator()); + + while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) { + // Change the PHI node into a select instruction. + Value *TrueVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfFalse); + Value *FalseVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfTrue); + + Value *NV = SelectInst::Create(IfCond, TrueVal, FalseVal, "", InsertPt); + PN->replaceAllUsesWith(NV); + NV->takeName(PN); + PN->eraseFromParent(); + } + + // At this point, IfBlock1 and IfBlock2 are both empty, so our if statement + // has been flattened. Change DomBlock to jump directly to our new block to + // avoid other simplifycfg's kicking in on the diamond. + TerminatorInst *OldTI = DomBlock->getTerminator(); + BranchInst::Create(BB, OldTI); + OldTI->eraseFromParent(); + return true; +} + +/// SimplifyCondBranchToTwoReturns - If we found a conditional branch that goes +/// to two returning blocks, try to merge them together into one return, +/// introducing a select if the return values disagree. +static bool SimplifyCondBranchToTwoReturns(BranchInst *BI) { + assert(BI->isConditional() && "Must be a conditional branch"); + BasicBlock *TrueSucc = BI->getSuccessor(0); + BasicBlock *FalseSucc = BI->getSuccessor(1); + ReturnInst *TrueRet = cast<ReturnInst>(TrueSucc->getTerminator()); + ReturnInst *FalseRet = cast<ReturnInst>(FalseSucc->getTerminator()); + + // Check to ensure both blocks are empty (just a return) or optionally empty + // with PHI nodes. If there are other instructions, merging would cause extra + // computation on one path or the other. + if (!TrueSucc->getFirstNonPHIOrDbg()->isTerminator()) + return false; + if (!FalseSucc->getFirstNonPHIOrDbg()->isTerminator()) + return false; + + // Okay, we found a branch that is going to two return nodes. If + // there is no return value for this function, just change the + // branch into a return. + if (FalseRet->getNumOperands() == 0) { + TrueSucc->removePredecessor(BI->getParent()); + FalseSucc->removePredecessor(BI->getParent()); + ReturnInst::Create(BI->getContext(), 0, BI); + EraseTerminatorInstAndDCECond(BI); + return true; + } + + // Otherwise, figure out what the true and false return values are + // so we can insert a new select instruction. + Value *TrueValue = TrueRet->getReturnValue(); + Value *FalseValue = FalseRet->getReturnValue(); + + // Unwrap any PHI nodes in the return blocks. + if (PHINode *TVPN = dyn_cast_or_null<PHINode>(TrueValue)) + if (TVPN->getParent() == TrueSucc) + TrueValue = TVPN->getIncomingValueForBlock(BI->getParent()); + if (PHINode *FVPN = dyn_cast_or_null<PHINode>(FalseValue)) + if (FVPN->getParent() == FalseSucc) + FalseValue = FVPN->getIncomingValueForBlock(BI->getParent()); + + // In order for this transformation to be safe, we must be able to + // unconditionally execute both operands to the return. This is + // normally the case, but we could have a potentially-trapping + // constant expression that prevents this transformation from being + // safe. + if (ConstantExpr *TCV = dyn_cast_or_null<ConstantExpr>(TrueValue)) + if (TCV->canTrap()) + return false; + if (ConstantExpr *FCV = dyn_cast_or_null<ConstantExpr>(FalseValue)) + if (FCV->canTrap()) + return false; + + // Okay, we collected all the mapped values and checked them for sanity, and + // defined to really do this transformation. First, update the CFG. + TrueSucc->removePredecessor(BI->getParent()); + FalseSucc->removePredecessor(BI->getParent()); + + // Insert select instructions where needed. + Value *BrCond = BI->getCondition(); + if (TrueValue) { + // Insert a select if the results differ. + if (TrueValue == FalseValue || isa<UndefValue>(FalseValue)) { + } else if (isa<UndefValue>(TrueValue)) { + TrueValue = FalseValue; + } else { + TrueValue = SelectInst::Create(BrCond, TrueValue, + FalseValue, "retval", BI); + } + } + + Value *RI = !TrueValue ? + ReturnInst::Create(BI->getContext(), BI) : + ReturnInst::Create(BI->getContext(), TrueValue, BI); + (void) RI; + + DEBUG(dbgs() << "\nCHANGING BRANCH TO TWO RETURNS INTO SELECT:" + << "\n " << *BI << "NewRet = " << *RI + << "TRUEBLOCK: " << *TrueSucc << "FALSEBLOCK: "<< *FalseSucc); + + EraseTerminatorInstAndDCECond(BI); + + return true; +} + +/// FoldBranchToCommonDest - If this basic block is ONLY a setcc and a branch, +/// and if a predecessor branches to us and one of our successors, fold the +/// setcc into the predecessor and use logical operations to pick the right +/// destination. +bool llvm::FoldBranchToCommonDest(BranchInst *BI) { + BasicBlock *BB = BI->getParent(); + Instruction *Cond = dyn_cast<Instruction>(BI->getCondition()); + if (Cond == 0 || (!isa<CmpInst>(Cond) && !isa<BinaryOperator>(Cond)) || + Cond->getParent() != BB || !Cond->hasOneUse()) + return false; + + // Only allow this if the condition is a simple instruction that can be + // executed unconditionally. It must be in the same block as the branch, and + // must be at the front of the block. + BasicBlock::iterator FrontIt = BB->front(); + // Ignore dbg intrinsics. + while (isa<DbgInfoIntrinsic>(FrontIt)) + ++FrontIt; + + // Allow a single instruction to be hoisted in addition to the compare + // that feeds the branch. We later ensure that any values that _it_ uses + // were also live in the predecessor, so that we don't unnecessarily create + // register pressure or inhibit out-of-order execution. + Instruction *BonusInst = 0; + if (&*FrontIt != Cond && + FrontIt->hasOneUse() && *FrontIt->use_begin() == Cond && + FrontIt->isSafeToSpeculativelyExecute()) { + BonusInst = &*FrontIt; + ++FrontIt; + } + + // Only a single bonus inst is allowed. + if (&*FrontIt != Cond) + return false; + + // Make sure the instruction after the condition is the cond branch. + BasicBlock::iterator CondIt = Cond; ++CondIt; + // Ingore dbg intrinsics. + while(isa<DbgInfoIntrinsic>(CondIt)) + ++CondIt; + if (&*CondIt != BI) { + assert (!isa<DbgInfoIntrinsic>(CondIt) && "Hey do not forget debug info!"); + return false; + } + + // Cond is known to be a compare or binary operator. Check to make sure that + // neither operand is a potentially-trapping constant expression. + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(0))) + if (CE->canTrap()) + return false; + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(1))) + if (CE->canTrap()) + return false; + + + // Finally, don't infinitely unroll conditional loops. + BasicBlock *TrueDest = BI->getSuccessor(0); + BasicBlock *FalseDest = BI->getSuccessor(1); + if (TrueDest == BB || FalseDest == BB) + return false; + + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { + BasicBlock *PredBlock = *PI; + BranchInst *PBI = dyn_cast<BranchInst>(PredBlock->getTerminator()); + + // Check that we have two conditional branches. If there is a PHI node in + // the common successor, verify that the same value flows in from both + // blocks. + if (PBI == 0 || PBI->isUnconditional() || + !SafeToMergeTerminators(BI, PBI)) + continue; + + // Ensure that any values used in the bonus instruction are also used + // by the terminator of the predecessor. This means that those values + // must already have been resolved, so we won't be inhibiting the + // out-of-order core by speculating them earlier. + if (BonusInst) { + // Collect the values used by the bonus inst + SmallPtrSet<Value*, 4> UsedValues; + for (Instruction::op_iterator OI = BonusInst->op_begin(), + OE = BonusInst->op_end(); OI != OE; ++OI) { + Value* V = *OI; + if (!isa<Constant>(V)) + UsedValues.insert(V); + } + + SmallVector<std::pair<Value*, unsigned>, 4> Worklist; + Worklist.push_back(std::make_pair(PBI->getOperand(0), 0)); + + // Walk up to four levels back up the use-def chain of the predecessor's + // terminator to see if all those values were used. The choice of four + // levels is arbitrary, to provide a compile-time-cost bound. + while (!Worklist.empty()) { + std::pair<Value*, unsigned> Pair = Worklist.back(); + Worklist.pop_back(); + + if (Pair.second >= 4) continue; + UsedValues.erase(Pair.first); + if (UsedValues.empty()) break; + + if (Instruction *I = dyn_cast<Instruction>(Pair.first)) { + for (Instruction::op_iterator OI = I->op_begin(), OE = I->op_end(); + OI != OE; ++OI) + Worklist.push_back(std::make_pair(OI->get(), Pair.second+1)); + } + } + + if (!UsedValues.empty()) return false; + } + + Instruction::BinaryOps Opc; + bool InvertPredCond = false; + + if (PBI->getSuccessor(0) == TrueDest) + Opc = Instruction::Or; + else if (PBI->getSuccessor(1) == FalseDest) + Opc = Instruction::And; + else if (PBI->getSuccessor(0) == FalseDest) + Opc = Instruction::And, InvertPredCond = true; + else if (PBI->getSuccessor(1) == TrueDest) + Opc = Instruction::Or, InvertPredCond = true; + else + continue; + + DEBUG(dbgs() << "FOLDING BRANCH TO COMMON DEST:\n" << *PBI << *BB); + + // If we need to invert the condition in the pred block to match, do so now. + if (InvertPredCond) { + Value *NewCond = PBI->getCondition(); + + if (NewCond->hasOneUse() && isa<CmpInst>(NewCond)) { + CmpInst *CI = cast<CmpInst>(NewCond); + CI->setPredicate(CI->getInversePredicate()); + } else { + NewCond = BinaryOperator::CreateNot(NewCond, + PBI->getCondition()->getName()+".not", PBI); + } + + PBI->setCondition(NewCond); + BasicBlock *OldTrue = PBI->getSuccessor(0); + BasicBlock *OldFalse = PBI->getSuccessor(1); + PBI->setSuccessor(0, OldFalse); + PBI->setSuccessor(1, OldTrue); + } + + // If we have a bonus inst, clone it into the predecessor block. + Instruction *NewBonus = 0; + if (BonusInst) { + NewBonus = BonusInst->clone(); + PredBlock->getInstList().insert(PBI, NewBonus); + NewBonus->takeName(BonusInst); + BonusInst->setName(BonusInst->getName()+".old"); + } + + // Clone Cond into the predecessor basic block, and or/and the + // two conditions together. + Instruction *New = Cond->clone(); + if (BonusInst) New->replaceUsesOfWith(BonusInst, NewBonus); + PredBlock->getInstList().insert(PBI, New); + New->takeName(Cond); + Cond->setName(New->getName()+".old"); + + Value *NewCond = BinaryOperator::Create(Opc, PBI->getCondition(), + New, "or.cond", PBI); + PBI->setCondition(NewCond); + if (PBI->getSuccessor(0) == BB) { + AddPredecessorToBlock(TrueDest, PredBlock, BB); + PBI->setSuccessor(0, TrueDest); + } + if (PBI->getSuccessor(1) == BB) { + AddPredecessorToBlock(FalseDest, PredBlock, BB); + PBI->setSuccessor(1, FalseDest); + } + return true; + } + return false; +} + +/// SimplifyCondBranchToCondBranch - If we have a conditional branch as a +/// predecessor of another block, this function tries to simplify it. We know +/// that PBI and BI are both conditional branches, and BI is in one of the +/// successor blocks of PBI - PBI branches to BI. +static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) { + assert(PBI->isConditional() && BI->isConditional()); + BasicBlock *BB = BI->getParent(); + + // If this block ends with a branch instruction, and if there is a + // predecessor that ends on a branch of the same condition, make + // this conditional branch redundant. + if (PBI->getCondition() == BI->getCondition() && + PBI->getSuccessor(0) != PBI->getSuccessor(1)) { + // Okay, the outcome of this conditional branch is statically + // knowable. If this block had a single pred, handle specially. + if (BB->getSinglePredecessor()) { + // Turn this into a branch on constant. + bool CondIsTrue = PBI->getSuccessor(0) == BB; + BI->setCondition(ConstantInt::get(Type::getInt1Ty(BB->getContext()), + CondIsTrue)); + return true; // Nuke the branch on constant. + } + + // Otherwise, if there are multiple predecessors, insert a PHI that merges + // in the constant and simplify the block result. Subsequent passes of + // simplifycfg will thread the block. + if (BlockIsSimpleEnoughToThreadThrough(BB)) { + PHINode *NewPN = PHINode::Create(Type::getInt1Ty(BB->getContext()), + BI->getCondition()->getName() + ".pr", + BB->begin()); + // Okay, we're going to insert the PHI node. Since PBI is not the only + // predecessor, compute the PHI'd conditional value for all of the preds. + // Any predecessor where the condition is not computable we keep symbolic. + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { + BasicBlock *P = *PI; + if ((PBI = dyn_cast<BranchInst>(P->getTerminator())) && + PBI != BI && PBI->isConditional() && + PBI->getCondition() == BI->getCondition() && + PBI->getSuccessor(0) != PBI->getSuccessor(1)) { + bool CondIsTrue = PBI->getSuccessor(0) == BB; + NewPN->addIncoming(ConstantInt::get(Type::getInt1Ty(BB->getContext()), + CondIsTrue), P); + } else { + NewPN->addIncoming(BI->getCondition(), P); + } + } + + BI->setCondition(NewPN); + return true; + } + } + + // If this is a conditional branch in an empty block, and if any + // predecessors is a conditional branch to one of our destinations, + // fold the conditions into logical ops and one cond br. + BasicBlock::iterator BBI = BB->begin(); + // Ignore dbg intrinsics. + while (isa<DbgInfoIntrinsic>(BBI)) + ++BBI; + if (&*BBI != BI) + return false; + + + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(BI->getCondition())) + if (CE->canTrap()) + return false; + + int PBIOp, BIOp; + if (PBI->getSuccessor(0) == BI->getSuccessor(0)) + PBIOp = BIOp = 0; + else if (PBI->getSuccessor(0) == BI->getSuccessor(1)) + PBIOp = 0, BIOp = 1; + else if (PBI->getSuccessor(1) == BI->getSuccessor(0)) + PBIOp = 1, BIOp = 0; + else if (PBI->getSuccessor(1) == BI->getSuccessor(1)) + PBIOp = BIOp = 1; + else + return false; + + // Check to make sure that the other destination of this branch + // isn't BB itself. If so, this is an infinite loop that will + // keep getting unwound. + if (PBI->getSuccessor(PBIOp) == BB) + return false; + + // Do not perform this transformation if it would require + // insertion of a large number of select instructions. For targets + // without predication/cmovs, this is a big pessimization. + BasicBlock *CommonDest = PBI->getSuccessor(PBIOp); + + unsigned NumPhis = 0; + for (BasicBlock::iterator II = CommonDest->begin(); + isa<PHINode>(II); ++II, ++NumPhis) + if (NumPhis > 2) // Disable this xform. + return false; + + // Finally, if everything is ok, fold the branches to logical ops. + BasicBlock *OtherDest = BI->getSuccessor(BIOp ^ 1); + + DEBUG(dbgs() << "FOLDING BRs:" << *PBI->getParent() + << "AND: " << *BI->getParent()); + + + // If OtherDest *is* BB, then BB is a basic block with a single conditional + // branch in it, where one edge (OtherDest) goes back to itself but the other + // exits. We don't *know* that the program avoids the infinite loop + // (even though that seems likely). If we do this xform naively, we'll end up + // recursively unpeeling the loop. Since we know that (after the xform is + // done) that the block *is* infinite if reached, we just make it an obviously + // infinite loop with no cond branch. + if (OtherDest == BB) { + // Insert it at the end of the function, because it's either code, + // or it won't matter if it's hot. :) + BasicBlock *InfLoopBlock = BasicBlock::Create(BB->getContext(), + "infloop", BB->getParent()); + BranchInst::Create(InfLoopBlock, InfLoopBlock); + OtherDest = InfLoopBlock; + } + + DEBUG(dbgs() << *PBI->getParent()->getParent()); + + // BI may have other predecessors. Because of this, we leave + // it alone, but modify PBI. + + // Make sure we get to CommonDest on True&True directions. + Value *PBICond = PBI->getCondition(); + if (PBIOp) + PBICond = BinaryOperator::CreateNot(PBICond, + PBICond->getName()+".not", + PBI); + Value *BICond = BI->getCondition(); + if (BIOp) + BICond = BinaryOperator::CreateNot(BICond, + BICond->getName()+".not", + PBI); + // Merge the conditions. + Value *Cond = BinaryOperator::CreateOr(PBICond, BICond, "brmerge", PBI); + + // Modify PBI to branch on the new condition to the new dests. + PBI->setCondition(Cond); + PBI->setSuccessor(0, CommonDest); + PBI->setSuccessor(1, OtherDest); + + // OtherDest may have phi nodes. If so, add an entry from PBI's + // block that are identical to the entries for BI's block. + AddPredecessorToBlock(OtherDest, PBI->getParent(), BB); + + // We know that the CommonDest already had an edge from PBI to + // it. If it has PHIs though, the PHIs may have different + // entries for BB and PBI's BB. If so, insert a select to make + // them agree. + PHINode *PN; + for (BasicBlock::iterator II = CommonDest->begin(); + (PN = dyn_cast<PHINode>(II)); ++II) { + Value *BIV = PN->getIncomingValueForBlock(BB); + unsigned PBBIdx = PN->getBasicBlockIndex(PBI->getParent()); + Value *PBIV = PN->getIncomingValue(PBBIdx); + if (BIV != PBIV) { + // Insert a select in PBI to pick the right value. + Value *NV = SelectInst::Create(PBICond, PBIV, BIV, + PBIV->getName()+".mux", PBI); + PN->setIncomingValue(PBBIdx, NV); + } + } + + DEBUG(dbgs() << "INTO: " << *PBI->getParent()); + DEBUG(dbgs() << *PBI->getParent()->getParent()); + + // This basic block is probably dead. We know it has at least + // one fewer predecessor. + return true; +} + +// SimplifyTerminatorOnSelect - Simplifies a terminator by replacing it with a +// branch to TrueBB if Cond is true or to FalseBB if Cond is false. +// Takes care of updating the successors and removing the old terminator. +// Also makes sure not to introduce new successors by assuming that edges to +// non-successor TrueBBs and FalseBBs aren't reachable. +static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond, + BasicBlock *TrueBB, BasicBlock *FalseBB){ + // Remove any superfluous successor edges from the CFG. + // First, figure out which successors to preserve. + // If TrueBB and FalseBB are equal, only try to preserve one copy of that + // successor. + BasicBlock *KeepEdge1 = TrueBB; + BasicBlock *KeepEdge2 = TrueBB != FalseBB ? FalseBB : 0; + + // Then remove the rest. + for (unsigned I = 0, E = OldTerm->getNumSuccessors(); I != E; ++I) { + BasicBlock *Succ = OldTerm->getSuccessor(I); + // Make sure only to keep exactly one copy of each edge. + if (Succ == KeepEdge1) + KeepEdge1 = 0; + else if (Succ == KeepEdge2) + KeepEdge2 = 0; + else + Succ->removePredecessor(OldTerm->getParent()); + } + + // Insert an appropriate new terminator. + if ((KeepEdge1 == 0) && (KeepEdge2 == 0)) { + if (TrueBB == FalseBB) + // We were only looking for one successor, and it was present. + // Create an unconditional branch to it. + BranchInst::Create(TrueBB, OldTerm); + else + // We found both of the successors we were looking for. + // Create a conditional branch sharing the condition of the select. + BranchInst::Create(TrueBB, FalseBB, Cond, OldTerm); + } else if (KeepEdge1 && (KeepEdge2 || TrueBB == FalseBB)) { + // Neither of the selected blocks were successors, so this + // terminator must be unreachable. + new UnreachableInst(OldTerm->getContext(), OldTerm); + } else { + // One of the selected values was a successor, but the other wasn't. + // Insert an unconditional branch to the one that was found; + // the edge to the one that wasn't must be unreachable. + if (KeepEdge1 == 0) + // Only TrueBB was found. + BranchInst::Create(TrueBB, OldTerm); + else + // Only FalseBB was found. + BranchInst::Create(FalseBB, OldTerm); + } + + EraseTerminatorInstAndDCECond(OldTerm); + return true; +} + +// SimplifyIndirectBrOnSelect - Replaces +// (indirectbr (select cond, blockaddress(@fn, BlockA), +// blockaddress(@fn, BlockB))) +// with +// (br cond, BlockA, BlockB). +static bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI) { + // Check that both operands of the select are block addresses. + BlockAddress *TBA = dyn_cast<BlockAddress>(SI->getTrueValue()); + BlockAddress *FBA = dyn_cast<BlockAddress>(SI->getFalseValue()); + if (!TBA || !FBA) + return false; + + // Extract the actual blocks. + BasicBlock *TrueBB = TBA->getBasicBlock(); + BasicBlock *FalseBB = FBA->getBasicBlock(); + + // Perform the actual simplification. + return SimplifyTerminatorOnSelect(IBI, SI->getCondition(), TrueBB, FalseBB); +} + +/// TryToSimplifyUncondBranchWithICmpInIt - This is called when we find an icmp +/// instruction (a seteq/setne with a constant) as the only instruction in a +/// block that ends with an uncond branch. We are looking for a very specific +/// pattern that occurs when "A == 1 || A == 2 || A == 3" gets simplified. In +/// this case, we merge the first two "or's of icmp" into a switch, but then the +/// default value goes to an uncond block with a seteq in it, we get something +/// like: +/// +/// switch i8 %A, label %DEFAULT [ i8 1, label %end i8 2, label %end ] +/// DEFAULT: +/// %tmp = icmp eq i8 %A, 92 +/// br label %end +/// end: +/// ... = phi i1 [ true, %entry ], [ %tmp, %DEFAULT ], [ true, %entry ] +/// +/// We prefer to split the edge to 'end' so that there is a true/false entry to +/// the PHI, merging the third icmp into the switch. +static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI, + const TargetData *TD) { + BasicBlock *BB = ICI->getParent(); + // If the block has any PHIs in it or the icmp has multiple uses, it is too + // complex. + if (isa<PHINode>(BB->begin()) || !ICI->hasOneUse()) return false; + + Value *V = ICI->getOperand(0); + ConstantInt *Cst = cast<ConstantInt>(ICI->getOperand(1)); + + // The pattern we're looking for is where our only predecessor is a switch on + // 'V' and this block is the default case for the switch. In this case we can + // fold the compared value into the switch to simplify things. + BasicBlock *Pred = BB->getSinglePredecessor(); + if (Pred == 0 || !isa<SwitchInst>(Pred->getTerminator())) return false; + + SwitchInst *SI = cast<SwitchInst>(Pred->getTerminator()); + if (SI->getCondition() != V) + return false; + + // If BB is reachable on a non-default case, then we simply know the value of + // V in this block. Substitute it and constant fold the icmp instruction + // away. + if (SI->getDefaultDest() != BB) { + ConstantInt *VVal = SI->findCaseDest(BB); + assert(VVal && "Should have a unique destination value"); + ICI->setOperand(0, VVal); + + if (Value *V = SimplifyInstruction(ICI, TD)) { + ICI->replaceAllUsesWith(V); + ICI->eraseFromParent(); + } + // BB is now empty, so it is likely to simplify away. + return SimplifyCFG(BB) | true; + } + + // Ok, the block is reachable from the default dest. If the constant we're + // comparing exists in one of the other edges, then we can constant fold ICI + // and zap it. + if (SI->findCaseValue(Cst) != 0) { + Value *V; + if (ICI->getPredicate() == ICmpInst::ICMP_EQ) + V = ConstantInt::getFalse(BB->getContext()); + else + V = ConstantInt::getTrue(BB->getContext()); + + ICI->replaceAllUsesWith(V); + ICI->eraseFromParent(); + // BB is now empty, so it is likely to simplify away. + return SimplifyCFG(BB) | true; + } + + // The use of the icmp has to be in the 'end' block, by the only PHI node in + // the block. + BasicBlock *SuccBlock = BB->getTerminator()->getSuccessor(0); + PHINode *PHIUse = dyn_cast<PHINode>(ICI->use_back()); + if (PHIUse == 0 || PHIUse != &SuccBlock->front() || + isa<PHINode>(++BasicBlock::iterator(PHIUse))) + return false; + + // If the icmp is a SETEQ, then the default dest gets false, the new edge gets + // true in the PHI. + Constant *DefaultCst = ConstantInt::getTrue(BB->getContext()); + Constant *NewCst = ConstantInt::getFalse(BB->getContext()); + + if (ICI->getPredicate() == ICmpInst::ICMP_EQ) + std::swap(DefaultCst, NewCst); + + // Replace ICI (which is used by the PHI for the default value) with true or + // false depending on if it is EQ or NE. + ICI->replaceAllUsesWith(DefaultCst); + ICI->eraseFromParent(); + + // Okay, the switch goes to this block on a default value. Add an edge from + // the switch to the merge point on the compared value. + BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "switch.edge", + BB->getParent(), BB); + SI->addCase(Cst, NewBB); + + // NewBB branches to the phi block, add the uncond branch and the phi entry. + BranchInst::Create(SuccBlock, NewBB); + PHIUse->addIncoming(NewCst, NewBB); + return true; +} + +/// SimplifyBranchOnICmpChain - The specified branch is a conditional branch. +/// Check to see if it is branching on an or/and chain of icmp instructions, and +/// fold it into a switch instruction if so. +static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD) { + Instruction *Cond = dyn_cast<Instruction>(BI->getCondition()); + if (Cond == 0) return false; + + + // Change br (X == 0 | X == 1), T, F into a switch instruction. + // If this is a bunch of seteq's or'd together, or if it's a bunch of + // 'setne's and'ed together, collect them. + Value *CompVal = 0; + std::vector<ConstantInt*> Values; + bool TrueWhenEqual = true; + Value *ExtraCase = 0; + unsigned UsedICmps = 0; + + if (Cond->getOpcode() == Instruction::Or) { + CompVal = GatherConstantCompares(Cond, Values, ExtraCase, TD, true, + UsedICmps); + } else if (Cond->getOpcode() == Instruction::And) { + CompVal = GatherConstantCompares(Cond, Values, ExtraCase, TD, false, + UsedICmps); + TrueWhenEqual = false; + } + + // If we didn't have a multiply compared value, fail. + if (CompVal == 0) return false; + + // Avoid turning single icmps into a switch. + if (UsedICmps <= 1) + return false; + + // There might be duplicate constants in the list, which the switch + // instruction can't handle, remove them now. + array_pod_sort(Values.begin(), Values.end(), ConstantIntSortPredicate); + Values.erase(std::unique(Values.begin(), Values.end()), Values.end()); + + // If Extra was used, we require at least two switch values to do the + // transformation. A switch with one value is just an cond branch. + if (ExtraCase && Values.size() < 2) return false; + + // Figure out which block is which destination. + BasicBlock *DefaultBB = BI->getSuccessor(1); + BasicBlock *EdgeBB = BI->getSuccessor(0); + if (!TrueWhenEqual) std::swap(DefaultBB, EdgeBB); + + BasicBlock *BB = BI->getParent(); + + DEBUG(dbgs() << "Converting 'icmp' chain with " << Values.size() + << " cases into SWITCH. BB is:\n" << *BB); + + // If there are any extra values that couldn't be folded into the switch + // then we evaluate them with an explicit branch first. Split the block + // right before the condbr to handle it. + if (ExtraCase) { + BasicBlock *NewBB = BB->splitBasicBlock(BI, "switch.early.test"); + // Remove the uncond branch added to the old block. + TerminatorInst *OldTI = BB->getTerminator(); + + if (TrueWhenEqual) + BranchInst::Create(EdgeBB, NewBB, ExtraCase, OldTI); + else + BranchInst::Create(NewBB, EdgeBB, ExtraCase, OldTI); + + OldTI->eraseFromParent(); + + // If there are PHI nodes in EdgeBB, then we need to add a new entry to them + // for the edge we just added. + AddPredecessorToBlock(EdgeBB, BB, NewBB); + + DEBUG(dbgs() << " ** 'icmp' chain unhandled condition: " << *ExtraCase + << "\nEXTRABB = " << *BB); + BB = NewBB; + } + + // Convert pointer to int before we switch. + if (CompVal->getType()->isPointerTy()) { + assert(TD && "Cannot switch on pointer without TargetData"); + CompVal = new PtrToIntInst(CompVal, + TD->getIntPtrType(CompVal->getContext()), + "magicptr", BI); + } + + // Create the new switch instruction now. + SwitchInst *New = SwitchInst::Create(CompVal, DefaultBB, Values.size(), BI); + + // Add all of the 'cases' to the switch instruction. + for (unsigned i = 0, e = Values.size(); i != e; ++i) + New->addCase(Values[i], EdgeBB); + + // We added edges from PI to the EdgeBB. As such, if there were any + // PHI nodes in EdgeBB, they need entries to be added corresponding to + // the number of edges added. + for (BasicBlock::iterator BBI = EdgeBB->begin(); + isa<PHINode>(BBI); ++BBI) { + PHINode *PN = cast<PHINode>(BBI); + Value *InVal = PN->getIncomingValueForBlock(BB); + for (unsigned i = 0, e = Values.size()-1; i != e; ++i) + PN->addIncoming(InVal, BB); + } + + // Erase the old branch instruction. + EraseTerminatorInstAndDCECond(BI); + + DEBUG(dbgs() << " ** 'icmp' chain result is:\n" << *BB << '\n'); + return true; +} + +bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI) { + BasicBlock *BB = RI->getParent(); + if (!BB->getFirstNonPHIOrDbg()->isTerminator()) return false; + + // Find predecessors that end with branches. + SmallVector<BasicBlock*, 8> UncondBranchPreds; + SmallVector<BranchInst*, 8> CondBranchPreds; + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { + BasicBlock *P = *PI; + TerminatorInst *PTI = P->getTerminator(); + if (BranchInst *BI = dyn_cast<BranchInst>(PTI)) { + if (BI->isUnconditional()) + UncondBranchPreds.push_back(P); + else + CondBranchPreds.push_back(BI); + } + } + + // If we found some, do the transformation! + if (!UncondBranchPreds.empty() && DupRet) { + while (!UncondBranchPreds.empty()) { + BasicBlock *Pred = UncondBranchPreds.pop_back_val(); + DEBUG(dbgs() << "FOLDING: " << *BB + << "INTO UNCOND BRANCH PRED: " << *Pred); + (void)FoldReturnIntoUncondBranch(RI, BB, Pred); + } + + // If we eliminated all predecessors of the block, delete the block now. + if (pred_begin(BB) == pred_end(BB)) + // We know there are no successors, so just nuke the block. + BB->eraseFromParent(); + + return true; + } + + // Check out all of the conditional branches going to this return + // instruction. If any of them just select between returns, change the + // branch itself into a select/return pair. + while (!CondBranchPreds.empty()) { + BranchInst *BI = CondBranchPreds.pop_back_val(); + + // Check to see if the non-BB successor is also a return block. + if (isa<ReturnInst>(BI->getSuccessor(0)->getTerminator()) && + isa<ReturnInst>(BI->getSuccessor(1)->getTerminator()) && + SimplifyCondBranchToTwoReturns(BI)) + return true; + } + return false; +} + +bool SimplifyCFGOpt::SimplifyUnwind(UnwindInst *UI) { + // Check to see if the first instruction in this block is just an unwind. + // If so, replace any invoke instructions which use this as an exception + // destination with call instructions. + BasicBlock *BB = UI->getParent(); + if (!BB->getFirstNonPHIOrDbg()->isTerminator()) return false; + + bool Changed = false; + SmallVector<BasicBlock*, 8> Preds(pred_begin(BB), pred_end(BB)); + while (!Preds.empty()) { + BasicBlock *Pred = Preds.back(); + InvokeInst *II = dyn_cast<InvokeInst>(Pred->getTerminator()); + if (II && II->getUnwindDest() == BB) { + // Insert a new branch instruction before the invoke, because this + // is now a fall through. + BranchInst *BI = BranchInst::Create(II->getNormalDest(), II); + Pred->getInstList().remove(II); // Take out of symbol table + + // Insert the call now. + SmallVector<Value*,8> Args(II->op_begin(), II->op_end()-3); + CallInst *CI = CallInst::Create(II->getCalledValue(), + Args.begin(), Args.end(), + II->getName(), BI); + CI->setCallingConv(II->getCallingConv()); + CI->setAttributes(II->getAttributes()); + // If the invoke produced a value, the Call now does instead. + II->replaceAllUsesWith(CI); + delete II; + Changed = true; + } + + Preds.pop_back(); + } + + // If this block is now dead (and isn't the entry block), remove it. + if (pred_begin(BB) == pred_end(BB) && + BB != &BB->getParent()->getEntryBlock()) { + // We know there are no successors, so just nuke the block. + BB->eraseFromParent(); + return true; + } + + return Changed; +} + +bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) { + BasicBlock *BB = UI->getParent(); + + bool Changed = false; + + // If there are any instructions immediately before the unreachable that can + // be removed, do so. + while (UI != BB->begin()) { + BasicBlock::iterator BBI = UI; + --BBI; + // Do not delete instructions that can have side effects, like calls + // (which may never return) and volatile loads and stores. + if (isa<CallInst>(BBI) && !isa<DbgInfoIntrinsic>(BBI)) break; + + if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) + if (SI->isVolatile()) + break; + + if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) + if (LI->isVolatile()) + break; + + // Delete this instruction + BBI->eraseFromParent(); + Changed = true; + } + + // If the unreachable instruction is the first in the block, take a gander + // at all of the predecessors of this instruction, and simplify them. + if (&BB->front() != UI) return Changed; + + SmallVector<BasicBlock*, 8> Preds(pred_begin(BB), pred_end(BB)); + for (unsigned i = 0, e = Preds.size(); i != e; ++i) { + TerminatorInst *TI = Preds[i]->getTerminator(); + + if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { + if (BI->isUnconditional()) { + if (BI->getSuccessor(0) == BB) { + new UnreachableInst(TI->getContext(), TI); + TI->eraseFromParent(); + Changed = true; + } + } else { + if (BI->getSuccessor(0) == BB) { + BranchInst::Create(BI->getSuccessor(1), BI); + EraseTerminatorInstAndDCECond(BI); + } else if (BI->getSuccessor(1) == BB) { + BranchInst::Create(BI->getSuccessor(0), BI); + EraseTerminatorInstAndDCECond(BI); + Changed = true; + } + } + } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { + for (unsigned i = 1, e = SI->getNumCases(); i != e; ++i) + if (SI->getSuccessor(i) == BB) { + BB->removePredecessor(SI->getParent()); + SI->removeCase(i); + --i; --e; + Changed = true; + } + // If the default value is unreachable, figure out the most popular + // destination and make it the default. + if (SI->getSuccessor(0) == BB) { + std::map<BasicBlock*, unsigned> Popularity; + for (unsigned i = 1, e = SI->getNumCases(); i != e; ++i) + Popularity[SI->getSuccessor(i)]++; + + // Find the most popular block. + unsigned MaxPop = 0; + BasicBlock *MaxBlock = 0; + for (std::map<BasicBlock*, unsigned>::iterator + I = Popularity.begin(), E = Popularity.end(); I != E; ++I) { + if (I->second > MaxPop) { + MaxPop = I->second; + MaxBlock = I->first; + } + } + if (MaxBlock) { + // Make this the new default, allowing us to delete any explicit + // edges to it. + SI->setSuccessor(0, MaxBlock); + Changed = true; + + // If MaxBlock has phinodes in it, remove MaxPop-1 entries from + // it. + if (isa<PHINode>(MaxBlock->begin())) + for (unsigned i = 0; i != MaxPop-1; ++i) + MaxBlock->removePredecessor(SI->getParent()); + + for (unsigned i = 1, e = SI->getNumCases(); i != e; ++i) + if (SI->getSuccessor(i) == MaxBlock) { + SI->removeCase(i); + --i; --e; + } + } + } + } else if (InvokeInst *II = dyn_cast<InvokeInst>(TI)) { + if (II->getUnwindDest() == BB) { + // Convert the invoke to a call instruction. This would be a good + // place to note that the call does not throw though. + BranchInst *BI = BranchInst::Create(II->getNormalDest(), II); + II->removeFromParent(); // Take out of symbol table + + // Insert the call now... + SmallVector<Value*, 8> Args(II->op_begin(), II->op_end()-3); + CallInst *CI = CallInst::Create(II->getCalledValue(), + Args.begin(), Args.end(), + II->getName(), BI); + CI->setCallingConv(II->getCallingConv()); + CI->setAttributes(II->getAttributes()); + // If the invoke produced a value, the call does now instead. + II->replaceAllUsesWith(CI); + delete II; + Changed = true; + } + } + } + + // If this block is now dead, remove it. + if (pred_begin(BB) == pred_end(BB) && + BB != &BB->getParent()->getEntryBlock()) { + // We know there are no successors, so just nuke the block. + BB->eraseFromParent(); + return true; + } + + return Changed; +} + +/// TurnSwitchRangeIntoICmp - Turns a switch with that contains only a +/// integer range comparison into a sub, an icmp and a branch. +static bool TurnSwitchRangeIntoICmp(SwitchInst *SI) { + assert(SI->getNumCases() > 2 && "Degenerate switch?"); + + // Make sure all cases point to the same destination and gather the values. + SmallVector<ConstantInt *, 16> Cases; + Cases.push_back(SI->getCaseValue(1)); + for (unsigned I = 2, E = SI->getNumCases(); I != E; ++I) { + if (SI->getSuccessor(I-1) != SI->getSuccessor(I)) + return false; + Cases.push_back(SI->getCaseValue(I)); + } + assert(Cases.size() == SI->getNumCases()-1 && "Not all cases gathered"); + + // Sort the case values, then check if they form a range we can transform. + array_pod_sort(Cases.begin(), Cases.end(), ConstantIntSortPredicate); + for (unsigned I = 1, E = Cases.size(); I != E; ++I) { + if (Cases[I-1]->getValue() != Cases[I]->getValue()+1) + return false; + } + + Constant *Offset = ConstantExpr::getNeg(Cases.back()); + Constant *NumCases = ConstantInt::get(Offset->getType(), SI->getNumCases()-1); + + Value *Sub = SI->getCondition(); + if (!Offset->isNullValue()) + Sub = BinaryOperator::CreateAdd(Sub, Offset, Sub->getName()+".off", SI); + Value *Cmp = new ICmpInst(SI, ICmpInst::ICMP_ULT, Sub, NumCases, "switch"); + BranchInst::Create(SI->getSuccessor(1), SI->getDefaultDest(), Cmp, SI); + + // Prune obsolete incoming values off the successor's PHI nodes. + for (BasicBlock::iterator BBI = SI->getSuccessor(1)->begin(); + isa<PHINode>(BBI); ++BBI) { + for (unsigned I = 0, E = SI->getNumCases()-2; I != E; ++I) + cast<PHINode>(BBI)->removeIncomingValue(SI->getParent()); + } + SI->eraseFromParent(); + + return true; +} + +bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI) { + // If this switch is too complex to want to look at, ignore it. + if (!isValueEqualityComparison(SI)) + return false; + + BasicBlock *BB = SI->getParent(); + + // If we only have one predecessor, and if it is a branch on this value, + // see if that predecessor totally determines the outcome of this switch. + if (BasicBlock *OnlyPred = BB->getSinglePredecessor()) + if (SimplifyEqualityComparisonWithOnlyPredecessor(SI, OnlyPred)) + return SimplifyCFG(BB) | true; + + // If the block only contains the switch, see if we can fold the block + // away into any preds. + BasicBlock::iterator BBI = BB->begin(); + // Ignore dbg intrinsics. + while (isa<DbgInfoIntrinsic>(BBI)) + ++BBI; + if (SI == &*BBI) + if (FoldValueComparisonIntoPredecessors(SI)) + return SimplifyCFG(BB) | true; + + // Try to transform the switch into an icmp and a branch. + if (TurnSwitchRangeIntoICmp(SI)) + return SimplifyCFG(BB) | true; + + return false; +} + +bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) { + BasicBlock *BB = IBI->getParent(); + bool Changed = false; + + // Eliminate redundant destinations. + SmallPtrSet<Value *, 8> Succs; + for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) { + BasicBlock *Dest = IBI->getDestination(i); + if (!Dest->hasAddressTaken() || !Succs.insert(Dest)) { + Dest->removePredecessor(BB); + IBI->removeDestination(i); + --i; --e; + Changed = true; + } + } + + if (IBI->getNumDestinations() == 0) { + // If the indirectbr has no successors, change it to unreachable. + new UnreachableInst(IBI->getContext(), IBI); + EraseTerminatorInstAndDCECond(IBI); + return true; + } + + if (IBI->getNumDestinations() == 1) { + // If the indirectbr has one successor, change it to a direct branch. + BranchInst::Create(IBI->getDestination(0), IBI); + EraseTerminatorInstAndDCECond(IBI); + return true; + } + + if (SelectInst *SI = dyn_cast<SelectInst>(IBI->getAddress())) { + if (SimplifyIndirectBrOnSelect(IBI, SI)) + return SimplifyCFG(BB) | true; + } + return Changed; +} + +bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI) { + BasicBlock *BB = BI->getParent(); + + // If the Terminator is the only non-phi instruction, simplify the block. + BasicBlock::iterator I = BB->getFirstNonPHIOrDbg(); + if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() && + TryToSimplifyUncondBranchFromEmptyBlock(BB)) + return true; + + // If the only instruction in the block is a seteq/setne comparison + // against a constant, try to simplify the block. + if (ICmpInst *ICI = dyn_cast<ICmpInst>(I)) + if (ICI->isEquality() && isa<ConstantInt>(ICI->getOperand(1))) { + for (++I; isa<DbgInfoIntrinsic>(I); ++I) + ; + if (I->isTerminator() && TryToSimplifyUncondBranchWithICmpInIt(ICI, TD)) + return true; + } + + return false; +} + + +bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI) { + BasicBlock *BB = BI->getParent(); + + // Conditional branch + if (isValueEqualityComparison(BI)) { + // If we only have one predecessor, and if it is a branch on this value, + // see if that predecessor totally determines the outcome of this + // switch. + if (BasicBlock *OnlyPred = BB->getSinglePredecessor()) + if (SimplifyEqualityComparisonWithOnlyPredecessor(BI, OnlyPred)) + return SimplifyCFG(BB) | true; + + // This block must be empty, except for the setcond inst, if it exists. + // Ignore dbg intrinsics. + BasicBlock::iterator I = BB->begin(); + // Ignore dbg intrinsics. + while (isa<DbgInfoIntrinsic>(I)) + ++I; + if (&*I == BI) { + if (FoldValueComparisonIntoPredecessors(BI)) + return SimplifyCFG(BB) | true; + } else if (&*I == cast<Instruction>(BI->getCondition())){ + ++I; + // Ignore dbg intrinsics. + while (isa<DbgInfoIntrinsic>(I)) + ++I; + if (&*I == BI && FoldValueComparisonIntoPredecessors(BI)) + return SimplifyCFG(BB) | true; + } + } + + // Try to turn "br (X == 0 | X == 1), T, F" into a switch instruction. + if (SimplifyBranchOnICmpChain(BI, TD)) + return true; + + // We have a conditional branch to two blocks that are only reachable + // from BI. We know that the condbr dominates the two blocks, so see if + // there is any identical code in the "then" and "else" blocks. If so, we + // can hoist it up to the branching block. + if (BI->getSuccessor(0)->getSinglePredecessor() != 0) { + if (BI->getSuccessor(1)->getSinglePredecessor() != 0) { + if (HoistThenElseCodeToIf(BI)) + return SimplifyCFG(BB) | true; + } else { + // If Successor #1 has multiple preds, we may be able to conditionally + // execute Successor #0 if it branches to successor #1. + TerminatorInst *Succ0TI = BI->getSuccessor(0)->getTerminator(); + if (Succ0TI->getNumSuccessors() == 1 && + Succ0TI->getSuccessor(0) == BI->getSuccessor(1)) + if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0))) + return SimplifyCFG(BB) | true; + } + } else if (BI->getSuccessor(1)->getSinglePredecessor() != 0) { + // If Successor #0 has multiple preds, we may be able to conditionally + // execute Successor #1 if it branches to successor #0. + TerminatorInst *Succ1TI = BI->getSuccessor(1)->getTerminator(); + if (Succ1TI->getNumSuccessors() == 1 && + Succ1TI->getSuccessor(0) == BI->getSuccessor(0)) + if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1))) + return SimplifyCFG(BB) | true; + } + + // If this is a branch on a phi node in the current block, thread control + // through this block if any PHI node entries are constants. + if (PHINode *PN = dyn_cast<PHINode>(BI->getCondition())) + if (PN->getParent() == BI->getParent()) + if (FoldCondBranchOnPHI(BI, TD)) + return SimplifyCFG(BB) | true; + + // If this basic block is ONLY a setcc and a branch, and if a predecessor + // branches to us and one of our successors, fold the setcc into the + // predecessor and use logical operations to pick the right destination. + if (FoldBranchToCommonDest(BI)) + return SimplifyCFG(BB) | true; + + // Scan predecessor blocks for conditional branches. + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) + if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator())) + if (PBI != BI && PBI->isConditional()) + if (SimplifyCondBranchToCondBranch(PBI, BI)) + return SimplifyCFG(BB) | true; + + return false; +} + +bool SimplifyCFGOpt::run(BasicBlock *BB) { + bool Changed = false; + + assert(BB && BB->getParent() && "Block not embedded in function!"); + assert(BB->getTerminator() && "Degenerate basic block encountered!"); + + // Remove basic blocks that have no predecessors (except the entry block)... + // or that just have themself as a predecessor. These are unreachable. + if ((pred_begin(BB) == pred_end(BB) && + BB != &BB->getParent()->getEntryBlock()) || + BB->getSinglePredecessor() == BB) { + DEBUG(dbgs() << "Removing BB: \n" << *BB); + DeleteDeadBlock(BB); + return true; + } + + // Check to see if we can constant propagate this terminator instruction + // away... + Changed |= ConstantFoldTerminator(BB); + + // Check for and eliminate duplicate PHI nodes in this block. + Changed |= EliminateDuplicatePHINodes(BB); + + // Merge basic blocks into their predecessor if there is only one distinct + // pred, and if there is only one distinct successor of the predecessor, and + // if there are no PHI nodes. + // + if (MergeBlockIntoPredecessor(BB)) + return true; + + // If there is a trivial two-entry PHI node in this basic block, and we can + // eliminate it, do so now. + if (PHINode *PN = dyn_cast<PHINode>(BB->begin())) + if (PN->getNumIncomingValues() == 2) + Changed |= FoldTwoEntryPHINode(PN, TD); + + if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) { + if (BI->isUnconditional()) { + if (SimplifyUncondBranch(BI)) return true; + } else { + if (SimplifyCondBranch(BI)) return true; + } + } else if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) { + if (SimplifyReturn(RI)) return true; + } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) { + if (SimplifySwitch(SI)) return true; + } else if (UnreachableInst *UI = + dyn_cast<UnreachableInst>(BB->getTerminator())) { + if (SimplifyUnreachable(UI)) return true; + } else if (UnwindInst *UI = dyn_cast<UnwindInst>(BB->getTerminator())) { + if (SimplifyUnwind(UI)) return true; + } else if (IndirectBrInst *IBI = + dyn_cast<IndirectBrInst>(BB->getTerminator())) { + if (SimplifyIndirectBr(IBI)) return true; + } + + return Changed; +} + +/// SimplifyCFG - This function is used to do simplification of a CFG. For +/// example, it adjusts branches to branches to eliminate the extra hop, it +/// eliminates unreachable basic blocks, and does other "peephole" optimization +/// of the CFG. It returns true if a modification was made. +/// +bool llvm::SimplifyCFG(BasicBlock *BB, const TargetData *TD) { + return SimplifyCFGOpt(TD).run(BB); +} diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp new file mode 100644 index 0000000..ac005f9 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp @@ -0,0 +1,94 @@ +//===------ SimplifyInstructions.cpp - Remove redundant instructions ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This is a utility pass used for testing the InstructionSimplify analysis. +// The analysis is applied to every instruction, and if it simplifies then the +// instruction is replaced by the simplification. If you are looking for a pass +// that performs serious instruction folding, use the instcombine pass instead. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "instsimplify" +#include "llvm/Function.h" +#include "llvm/Pass.h" +#include "llvm/Type.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" +using namespace llvm; + +STATISTIC(NumSimplified, "Number of redundant instructions removed"); + +namespace { + struct InstSimplifier : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + InstSimplifier() : FunctionPass(ID) { + initializeInstSimplifierPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + } + + /// runOnFunction - Remove instructions that simplify. + bool runOnFunction(Function &F) { + const DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>(); + const TargetData *TD = getAnalysisIfAvailable<TargetData>(); + SmallPtrSet<const Instruction*, 8> S1, S2, *ToSimplify = &S1, *Next = &S2; + bool Changed = false; + + do { + for (df_iterator<BasicBlock*> DI = df_begin(&F.getEntryBlock()), + DE = df_end(&F.getEntryBlock()); DI != DE; ++DI) + for (BasicBlock::iterator BI = DI->begin(), BE = DI->end(); BI != BE;) { + Instruction *I = BI++; + // The first time through the loop ToSimplify is empty and we try to + // simplify all instructions. On later iterations ToSimplify is not + // empty and we only bother simplifying instructions that are in it. + if (!ToSimplify->empty() && !ToSimplify->count(I)) + continue; + // Don't waste time simplifying unused instructions. + if (!I->use_empty()) + if (Value *V = SimplifyInstruction(I, TD, DT)) { + // Mark all uses for resimplification next time round the loop. + for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); + UI != UE; ++UI) + Next->insert(cast<Instruction>(*UI)); + I->replaceAllUsesWith(V); + ++NumSimplified; + Changed = true; + } + Changed |= RecursivelyDeleteTriviallyDeadInstructions(I); + } + + // Place the list of instructions to simplify on the next loop iteration + // into ToSimplify. + std::swap(ToSimplify, Next); + Next->clear(); + } while (!ToSimplify->empty()); + + return Changed; + } + }; +} + +char InstSimplifier::ID = 0; +INITIALIZE_PASS(InstSimplifier, "instsimplify", "Remove redundant instructions", + false, false) +char &llvm::InstructionSimplifierID = InstSimplifier::ID; + +// Public interface to the simplify instructions pass. +FunctionPass *llvm::createInstructionSimplifierPass() { + return new InstSimplifier(); +} diff --git a/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp b/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp new file mode 100644 index 0000000..ccb8287 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp @@ -0,0 +1,141 @@ +//===- UnifyFunctionExitNodes.cpp - Make all functions have a single exit -===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass is used to ensure that functions have at most one return +// instruction in them. Additionally, it keeps track of which node is the new +// exit node of the CFG. If there are no exit nodes in the CFG, the getExitNode +// method will return a null pointer. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/BasicBlock.h" +#include "llvm/Function.h" +#include "llvm/Instructions.h" +#include "llvm/Type.h" +#include "llvm/ADT/StringExtras.h" +using namespace llvm; + +char UnifyFunctionExitNodes::ID = 0; +INITIALIZE_PASS(UnifyFunctionExitNodes, "mergereturn", + "Unify function exit nodes", false, false) + +Pass *llvm::createUnifyFunctionExitNodesPass() { + return new UnifyFunctionExitNodes(); +} + +void UnifyFunctionExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{ + // We preserve the non-critical-edgeness property + AU.addPreservedID(BreakCriticalEdgesID); + // This is a cluster of orthogonal Transforms + AU.addPreserved("mem2reg"); + AU.addPreservedID(LowerSwitchID); +} + +// UnifyAllExitNodes - Unify all exit nodes of the CFG by creating a new +// BasicBlock, and converting all returns to unconditional branches to this +// new basic block. The singular exit node is returned. +// +// If there are no return stmts in the Function, a null pointer is returned. +// +bool UnifyFunctionExitNodes::runOnFunction(Function &F) { + // Loop over all of the blocks in a function, tracking all of the blocks that + // return. + // + std::vector<BasicBlock*> ReturningBlocks; + std::vector<BasicBlock*> UnwindingBlocks; + std::vector<BasicBlock*> UnreachableBlocks; + for(Function::iterator I = F.begin(), E = F.end(); I != E; ++I) + if (isa<ReturnInst>(I->getTerminator())) + ReturningBlocks.push_back(I); + else if (isa<UnwindInst>(I->getTerminator())) + UnwindingBlocks.push_back(I); + else if (isa<UnreachableInst>(I->getTerminator())) + UnreachableBlocks.push_back(I); + + // Handle unwinding blocks first. + if (UnwindingBlocks.empty()) { + UnwindBlock = 0; + } else if (UnwindingBlocks.size() == 1) { + UnwindBlock = UnwindingBlocks.front(); + } else { + UnwindBlock = BasicBlock::Create(F.getContext(), "UnifiedUnwindBlock", &F); + new UnwindInst(F.getContext(), UnwindBlock); + + for (std::vector<BasicBlock*>::iterator I = UnwindingBlocks.begin(), + E = UnwindingBlocks.end(); I != E; ++I) { + BasicBlock *BB = *I; + BB->getInstList().pop_back(); // Remove the unwind insn + BranchInst::Create(UnwindBlock, BB); + } + } + + // Then unreachable blocks. + if (UnreachableBlocks.empty()) { + UnreachableBlock = 0; + } else if (UnreachableBlocks.size() == 1) { + UnreachableBlock = UnreachableBlocks.front(); + } else { + UnreachableBlock = BasicBlock::Create(F.getContext(), + "UnifiedUnreachableBlock", &F); + new UnreachableInst(F.getContext(), UnreachableBlock); + + for (std::vector<BasicBlock*>::iterator I = UnreachableBlocks.begin(), + E = UnreachableBlocks.end(); I != E; ++I) { + BasicBlock *BB = *I; + BB->getInstList().pop_back(); // Remove the unreachable inst. + BranchInst::Create(UnreachableBlock, BB); + } + } + + // Now handle return blocks. + if (ReturningBlocks.empty()) { + ReturnBlock = 0; + return false; // No blocks return + } else if (ReturningBlocks.size() == 1) { + ReturnBlock = ReturningBlocks.front(); // Already has a single return block + return false; + } + + // Otherwise, we need to insert a new basic block into the function, add a PHI + // nodes (if the function returns values), and convert all of the return + // instructions into unconditional branches. + // + BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), + "UnifiedReturnBlock", &F); + + PHINode *PN = 0; + if (F.getReturnType()->isVoidTy()) { + ReturnInst::Create(F.getContext(), NULL, NewRetBlock); + } else { + // If the function doesn't return void... add a PHI node to the block... + PN = PHINode::Create(F.getReturnType(), "UnifiedRetVal"); + NewRetBlock->getInstList().push_back(PN); + ReturnInst::Create(F.getContext(), PN, NewRetBlock); + } + + // Loop over all of the blocks, replacing the return instruction with an + // unconditional branch. + // + for (std::vector<BasicBlock*>::iterator I = ReturningBlocks.begin(), + E = ReturningBlocks.end(); I != E; ++I) { + BasicBlock *BB = *I; + + // Add an incoming element to the PHI node for every return instruction that + // is merging into this new block... + if (PN) + PN->addIncoming(BB->getTerminator()->getOperand(0), BB); + + BB->getInstList().pop_back(); // Remove the return insn + BranchInst::Create(NewRetBlock, BB); + } + ReturnBlock = NewRetBlock; + return true; +} diff --git a/contrib/llvm/lib/Transforms/Utils/Utils.cpp b/contrib/llvm/lib/Transforms/Utils/Utils.cpp new file mode 100644 index 0000000..24e8c8f --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/Utils.cpp @@ -0,0 +1,37 @@ +//===-- Utils.cpp - TransformUtils Infrastructure -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the common initialization infrastructure for the +// TransformUtils library. +// +//===----------------------------------------------------------------------===// + +#include "llvm/InitializePasses.h" +#include "llvm-c/Initialization.h" + +using namespace llvm; + +/// initializeTransformUtils - Initialize all passes in the TransformUtils +/// library. +void llvm::initializeTransformUtils(PassRegistry &Registry) { + initializeBreakCriticalEdgesPass(Registry); + initializeInstNamerPass(Registry); + initializeLCSSAPass(Registry); + initializeLoopSimplifyPass(Registry); + initializeLowerInvokePass(Registry); + initializeLowerSwitchPass(Registry); + initializePromotePassPass(Registry); + initializeUnifyFunctionExitNodesPass(Registry); + initializeInstSimplifierPass(Registry); +} + +/// LLVMInitializeTransformUtils - C binding for initializeTransformUtilsPasses. +void LLVMInitializeTransformUtils(LLVMPassRegistryRef R) { + initializeTransformUtils(*unwrap(R)); +} diff --git a/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp b/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp new file mode 100644 index 0000000..f5481d3 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp @@ -0,0 +1,141 @@ +//===- ValueMapper.cpp - Interface shared by lib/Transforms/Utils ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the MapValue function, which is shared by various parts of +// the lib/Transforms/Utils library. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm/Type.h" +#include "llvm/Constants.h" +#include "llvm/Function.h" +#include "llvm/Metadata.h" +#include "llvm/ADT/SmallVector.h" +using namespace llvm; + +Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, + RemapFlags Flags) { + ValueToValueMapTy::iterator I = VM.find(V); + + // If the value already exists in the map, use it. + if (I != VM.end() && I->second) return I->second; + + // Global values do not need to be seeded into the VM if they + // are using the identity mapping. + if (isa<GlobalValue>(V) || isa<InlineAsm>(V) || isa<MDString>(V)) + return VM[V] = const_cast<Value*>(V); + + if (const MDNode *MD = dyn_cast<MDNode>(V)) { + // If this is a module-level metadata and we know that nothing at the module + // level is changing, then use an identity mapping. + if (!MD->isFunctionLocal() && (Flags & RF_NoModuleLevelChanges)) + return VM[V] = const_cast<Value*>(V); + + // Create a dummy node in case we have a metadata cycle. + MDNode *Dummy = MDNode::getTemporary(V->getContext(), 0, 0); + VM[V] = Dummy; + + // Check all operands to see if any need to be remapped. + for (unsigned i = 0, e = MD->getNumOperands(); i != e; ++i) { + Value *OP = MD->getOperand(i); + if (OP == 0 || MapValue(OP, VM, Flags) == OP) continue; + + // Ok, at least one operand needs remapping. + SmallVector<Value*, 4> Elts; + Elts.reserve(MD->getNumOperands()); + for (i = 0; i != e; ++i) { + Value *Op = MD->getOperand(i); + Elts.push_back(Op ? MapValue(Op, VM, Flags) : 0); + } + MDNode *NewMD = MDNode::get(V->getContext(), Elts.data(), Elts.size()); + Dummy->replaceAllUsesWith(NewMD); + VM[V] = NewMD; + MDNode::deleteTemporary(Dummy); + return NewMD; + } + + VM[V] = const_cast<Value*>(V); + MDNode::deleteTemporary(Dummy); + + // No operands needed remapping. Use an identity mapping. + return const_cast<Value*>(V); + } + + // Okay, this either must be a constant (which may or may not be mappable) or + // is something that is not in the mapping table. + Constant *C = const_cast<Constant*>(dyn_cast<Constant>(V)); + if (C == 0) + return 0; + + if (BlockAddress *BA = dyn_cast<BlockAddress>(C)) { + Function *F = cast<Function>(MapValue(BA->getFunction(), VM, Flags)); + BasicBlock *BB = cast_or_null<BasicBlock>(MapValue(BA->getBasicBlock(), VM, + Flags)); + return VM[V] = BlockAddress::get(F, BB ? BB : BA->getBasicBlock()); + } + + for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i) { + Value *Op = C->getOperand(i); + Value *Mapped = MapValue(Op, VM, Flags); + if (Mapped == C) continue; + + // Okay, the operands don't all match. We've already processed some or all + // of the operands, set them up now. + std::vector<Constant*> Ops; + Ops.reserve(C->getNumOperands()); + for (unsigned j = 0; j != i; ++j) + Ops.push_back(cast<Constant>(C->getOperand(i))); + Ops.push_back(cast<Constant>(Mapped)); + + // Map the rest of the operands that aren't processed yet. + for (++i; i != e; ++i) + Ops.push_back(cast<Constant>(MapValue(C->getOperand(i), VM, Flags))); + + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) + return VM[V] = CE->getWithOperands(Ops); + if (ConstantArray *CA = dyn_cast<ConstantArray>(C)) + return VM[V] = ConstantArray::get(CA->getType(), Ops); + if (ConstantStruct *CS = dyn_cast<ConstantStruct>(C)) + return VM[V] = ConstantStruct::get(CS->getType(), Ops); + assert(isa<ConstantVector>(C) && "Unknown mapped constant type"); + return VM[V] = ConstantVector::get(Ops); + } + + // If we reach here, all of the operands of the constant match. + return VM[V] = C; +} + +/// RemapInstruction - Convert the instruction operands from referencing the +/// current values into those specified by VMap. +/// +void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &VMap, + RemapFlags Flags) { + // Remap operands. + for (User::op_iterator op = I->op_begin(), E = I->op_end(); op != E; ++op) { + Value *V = MapValue(*op, VMap, Flags); + // If we aren't ignoring missing entries, assert that something happened. + if (V != 0) + *op = V; + else + assert((Flags & RF_IgnoreMissingEntries) && + "Referenced value not in value map!"); + } + + // Remap attached metadata. + SmallVector<std::pair<unsigned, MDNode *>, 4> MDs; + I->getAllMetadata(MDs); + for (SmallVectorImpl<std::pair<unsigned, MDNode *> >::iterator + MI = MDs.begin(), ME = MDs.end(); MI != ME; ++MI) { + Value *Old = MI->second; + Value *New = MapValue(Old, VMap, Flags); + if (New != Old) + I->setMetadata(MI->first, cast<MDNode>(New)); + } +} |