diff options
Diffstat (limited to 'contrib/llvm/lib/Transforms/Scalar')
32 files changed, 34299 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp new file mode 100644 index 0000000..b344952 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp @@ -0,0 +1,97 @@ +//===- DCE.cpp - Code to perform dead code elimination --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the Aggressive Dead Code Elimination pass. This pass +// optimistically assumes that all instructions are dead until proven otherwise, +// allowing it to eliminate dead computations that other DCE passes do not +// catch, particularly involving loop computations. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "adce" +#include "llvm/Transforms/Scalar.h" +#include "llvm/BasicBlock.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Pass.h" +#include "llvm/Support/CFG.h" +#include "llvm/Support/InstIterator.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumRemoved, "Number of instructions removed"); + +namespace { + struct ADCE : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + ADCE() : FunctionPass(ID) { + initializeADCEPass(*PassRegistry::getPassRegistry()); + } + + virtual bool runOnFunction(Function& F); + + virtual void getAnalysisUsage(AnalysisUsage& AU) const { + AU.setPreservesCFG(); + } + + }; +} + +char ADCE::ID = 0; +INITIALIZE_PASS(ADCE, "adce", "Aggressive Dead Code Elimination", false, false) + +bool ADCE::runOnFunction(Function& F) { + SmallPtrSet<Instruction*, 128> alive; + SmallVector<Instruction*, 128> worklist; + + // Collect the set of "root" instructions that are known live. + for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) + if (isa<TerminatorInst>(I.getInstructionIterator()) || + isa<DbgInfoIntrinsic>(I.getInstructionIterator()) || + isa<LandingPadInst>(I.getInstructionIterator()) || + I->mayHaveSideEffects()) { + alive.insert(I.getInstructionIterator()); + worklist.push_back(I.getInstructionIterator()); + } + + // Propagate liveness backwards to operands. + while (!worklist.empty()) { + Instruction* curr = worklist.pop_back_val(); + for (Instruction::op_iterator OI = curr->op_begin(), OE = curr->op_end(); + OI != OE; ++OI) + if (Instruction* Inst = dyn_cast<Instruction>(OI)) + if (alive.insert(Inst)) + worklist.push_back(Inst); + } + + // The inverse of the live set is the dead set. These are those instructions + // which have no side effects and do not influence the control flow or return + // value of the function, and may therefore be deleted safely. + // NOTE: We reuse the worklist vector here for memory efficiency. + for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) + if (!alive.count(I.getInstructionIterator())) { + worklist.push_back(I.getInstructionIterator()); + I->dropAllReferences(); + } + + for (SmallVector<Instruction*, 1024>::iterator I = worklist.begin(), + E = worklist.end(); I != E; ++I) { + ++NumRemoved; + (*I)->eraseFromParent(); + } + + return !worklist.empty(); +} + +FunctionPass *llvm::createAggressiveDCEPass() { + return new ADCE(); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/BasicBlockPlacement.cpp b/contrib/llvm/lib/Transforms/Scalar/BasicBlockPlacement.cpp new file mode 100644 index 0000000..cee5502 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/BasicBlockPlacement.cpp @@ -0,0 +1,152 @@ +//===-- BasicBlockPlacement.cpp - Basic Block Code Layout optimization ----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a very simple profile guided basic block placement +// algorithm. The idea is to put frequently executed blocks together at the +// start of the function, and hopefully increase the number of fall-through +// conditional branches. If there is no profile information for a particular +// function, this pass basically orders blocks in depth-first order +// +// The algorithm implemented here is basically "Algo1" from "Profile Guided Code +// Positioning" by Pettis and Hansen, except that it uses basic block counts +// instead of edge counts. This should be improved in many ways, but is very +// simple for now. +// +// Basically we "place" the entry block, then loop over all successors in a DFO, +// placing the most frequently executed successor until we run out of blocks. I +// told you this was _extremely_ simplistic. :) This is also much slower than it +// could be. When it becomes important, this pass will be rewritten to use a +// better algorithm, and then we can worry about efficiency. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "block-placement" +#include "llvm/Analysis/ProfileInfo.h" +#include "llvm/Function.h" +#include "llvm/Pass.h" +#include "llvm/Support/CFG.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Transforms/Scalar.h" +#include <set> +using namespace llvm; + +STATISTIC(NumMoved, "Number of basic blocks moved"); + +namespace { + struct BlockPlacement : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + BlockPlacement() : FunctionPass(ID) { + initializeBlockPlacementPass(*PassRegistry::getPassRegistry()); + } + + virtual bool runOnFunction(Function &F); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired<ProfileInfo>(); + //AU.addPreserved<ProfileInfo>(); // Does this work? + } + private: + /// PI - The profile information that is guiding us. + /// + ProfileInfo *PI; + + /// NumMovedBlocks - Every time we move a block, increment this counter. + /// + unsigned NumMovedBlocks; + + /// PlacedBlocks - Every time we place a block, remember it so we don't get + /// into infinite loops. + std::set<BasicBlock*> PlacedBlocks; + + /// InsertPos - This an iterator to the next place we want to insert a + /// block. + Function::iterator InsertPos; + + /// PlaceBlocks - Recursively place the specified blocks and any unplaced + /// successors. + void PlaceBlocks(BasicBlock *BB); + }; +} + +char BlockPlacement::ID = 0; +INITIALIZE_PASS_BEGIN(BlockPlacement, "block-placement", + "Profile Guided Basic Block Placement", false, false) +INITIALIZE_AG_DEPENDENCY(ProfileInfo) +INITIALIZE_PASS_END(BlockPlacement, "block-placement", + "Profile Guided Basic Block Placement", false, false) + +FunctionPass *llvm::createBlockPlacementPass() { return new BlockPlacement(); } + +bool BlockPlacement::runOnFunction(Function &F) { + PI = &getAnalysis<ProfileInfo>(); + + NumMovedBlocks = 0; + InsertPos = F.begin(); + + // Recursively place all blocks. + PlaceBlocks(F.begin()); + + PlacedBlocks.clear(); + NumMoved += NumMovedBlocks; + return NumMovedBlocks != 0; +} + + +/// PlaceBlocks - Recursively place the specified blocks and any unplaced +/// successors. +void BlockPlacement::PlaceBlocks(BasicBlock *BB) { + assert(!PlacedBlocks.count(BB) && "Already placed this block!"); + PlacedBlocks.insert(BB); + + // Place the specified block. + if (&*InsertPos != BB) { + // Use splice to move the block into the right place. This avoids having to + // remove the block from the function then readd it, which causes a bunch of + // symbol table traffic that is entirely pointless. + Function::BasicBlockListType &Blocks = BB->getParent()->getBasicBlockList(); + Blocks.splice(InsertPos, Blocks, BB); + + ++NumMovedBlocks; + } else { + // This block is already in the right place, we don't have to do anything. + ++InsertPos; + } + + // Keep placing successors until we run out of ones to place. Note that this + // loop is very inefficient (N^2) for blocks with many successors, like switch + // statements. FIXME! + while (1) { + // Okay, now place any unplaced successors. + succ_iterator SI = succ_begin(BB), E = succ_end(BB); + + // Scan for the first unplaced successor. + for (; SI != E && PlacedBlocks.count(*SI); ++SI) + /*empty*/; + if (SI == E) return; // No more successors to place. + + double MaxExecutionCount = PI->getExecutionCount(*SI); + BasicBlock *MaxSuccessor = *SI; + + // Scan for more frequently executed successors + for (; SI != E; ++SI) + if (!PlacedBlocks.count(*SI)) { + double Count = PI->getExecutionCount(*SI); + if (Count > MaxExecutionCount || + // Prefer to not disturb the code. + (Count == MaxExecutionCount && *SI == &*InsertPos)) { + MaxExecutionCount = Count; + MaxSuccessor = *SI; + } + } + + // Now that we picked the maximally executed successor, place it. + PlaceBlocks(MaxSuccessor); + } +} diff --git a/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp b/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp new file mode 100644 index 0000000..a3c426a --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp @@ -0,0 +1,1340 @@ +//===- CodeGenPrepare.cpp - Prepare a function for code generation --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass munges the code in the input function to better prepare it for +// SelectionDAG-based code generation. This works around limitations in it's +// basic-block-at-a-time approach. It should eventually be removed. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "codegenprepare" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/IRBuilder.h" +#include "llvm/InlineAsm.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Pass.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/ProfileInfo.h" +#include "llvm/Assembly/Writer.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/GetElementPtrTypeIterator.h" +#include "llvm/Support/PatternMatch.h" +#include "llvm/Support/ValueHandle.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Transforms/Utils/AddrModeMatcher.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/BuildLibCalls.h" +#include "llvm/Transforms/Utils/Local.h" +using namespace llvm; +using namespace llvm::PatternMatch; + +STATISTIC(NumBlocksElim, "Number of blocks eliminated"); +STATISTIC(NumPHIsElim, "Number of trivial PHIs eliminated"); +STATISTIC(NumGEPsElim, "Number of GEPs converted to casts"); +STATISTIC(NumCmpUses, "Number of uses of Cmp expressions replaced with uses of " + "sunken Cmps"); +STATISTIC(NumCastUses, "Number of uses of Cast expressions replaced with uses " + "of sunken Casts"); +STATISTIC(NumMemoryInsts, "Number of memory instructions whose address " + "computations were sunk"); +STATISTIC(NumExtsMoved, "Number of [s|z]ext instructions combined with loads"); +STATISTIC(NumExtUses, "Number of uses of [s|z]ext instructions optimized"); +STATISTIC(NumRetsDup, "Number of return instructions duplicated"); +STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved"); +STATISTIC(NumSelectsExpanded, "Number of selects turned into branches"); + +static cl::opt<bool> DisableBranchOpts( + "disable-cgp-branch-opts", cl::Hidden, cl::init(false), + cl::desc("Disable branch optimizations in CodeGenPrepare")); + +static cl::opt<bool> DisableSelectToBranch( + "disable-cgp-select2branch", cl::Hidden, cl::init(false), + cl::desc("Disable select to branch conversion.")); + +namespace { + class CodeGenPrepare : public FunctionPass { + /// TLI - Keep a pointer of a TargetLowering to consult for determining + /// transformation profitability. + const TargetLowering *TLI; + const TargetLibraryInfo *TLInfo; + DominatorTree *DT; + ProfileInfo *PFI; + + /// CurInstIterator - As we scan instructions optimizing them, this is the + /// next instruction to optimize. Xforms that can invalidate this should + /// update it. + BasicBlock::iterator CurInstIterator; + + /// Keeps track of non-local addresses that have been sunk into a block. + /// This allows us to avoid inserting duplicate code for blocks with + /// multiple load/stores of the same address. + DenseMap<Value*, Value*> SunkAddrs; + + /// ModifiedDT - If CFG is modified in anyway, dominator tree may need to + /// be updated. + bool ModifiedDT; + + /// OptSize - True if optimizing for size. + bool OptSize; + + public: + static char ID; // Pass identification, replacement for typeid + explicit CodeGenPrepare(const TargetLowering *tli = 0) + : FunctionPass(ID), TLI(tli) { + initializeCodeGenPreparePass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addPreserved<DominatorTree>(); + AU.addPreserved<ProfileInfo>(); + AU.addRequired<TargetLibraryInfo>(); + } + + private: + bool EliminateFallThrough(Function &F); + bool EliminateMostlyEmptyBlocks(Function &F); + bool CanMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const; + void EliminateMostlyEmptyBlock(BasicBlock *BB); + bool OptimizeBlock(BasicBlock &BB); + bool OptimizeInst(Instruction *I); + bool OptimizeMemoryInst(Instruction *I, Value *Addr, Type *AccessTy); + bool OptimizeInlineAsmInst(CallInst *CS); + bool OptimizeCallInst(CallInst *CI); + bool MoveExtToFormExtLoad(Instruction *I); + bool OptimizeExtUses(Instruction *I); + bool OptimizeSelectInst(SelectInst *SI); + bool DupRetToEnableTailCallOpts(ReturnInst *RI); + bool PlaceDbgValues(Function &F); + }; +} + +char CodeGenPrepare::ID = 0; +INITIALIZE_PASS_BEGIN(CodeGenPrepare, "codegenprepare", + "Optimize for code generation", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_END(CodeGenPrepare, "codegenprepare", + "Optimize for code generation", false, false) + +FunctionPass *llvm::createCodeGenPreparePass(const TargetLowering *TLI) { + return new CodeGenPrepare(TLI); +} + +bool CodeGenPrepare::runOnFunction(Function &F) { + bool EverMadeChange = false; + + ModifiedDT = false; + TLInfo = &getAnalysis<TargetLibraryInfo>(); + DT = getAnalysisIfAvailable<DominatorTree>(); + PFI = getAnalysisIfAvailable<ProfileInfo>(); + OptSize = F.hasFnAttr(Attribute::OptimizeForSize); + + // First pass, eliminate blocks that contain only PHI nodes and an + // unconditional branch. + EverMadeChange |= EliminateMostlyEmptyBlocks(F); + + // llvm.dbg.value is far away from the value then iSel may not be able + // handle it properly. iSel will drop llvm.dbg.value if it can not + // find a node corresponding to the value. + EverMadeChange |= PlaceDbgValues(F); + + bool MadeChange = true; + while (MadeChange) { + MadeChange = false; + for (Function::iterator I = F.begin(), E = F.end(); I != E; ) { + BasicBlock *BB = I++; + MadeChange |= OptimizeBlock(*BB); + } + EverMadeChange |= MadeChange; + } + + SunkAddrs.clear(); + + if (!DisableBranchOpts) { + MadeChange = false; + SmallPtrSet<BasicBlock*, 8> WorkList; + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { + SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB)); + MadeChange |= ConstantFoldTerminator(BB, true); + if (!MadeChange) continue; + + for (SmallVectorImpl<BasicBlock*>::iterator + II = Successors.begin(), IE = Successors.end(); II != IE; ++II) + if (pred_begin(*II) == pred_end(*II)) + WorkList.insert(*II); + } + + for (SmallPtrSet<BasicBlock*, 8>::iterator + I = WorkList.begin(), E = WorkList.end(); I != E; ++I) + DeleteDeadBlock(*I); + + // Merge pairs of basic blocks with unconditional branches, connected by + // a single edge. + if (EverMadeChange || MadeChange) + MadeChange |= EliminateFallThrough(F); + + if (MadeChange) + ModifiedDT = true; + EverMadeChange |= MadeChange; + } + + if (ModifiedDT && DT) + DT->DT->recalculate(F); + + return EverMadeChange; +} + +/// EliminateFallThrough - Merge basic blocks which are connected +/// by a single edge, where one of the basic blocks has a single successor +/// pointing to the other basic block, which has a single predecessor. +bool CodeGenPrepare::EliminateFallThrough(Function &F) { + bool Changed = false; + // Scan all of the blocks in the function, except for the entry block. + for (Function::iterator I = ++F.begin(), E = F.end(); I != E; ) { + BasicBlock *BB = I++; + // If the destination block has a single pred, then this is a trivial + // edge, just collapse it. + BasicBlock *SinglePred = BB->getSinglePredecessor(); + + if (!SinglePred || SinglePred == BB) continue; + + BranchInst *Term = dyn_cast<BranchInst>(SinglePred->getTerminator()); + if (Term && !Term->isConditional()) { + Changed = true; + // Remember if SinglePred was the entry block of the function. + // If so, we will need to move BB back to the entry position. + bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock(); + MergeBasicBlockIntoOnlyPred(BB, this); + + if (isEntry && BB != &BB->getParent()->getEntryBlock()) + BB->moveBefore(&BB->getParent()->getEntryBlock()); + + // We have erased a block. Update the iterator. + I = BB; + DEBUG(dbgs() << "Merged:\n"<< *SinglePred << "\n\n\n"); + } + } + return Changed; +} + +/// EliminateMostlyEmptyBlocks - eliminate blocks that contain only PHI nodes, +/// debug info directives, and an unconditional branch. Passes before isel +/// (e.g. LSR/loopsimplify) often split edges in ways that are non-optimal for +/// isel. Start by eliminating these blocks so we can split them the way we +/// want them. +bool CodeGenPrepare::EliminateMostlyEmptyBlocks(Function &F) { + bool MadeChange = false; + // Note that this intentionally skips the entry block. + for (Function::iterator I = ++F.begin(), E = F.end(); I != E; ) { + BasicBlock *BB = I++; + + // If this block doesn't end with an uncond branch, ignore it. + BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()); + if (!BI || !BI->isUnconditional()) + continue; + + // If the instruction before the branch (skipping debug info) isn't a phi + // node, then other stuff is happening here. + BasicBlock::iterator BBI = BI; + if (BBI != BB->begin()) { + --BBI; + while (isa<DbgInfoIntrinsic>(BBI)) { + if (BBI == BB->begin()) + break; + --BBI; + } + if (!isa<DbgInfoIntrinsic>(BBI) && !isa<PHINode>(BBI)) + continue; + } + + // Do not break infinite loops. + BasicBlock *DestBB = BI->getSuccessor(0); + if (DestBB == BB) + continue; + + if (!CanMergeBlocks(BB, DestBB)) + continue; + + EliminateMostlyEmptyBlock(BB); + MadeChange = true; + } + return MadeChange; +} + +/// CanMergeBlocks - Return true if we can merge BB into DestBB if there is a +/// single uncond branch between them, and BB contains no other non-phi +/// instructions. +bool CodeGenPrepare::CanMergeBlocks(const BasicBlock *BB, + const BasicBlock *DestBB) const { + // We only want to eliminate blocks whose phi nodes are used by phi nodes in + // the successor. If there are more complex condition (e.g. preheaders), + // don't mess around with them. + BasicBlock::const_iterator BBI = BB->begin(); + while (const PHINode *PN = dyn_cast<PHINode>(BBI++)) { + for (Value::const_use_iterator UI = PN->use_begin(), E = PN->use_end(); + UI != E; ++UI) { + const Instruction *User = cast<Instruction>(*UI); + if (User->getParent() != DestBB || !isa<PHINode>(User)) + return false; + // If User is inside DestBB block and it is a PHINode then check + // incoming value. If incoming value is not from BB then this is + // a complex condition (e.g. preheaders) we want to avoid here. + if (User->getParent() == DestBB) { + if (const PHINode *UPN = dyn_cast<PHINode>(User)) + for (unsigned I = 0, E = UPN->getNumIncomingValues(); I != E; ++I) { + Instruction *Insn = dyn_cast<Instruction>(UPN->getIncomingValue(I)); + if (Insn && Insn->getParent() == BB && + Insn->getParent() != UPN->getIncomingBlock(I)) + return false; + } + } + } + } + + // If BB and DestBB contain any common predecessors, then the phi nodes in BB + // and DestBB may have conflicting incoming values for the block. If so, we + // can't merge the block. + const PHINode *DestBBPN = dyn_cast<PHINode>(DestBB->begin()); + if (!DestBBPN) return true; // no conflict. + + // Collect the preds of BB. + SmallPtrSet<const BasicBlock*, 16> BBPreds; + if (const PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) { + // It is faster to get preds from a PHI than with pred_iterator. + for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i) + BBPreds.insert(BBPN->getIncomingBlock(i)); + } else { + BBPreds.insert(pred_begin(BB), pred_end(BB)); + } + + // Walk the preds of DestBB. + for (unsigned i = 0, e = DestBBPN->getNumIncomingValues(); i != e; ++i) { + BasicBlock *Pred = DestBBPN->getIncomingBlock(i); + if (BBPreds.count(Pred)) { // Common predecessor? + BBI = DestBB->begin(); + while (const PHINode *PN = dyn_cast<PHINode>(BBI++)) { + const Value *V1 = PN->getIncomingValueForBlock(Pred); + const Value *V2 = PN->getIncomingValueForBlock(BB); + + // If V2 is a phi node in BB, look up what the mapped value will be. + if (const PHINode *V2PN = dyn_cast<PHINode>(V2)) + if (V2PN->getParent() == BB) + V2 = V2PN->getIncomingValueForBlock(Pred); + + // If there is a conflict, bail out. + if (V1 != V2) return false; + } + } + } + + return true; +} + + +/// EliminateMostlyEmptyBlock - Eliminate a basic block that have only phi's and +/// an unconditional branch in it. +void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) { + BranchInst *BI = cast<BranchInst>(BB->getTerminator()); + BasicBlock *DestBB = BI->getSuccessor(0); + + DEBUG(dbgs() << "MERGING MOSTLY EMPTY BLOCKS - BEFORE:\n" << *BB << *DestBB); + + // If the destination block has a single pred, then this is a trivial edge, + // just collapse it. + if (BasicBlock *SinglePred = DestBB->getSinglePredecessor()) { + if (SinglePred != DestBB) { + // Remember if SinglePred was the entry block of the function. If so, we + // will need to move BB back to the entry position. + bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock(); + MergeBasicBlockIntoOnlyPred(DestBB, this); + + if (isEntry && BB != &BB->getParent()->getEntryBlock()) + BB->moveBefore(&BB->getParent()->getEntryBlock()); + + DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n"); + return; + } + } + + // Otherwise, we have multiple predecessors of BB. Update the PHIs in DestBB + // to handle the new incoming edges it is about to have. + PHINode *PN; + for (BasicBlock::iterator BBI = DestBB->begin(); + (PN = dyn_cast<PHINode>(BBI)); ++BBI) { + // Remove the incoming value for BB, and remember it. + Value *InVal = PN->removeIncomingValue(BB, false); + + // Two options: either the InVal is a phi node defined in BB or it is some + // value that dominates BB. + PHINode *InValPhi = dyn_cast<PHINode>(InVal); + if (InValPhi && InValPhi->getParent() == BB) { + // Add all of the input values of the input PHI as inputs of this phi. + for (unsigned i = 0, e = InValPhi->getNumIncomingValues(); i != e; ++i) + PN->addIncoming(InValPhi->getIncomingValue(i), + InValPhi->getIncomingBlock(i)); + } else { + // Otherwise, add one instance of the dominating value for each edge that + // we will be adding. + if (PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) { + for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i) + PN->addIncoming(InVal, BBPN->getIncomingBlock(i)); + } else { + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) + PN->addIncoming(InVal, *PI); + } + } + } + + // The PHIs are now updated, change everything that refers to BB to use + // DestBB and remove BB. + BB->replaceAllUsesWith(DestBB); + if (DT && !ModifiedDT) { + BasicBlock *BBIDom = DT->getNode(BB)->getIDom()->getBlock(); + BasicBlock *DestBBIDom = DT->getNode(DestBB)->getIDom()->getBlock(); + BasicBlock *NewIDom = DT->findNearestCommonDominator(BBIDom, DestBBIDom); + DT->changeImmediateDominator(DestBB, NewIDom); + DT->eraseNode(BB); + } + if (PFI) { + PFI->replaceAllUses(BB, DestBB); + PFI->removeEdge(ProfileInfo::getEdge(BB, DestBB)); + } + BB->eraseFromParent(); + ++NumBlocksElim; + + DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n"); +} + +/// OptimizeNoopCopyExpression - If the specified cast instruction is a noop +/// copy (e.g. it's casting from one pointer type to another, i32->i8 on PPC), +/// sink it into user blocks to reduce the number of virtual +/// registers that must be created and coalesced. +/// +/// Return true if any changes are made. +/// +static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI){ + // If this is a noop copy, + EVT SrcVT = TLI.getValueType(CI->getOperand(0)->getType()); + EVT DstVT = TLI.getValueType(CI->getType()); + + // This is an fp<->int conversion? + if (SrcVT.isInteger() != DstVT.isInteger()) + return false; + + // If this is an extension, it will be a zero or sign extension, which + // isn't a noop. + if (SrcVT.bitsLT(DstVT)) return false; + + // If these values will be promoted, find out what they will be promoted + // to. This helps us consider truncates on PPC as noop copies when they + // are. + if (TLI.getTypeAction(CI->getContext(), SrcVT) == + TargetLowering::TypePromoteInteger) + SrcVT = TLI.getTypeToTransformTo(CI->getContext(), SrcVT); + if (TLI.getTypeAction(CI->getContext(), DstVT) == + TargetLowering::TypePromoteInteger) + DstVT = TLI.getTypeToTransformTo(CI->getContext(), DstVT); + + // If, after promotion, these are the same types, this is a noop copy. + if (SrcVT != DstVT) + return false; + + BasicBlock *DefBB = CI->getParent(); + + /// InsertedCasts - Only insert a cast in each block once. + DenseMap<BasicBlock*, CastInst*> InsertedCasts; + + bool MadeChange = false; + for (Value::use_iterator UI = CI->use_begin(), E = CI->use_end(); + UI != E; ) { + Use &TheUse = UI.getUse(); + Instruction *User = cast<Instruction>(*UI); + + // Figure out which BB this cast is used in. For PHI's this is the + // appropriate predecessor block. + BasicBlock *UserBB = User->getParent(); + if (PHINode *PN = dyn_cast<PHINode>(User)) { + UserBB = PN->getIncomingBlock(UI); + } + + // Preincrement use iterator so we don't invalidate it. + ++UI; + + // If this user is in the same block as the cast, don't change the cast. + if (UserBB == DefBB) continue; + + // If we have already inserted a cast into this block, use it. + CastInst *&InsertedCast = InsertedCasts[UserBB]; + + if (!InsertedCast) { + BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); + InsertedCast = + CastInst::Create(CI->getOpcode(), CI->getOperand(0), CI->getType(), "", + InsertPt); + MadeChange = true; + } + + // Replace a use of the cast with a use of the new cast. + TheUse = InsertedCast; + ++NumCastUses; + } + + // If we removed all uses, nuke the cast. + if (CI->use_empty()) { + CI->eraseFromParent(); + MadeChange = true; + } + + return MadeChange; +} + +/// OptimizeCmpExpression - sink the given CmpInst into user blocks to reduce +/// the number of virtual registers that must be created and coalesced. This is +/// a clear win except on targets with multiple condition code registers +/// (PowerPC), where it might lose; some adjustment may be wanted there. +/// +/// Return true if any changes are made. +static bool OptimizeCmpExpression(CmpInst *CI) { + BasicBlock *DefBB = CI->getParent(); + + /// InsertedCmp - Only insert a cmp in each block once. + DenseMap<BasicBlock*, CmpInst*> InsertedCmps; + + bool MadeChange = false; + for (Value::use_iterator UI = CI->use_begin(), E = CI->use_end(); + UI != E; ) { + Use &TheUse = UI.getUse(); + Instruction *User = cast<Instruction>(*UI); + + // Preincrement use iterator so we don't invalidate it. + ++UI; + + // Don't bother for PHI nodes. + if (isa<PHINode>(User)) + continue; + + // Figure out which BB this cmp is used in. + BasicBlock *UserBB = User->getParent(); + + // If this user is in the same block as the cmp, don't change the cmp. + if (UserBB == DefBB) continue; + + // If we have already inserted a cmp into this block, use it. + CmpInst *&InsertedCmp = InsertedCmps[UserBB]; + + if (!InsertedCmp) { + BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); + InsertedCmp = + CmpInst::Create(CI->getOpcode(), + CI->getPredicate(), CI->getOperand(0), + CI->getOperand(1), "", InsertPt); + MadeChange = true; + } + + // Replace a use of the cmp with a use of the new cmp. + TheUse = InsertedCmp; + ++NumCmpUses; + } + + // If we removed all uses, nuke the cmp. + if (CI->use_empty()) + CI->eraseFromParent(); + + return MadeChange; +} + +namespace { +class CodeGenPrepareFortifiedLibCalls : public SimplifyFortifiedLibCalls { +protected: + void replaceCall(Value *With) { + CI->replaceAllUsesWith(With); + CI->eraseFromParent(); + } + bool isFoldable(unsigned SizeCIOp, unsigned, bool) const { + if (ConstantInt *SizeCI = + dyn_cast<ConstantInt>(CI->getArgOperand(SizeCIOp))) + return SizeCI->isAllOnesValue(); + return false; + } +}; +} // end anonymous namespace + +bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { + BasicBlock *BB = CI->getParent(); + + // Lower inline assembly if we can. + // If we found an inline asm expession, and if the target knows how to + // lower it to normal LLVM code, do so now. + if (TLI && isa<InlineAsm>(CI->getCalledValue())) { + if (TLI->ExpandInlineAsm(CI)) { + // Avoid invalidating the iterator. + CurInstIterator = BB->begin(); + // Avoid processing instructions out of order, which could cause + // reuse before a value is defined. + SunkAddrs.clear(); + return true; + } + // Sink address computing for memory operands into the block. + if (OptimizeInlineAsmInst(CI)) + return true; + } + + // Lower all uses of llvm.objectsize.* + IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI); + if (II && II->getIntrinsicID() == Intrinsic::objectsize) { + bool Min = (cast<ConstantInt>(II->getArgOperand(1))->getZExtValue() == 1); + Type *ReturnTy = CI->getType(); + Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL); + + // Substituting this can cause recursive simplifications, which can + // invalidate our iterator. Use a WeakVH to hold onto it in case this + // happens. + WeakVH IterHandle(CurInstIterator); + + replaceAndRecursivelySimplify(CI, RetVal, TLI ? TLI->getTargetData() : 0, + TLInfo, ModifiedDT ? 0 : DT); + + // If the iterator instruction was recursively deleted, start over at the + // start of the block. + if (IterHandle != CurInstIterator) { + CurInstIterator = BB->begin(); + SunkAddrs.clear(); + } + return true; + } + + if (II && TLI) { + SmallVector<Value*, 2> PtrOps; + Type *AccessTy; + if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy)) + while (!PtrOps.empty()) + if (OptimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy)) + return true; + } + + // From here on out we're working with named functions. + if (CI->getCalledFunction() == 0) return false; + + // We'll need TargetData from here on out. + const TargetData *TD = TLI ? TLI->getTargetData() : 0; + if (!TD) return false; + + // Lower all default uses of _chk calls. This is very similar + // to what InstCombineCalls does, but here we are only lowering calls + // that have the default "don't know" as the objectsize. Anything else + // should be left alone. + CodeGenPrepareFortifiedLibCalls Simplifier; + return Simplifier.fold(CI, TD, TLInfo); +} + +/// DupRetToEnableTailCallOpts - Look for opportunities to duplicate return +/// instructions to the predecessor to enable tail call optimizations. The +/// case it is currently looking for is: +/// bb0: +/// %tmp0 = tail call i32 @f0() +/// br label %return +/// bb1: +/// %tmp1 = tail call i32 @f1() +/// br label %return +/// bb2: +/// %tmp2 = tail call i32 @f2() +/// br label %return +/// return: +/// %retval = phi i32 [ %tmp0, %bb0 ], [ %tmp1, %bb1 ], [ %tmp2, %bb2 ] +/// ret i32 %retval +/// +/// => +/// +/// bb0: +/// %tmp0 = tail call i32 @f0() +/// ret i32 %tmp0 +/// bb1: +/// %tmp1 = tail call i32 @f1() +/// ret i32 %tmp1 +/// bb2: +/// %tmp2 = tail call i32 @f2() +/// ret i32 %tmp2 +/// +bool CodeGenPrepare::DupRetToEnableTailCallOpts(ReturnInst *RI) { + if (!TLI) + return false; + + PHINode *PN = 0; + BitCastInst *BCI = 0; + Value *V = RI->getReturnValue(); + if (V) { + BCI = dyn_cast<BitCastInst>(V); + if (BCI) + V = BCI->getOperand(0); + + PN = dyn_cast<PHINode>(V); + if (!PN) + return false; + } + + BasicBlock *BB = RI->getParent(); + if (PN && PN->getParent() != BB) + return false; + + // It's not safe to eliminate the sign / zero extension of the return value. + // See llvm::isInTailCallPosition(). + const Function *F = BB->getParent(); + Attributes CallerRetAttr = F->getAttributes().getRetAttributes(); + if ((CallerRetAttr & Attribute::ZExt) || (CallerRetAttr & Attribute::SExt)) + return false; + + // Make sure there are no instructions between the PHI and return, or that the + // return is the first instruction in the block. + if (PN) { + BasicBlock::iterator BI = BB->begin(); + do { ++BI; } while (isa<DbgInfoIntrinsic>(BI)); + if (&*BI == BCI) + // Also skip over the bitcast. + ++BI; + if (&*BI != RI) + return false; + } else { + BasicBlock::iterator BI = BB->begin(); + while (isa<DbgInfoIntrinsic>(BI)) ++BI; + if (&*BI != RI) + return false; + } + + /// Only dup the ReturnInst if the CallInst is likely to be emitted as a tail + /// call. + SmallVector<CallInst*, 4> TailCalls; + if (PN) { + for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) { + CallInst *CI = dyn_cast<CallInst>(PN->getIncomingValue(I)); + // Make sure the phi value is indeed produced by the tail call. + if (CI && CI->hasOneUse() && CI->getParent() == PN->getIncomingBlock(I) && + TLI->mayBeEmittedAsTailCall(CI)) + TailCalls.push_back(CI); + } + } else { + SmallPtrSet<BasicBlock*, 4> VisitedBBs; + for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) { + if (!VisitedBBs.insert(*PI)) + continue; + + BasicBlock::InstListType &InstList = (*PI)->getInstList(); + BasicBlock::InstListType::reverse_iterator RI = InstList.rbegin(); + BasicBlock::InstListType::reverse_iterator RE = InstList.rend(); + do { ++RI; } while (RI != RE && isa<DbgInfoIntrinsic>(&*RI)); + if (RI == RE) + continue; + + CallInst *CI = dyn_cast<CallInst>(&*RI); + if (CI && CI->use_empty() && TLI->mayBeEmittedAsTailCall(CI)) + TailCalls.push_back(CI); + } + } + + bool Changed = false; + for (unsigned i = 0, e = TailCalls.size(); i != e; ++i) { + CallInst *CI = TailCalls[i]; + CallSite CS(CI); + + // Conservatively require the attributes of the call to match those of the + // return. Ignore noalias because it doesn't affect the call sequence. + Attributes CalleeRetAttr = CS.getAttributes().getRetAttributes(); + if ((CalleeRetAttr ^ CallerRetAttr) & ~Attribute::NoAlias) + continue; + + // Make sure the call instruction is followed by an unconditional branch to + // the return block. + BasicBlock *CallBB = CI->getParent(); + BranchInst *BI = dyn_cast<BranchInst>(CallBB->getTerminator()); + if (!BI || !BI->isUnconditional() || BI->getSuccessor(0) != BB) + continue; + + // Duplicate the return into CallBB. + (void)FoldReturnIntoUncondBranch(RI, BB, CallBB); + ModifiedDT = Changed = true; + ++NumRetsDup; + } + + // If we eliminated all predecessors of the block, delete the block now. + if (Changed && pred_begin(BB) == pred_end(BB)) + BB->eraseFromParent(); + + return Changed; +} + +//===----------------------------------------------------------------------===// +// Memory Optimization +//===----------------------------------------------------------------------===// + +/// IsNonLocalValue - Return true if the specified values are defined in a +/// different basic block than BB. +static bool IsNonLocalValue(Value *V, BasicBlock *BB) { + if (Instruction *I = dyn_cast<Instruction>(V)) + return I->getParent() != BB; + return false; +} + +/// OptimizeMemoryInst - Load and Store Instructions often have +/// addressing modes that can do significant amounts of computation. As such, +/// instruction selection will try to get the load or store to do as much +/// computation as possible for the program. The problem is that isel can only +/// see within a single block. As such, we sink as much legal addressing mode +/// stuff into the block as possible. +/// +/// This method is used to optimize both load/store and inline asms with memory +/// operands. +bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, + Type *AccessTy) { + Value *Repl = Addr; + + // Try to collapse single-value PHI nodes. This is necessary to undo + // unprofitable PRE transformations. + SmallVector<Value*, 8> worklist; + SmallPtrSet<Value*, 16> Visited; + worklist.push_back(Addr); + + // Use a worklist to iteratively look through PHI nodes, and ensure that + // the addressing mode obtained from the non-PHI roots of the graph + // are equivalent. + Value *Consensus = 0; + unsigned NumUsesConsensus = 0; + bool IsNumUsesConsensusValid = false; + SmallVector<Instruction*, 16> AddrModeInsts; + ExtAddrMode AddrMode; + while (!worklist.empty()) { + Value *V = worklist.back(); + worklist.pop_back(); + + // Break use-def graph loops. + if (!Visited.insert(V)) { + Consensus = 0; + break; + } + + // For a PHI node, push all of its incoming values. + if (PHINode *P = dyn_cast<PHINode>(V)) { + for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i) + worklist.push_back(P->getIncomingValue(i)); + continue; + } + + // For non-PHIs, determine the addressing mode being computed. + SmallVector<Instruction*, 16> NewAddrModeInsts; + ExtAddrMode NewAddrMode = + AddressingModeMatcher::Match(V, AccessTy, MemoryInst, + NewAddrModeInsts, *TLI); + + // This check is broken into two cases with very similar code to avoid using + // getNumUses() as much as possible. Some values have a lot of uses, so + // calling getNumUses() unconditionally caused a significant compile-time + // regression. + if (!Consensus) { + Consensus = V; + AddrMode = NewAddrMode; + AddrModeInsts = NewAddrModeInsts; + continue; + } else if (NewAddrMode == AddrMode) { + if (!IsNumUsesConsensusValid) { + NumUsesConsensus = Consensus->getNumUses(); + IsNumUsesConsensusValid = true; + } + + // Ensure that the obtained addressing mode is equivalent to that obtained + // for all other roots of the PHI traversal. Also, when choosing one + // such root as representative, select the one with the most uses in order + // to keep the cost modeling heuristics in AddressingModeMatcher + // applicable. + unsigned NumUses = V->getNumUses(); + if (NumUses > NumUsesConsensus) { + Consensus = V; + NumUsesConsensus = NumUses; + AddrModeInsts = NewAddrModeInsts; + } + continue; + } + + Consensus = 0; + break; + } + + // If the addressing mode couldn't be determined, or if multiple different + // ones were determined, bail out now. + if (!Consensus) return false; + + // Check to see if any of the instructions supersumed by this addr mode are + // non-local to I's BB. + bool AnyNonLocal = false; + for (unsigned i = 0, e = AddrModeInsts.size(); i != e; ++i) { + if (IsNonLocalValue(AddrModeInsts[i], MemoryInst->getParent())) { + AnyNonLocal = true; + break; + } + } + + // If all the instructions matched are already in this BB, don't do anything. + if (!AnyNonLocal) { + DEBUG(dbgs() << "CGP: Found local addrmode: " << AddrMode << "\n"); + return false; + } + + // Insert this computation right after this user. Since our caller is + // scanning from the top of the BB to the bottom, reuse of the expr are + // guaranteed to happen later. + IRBuilder<> Builder(MemoryInst); + + // Now that we determined the addressing expression we want to use and know + // that we have to sink it into this block. Check to see if we have already + // done this for some other load/store instr in this block. If so, reuse the + // computation. + Value *&SunkAddr = SunkAddrs[Addr]; + if (SunkAddr) { + DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode << " for " + << *MemoryInst); + if (SunkAddr->getType() != Addr->getType()) + SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType()); + } else { + DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for " + << *MemoryInst); + Type *IntPtrTy = + TLI->getTargetData()->getIntPtrType(AccessTy->getContext()); + + Value *Result = 0; + + // Start with the base register. Do this first so that subsequent address + // matching finds it last, which will prevent it from trying to match it + // as the scaled value in case it happens to be a mul. That would be + // problematic if we've sunk a different mul for the scale, because then + // we'd end up sinking both muls. + if (AddrMode.BaseReg) { + Value *V = AddrMode.BaseReg; + if (V->getType()->isPointerTy()) + V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr"); + if (V->getType() != IntPtrTy) + V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr"); + Result = V; + } + + // Add the scale value. + if (AddrMode.Scale) { + Value *V = AddrMode.ScaledReg; + if (V->getType() == IntPtrTy) { + // done. + } else if (V->getType()->isPointerTy()) { + V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr"); + } else if (cast<IntegerType>(IntPtrTy)->getBitWidth() < + cast<IntegerType>(V->getType())->getBitWidth()) { + V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr"); + } else { + V = Builder.CreateSExt(V, IntPtrTy, "sunkaddr"); + } + if (AddrMode.Scale != 1) + V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale), + "sunkaddr"); + if (Result) + Result = Builder.CreateAdd(Result, V, "sunkaddr"); + else + Result = V; + } + + // Add in the BaseGV if present. + if (AddrMode.BaseGV) { + Value *V = Builder.CreatePtrToInt(AddrMode.BaseGV, IntPtrTy, "sunkaddr"); + if (Result) + Result = Builder.CreateAdd(Result, V, "sunkaddr"); + else + Result = V; + } + + // Add in the Base Offset if present. + if (AddrMode.BaseOffs) { + Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs); + if (Result) + Result = Builder.CreateAdd(Result, V, "sunkaddr"); + else + Result = V; + } + + if (Result == 0) + SunkAddr = Constant::getNullValue(Addr->getType()); + else + SunkAddr = Builder.CreateIntToPtr(Result, Addr->getType(), "sunkaddr"); + } + + MemoryInst->replaceUsesOfWith(Repl, SunkAddr); + + // If we have no uses, recursively delete the value and all dead instructions + // using it. + if (Repl->use_empty()) { + // This can cause recursive deletion, which can invalidate our iterator. + // Use a WeakVH to hold onto it in case this happens. + WeakVH IterHandle(CurInstIterator); + BasicBlock *BB = CurInstIterator->getParent(); + + RecursivelyDeleteTriviallyDeadInstructions(Repl); + + if (IterHandle != CurInstIterator) { + // If the iterator instruction was recursively deleted, start over at the + // start of the block. + CurInstIterator = BB->begin(); + SunkAddrs.clear(); + } else { + // This address is now available for reassignment, so erase the table + // entry; we don't want to match some completely different instruction. + SunkAddrs[Addr] = 0; + } + } + ++NumMemoryInsts; + return true; +} + +/// OptimizeInlineAsmInst - If there are any memory operands, use +/// OptimizeMemoryInst to sink their address computing into the block when +/// possible / profitable. +bool CodeGenPrepare::OptimizeInlineAsmInst(CallInst *CS) { + bool MadeChange = false; + + TargetLowering::AsmOperandInfoVector + TargetConstraints = TLI->ParseConstraints(CS); + unsigned ArgNo = 0; + for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { + TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; + + // Compute the constraint code and ConstraintType to use. + TLI->ComputeConstraintToUse(OpInfo, SDValue()); + + if (OpInfo.ConstraintType == TargetLowering::C_Memory && + OpInfo.isIndirect) { + Value *OpVal = CS->getArgOperand(ArgNo++); + MadeChange |= OptimizeMemoryInst(CS, OpVal, OpVal->getType()); + } else if (OpInfo.Type == InlineAsm::isInput) + ArgNo++; + } + + return MadeChange; +} + +/// MoveExtToFormExtLoad - Move a zext or sext fed by a load into the same +/// basic block as the load, unless conditions are unfavorable. This allows +/// SelectionDAG to fold the extend into the load. +/// +bool CodeGenPrepare::MoveExtToFormExtLoad(Instruction *I) { + // Look for a load being extended. + LoadInst *LI = dyn_cast<LoadInst>(I->getOperand(0)); + if (!LI) return false; + + // If they're already in the same block, there's nothing to do. + if (LI->getParent() == I->getParent()) + return false; + + // If the load has other users and the truncate is not free, this probably + // isn't worthwhile. + if (!LI->hasOneUse() && + TLI && (TLI->isTypeLegal(TLI->getValueType(LI->getType())) || + !TLI->isTypeLegal(TLI->getValueType(I->getType()))) && + !TLI->isTruncateFree(I->getType(), LI->getType())) + return false; + + // Check whether the target supports casts folded into loads. + unsigned LType; + if (isa<ZExtInst>(I)) + LType = ISD::ZEXTLOAD; + else { + assert(isa<SExtInst>(I) && "Unexpected ext type!"); + LType = ISD::SEXTLOAD; + } + if (TLI && !TLI->isLoadExtLegal(LType, TLI->getValueType(LI->getType()))) + return false; + + // Move the extend into the same block as the load, so that SelectionDAG + // can fold it. + I->removeFromParent(); + I->insertAfter(LI); + ++NumExtsMoved; + return true; +} + +bool CodeGenPrepare::OptimizeExtUses(Instruction *I) { + BasicBlock *DefBB = I->getParent(); + + // If the result of a {s|z}ext and its source are both live out, rewrite all + // other uses of the source with result of extension. + Value *Src = I->getOperand(0); + if (Src->hasOneUse()) + return false; + + // Only do this xform if truncating is free. + if (TLI && !TLI->isTruncateFree(I->getType(), Src->getType())) + return false; + + // Only safe to perform the optimization if the source is also defined in + // this block. + if (!isa<Instruction>(Src) || DefBB != cast<Instruction>(Src)->getParent()) + return false; + + bool DefIsLiveOut = false; + for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); + UI != E; ++UI) { + Instruction *User = cast<Instruction>(*UI); + + // Figure out which BB this ext is used in. + BasicBlock *UserBB = User->getParent(); + if (UserBB == DefBB) continue; + DefIsLiveOut = true; + break; + } + if (!DefIsLiveOut) + return false; + + // Make sure non of the uses are PHI nodes. + for (Value::use_iterator UI = Src->use_begin(), E = Src->use_end(); + UI != E; ++UI) { + Instruction *User = cast<Instruction>(*UI); + BasicBlock *UserBB = User->getParent(); + if (UserBB == DefBB) continue; + // Be conservative. We don't want this xform to end up introducing + // reloads just before load / store instructions. + if (isa<PHINode>(User) || isa<LoadInst>(User) || isa<StoreInst>(User)) + return false; + } + + // InsertedTruncs - Only insert one trunc in each block once. + DenseMap<BasicBlock*, Instruction*> InsertedTruncs; + + bool MadeChange = false; + for (Value::use_iterator UI = Src->use_begin(), E = Src->use_end(); + UI != E; ++UI) { + Use &TheUse = UI.getUse(); + Instruction *User = cast<Instruction>(*UI); + + // Figure out which BB this ext is used in. + BasicBlock *UserBB = User->getParent(); + if (UserBB == DefBB) continue; + + // Both src and def are live in this block. Rewrite the use. + Instruction *&InsertedTrunc = InsertedTruncs[UserBB]; + + if (!InsertedTrunc) { + BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); + InsertedTrunc = new TruncInst(I, Src->getType(), "", InsertPt); + } + + // Replace a use of the {s|z}ext source with a use of the result. + TheUse = InsertedTrunc; + ++NumExtUses; + MadeChange = true; + } + + return MadeChange; +} + +/// isFormingBranchFromSelectProfitable - Returns true if a SelectInst should be +/// turned into an explicit branch. +static bool isFormingBranchFromSelectProfitable(SelectInst *SI) { + // FIXME: This should use the same heuristics as IfConversion to determine + // whether a select is better represented as a branch. This requires that + // branch probability metadata is preserved for the select, which is not the + // case currently. + + CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition()); + + // If the branch is predicted right, an out of order CPU can avoid blocking on + // the compare. Emit cmovs on compares with a memory operand as branches to + // avoid stalls on the load from memory. If the compare has more than one use + // there's probably another cmov or setcc around so it's not worth emitting a + // branch. + if (!Cmp) + return false; + + Value *CmpOp0 = Cmp->getOperand(0); + Value *CmpOp1 = Cmp->getOperand(1); + + // We check that the memory operand has one use to avoid uses of the loaded + // value directly after the compare, making branches unprofitable. + return Cmp->hasOneUse() && + ((isa<LoadInst>(CmpOp0) && CmpOp0->hasOneUse()) || + (isa<LoadInst>(CmpOp1) && CmpOp1->hasOneUse())); +} + + +bool CodeGenPrepare::OptimizeSelectInst(SelectInst *SI) { + // If we have a SelectInst that will likely profit from branch prediction, + // turn it into a branch. + if (DisableSelectToBranch || OptSize || !TLI || + !TLI->isPredictableSelectExpensive()) + return false; + + if (!SI->getCondition()->getType()->isIntegerTy(1) || + !isFormingBranchFromSelectProfitable(SI)) + return false; + + ModifiedDT = true; + + // First, we split the block containing the select into 2 blocks. + BasicBlock *StartBlock = SI->getParent(); + BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(SI)); + BasicBlock *NextBlock = StartBlock->splitBasicBlock(SplitPt, "select.end"); + + // Create a new block serving as the landing pad for the branch. + BasicBlock *SmallBlock = BasicBlock::Create(SI->getContext(), "select.mid", + NextBlock->getParent(), NextBlock); + + // Move the unconditional branch from the block with the select in it into our + // landing pad block. + StartBlock->getTerminator()->eraseFromParent(); + BranchInst::Create(NextBlock, SmallBlock); + + // Insert the real conditional branch based on the original condition. + BranchInst::Create(NextBlock, SmallBlock, SI->getCondition(), SI); + + // The select itself is replaced with a PHI Node. + PHINode *PN = PHINode::Create(SI->getType(), 2, "", NextBlock->begin()); + PN->takeName(SI); + PN->addIncoming(SI->getTrueValue(), StartBlock); + PN->addIncoming(SI->getFalseValue(), SmallBlock); + SI->replaceAllUsesWith(PN); + SI->eraseFromParent(); + + // Instruct OptimizeBlock to skip to the next block. + CurInstIterator = StartBlock->end(); + ++NumSelectsExpanded; + return true; +} + +bool CodeGenPrepare::OptimizeInst(Instruction *I) { + if (PHINode *P = dyn_cast<PHINode>(I)) { + // It is possible for very late stage optimizations (such as SimplifyCFG) + // to introduce PHI nodes too late to be cleaned up. If we detect such a + // trivial PHI, go ahead and zap it here. + if (Value *V = SimplifyInstruction(P)) { + P->replaceAllUsesWith(V); + P->eraseFromParent(); + ++NumPHIsElim; + return true; + } + return false; + } + + if (CastInst *CI = dyn_cast<CastInst>(I)) { + // If the source of the cast is a constant, then this should have + // already been constant folded. The only reason NOT to constant fold + // it is if something (e.g. LSR) was careful to place the constant + // evaluation in a block other than then one that uses it (e.g. to hoist + // the address of globals out of a loop). If this is the case, we don't + // want to forward-subst the cast. + if (isa<Constant>(CI->getOperand(0))) + return false; + + if (TLI && OptimizeNoopCopyExpression(CI, *TLI)) + return true; + + if (isa<ZExtInst>(I) || isa<SExtInst>(I)) { + bool MadeChange = MoveExtToFormExtLoad(I); + return MadeChange | OptimizeExtUses(I); + } + return false; + } + + if (CmpInst *CI = dyn_cast<CmpInst>(I)) + return OptimizeCmpExpression(CI); + + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + if (TLI) + return OptimizeMemoryInst(I, I->getOperand(0), LI->getType()); + return false; + } + + if (StoreInst *SI = dyn_cast<StoreInst>(I)) { + if (TLI) + return OptimizeMemoryInst(I, SI->getOperand(1), + SI->getOperand(0)->getType()); + return false; + } + + if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) { + if (GEPI->hasAllZeroIndices()) { + /// The GEP operand must be a pointer, so must its result -> BitCast + Instruction *NC = new BitCastInst(GEPI->getOperand(0), GEPI->getType(), + GEPI->getName(), GEPI); + GEPI->replaceAllUsesWith(NC); + GEPI->eraseFromParent(); + ++NumGEPsElim; + OptimizeInst(NC); + return true; + } + return false; + } + + if (CallInst *CI = dyn_cast<CallInst>(I)) + return OptimizeCallInst(CI); + + if (ReturnInst *RI = dyn_cast<ReturnInst>(I)) + return DupRetToEnableTailCallOpts(RI); + + if (SelectInst *SI = dyn_cast<SelectInst>(I)) + return OptimizeSelectInst(SI); + + return false; +} + +// In this pass we look for GEP and cast instructions that are used +// across basic blocks and rewrite them to improve basic-block-at-a-time +// selection. +bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) { + SunkAddrs.clear(); + bool MadeChange = false; + + CurInstIterator = BB.begin(); + for (BasicBlock::iterator E = BB.end(); CurInstIterator != E; ) + MadeChange |= OptimizeInst(CurInstIterator++); + + return MadeChange; +} + +// llvm.dbg.value is far away from the value then iSel may not be able +// handle it properly. iSel will drop llvm.dbg.value if it can not +// find a node corresponding to the value. +bool CodeGenPrepare::PlaceDbgValues(Function &F) { + bool MadeChange = false; + for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { + Instruction *PrevNonDbgInst = NULL; + for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) { + Instruction *Insn = BI; ++BI; + DbgValueInst *DVI = dyn_cast<DbgValueInst>(Insn); + if (!DVI) { + PrevNonDbgInst = Insn; + continue; + } + + Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue()); + if (VI && VI != PrevNonDbgInst && !VI->isTerminator()) { + DEBUG(dbgs() << "Moving Debug Value before :\n" << *DVI << ' ' << *VI); + DVI->removeFromParent(); + if (isa<PHINode>(VI)) + DVI->insertBefore(VI->getParent()->getFirstInsertionPt()); + else + DVI->insertAfter(VI); + MadeChange = true; + ++NumDbgValueMoved; + } + } + } + return MadeChange; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp new file mode 100644 index 0000000..5430f62 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp @@ -0,0 +1,98 @@ +//===- ConstantProp.cpp - Code to perform Simple Constant Propagation -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements constant propagation and merging: +// +// Specifically, this: +// * Converts instructions like "add int 1, 2" into 3 +// +// Notice that: +// * This pass has a habit of making definitions be dead. It is a good idea +// to run a DIE pass sometime after running this pass. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "constprop" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Constant.h" +#include "llvm/Instruction.h" +#include "llvm/Pass.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Support/InstIterator.h" +#include "llvm/ADT/Statistic.h" +#include <set> +using namespace llvm; + +STATISTIC(NumInstKilled, "Number of instructions killed"); + +namespace { + struct ConstantPropagation : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + ConstantPropagation() : FunctionPass(ID) { + initializeConstantPropagationPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired<TargetLibraryInfo>(); + } + }; +} + +char ConstantPropagation::ID = 0; +INITIALIZE_PASS_BEGIN(ConstantPropagation, "constprop", + "Simple constant propagation", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_END(ConstantPropagation, "constprop", + "Simple constant propagation", false, false) + +FunctionPass *llvm::createConstantPropagationPass() { + return new ConstantPropagation(); +} + +bool ConstantPropagation::runOnFunction(Function &F) { + // Initialize the worklist to all of the instructions ready to process... + std::set<Instruction*> WorkList; + for(inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { + WorkList.insert(&*i); + } + bool Changed = false; + TargetData *TD = getAnalysisIfAvailable<TargetData>(); + TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); + + while (!WorkList.empty()) { + Instruction *I = *WorkList.begin(); + WorkList.erase(WorkList.begin()); // Get an element from the worklist... + + if (!I->use_empty()) // Don't muck with dead instructions... + if (Constant *C = ConstantFoldInstruction(I, TD, TLI)) { + // Add all of the users of this instruction to the worklist, they might + // be constant propagatable now... + for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); + UI != UE; ++UI) + WorkList.insert(cast<Instruction>(*UI)); + + // Replace all of the uses of a variable with uses of the constant. + I->replaceAllUsesWith(C); + + // Remove the dead instruction. + WorkList.erase(I); + I->eraseFromParent(); + + // We made a change to the function... + Changed = true; + ++NumInstKilled; + } + } + return Changed; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp new file mode 100644 index 0000000..9b0aadb --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -0,0 +1,297 @@ +//===- CorrelatedValuePropagation.cpp - Propagate CFG-derived info --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the Correlated Value Propagation pass. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "correlated-value-propagation" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Constants.h" +#include "llvm/Function.h" +#include "llvm/Instructions.h" +#include "llvm/Pass.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LazyValueInfo.h" +#include "llvm/Support/CFG.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumPhis, "Number of phis propagated"); +STATISTIC(NumSelects, "Number of selects propagated"); +STATISTIC(NumMemAccess, "Number of memory access targets propagated"); +STATISTIC(NumCmps, "Number of comparisons propagated"); +STATISTIC(NumDeadCases, "Number of switch cases removed"); + +namespace { + class CorrelatedValuePropagation : public FunctionPass { + LazyValueInfo *LVI; + + bool processSelect(SelectInst *SI); + bool processPHI(PHINode *P); + bool processMemAccess(Instruction *I); + bool processCmp(CmpInst *C); + bool processSwitch(SwitchInst *SI); + + public: + static char ID; + CorrelatedValuePropagation(): FunctionPass(ID) { + initializeCorrelatedValuePropagationPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<LazyValueInfo>(); + } + }; +} + +char CorrelatedValuePropagation::ID = 0; +INITIALIZE_PASS_BEGIN(CorrelatedValuePropagation, "correlated-propagation", + "Value Propagation", false, false) +INITIALIZE_PASS_DEPENDENCY(LazyValueInfo) +INITIALIZE_PASS_END(CorrelatedValuePropagation, "correlated-propagation", + "Value Propagation", false, false) + +// Public interface to the Value Propagation pass +Pass *llvm::createCorrelatedValuePropagationPass() { + return new CorrelatedValuePropagation(); +} + +bool CorrelatedValuePropagation::processSelect(SelectInst *S) { + if (S->getType()->isVectorTy()) return false; + if (isa<Constant>(S->getOperand(0))) return false; + + Constant *C = LVI->getConstant(S->getOperand(0), S->getParent()); + if (!C) return false; + + ConstantInt *CI = dyn_cast<ConstantInt>(C); + if (!CI) return false; + + Value *ReplaceWith = S->getOperand(1); + Value *Other = S->getOperand(2); + if (!CI->isOne()) std::swap(ReplaceWith, Other); + if (ReplaceWith == S) ReplaceWith = UndefValue::get(S->getType()); + + S->replaceAllUsesWith(ReplaceWith); + S->eraseFromParent(); + + ++NumSelects; + + return true; +} + +bool CorrelatedValuePropagation::processPHI(PHINode *P) { + bool Changed = false; + + BasicBlock *BB = P->getParent(); + for (unsigned i = 0, e = P->getNumIncomingValues(); i < e; ++i) { + Value *Incoming = P->getIncomingValue(i); + if (isa<Constant>(Incoming)) continue; + + Constant *C = LVI->getConstantOnEdge(P->getIncomingValue(i), + P->getIncomingBlock(i), + BB); + if (!C) continue; + + P->setIncomingValue(i, C); + Changed = true; + } + + if (Value *V = SimplifyInstruction(P)) { + P->replaceAllUsesWith(V); + P->eraseFromParent(); + Changed = true; + } + + if (Changed) + ++NumPhis; + + return Changed; +} + +bool CorrelatedValuePropagation::processMemAccess(Instruction *I) { + Value *Pointer = 0; + if (LoadInst *L = dyn_cast<LoadInst>(I)) + Pointer = L->getPointerOperand(); + else + Pointer = cast<StoreInst>(I)->getPointerOperand(); + + if (isa<Constant>(Pointer)) return false; + + Constant *C = LVI->getConstant(Pointer, I->getParent()); + if (!C) return false; + + ++NumMemAccess; + I->replaceUsesOfWith(Pointer, C); + return true; +} + +/// processCmp - If the value of this comparison could be determined locally, +/// constant propagation would already have figured it out. Instead, walk +/// the predecessors and statically evaluate the comparison based on information +/// available on that edge. If a given static evaluation is true on ALL +/// incoming edges, then it's true universally and we can simplify the compare. +bool CorrelatedValuePropagation::processCmp(CmpInst *C) { + Value *Op0 = C->getOperand(0); + if (isa<Instruction>(Op0) && + cast<Instruction>(Op0)->getParent() == C->getParent()) + return false; + + Constant *Op1 = dyn_cast<Constant>(C->getOperand(1)); + if (!Op1) return false; + + pred_iterator PI = pred_begin(C->getParent()), PE = pred_end(C->getParent()); + if (PI == PE) return false; + + LazyValueInfo::Tristate Result = LVI->getPredicateOnEdge(C->getPredicate(), + C->getOperand(0), Op1, *PI, C->getParent()); + if (Result == LazyValueInfo::Unknown) return false; + + ++PI; + while (PI != PE) { + LazyValueInfo::Tristate Res = LVI->getPredicateOnEdge(C->getPredicate(), + C->getOperand(0), Op1, *PI, C->getParent()); + if (Res != Result) return false; + ++PI; + } + + ++NumCmps; + + if (Result == LazyValueInfo::True) + C->replaceAllUsesWith(ConstantInt::getTrue(C->getContext())); + else + C->replaceAllUsesWith(ConstantInt::getFalse(C->getContext())); + + C->eraseFromParent(); + + return true; +} + +/// processSwitch - Simplify a switch instruction by removing cases which can +/// never fire. If the uselessness of a case could be determined locally then +/// constant propagation would already have figured it out. Instead, walk the +/// predecessors and statically evaluate cases based on information available +/// on that edge. Cases that cannot fire no matter what the incoming edge can +/// safely be removed. If a case fires on every incoming edge then the entire +/// switch can be removed and replaced with a branch to the case destination. +bool CorrelatedValuePropagation::processSwitch(SwitchInst *SI) { + Value *Cond = SI->getCondition(); + BasicBlock *BB = SI->getParent(); + + // If the condition was defined in same block as the switch then LazyValueInfo + // currently won't say anything useful about it, though in theory it could. + if (isa<Instruction>(Cond) && cast<Instruction>(Cond)->getParent() == BB) + return false; + + // If the switch is unreachable then trying to improve it is a waste of time. + pred_iterator PB = pred_begin(BB), PE = pred_end(BB); + if (PB == PE) return false; + + // Analyse each switch case in turn. This is done in reverse order so that + // removing a case doesn't cause trouble for the iteration. + bool Changed = false; + for (SwitchInst::CaseIt CI = SI->case_end(), CE = SI->case_begin(); CI-- != CE; + ) { + ConstantInt *Case = CI.getCaseValue(); + + // Check to see if the switch condition is equal to/not equal to the case + // value on every incoming edge, equal/not equal being the same each time. + LazyValueInfo::Tristate State = LazyValueInfo::Unknown; + for (pred_iterator PI = PB; PI != PE; ++PI) { + // Is the switch condition equal to the case value? + LazyValueInfo::Tristate Value = LVI->getPredicateOnEdge(CmpInst::ICMP_EQ, + Cond, Case, *PI, BB); + // Give up on this case if nothing is known. + if (Value == LazyValueInfo::Unknown) { + State = LazyValueInfo::Unknown; + break; + } + + // If this was the first edge to be visited, record that all other edges + // need to give the same result. + if (PI == PB) { + State = Value; + continue; + } + + // If this case is known to fire for some edges and known not to fire for + // others then there is nothing we can do - give up. + if (Value != State) { + State = LazyValueInfo::Unknown; + break; + } + } + + if (State == LazyValueInfo::False) { + // This case never fires - remove it. + CI.getCaseSuccessor()->removePredecessor(BB); + SI->removeCase(CI); // Does not invalidate the iterator. + ++NumDeadCases; + Changed = true; + } else if (State == LazyValueInfo::True) { + // This case always fires. Arrange for the switch to be turned into an + // unconditional branch by replacing the switch condition with the case + // value. + SI->setCondition(Case); + NumDeadCases += SI->getNumCases(); + Changed = true; + break; + } + } + + if (Changed) + // If the switch has been simplified to the point where it can be replaced + // by a branch then do so now. + ConstantFoldTerminator(BB); + + return Changed; +} + +bool CorrelatedValuePropagation::runOnFunction(Function &F) { + LVI = &getAnalysis<LazyValueInfo>(); + + bool FnChanged = false; + + for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) { + bool BBChanged = false; + for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ) { + Instruction *II = BI++; + switch (II->getOpcode()) { + case Instruction::Select: + BBChanged |= processSelect(cast<SelectInst>(II)); + break; + case Instruction::PHI: + BBChanged |= processPHI(cast<PHINode>(II)); + break; + case Instruction::ICmp: + case Instruction::FCmp: + BBChanged |= processCmp(cast<CmpInst>(II)); + break; + case Instruction::Load: + case Instruction::Store: + BBChanged |= processMemAccess(II); + break; + } + } + + Instruction *Term = FI->getTerminator(); + switch (Term->getOpcode()) { + case Instruction::Switch: + BBChanged |= processSwitch(cast<SwitchInst>(Term)); + break; + } + + FnChanged |= BBChanged; + } + + return FnChanged; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp new file mode 100644 index 0000000..8dbcc23 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp @@ -0,0 +1,135 @@ +//===- DCE.cpp - Code to perform dead code elimination --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements dead inst elimination and dead code elimination. +// +// Dead Inst Elimination performs a single pass over the function removing +// instructions that are obviously dead. Dead Code Elimination is similar, but +// it rechecks instructions that were used by removed instructions to see if +// they are newly dead. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "dce" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Instruction.h" +#include "llvm/Pass.h" +#include "llvm/Support/InstIterator.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(DIEEliminated, "Number of insts removed by DIE pass"); +STATISTIC(DCEEliminated, "Number of insts removed"); + +namespace { + //===--------------------------------------------------------------------===// + // DeadInstElimination pass implementation + // + struct DeadInstElimination : public BasicBlockPass { + static char ID; // Pass identification, replacement for typeid + DeadInstElimination() : BasicBlockPass(ID) { + initializeDeadInstEliminationPass(*PassRegistry::getPassRegistry()); + } + virtual bool runOnBasicBlock(BasicBlock &BB) { + bool Changed = false; + for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) { + Instruction *Inst = DI++; + if (isInstructionTriviallyDead(Inst)) { + Inst->eraseFromParent(); + Changed = true; + ++DIEEliminated; + } + } + return Changed; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + } + }; +} + +char DeadInstElimination::ID = 0; +INITIALIZE_PASS(DeadInstElimination, "die", + "Dead Instruction Elimination", false, false) + +Pass *llvm::createDeadInstEliminationPass() { + return new DeadInstElimination(); +} + + +namespace { + //===--------------------------------------------------------------------===// + // DeadCodeElimination pass implementation + // + struct DCE : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + DCE() : FunctionPass(ID) { + initializeDCEPass(*PassRegistry::getPassRegistry()); + } + + virtual bool runOnFunction(Function &F); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + } + }; +} + +char DCE::ID = 0; +INITIALIZE_PASS(DCE, "dce", "Dead Code Elimination", false, false) + +bool DCE::runOnFunction(Function &F) { + // Start out with all of the instructions in the worklist... + std::vector<Instruction*> WorkList; + for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) + WorkList.push_back(&*i); + + // Loop over the worklist finding instructions that are dead. If they are + // dead make them drop all of their uses, making other instructions + // potentially dead, and work until the worklist is empty. + // + bool MadeChange = false; + while (!WorkList.empty()) { + Instruction *I = WorkList.back(); + WorkList.pop_back(); + + if (isInstructionTriviallyDead(I)) { // If the instruction is dead. + // Loop over all of the values that the instruction uses, if there are + // instructions being used, add them to the worklist, because they might + // go dead after this one is removed. + // + for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) + if (Instruction *Used = dyn_cast<Instruction>(*OI)) + WorkList.push_back(Used); + + // Remove the instruction. + I->eraseFromParent(); + + // Remove the instruction from the worklist if it still exists in it. + for (std::vector<Instruction*>::iterator WI = WorkList.begin(); + WI != WorkList.end(); ) { + if (*WI == I) + WI = WorkList.erase(WI); + else + ++WI; + } + + MadeChange = true; + ++DCEEliminated; + } + } + return MadeChange; +} + +FunctionPass *llvm::createDeadCodeEliminationPass() { + return new DCE(); +} + diff --git a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp new file mode 100644 index 0000000..8b1283f --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -0,0 +1,849 @@ +//===- DeadStoreElimination.cpp - Fast Dead Store Elimination -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a trivial dead store elimination that only considers +// basic-block local redundant stores. +// +// FIXME: This should eventually be extended to be a post-dominator tree +// traversal. Doing so would be pretty trivial. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "dse" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Constants.h" +#include "llvm/Function.h" +#include "llvm/GlobalVariable.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Pass.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/MemoryDependenceAnalysis.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Support/Debug.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +using namespace llvm; + +STATISTIC(NumFastStores, "Number of stores deleted"); +STATISTIC(NumFastOther , "Number of other instrs removed"); + +namespace { + struct DSE : public FunctionPass { + AliasAnalysis *AA; + MemoryDependenceAnalysis *MD; + DominatorTree *DT; + + static char ID; // Pass identification, replacement for typeid + DSE() : FunctionPass(ID), AA(0), MD(0), DT(0) { + initializeDSEPass(*PassRegistry::getPassRegistry()); + } + + virtual bool runOnFunction(Function &F) { + AA = &getAnalysis<AliasAnalysis>(); + MD = &getAnalysis<MemoryDependenceAnalysis>(); + DT = &getAnalysis<DominatorTree>(); + + bool Changed = false; + for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) + // Only check non-dead blocks. Dead blocks may have strange pointer + // cycles that will confuse alias analysis. + if (DT->isReachableFromEntry(I)) + Changed |= runOnBasicBlock(*I); + + AA = 0; MD = 0; DT = 0; + return Changed; + } + + bool runOnBasicBlock(BasicBlock &BB); + bool HandleFree(CallInst *F); + bool handleEndBlock(BasicBlock &BB); + void RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc, + SmallSetVector<Value*, 16> &DeadStackObjects); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired<DominatorTree>(); + AU.addRequired<AliasAnalysis>(); + AU.addRequired<MemoryDependenceAnalysis>(); + AU.addPreserved<AliasAnalysis>(); + AU.addPreserved<DominatorTree>(); + AU.addPreserved<MemoryDependenceAnalysis>(); + } + }; +} + +char DSE::ID = 0; +INITIALIZE_PASS_BEGIN(DSE, "dse", "Dead Store Elimination", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_END(DSE, "dse", "Dead Store Elimination", false, false) + +FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); } + +//===----------------------------------------------------------------------===// +// Helper functions +//===----------------------------------------------------------------------===// + +/// DeleteDeadInstruction - Delete this instruction. Before we do, go through +/// and zero out all the operands of this instruction. If any of them become +/// dead, delete them and the computation tree that feeds them. +/// +/// If ValueSet is non-null, remove any deleted instructions from it as well. +/// +static void DeleteDeadInstruction(Instruction *I, + MemoryDependenceAnalysis &MD, + SmallSetVector<Value*, 16> *ValueSet = 0) { + SmallVector<Instruction*, 32> NowDeadInsts; + + NowDeadInsts.push_back(I); + --NumFastOther; + + // Before we touch this instruction, remove it from memdep! + do { + Instruction *DeadInst = NowDeadInsts.pop_back_val(); + ++NumFastOther; + + // This instruction is dead, zap it, in stages. Start by removing it from + // MemDep, which needs to know the operands and needs it to be in the + // function. + MD.removeInstruction(DeadInst); + + for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) { + Value *Op = DeadInst->getOperand(op); + DeadInst->setOperand(op, 0); + + // If this operand just became dead, add it to the NowDeadInsts list. + if (!Op->use_empty()) continue; + + if (Instruction *OpI = dyn_cast<Instruction>(Op)) + if (isInstructionTriviallyDead(OpI)) + NowDeadInsts.push_back(OpI); + } + + DeadInst->eraseFromParent(); + + if (ValueSet) ValueSet->remove(DeadInst); + } while (!NowDeadInsts.empty()); +} + + +/// hasMemoryWrite - Does this instruction write some memory? This only returns +/// true for things that we can analyze with other helpers below. +static bool hasMemoryWrite(Instruction *I) { + if (isa<StoreInst>(I)) + return true; + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + switch (II->getIntrinsicID()) { + default: + return false; + case Intrinsic::memset: + case Intrinsic::memmove: + case Intrinsic::memcpy: + case Intrinsic::init_trampoline: + case Intrinsic::lifetime_end: + return true; + } + } + return false; +} + +/// getLocForWrite - Return a Location stored to by the specified instruction. +/// If isRemovable returns true, this function and getLocForRead completely +/// describe the memory operations for this instruction. +static AliasAnalysis::Location +getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) + return AA.getLocation(SI); + + if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Inst)) { + // memcpy/memmove/memset. + AliasAnalysis::Location Loc = AA.getLocationForDest(MI); + // If we don't have target data around, an unknown size in Location means + // that we should use the size of the pointee type. This isn't valid for + // memset/memcpy, which writes more than an i8. + if (Loc.Size == AliasAnalysis::UnknownSize && AA.getTargetData() == 0) + return AliasAnalysis::Location(); + return Loc; + } + + IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst); + if (II == 0) return AliasAnalysis::Location(); + + switch (II->getIntrinsicID()) { + default: return AliasAnalysis::Location(); // Unhandled intrinsic. + case Intrinsic::init_trampoline: + // If we don't have target data around, an unknown size in Location means + // that we should use the size of the pointee type. This isn't valid for + // init.trampoline, which writes more than an i8. + if (AA.getTargetData() == 0) return AliasAnalysis::Location(); + + // FIXME: We don't know the size of the trampoline, so we can't really + // handle it here. + return AliasAnalysis::Location(II->getArgOperand(0)); + case Intrinsic::lifetime_end: { + uint64_t Len = cast<ConstantInt>(II->getArgOperand(0))->getZExtValue(); + return AliasAnalysis::Location(II->getArgOperand(1), Len); + } + } +} + +/// getLocForRead - Return the location read by the specified "hasMemoryWrite" +/// instruction if any. +static AliasAnalysis::Location +getLocForRead(Instruction *Inst, AliasAnalysis &AA) { + assert(hasMemoryWrite(Inst) && "Unknown instruction case"); + + // The only instructions that both read and write are the mem transfer + // instructions (memcpy/memmove). + if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(Inst)) + return AA.getLocationForSource(MTI); + return AliasAnalysis::Location(); +} + + +/// isRemovable - If the value of this instruction and the memory it writes to +/// is unused, may we delete this instruction? +static bool isRemovable(Instruction *I) { + // Don't remove volatile/atomic stores. + if (StoreInst *SI = dyn_cast<StoreInst>(I)) + return SI->isUnordered(); + + IntrinsicInst *II = cast<IntrinsicInst>(I); + switch (II->getIntrinsicID()) { + default: llvm_unreachable("doesn't pass 'hasMemoryWrite' predicate"); + case Intrinsic::lifetime_end: + // Never remove dead lifetime_end's, e.g. because it is followed by a + // free. + return false; + case Intrinsic::init_trampoline: + // Always safe to remove init_trampoline. + return true; + + case Intrinsic::memset: + case Intrinsic::memmove: + case Intrinsic::memcpy: + // Don't remove volatile memory intrinsics. + return !cast<MemIntrinsic>(II)->isVolatile(); + } +} + + +/// isShortenable - Returns true if this instruction can be safely shortened in +/// length. +static bool isShortenable(Instruction *I) { + // Don't shorten stores for now + if (isa<StoreInst>(I)) + return false; + + IntrinsicInst *II = cast<IntrinsicInst>(I); + switch (II->getIntrinsicID()) { + default: return false; + case Intrinsic::memset: + case Intrinsic::memcpy: + // Do shorten memory intrinsics. + return true; + } +} + +/// getStoredPointerOperand - Return the pointer that is being written to. +static Value *getStoredPointerOperand(Instruction *I) { + if (StoreInst *SI = dyn_cast<StoreInst>(I)) + return SI->getPointerOperand(); + if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) + return MI->getDest(); + + IntrinsicInst *II = cast<IntrinsicInst>(I); + switch (II->getIntrinsicID()) { + default: llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::init_trampoline: + return II->getArgOperand(0); + } +} + +static uint64_t getPointerSize(const Value *V, AliasAnalysis &AA) { + uint64_t Size; + if (getObjectSize(V, Size, AA.getTargetData())) + return Size; + return AliasAnalysis::UnknownSize; +} + +namespace { + enum OverwriteResult + { + OverwriteComplete, + OverwriteEnd, + OverwriteUnknown + }; +} + +/// isOverwrite - Return 'OverwriteComplete' if a store to the 'Later' location +/// completely overwrites a store to the 'Earlier' location. +/// 'OverwriteEnd' if the end of the 'Earlier' location is completely +/// overwritten by 'Later', or 'OverwriteUnknown' if nothing can be determined +static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later, + const AliasAnalysis::Location &Earlier, + AliasAnalysis &AA, + int64_t &EarlierOff, + int64_t &LaterOff) { + const Value *P1 = Earlier.Ptr->stripPointerCasts(); + const Value *P2 = Later.Ptr->stripPointerCasts(); + + // If the start pointers are the same, we just have to compare sizes to see if + // the later store was larger than the earlier store. + if (P1 == P2) { + // If we don't know the sizes of either access, then we can't do a + // comparison. + if (Later.Size == AliasAnalysis::UnknownSize || + Earlier.Size == AliasAnalysis::UnknownSize) { + // If we have no TargetData information around, then the size of the store + // is inferrable from the pointee type. If they are the same type, then + // we know that the store is safe. + if (AA.getTargetData() == 0 && + Later.Ptr->getType() == Earlier.Ptr->getType()) + return OverwriteComplete; + + return OverwriteUnknown; + } + + // Make sure that the Later size is >= the Earlier size. + if (Later.Size >= Earlier.Size) + return OverwriteComplete; + } + + // Otherwise, we have to have size information, and the later store has to be + // larger than the earlier one. + if (Later.Size == AliasAnalysis::UnknownSize || + Earlier.Size == AliasAnalysis::UnknownSize || + AA.getTargetData() == 0) + return OverwriteUnknown; + + // Check to see if the later store is to the entire object (either a global, + // an alloca, or a byval argument). If so, then it clearly overwrites any + // other store to the same object. + const TargetData &TD = *AA.getTargetData(); + + const Value *UO1 = GetUnderlyingObject(P1, &TD), + *UO2 = GetUnderlyingObject(P2, &TD); + + // If we can't resolve the same pointers to the same object, then we can't + // analyze them at all. + if (UO1 != UO2) + return OverwriteUnknown; + + // If the "Later" store is to a recognizable object, get its size. + uint64_t ObjectSize = getPointerSize(UO2, AA); + if (ObjectSize != AliasAnalysis::UnknownSize) + if (ObjectSize == Later.Size && ObjectSize >= Earlier.Size) + return OverwriteComplete; + + // Okay, we have stores to two completely different pointers. Try to + // decompose the pointer into a "base + constant_offset" form. If the base + // pointers are equal, then we can reason about the two stores. + EarlierOff = 0; + LaterOff = 0; + const Value *BP1 = GetPointerBaseWithConstantOffset(P1, EarlierOff, TD); + const Value *BP2 = GetPointerBaseWithConstantOffset(P2, LaterOff, TD); + + // If the base pointers still differ, we have two completely different stores. + if (BP1 != BP2) + return OverwriteUnknown; + + // The later store completely overlaps the earlier store if: + // + // 1. Both start at the same offset and the later one's size is greater than + // or equal to the earlier one's, or + // + // |--earlier--| + // |-- later --| + // + // 2. The earlier store has an offset greater than the later offset, but which + // still lies completely within the later store. + // + // |--earlier--| + // |----- later ------| + // + // We have to be careful here as *Off is signed while *.Size is unsigned. + if (EarlierOff >= LaterOff && + Later.Size >= Earlier.Size && + uint64_t(EarlierOff - LaterOff) + Earlier.Size <= Later.Size) + return OverwriteComplete; + + // The other interesting case is if the later store overwrites the end of + // the earlier store + // + // |--earlier--| + // |-- later --| + // + // In this case we may want to trim the size of earlier to avoid generating + // writes to addresses which will definitely be overwritten later + if (LaterOff > EarlierOff && + LaterOff < int64_t(EarlierOff + Earlier.Size) && + int64_t(LaterOff + Later.Size) >= int64_t(EarlierOff + Earlier.Size)) + return OverwriteEnd; + + // Otherwise, they don't completely overlap. + return OverwriteUnknown; +} + +/// isPossibleSelfRead - If 'Inst' might be a self read (i.e. a noop copy of a +/// memory region into an identical pointer) then it doesn't actually make its +/// input dead in the traditional sense. Consider this case: +/// +/// memcpy(A <- B) +/// memcpy(A <- A) +/// +/// In this case, the second store to A does not make the first store to A dead. +/// The usual situation isn't an explicit A<-A store like this (which can be +/// trivially removed) but a case where two pointers may alias. +/// +/// This function detects when it is unsafe to remove a dependent instruction +/// because the DSE inducing instruction may be a self-read. +static bool isPossibleSelfRead(Instruction *Inst, + const AliasAnalysis::Location &InstStoreLoc, + Instruction *DepWrite, AliasAnalysis &AA) { + // Self reads can only happen for instructions that read memory. Get the + // location read. + AliasAnalysis::Location InstReadLoc = getLocForRead(Inst, AA); + if (InstReadLoc.Ptr == 0) return false; // Not a reading instruction. + + // If the read and written loc obviously don't alias, it isn't a read. + if (AA.isNoAlias(InstReadLoc, InstStoreLoc)) return false; + + // Okay, 'Inst' may copy over itself. However, we can still remove a the + // DepWrite instruction if we can prove that it reads from the same location + // as Inst. This handles useful cases like: + // memcpy(A <- B) + // memcpy(A <- B) + // Here we don't know if A/B may alias, but we do know that B/B are must + // aliases, so removing the first memcpy is safe (assuming it writes <= # + // bytes as the second one. + AliasAnalysis::Location DepReadLoc = getLocForRead(DepWrite, AA); + + if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr)) + return false; + + // If DepWrite doesn't read memory or if we can't prove it is a must alias, + // then it can't be considered dead. + return true; +} + + +//===----------------------------------------------------------------------===// +// DSE Pass +//===----------------------------------------------------------------------===// + +bool DSE::runOnBasicBlock(BasicBlock &BB) { + bool MadeChange = false; + + // Do a top-down walk on the BB. + for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) { + Instruction *Inst = BBI++; + + // Handle 'free' calls specially. + if (CallInst *F = isFreeCall(Inst)) { + MadeChange |= HandleFree(F); + continue; + } + + // If we find something that writes memory, get its memory dependence. + if (!hasMemoryWrite(Inst)) + continue; + + MemDepResult InstDep = MD->getDependency(Inst); + + // Ignore any store where we can't find a local dependence. + // FIXME: cross-block DSE would be fun. :) + if (!InstDep.isDef() && !InstDep.isClobber()) + continue; + + // If we're storing the same value back to a pointer that we just + // loaded from, then the store can be removed. + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + if (LoadInst *DepLoad = dyn_cast<LoadInst>(InstDep.getInst())) { + if (SI->getPointerOperand() == DepLoad->getPointerOperand() && + SI->getOperand(0) == DepLoad && isRemovable(SI)) { + DEBUG(dbgs() << "DSE: Remove Store Of Load from same pointer:\n " + << "LOAD: " << *DepLoad << "\n STORE: " << *SI << '\n'); + + // DeleteDeadInstruction can delete the current instruction. Save BBI + // in case we need it. + WeakVH NextInst(BBI); + + DeleteDeadInstruction(SI, *MD); + + if (NextInst == 0) // Next instruction deleted. + BBI = BB.begin(); + else if (BBI != BB.begin()) // Revisit this instruction if possible. + --BBI; + ++NumFastStores; + MadeChange = true; + continue; + } + } + } + + // Figure out what location is being stored to. + AliasAnalysis::Location Loc = getLocForWrite(Inst, *AA); + + // If we didn't get a useful location, fail. + if (Loc.Ptr == 0) + continue; + + while (InstDep.isDef() || InstDep.isClobber()) { + // Get the memory clobbered by the instruction we depend on. MemDep will + // skip any instructions that 'Loc' clearly doesn't interact with. If we + // end up depending on a may- or must-aliased load, then we can't optimize + // away the store and we bail out. However, if we depend on on something + // that overwrites the memory location we *can* potentially optimize it. + // + // Find out what memory location the dependent instruction stores. + Instruction *DepWrite = InstDep.getInst(); + AliasAnalysis::Location DepLoc = getLocForWrite(DepWrite, *AA); + // If we didn't get a useful location, or if it isn't a size, bail out. + if (DepLoc.Ptr == 0) + break; + + // If we find a write that is a) removable (i.e., non-volatile), b) is + // completely obliterated by the store to 'Loc', and c) which we know that + // 'Inst' doesn't load from, then we can remove it. + if (isRemovable(DepWrite) && + !isPossibleSelfRead(Inst, Loc, DepWrite, *AA)) { + int64_t InstWriteOffset, DepWriteOffset; + OverwriteResult OR = isOverwrite(Loc, DepLoc, *AA, + DepWriteOffset, InstWriteOffset); + if (OR == OverwriteComplete) { + DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " + << *DepWrite << "\n KILLER: " << *Inst << '\n'); + + // Delete the store and now-dead instructions that feed it. + DeleteDeadInstruction(DepWrite, *MD); + ++NumFastStores; + MadeChange = true; + + // DeleteDeadInstruction can delete the current instruction in loop + // cases, reset BBI. + BBI = Inst; + if (BBI != BB.begin()) + --BBI; + break; + } else if (OR == OverwriteEnd && isShortenable(DepWrite)) { + // TODO: base this on the target vector size so that if the earlier + // store was too small to get vector writes anyway then its likely + // a good idea to shorten it + // Power of 2 vector writes are probably always a bad idea to optimize + // as any store/memset/memcpy is likely using vector instructions so + // shortening it to not vector size is likely to be slower + MemIntrinsic* DepIntrinsic = cast<MemIntrinsic>(DepWrite); + unsigned DepWriteAlign = DepIntrinsic->getAlignment(); + if (llvm::isPowerOf2_64(InstWriteOffset) || + ((DepWriteAlign != 0) && InstWriteOffset % DepWriteAlign == 0)) { + + DEBUG(dbgs() << "DSE: Remove Dead Store:\n OW END: " + << *DepWrite << "\n KILLER (offset " + << InstWriteOffset << ", " + << DepLoc.Size << ")" + << *Inst << '\n'); + + Value* DepWriteLength = DepIntrinsic->getLength(); + Value* TrimmedLength = ConstantInt::get(DepWriteLength->getType(), + InstWriteOffset - + DepWriteOffset); + DepIntrinsic->setLength(TrimmedLength); + MadeChange = true; + } + } + } + + // If this is a may-aliased store that is clobbering the store value, we + // can keep searching past it for another must-aliased pointer that stores + // to the same location. For example, in: + // store -> P + // store -> Q + // store -> P + // we can remove the first store to P even though we don't know if P and Q + // alias. + if (DepWrite == &BB.front()) break; + + // Can't look past this instruction if it might read 'Loc'. + if (AA->getModRefInfo(DepWrite, Loc) & AliasAnalysis::Ref) + break; + + InstDep = MD->getPointerDependencyFrom(Loc, false, DepWrite, &BB); + } + } + + // If this block ends in a return, unwind, or unreachable, all allocas are + // dead at its end, which means stores to them are also dead. + if (BB.getTerminator()->getNumSuccessors() == 0) + MadeChange |= handleEndBlock(BB); + + return MadeChange; +} + +/// Find all blocks that will unconditionally lead to the block BB and append +/// them to F. +static void FindUnconditionalPreds(SmallVectorImpl<BasicBlock *> &Blocks, + BasicBlock *BB, DominatorTree *DT) { + for (pred_iterator I = pred_begin(BB), E = pred_end(BB); I != E; ++I) { + BasicBlock *Pred = *I; + if (Pred == BB) continue; + TerminatorInst *PredTI = Pred->getTerminator(); + if (PredTI->getNumSuccessors() != 1) + continue; + + if (DT->isReachableFromEntry(Pred)) + Blocks.push_back(Pred); + } +} + +/// HandleFree - Handle frees of entire structures whose dependency is a store +/// to a field of that structure. +bool DSE::HandleFree(CallInst *F) { + bool MadeChange = false; + + AliasAnalysis::Location Loc = AliasAnalysis::Location(F->getOperand(0)); + SmallVector<BasicBlock *, 16> Blocks; + Blocks.push_back(F->getParent()); + + while (!Blocks.empty()) { + BasicBlock *BB = Blocks.pop_back_val(); + Instruction *InstPt = BB->getTerminator(); + if (BB == F->getParent()) InstPt = F; + + MemDepResult Dep = MD->getPointerDependencyFrom(Loc, false, InstPt, BB); + while (Dep.isDef() || Dep.isClobber()) { + Instruction *Dependency = Dep.getInst(); + if (!hasMemoryWrite(Dependency) || !isRemovable(Dependency)) + break; + + Value *DepPointer = + GetUnderlyingObject(getStoredPointerOperand(Dependency)); + + // Check for aliasing. + if (!AA->isMustAlias(F->getArgOperand(0), DepPointer)) + break; + + Instruction *Next = llvm::next(BasicBlock::iterator(Dependency)); + + // DCE instructions only used to calculate that store + DeleteDeadInstruction(Dependency, *MD); + ++NumFastStores; + MadeChange = true; + + // Inst's old Dependency is now deleted. Compute the next dependency, + // which may also be dead, as in + // s[0] = 0; + // s[1] = 0; // This has just been deleted. + // free(s); + Dep = MD->getPointerDependencyFrom(Loc, false, Next, BB); + } + + if (Dep.isNonLocal()) + FindUnconditionalPreds(Blocks, BB, DT); + } + + return MadeChange; +} + +/// handleEndBlock - Remove dead stores to stack-allocated locations in the +/// function end block. Ex: +/// %A = alloca i32 +/// ... +/// store i32 1, i32* %A +/// ret void +bool DSE::handleEndBlock(BasicBlock &BB) { + bool MadeChange = false; + + // Keep track of all of the stack objects that are dead at the end of the + // function. + SmallSetVector<Value*, 16> DeadStackObjects; + + // Find all of the alloca'd pointers in the entry block. + BasicBlock *Entry = BB.getParent()->begin(); + for (BasicBlock::iterator I = Entry->begin(), E = Entry->end(); I != E; ++I) { + if (isa<AllocaInst>(I)) + DeadStackObjects.insert(I); + + // Okay, so these are dead heap objects, but if the pointer never escapes + // then it's leaked by this function anyways. + else if (isAllocLikeFn(I) && !PointerMayBeCaptured(I, true, true)) + DeadStackObjects.insert(I); + } + + // Treat byval arguments the same, stores to them are dead at the end of the + // function. + for (Function::arg_iterator AI = BB.getParent()->arg_begin(), + AE = BB.getParent()->arg_end(); AI != AE; ++AI) + if (AI->hasByValAttr()) + DeadStackObjects.insert(AI); + + // Scan the basic block backwards + for (BasicBlock::iterator BBI = BB.end(); BBI != BB.begin(); ){ + --BBI; + + // If we find a store, check to see if it points into a dead stack value. + if (hasMemoryWrite(BBI) && isRemovable(BBI)) { + // See through pointer-to-pointer bitcasts + SmallVector<Value *, 4> Pointers; + GetUnderlyingObjects(getStoredPointerOperand(BBI), Pointers); + + // Stores to stack values are valid candidates for removal. + bool AllDead = true; + for (SmallVectorImpl<Value *>::iterator I = Pointers.begin(), + E = Pointers.end(); I != E; ++I) + if (!DeadStackObjects.count(*I)) { + AllDead = false; + break; + } + + if (AllDead) { + Instruction *Dead = BBI++; + + DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n DEAD: " + << *Dead << "\n Objects: "; + for (SmallVectorImpl<Value *>::iterator I = Pointers.begin(), + E = Pointers.end(); I != E; ++I) { + dbgs() << **I; + if (llvm::next(I) != E) + dbgs() << ", "; + } + dbgs() << '\n'); + + // DCE instructions only used to calculate that store. + DeleteDeadInstruction(Dead, *MD, &DeadStackObjects); + ++NumFastStores; + MadeChange = true; + continue; + } + } + + // Remove any dead non-memory-mutating instructions. + if (isInstructionTriviallyDead(BBI)) { + Instruction *Inst = BBI++; + DeleteDeadInstruction(Inst, *MD, &DeadStackObjects); + ++NumFastOther; + MadeChange = true; + continue; + } + + if (isa<AllocaInst>(BBI)) { + // Remove allocas from the list of dead stack objects; there can't be + // any references before the definition. + DeadStackObjects.remove(BBI); + continue; + } + + if (CallSite CS = cast<Value>(BBI)) { + // Remove allocation function calls from the list of dead stack objects; + // there can't be any references before the definition. + if (isAllocLikeFn(BBI)) + DeadStackObjects.remove(BBI); + + // If this call does not access memory, it can't be loading any of our + // pointers. + if (AA->doesNotAccessMemory(CS)) + continue; + + // If the call might load from any of our allocas, then any store above + // the call is live. + SmallVector<Value*, 8> LiveAllocas; + for (SmallSetVector<Value*, 16>::iterator I = DeadStackObjects.begin(), + E = DeadStackObjects.end(); I != E; ++I) { + // See if the call site touches it. + AliasAnalysis::ModRefResult A = + AA->getModRefInfo(CS, *I, getPointerSize(*I, *AA)); + + if (A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref) + LiveAllocas.push_back(*I); + } + + for (SmallVector<Value*, 8>::iterator I = LiveAllocas.begin(), + E = LiveAllocas.end(); I != E; ++I) + DeadStackObjects.remove(*I); + + // If all of the allocas were clobbered by the call then we're not going + // to find anything else to process. + if (DeadStackObjects.empty()) + break; + + continue; + } + + AliasAnalysis::Location LoadedLoc; + + // If we encounter a use of the pointer, it is no longer considered dead + if (LoadInst *L = dyn_cast<LoadInst>(BBI)) { + if (!L->isUnordered()) // Be conservative with atomic/volatile load + break; + LoadedLoc = AA->getLocation(L); + } else if (VAArgInst *V = dyn_cast<VAArgInst>(BBI)) { + LoadedLoc = AA->getLocation(V); + } else if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(BBI)) { + LoadedLoc = AA->getLocationForSource(MTI); + } else if (!BBI->mayReadFromMemory()) { + // Instruction doesn't read memory. Note that stores that weren't removed + // above will hit this case. + continue; + } else { + // Unknown inst; assume it clobbers everything. + break; + } + + // Remove any allocas from the DeadPointer set that are loaded, as this + // makes any stores above the access live. + RemoveAccessedObjects(LoadedLoc, DeadStackObjects); + + // If all of the allocas were clobbered by the access then we're not going + // to find anything else to process. + if (DeadStackObjects.empty()) + break; + } + + return MadeChange; +} + +/// RemoveAccessedObjects - Check to see if the specified location may alias any +/// of the stack objects in the DeadStackObjects set. If so, they become live +/// because the location is being loaded. +void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc, + SmallSetVector<Value*, 16> &DeadStackObjects) { + const Value *UnderlyingPointer = GetUnderlyingObject(LoadedLoc.Ptr); + + // A constant can't be in the dead pointer set. + if (isa<Constant>(UnderlyingPointer)) + return; + + // If the kill pointer can be easily reduced to an alloca, don't bother doing + // extraneous AA queries. + if (isa<AllocaInst>(UnderlyingPointer) || isa<Argument>(UnderlyingPointer)) { + DeadStackObjects.remove(const_cast<Value*>(UnderlyingPointer)); + return; + } + + SmallVector<Value*, 16> NowLive; + for (SmallSetVector<Value*, 16>::iterator I = DeadStackObjects.begin(), + E = DeadStackObjects.end(); I != E; ++I) { + // See if the loaded location could alias the stack location. + AliasAnalysis::Location StackLoc(*I, getPointerSize(*I, *AA)); + if (!AA->isNoAlias(StackLoc, LoadedLoc)) + NowLive.push_back(*I); + } + + for (SmallVector<Value*, 16>::iterator I = NowLive.begin(), E = NowLive.end(); + I != E; ++I) + DeadStackObjects.remove(*I); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp new file mode 100644 index 0000000..9759549 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -0,0 +1,570 @@ +//===- EarlyCSE.cpp - Simple and fast CSE pass ----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass performs a simple dominator tree walk that eliminates trivially +// redundant instructions. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "early-cse" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Instructions.h" +#include "llvm/Pass.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/RecyclingAllocator.h" +#include "llvm/ADT/ScopedHashTable.h" +#include "llvm/ADT/Statistic.h" +#include <deque> +using namespace llvm; + +STATISTIC(NumSimplify, "Number of instructions simplified or DCE'd"); +STATISTIC(NumCSE, "Number of instructions CSE'd"); +STATISTIC(NumCSELoad, "Number of load instructions CSE'd"); +STATISTIC(NumCSECall, "Number of call instructions CSE'd"); +STATISTIC(NumDSE, "Number of trivial dead stores removed"); + +static unsigned getHash(const void *V) { + return DenseMapInfo<const void*>::getHashValue(V); +} + +//===----------------------------------------------------------------------===// +// SimpleValue +//===----------------------------------------------------------------------===// + +namespace { + /// SimpleValue - Instances of this struct represent available values in the + /// scoped hash table. + struct SimpleValue { + Instruction *Inst; + + SimpleValue(Instruction *I) : Inst(I) { + assert((isSentinel() || canHandle(I)) && "Inst can't be handled!"); + } + + bool isSentinel() const { + return Inst == DenseMapInfo<Instruction*>::getEmptyKey() || + Inst == DenseMapInfo<Instruction*>::getTombstoneKey(); + } + + static bool canHandle(Instruction *Inst) { + // This can only handle non-void readnone functions. + if (CallInst *CI = dyn_cast<CallInst>(Inst)) + return CI->doesNotAccessMemory() && !CI->getType()->isVoidTy(); + return isa<CastInst>(Inst) || isa<BinaryOperator>(Inst) || + isa<GetElementPtrInst>(Inst) || isa<CmpInst>(Inst) || + isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) || + isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) || + isa<ExtractValueInst>(Inst) || isa<InsertValueInst>(Inst); + } + }; +} + +namespace llvm { +// SimpleValue is POD. +template<> struct isPodLike<SimpleValue> { + static const bool value = true; +}; + +template<> struct DenseMapInfo<SimpleValue> { + static inline SimpleValue getEmptyKey() { + return DenseMapInfo<Instruction*>::getEmptyKey(); + } + static inline SimpleValue getTombstoneKey() { + return DenseMapInfo<Instruction*>::getTombstoneKey(); + } + static unsigned getHashValue(SimpleValue Val); + static bool isEqual(SimpleValue LHS, SimpleValue RHS); +}; +} + +unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) { + Instruction *Inst = Val.Inst; + + // Hash in all of the operands as pointers. + unsigned Res = 0; + for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i) + Res ^= getHash(Inst->getOperand(i)) << (i & 0xF); + + if (CastInst *CI = dyn_cast<CastInst>(Inst)) + Res ^= getHash(CI->getType()); + else if (CmpInst *CI = dyn_cast<CmpInst>(Inst)) + Res ^= CI->getPredicate(); + else if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(Inst)) { + for (ExtractValueInst::idx_iterator I = EVI->idx_begin(), + E = EVI->idx_end(); I != E; ++I) + Res ^= *I; + } else if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(Inst)) { + for (InsertValueInst::idx_iterator I = IVI->idx_begin(), + E = IVI->idx_end(); I != E; ++I) + Res ^= *I; + } else { + // nothing extra to hash in. + assert((isa<CallInst>(Inst) || + isa<BinaryOperator>(Inst) || isa<GetElementPtrInst>(Inst) || + isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) || + isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst)) && + "Invalid/unknown instruction"); + } + + // Mix in the opcode. + return (Res << 1) ^ Inst->getOpcode(); +} + +bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) { + Instruction *LHSI = LHS.Inst, *RHSI = RHS.Inst; + + if (LHS.isSentinel() || RHS.isSentinel()) + return LHSI == RHSI; + + if (LHSI->getOpcode() != RHSI->getOpcode()) return false; + return LHSI->isIdenticalTo(RHSI); +} + +//===----------------------------------------------------------------------===// +// CallValue +//===----------------------------------------------------------------------===// + +namespace { + /// CallValue - Instances of this struct represent available call values in + /// the scoped hash table. + struct CallValue { + Instruction *Inst; + + CallValue(Instruction *I) : Inst(I) { + assert((isSentinel() || canHandle(I)) && "Inst can't be handled!"); + } + + bool isSentinel() const { + return Inst == DenseMapInfo<Instruction*>::getEmptyKey() || + Inst == DenseMapInfo<Instruction*>::getTombstoneKey(); + } + + static bool canHandle(Instruction *Inst) { + // Don't value number anything that returns void. + if (Inst->getType()->isVoidTy()) + return false; + + CallInst *CI = dyn_cast<CallInst>(Inst); + if (CI == 0 || !CI->onlyReadsMemory()) + return false; + return true; + } + }; +} + +namespace llvm { + // CallValue is POD. + template<> struct isPodLike<CallValue> { + static const bool value = true; + }; + + template<> struct DenseMapInfo<CallValue> { + static inline CallValue getEmptyKey() { + return DenseMapInfo<Instruction*>::getEmptyKey(); + } + static inline CallValue getTombstoneKey() { + return DenseMapInfo<Instruction*>::getTombstoneKey(); + } + static unsigned getHashValue(CallValue Val); + static bool isEqual(CallValue LHS, CallValue RHS); + }; +} +unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) { + Instruction *Inst = Val.Inst; + // Hash in all of the operands as pointers. + unsigned Res = 0; + for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i) { + assert(!Inst->getOperand(i)->getType()->isMetadataTy() && + "Cannot value number calls with metadata operands"); + Res ^= getHash(Inst->getOperand(i)) << (i & 0xF); + } + + // Mix in the opcode. + return (Res << 1) ^ Inst->getOpcode(); +} + +bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) { + Instruction *LHSI = LHS.Inst, *RHSI = RHS.Inst; + if (LHS.isSentinel() || RHS.isSentinel()) + return LHSI == RHSI; + return LHSI->isIdenticalTo(RHSI); +} + + +//===----------------------------------------------------------------------===// +// EarlyCSE pass. +//===----------------------------------------------------------------------===// + +namespace { + +/// EarlyCSE - This pass does a simple depth-first walk over the dominator +/// tree, eliminating trivially redundant instructions and using instsimplify +/// to canonicalize things as it goes. It is intended to be fast and catch +/// obvious cases so that instcombine and other passes are more effective. It +/// is expected that a later pass of GVN will catch the interesting/hard +/// cases. +class EarlyCSE : public FunctionPass { +public: + const TargetData *TD; + const TargetLibraryInfo *TLI; + DominatorTree *DT; + typedef RecyclingAllocator<BumpPtrAllocator, + ScopedHashTableVal<SimpleValue, Value*> > AllocatorTy; + typedef ScopedHashTable<SimpleValue, Value*, DenseMapInfo<SimpleValue>, + AllocatorTy> ScopedHTType; + + /// AvailableValues - This scoped hash table contains the current values of + /// all of our simple scalar expressions. As we walk down the domtree, we + /// look to see if instructions are in this: if so, we replace them with what + /// we find, otherwise we insert them so that dominated values can succeed in + /// their lookup. + ScopedHTType *AvailableValues; + + /// AvailableLoads - This scoped hash table contains the current values + /// of loads. This allows us to get efficient access to dominating loads when + /// we have a fully redundant load. In addition to the most recent load, we + /// keep track of a generation count of the read, which is compared against + /// the current generation count. The current generation count is + /// incremented after every possibly writing memory operation, which ensures + /// that we only CSE loads with other loads that have no intervening store. + typedef RecyclingAllocator<BumpPtrAllocator, + ScopedHashTableVal<Value*, std::pair<Value*, unsigned> > > LoadMapAllocator; + typedef ScopedHashTable<Value*, std::pair<Value*, unsigned>, + DenseMapInfo<Value*>, LoadMapAllocator> LoadHTType; + LoadHTType *AvailableLoads; + + /// AvailableCalls - This scoped hash table contains the current values + /// of read-only call values. It uses the same generation count as loads. + typedef ScopedHashTable<CallValue, std::pair<Value*, unsigned> > CallHTType; + CallHTType *AvailableCalls; + + /// CurrentGeneration - This is the current generation of the memory value. + unsigned CurrentGeneration; + + static char ID; + explicit EarlyCSE() : FunctionPass(ID) { + initializeEarlyCSEPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F); + +private: + + // NodeScope - almost a POD, but needs to call the constructors for the + // scoped hash tables so that a new scope gets pushed on. These are RAII so + // that the scope gets popped when the NodeScope is destroyed. + class NodeScope { + public: + NodeScope(ScopedHTType *availableValues, + LoadHTType *availableLoads, + CallHTType *availableCalls) : + Scope(*availableValues), + LoadScope(*availableLoads), + CallScope(*availableCalls) {} + + private: + NodeScope(const NodeScope&); // DO NOT IMPLEMENT + + ScopedHTType::ScopeTy Scope; + LoadHTType::ScopeTy LoadScope; + CallHTType::ScopeTy CallScope; + }; + + // StackNode - contains all the needed information to create a stack for + // doing a depth first tranversal of the tree. This includes scopes for + // values, loads, and calls as well as the generation. There is a child + // iterator so that the children do not need to be store spearately. + class StackNode { + public: + StackNode(ScopedHTType *availableValues, + LoadHTType *availableLoads, + CallHTType *availableCalls, + unsigned cg, DomTreeNode *n, + DomTreeNode::iterator child, DomTreeNode::iterator end) : + CurrentGeneration(cg), ChildGeneration(cg), Node(n), + ChildIter(child), EndIter(end), + Scopes(availableValues, availableLoads, availableCalls), + Processed(false) {} + + // Accessors. + unsigned currentGeneration() { return CurrentGeneration; } + unsigned childGeneration() { return ChildGeneration; } + void childGeneration(unsigned generation) { ChildGeneration = generation; } + DomTreeNode *node() { return Node; } + DomTreeNode::iterator childIter() { return ChildIter; } + DomTreeNode *nextChild() { + DomTreeNode *child = *ChildIter; + ++ChildIter; + return child; + } + DomTreeNode::iterator end() { return EndIter; } + bool isProcessed() { return Processed; } + void process() { Processed = true; } + + private: + StackNode(const StackNode&); // DO NOT IMPLEMENT + + // Members. + unsigned CurrentGeneration; + unsigned ChildGeneration; + DomTreeNode *Node; + DomTreeNode::iterator ChildIter; + DomTreeNode::iterator EndIter; + NodeScope Scopes; + bool Processed; + }; + + bool processNode(DomTreeNode *Node); + + // This transformation requires dominator postdominator info + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<DominatorTree>(); + AU.addRequired<TargetLibraryInfo>(); + AU.setPreservesCFG(); + } +}; +} + +char EarlyCSE::ID = 0; + +// createEarlyCSEPass - The public interface to this file. +FunctionPass *llvm::createEarlyCSEPass() { + return new EarlyCSE(); +} + +INITIALIZE_PASS_BEGIN(EarlyCSE, "early-cse", "Early CSE", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_END(EarlyCSE, "early-cse", "Early CSE", false, false) + +bool EarlyCSE::processNode(DomTreeNode *Node) { + BasicBlock *BB = Node->getBlock(); + + // If this block has a single predecessor, then the predecessor is the parent + // of the domtree node and all of the live out memory values are still current + // in this block. If this block has multiple predecessors, then they could + // have invalidated the live-out memory values of our parent value. For now, + // just be conservative and invalidate memory if this block has multiple + // predecessors. + if (BB->getSinglePredecessor() == 0) + ++CurrentGeneration; + + /// LastStore - Keep track of the last non-volatile store that we saw... for + /// as long as there in no instruction that reads memory. If we see a store + /// to the same location, we delete the dead store. This zaps trivial dead + /// stores which can occur in bitfield code among other things. + StoreInst *LastStore = 0; + + bool Changed = false; + + // See if any instructions in the block can be eliminated. If so, do it. If + // not, add them to AvailableValues. + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) { + Instruction *Inst = I++; + + // Dead instructions should just be removed. + if (isInstructionTriviallyDead(Inst)) { + DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n'); + Inst->eraseFromParent(); + Changed = true; + ++NumSimplify; + continue; + } + + // If the instruction can be simplified (e.g. X+0 = X) then replace it with + // its simpler value. + if (Value *V = SimplifyInstruction(Inst, TD, TLI, DT)) { + DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << " to: " << *V << '\n'); + Inst->replaceAllUsesWith(V); + Inst->eraseFromParent(); + Changed = true; + ++NumSimplify; + continue; + } + + // If this is a simple instruction that we can value number, process it. + if (SimpleValue::canHandle(Inst)) { + // See if the instruction has an available value. If so, use it. + if (Value *V = AvailableValues->lookup(Inst)) { + DEBUG(dbgs() << "EarlyCSE CSE: " << *Inst << " to: " << *V << '\n'); + Inst->replaceAllUsesWith(V); + Inst->eraseFromParent(); + Changed = true; + ++NumCSE; + continue; + } + + // Otherwise, just remember that this value is available. + AvailableValues->insert(Inst, Inst); + continue; + } + + // If this is a non-volatile load, process it. + if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { + // Ignore volatile loads. + if (!LI->isSimple()) { + LastStore = 0; + continue; + } + + // If we have an available version of this load, and if it is the right + // generation, replace this instruction. + std::pair<Value*, unsigned> InVal = + AvailableLoads->lookup(Inst->getOperand(0)); + if (InVal.first != 0 && InVal.second == CurrentGeneration) { + DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst << " to: " + << *InVal.first << '\n'); + if (!Inst->use_empty()) Inst->replaceAllUsesWith(InVal.first); + Inst->eraseFromParent(); + Changed = true; + ++NumCSELoad; + continue; + } + + // Otherwise, remember that we have this instruction. + AvailableLoads->insert(Inst->getOperand(0), + std::pair<Value*, unsigned>(Inst, CurrentGeneration)); + LastStore = 0; + continue; + } + + // If this instruction may read from memory, forget LastStore. + if (Inst->mayReadFromMemory()) + LastStore = 0; + + // If this is a read-only call, process it. + if (CallValue::canHandle(Inst)) { + // If we have an available version of this call, and if it is the right + // generation, replace this instruction. + std::pair<Value*, unsigned> InVal = AvailableCalls->lookup(Inst); + if (InVal.first != 0 && InVal.second == CurrentGeneration) { + DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst << " to: " + << *InVal.first << '\n'); + if (!Inst->use_empty()) Inst->replaceAllUsesWith(InVal.first); + Inst->eraseFromParent(); + Changed = true; + ++NumCSECall; + continue; + } + + // Otherwise, remember that we have this instruction. + AvailableCalls->insert(Inst, + std::pair<Value*, unsigned>(Inst, CurrentGeneration)); + continue; + } + + // Okay, this isn't something we can CSE at all. Check to see if it is + // something that could modify memory. If so, our available memory values + // cannot be used so bump the generation count. + if (Inst->mayWriteToMemory()) { + ++CurrentGeneration; + + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + // We do a trivial form of DSE if there are two stores to the same + // location with no intervening loads. Delete the earlier store. + if (LastStore && + LastStore->getPointerOperand() == SI->getPointerOperand()) { + DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore << " due to: " + << *Inst << '\n'); + LastStore->eraseFromParent(); + Changed = true; + ++NumDSE; + LastStore = 0; + continue; + } + + // Okay, we just invalidated anything we knew about loaded values. Try + // to salvage *something* by remembering that the stored value is a live + // version of the pointer. It is safe to forward from volatile stores + // to non-volatile loads, so we don't have to check for volatility of + // the store. + AvailableLoads->insert(SI->getPointerOperand(), + std::pair<Value*, unsigned>(SI->getValueOperand(), CurrentGeneration)); + + // Remember that this was the last store we saw for DSE. + if (SI->isSimple()) + LastStore = SI; + } + } + } + + return Changed; +} + + +bool EarlyCSE::runOnFunction(Function &F) { + std::deque<StackNode *> nodesToProcess; + + TD = getAnalysisIfAvailable<TargetData>(); + TLI = &getAnalysis<TargetLibraryInfo>(); + DT = &getAnalysis<DominatorTree>(); + + // Tables that the pass uses when walking the domtree. + ScopedHTType AVTable; + AvailableValues = &AVTable; + LoadHTType LoadTable; + AvailableLoads = &LoadTable; + CallHTType CallTable; + AvailableCalls = &CallTable; + + CurrentGeneration = 0; + bool Changed = false; + + // Process the root node. + nodesToProcess.push_front( + new StackNode(AvailableValues, AvailableLoads, AvailableCalls, + CurrentGeneration, DT->getRootNode(), + DT->getRootNode()->begin(), + DT->getRootNode()->end())); + + // Save the current generation. + unsigned LiveOutGeneration = CurrentGeneration; + + // Process the stack. + while (!nodesToProcess.empty()) { + // Grab the first item off the stack. Set the current generation, remove + // the node from the stack, and process it. + StackNode *NodeToProcess = nodesToProcess.front(); + + // Initialize class members. + CurrentGeneration = NodeToProcess->currentGeneration(); + + // Check if the node needs to be processed. + if (!NodeToProcess->isProcessed()) { + // Process the node. + Changed |= processNode(NodeToProcess->node()); + NodeToProcess->childGeneration(CurrentGeneration); + NodeToProcess->process(); + } else if (NodeToProcess->childIter() != NodeToProcess->end()) { + // Push the next child onto the stack. + DomTreeNode *child = NodeToProcess->nextChild(); + nodesToProcess.push_front( + new StackNode(AvailableValues, + AvailableLoads, + AvailableCalls, + NodeToProcess->childGeneration(), child, + child->begin(), child->end())); + } else { + // It has been processed, and there are no more children to process, + // so delete it and pop it off the stack. + delete NodeToProcess; + nodesToProcess.pop_front(); + } + } // while (!nodes...) + + // Reset the current generation. + CurrentGeneration = LiveOutGeneration; + + return Changed; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp new file mode 100644 index 0000000..4822fd0 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp @@ -0,0 +1,2612 @@ +//===- GVN.cpp - Eliminate redundant values and loads ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass performs global value numbering to eliminate fully redundant +// instructions. It also performs simple dead load elimination. +// +// Note that this pass does the value numbering itself; it does not use the +// ValueNumbering analysis passes. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "gvn" +#include "llvm/Transforms/Scalar.h" +#include "llvm/GlobalVariable.h" +#include "llvm/IRBuilder.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/LLVMContext.h" +#include "llvm/Metadata.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/Hashing.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/MemoryDependenceAnalysis.h" +#include "llvm/Analysis/PHITransAddr.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Assembly/Writer.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/PatternMatch.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" +using namespace llvm; +using namespace PatternMatch; + +STATISTIC(NumGVNInstr, "Number of instructions deleted"); +STATISTIC(NumGVNLoad, "Number of loads deleted"); +STATISTIC(NumGVNPRE, "Number of instructions PRE'd"); +STATISTIC(NumGVNBlocks, "Number of blocks merged"); +STATISTIC(NumGVNSimpl, "Number of instructions simplified"); +STATISTIC(NumGVNEqProp, "Number of equalities propagated"); +STATISTIC(NumPRELoad, "Number of loads PRE'd"); + +static cl::opt<bool> EnablePRE("enable-pre", + cl::init(true), cl::Hidden); +static cl::opt<bool> EnableLoadPRE("enable-load-pre", cl::init(true)); + +// Maximum allowed recursion depth. +static cl::opt<uint32_t> +MaxRecurseDepth("max-recurse-depth", cl::Hidden, cl::init(1000), cl::ZeroOrMore, + cl::desc("Max recurse depth (default = 1000)")); + +//===----------------------------------------------------------------------===// +// ValueTable Class +//===----------------------------------------------------------------------===// + +/// This class holds the mapping between values and value numbers. It is used +/// as an efficient mechanism to determine the expression-wise equivalence of +/// two values. +namespace { + struct Expression { + uint32_t opcode; + Type *type; + SmallVector<uint32_t, 4> varargs; + + Expression(uint32_t o = ~2U) : opcode(o) { } + + bool operator==(const Expression &other) const { + if (opcode != other.opcode) + return false; + if (opcode == ~0U || opcode == ~1U) + return true; + if (type != other.type) + return false; + if (varargs != other.varargs) + return false; + return true; + } + + friend hash_code hash_value(const Expression &Value) { + return hash_combine(Value.opcode, Value.type, + hash_combine_range(Value.varargs.begin(), + Value.varargs.end())); + } + }; + + class ValueTable { + DenseMap<Value*, uint32_t> valueNumbering; + DenseMap<Expression, uint32_t> expressionNumbering; + AliasAnalysis *AA; + MemoryDependenceAnalysis *MD; + DominatorTree *DT; + + uint32_t nextValueNumber; + + Expression create_expression(Instruction* I); + Expression create_cmp_expression(unsigned Opcode, + CmpInst::Predicate Predicate, + Value *LHS, Value *RHS); + Expression create_extractvalue_expression(ExtractValueInst* EI); + uint32_t lookup_or_add_call(CallInst* C); + public: + ValueTable() : nextValueNumber(1) { } + uint32_t lookup_or_add(Value *V); + uint32_t lookup(Value *V) const; + uint32_t lookup_or_add_cmp(unsigned Opcode, CmpInst::Predicate Pred, + Value *LHS, Value *RHS); + void add(Value *V, uint32_t num); + void clear(); + void erase(Value *v); + void setAliasAnalysis(AliasAnalysis* A) { AA = A; } + AliasAnalysis *getAliasAnalysis() const { return AA; } + void setMemDep(MemoryDependenceAnalysis* M) { MD = M; } + void setDomTree(DominatorTree* D) { DT = D; } + uint32_t getNextUnusedValueNumber() { return nextValueNumber; } + void verifyRemoved(const Value *) const; + }; +} + +namespace llvm { +template <> struct DenseMapInfo<Expression> { + static inline Expression getEmptyKey() { + return ~0U; + } + + static inline Expression getTombstoneKey() { + return ~1U; + } + + static unsigned getHashValue(const Expression e) { + using llvm::hash_value; + return static_cast<unsigned>(hash_value(e)); + } + static bool isEqual(const Expression &LHS, const Expression &RHS) { + return LHS == RHS; + } +}; + +} + +//===----------------------------------------------------------------------===// +// ValueTable Internal Functions +//===----------------------------------------------------------------------===// + +Expression ValueTable::create_expression(Instruction *I) { + Expression e; + e.type = I->getType(); + e.opcode = I->getOpcode(); + for (Instruction::op_iterator OI = I->op_begin(), OE = I->op_end(); + OI != OE; ++OI) + e.varargs.push_back(lookup_or_add(*OI)); + if (I->isCommutative()) { + // Ensure that commutative instructions that only differ by a permutation + // of their operands get the same value number by sorting the operand value + // numbers. Since all commutative instructions have two operands it is more + // efficient to sort by hand rather than using, say, std::sort. + assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!"); + if (e.varargs[0] > e.varargs[1]) + std::swap(e.varargs[0], e.varargs[1]); + } + + if (CmpInst *C = dyn_cast<CmpInst>(I)) { + // Sort the operand value numbers so x<y and y>x get the same value number. + CmpInst::Predicate Predicate = C->getPredicate(); + if (e.varargs[0] > e.varargs[1]) { + std::swap(e.varargs[0], e.varargs[1]); + Predicate = CmpInst::getSwappedPredicate(Predicate); + } + e.opcode = (C->getOpcode() << 8) | Predicate; + } else if (InsertValueInst *E = dyn_cast<InsertValueInst>(I)) { + for (InsertValueInst::idx_iterator II = E->idx_begin(), IE = E->idx_end(); + II != IE; ++II) + e.varargs.push_back(*II); + } + + return e; +} + +Expression ValueTable::create_cmp_expression(unsigned Opcode, + CmpInst::Predicate Predicate, + Value *LHS, Value *RHS) { + assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && + "Not a comparison!"); + Expression e; + e.type = CmpInst::makeCmpResultType(LHS->getType()); + e.varargs.push_back(lookup_or_add(LHS)); + e.varargs.push_back(lookup_or_add(RHS)); + + // Sort the operand value numbers so x<y and y>x get the same value number. + if (e.varargs[0] > e.varargs[1]) { + std::swap(e.varargs[0], e.varargs[1]); + Predicate = CmpInst::getSwappedPredicate(Predicate); + } + e.opcode = (Opcode << 8) | Predicate; + return e; +} + +Expression ValueTable::create_extractvalue_expression(ExtractValueInst *EI) { + assert(EI != 0 && "Not an ExtractValueInst?"); + Expression e; + e.type = EI->getType(); + e.opcode = 0; + + IntrinsicInst *I = dyn_cast<IntrinsicInst>(EI->getAggregateOperand()); + if (I != 0 && EI->getNumIndices() == 1 && *EI->idx_begin() == 0 ) { + // EI might be an extract from one of our recognised intrinsics. If it + // is we'll synthesize a semantically equivalent expression instead on + // an extract value expression. + switch (I->getIntrinsicID()) { + case Intrinsic::sadd_with_overflow: + case Intrinsic::uadd_with_overflow: + e.opcode = Instruction::Add; + break; + case Intrinsic::ssub_with_overflow: + case Intrinsic::usub_with_overflow: + e.opcode = Instruction::Sub; + break; + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: + e.opcode = Instruction::Mul; + break; + default: + break; + } + + if (e.opcode != 0) { + // Intrinsic recognized. Grab its args to finish building the expression. + assert(I->getNumArgOperands() == 2 && + "Expect two args for recognised intrinsics."); + e.varargs.push_back(lookup_or_add(I->getArgOperand(0))); + e.varargs.push_back(lookup_or_add(I->getArgOperand(1))); + return e; + } + } + + // Not a recognised intrinsic. Fall back to producing an extract value + // expression. + e.opcode = EI->getOpcode(); + for (Instruction::op_iterator OI = EI->op_begin(), OE = EI->op_end(); + OI != OE; ++OI) + e.varargs.push_back(lookup_or_add(*OI)); + + for (ExtractValueInst::idx_iterator II = EI->idx_begin(), IE = EI->idx_end(); + II != IE; ++II) + e.varargs.push_back(*II); + + return e; +} + +//===----------------------------------------------------------------------===// +// ValueTable External Functions +//===----------------------------------------------------------------------===// + +/// add - Insert a value into the table with a specified value number. +void ValueTable::add(Value *V, uint32_t num) { + valueNumbering.insert(std::make_pair(V, num)); +} + +uint32_t ValueTable::lookup_or_add_call(CallInst* C) { + if (AA->doesNotAccessMemory(C)) { + Expression exp = create_expression(C); + uint32_t& e = expressionNumbering[exp]; + if (!e) e = nextValueNumber++; + valueNumbering[C] = e; + return e; + } else if (AA->onlyReadsMemory(C)) { + Expression exp = create_expression(C); + uint32_t& e = expressionNumbering[exp]; + if (!e) { + e = nextValueNumber++; + valueNumbering[C] = e; + return e; + } + if (!MD) { + e = nextValueNumber++; + valueNumbering[C] = e; + return e; + } + + MemDepResult local_dep = MD->getDependency(C); + + if (!local_dep.isDef() && !local_dep.isNonLocal()) { + valueNumbering[C] = nextValueNumber; + return nextValueNumber++; + } + + if (local_dep.isDef()) { + CallInst* local_cdep = cast<CallInst>(local_dep.getInst()); + + if (local_cdep->getNumArgOperands() != C->getNumArgOperands()) { + valueNumbering[C] = nextValueNumber; + return nextValueNumber++; + } + + for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) { + uint32_t c_vn = lookup_or_add(C->getArgOperand(i)); + uint32_t cd_vn = lookup_or_add(local_cdep->getArgOperand(i)); + if (c_vn != cd_vn) { + valueNumbering[C] = nextValueNumber; + return nextValueNumber++; + } + } + + uint32_t v = lookup_or_add(local_cdep); + valueNumbering[C] = v; + return v; + } + + // Non-local case. + const MemoryDependenceAnalysis::NonLocalDepInfo &deps = + MD->getNonLocalCallDependency(CallSite(C)); + // FIXME: Move the checking logic to MemDep! + CallInst* cdep = 0; + + // Check to see if we have a single dominating call instruction that is + // identical to C. + for (unsigned i = 0, e = deps.size(); i != e; ++i) { + const NonLocalDepEntry *I = &deps[i]; + if (I->getResult().isNonLocal()) + continue; + + // We don't handle non-definitions. If we already have a call, reject + // instruction dependencies. + if (!I->getResult().isDef() || cdep != 0) { + cdep = 0; + break; + } + + CallInst *NonLocalDepCall = dyn_cast<CallInst>(I->getResult().getInst()); + // FIXME: All duplicated with non-local case. + if (NonLocalDepCall && DT->properlyDominates(I->getBB(), C->getParent())){ + cdep = NonLocalDepCall; + continue; + } + + cdep = 0; + break; + } + + if (!cdep) { + valueNumbering[C] = nextValueNumber; + return nextValueNumber++; + } + + if (cdep->getNumArgOperands() != C->getNumArgOperands()) { + valueNumbering[C] = nextValueNumber; + return nextValueNumber++; + } + for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) { + uint32_t c_vn = lookup_or_add(C->getArgOperand(i)); + uint32_t cd_vn = lookup_or_add(cdep->getArgOperand(i)); + if (c_vn != cd_vn) { + valueNumbering[C] = nextValueNumber; + return nextValueNumber++; + } + } + + uint32_t v = lookup_or_add(cdep); + valueNumbering[C] = v; + return v; + + } else { + valueNumbering[C] = nextValueNumber; + return nextValueNumber++; + } +} + +/// lookup_or_add - Returns the value number for the specified value, assigning +/// it a new number if it did not have one before. +uint32_t ValueTable::lookup_or_add(Value *V) { + DenseMap<Value*, uint32_t>::iterator VI = valueNumbering.find(V); + if (VI != valueNumbering.end()) + return VI->second; + + if (!isa<Instruction>(V)) { + valueNumbering[V] = nextValueNumber; + return nextValueNumber++; + } + + Instruction* I = cast<Instruction>(V); + Expression exp; + switch (I->getOpcode()) { + case Instruction::Call: + return lookup_or_add_call(cast<CallInst>(I)); + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or : + case Instruction::Xor: + case Instruction::ICmp: + case Instruction::FCmp: + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::FPTrunc: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::BitCast: + case Instruction::Select: + case Instruction::ExtractElement: + case Instruction::InsertElement: + case Instruction::ShuffleVector: + case Instruction::InsertValue: + case Instruction::GetElementPtr: + exp = create_expression(I); + break; + case Instruction::ExtractValue: + exp = create_extractvalue_expression(cast<ExtractValueInst>(I)); + break; + default: + valueNumbering[V] = nextValueNumber; + return nextValueNumber++; + } + + uint32_t& e = expressionNumbering[exp]; + if (!e) e = nextValueNumber++; + valueNumbering[V] = e; + return e; +} + +/// lookup - Returns the value number of the specified value. Fails if +/// the value has not yet been numbered. +uint32_t ValueTable::lookup(Value *V) const { + DenseMap<Value*, uint32_t>::const_iterator VI = valueNumbering.find(V); + assert(VI != valueNumbering.end() && "Value not numbered?"); + return VI->second; +} + +/// lookup_or_add_cmp - Returns the value number of the given comparison, +/// assigning it a new number if it did not have one before. Useful when +/// we deduced the result of a comparison, but don't immediately have an +/// instruction realizing that comparison to hand. +uint32_t ValueTable::lookup_or_add_cmp(unsigned Opcode, + CmpInst::Predicate Predicate, + Value *LHS, Value *RHS) { + Expression exp = create_cmp_expression(Opcode, Predicate, LHS, RHS); + uint32_t& e = expressionNumbering[exp]; + if (!e) e = nextValueNumber++; + return e; +} + +/// clear - Remove all entries from the ValueTable. +void ValueTable::clear() { + valueNumbering.clear(); + expressionNumbering.clear(); + nextValueNumber = 1; +} + +/// erase - Remove a value from the value numbering. +void ValueTable::erase(Value *V) { + valueNumbering.erase(V); +} + +/// verifyRemoved - Verify that the value is removed from all internal data +/// structures. +void ValueTable::verifyRemoved(const Value *V) const { + for (DenseMap<Value*, uint32_t>::const_iterator + I = valueNumbering.begin(), E = valueNumbering.end(); I != E; ++I) { + assert(I->first != V && "Inst still occurs in value numbering map!"); + } +} + +//===----------------------------------------------------------------------===// +// GVN Pass +//===----------------------------------------------------------------------===// + +namespace { + + class GVN : public FunctionPass { + bool NoLoads; + MemoryDependenceAnalysis *MD; + DominatorTree *DT; + const TargetData *TD; + const TargetLibraryInfo *TLI; + + ValueTable VN; + + /// LeaderTable - A mapping from value numbers to lists of Value*'s that + /// have that value number. Use findLeader to query it. + struct LeaderTableEntry { + Value *Val; + const BasicBlock *BB; + LeaderTableEntry *Next; + }; + DenseMap<uint32_t, LeaderTableEntry> LeaderTable; + BumpPtrAllocator TableAllocator; + + SmallVector<Instruction*, 8> InstrsToErase; + public: + static char ID; // Pass identification, replacement for typeid + explicit GVN(bool noloads = false) + : FunctionPass(ID), NoLoads(noloads), MD(0) { + initializeGVNPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F); + + /// markInstructionForDeletion - This removes the specified instruction from + /// our various maps and marks it for deletion. + void markInstructionForDeletion(Instruction *I) { + VN.erase(I); + InstrsToErase.push_back(I); + } + + const TargetData *getTargetData() const { return TD; } + DominatorTree &getDominatorTree() const { return *DT; } + AliasAnalysis *getAliasAnalysis() const { return VN.getAliasAnalysis(); } + MemoryDependenceAnalysis &getMemDep() const { return *MD; } + private: + /// addToLeaderTable - Push a new Value to the LeaderTable onto the list for + /// its value number. + void addToLeaderTable(uint32_t N, Value *V, const BasicBlock *BB) { + LeaderTableEntry &Curr = LeaderTable[N]; + if (!Curr.Val) { + Curr.Val = V; + Curr.BB = BB; + return; + } + + LeaderTableEntry *Node = TableAllocator.Allocate<LeaderTableEntry>(); + Node->Val = V; + Node->BB = BB; + Node->Next = Curr.Next; + Curr.Next = Node; + } + + /// removeFromLeaderTable - Scan the list of values corresponding to a given + /// value number, and remove the given instruction if encountered. + void removeFromLeaderTable(uint32_t N, Instruction *I, BasicBlock *BB) { + LeaderTableEntry* Prev = 0; + LeaderTableEntry* Curr = &LeaderTable[N]; + + while (Curr->Val != I || Curr->BB != BB) { + Prev = Curr; + Curr = Curr->Next; + } + + if (Prev) { + Prev->Next = Curr->Next; + } else { + if (!Curr->Next) { + Curr->Val = 0; + Curr->BB = 0; + } else { + LeaderTableEntry* Next = Curr->Next; + Curr->Val = Next->Val; + Curr->BB = Next->BB; + Curr->Next = Next->Next; + } + } + } + + // List of critical edges to be split between iterations. + SmallVector<std::pair<TerminatorInst*, unsigned>, 4> toSplit; + + // This transformation requires dominator postdominator info + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<DominatorTree>(); + AU.addRequired<TargetLibraryInfo>(); + if (!NoLoads) + AU.addRequired<MemoryDependenceAnalysis>(); + AU.addRequired<AliasAnalysis>(); + + AU.addPreserved<DominatorTree>(); + AU.addPreserved<AliasAnalysis>(); + } + + + // Helper fuctions + // FIXME: eliminate or document these better + bool processLoad(LoadInst *L); + bool processInstruction(Instruction *I); + bool processNonLocalLoad(LoadInst *L); + bool processBlock(BasicBlock *BB); + void dump(DenseMap<uint32_t, Value*> &d); + bool iterateOnFunction(Function &F); + bool performPRE(Function &F); + Value *findLeader(const BasicBlock *BB, uint32_t num); + void cleanupGlobalSets(); + void verifyRemoved(const Instruction *I) const; + bool splitCriticalEdges(); + unsigned replaceAllDominatedUsesWith(Value *From, Value *To, + const BasicBlockEdge &Root); + bool propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root); + }; + + char GVN::ID = 0; +} + +// createGVNPass - The public interface to this file... +FunctionPass *llvm::createGVNPass(bool NoLoads) { + return new GVN(NoLoads); +} + +INITIALIZE_PASS_BEGIN(GVN, "gvn", "Global Value Numbering", false, false) +INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_END(GVN, "gvn", "Global Value Numbering", false, false) + +void GVN::dump(DenseMap<uint32_t, Value*>& d) { + errs() << "{\n"; + for (DenseMap<uint32_t, Value*>::iterator I = d.begin(), + E = d.end(); I != E; ++I) { + errs() << I->first << "\n"; + I->second->dump(); + } + errs() << "}\n"; +} + +/// IsValueFullyAvailableInBlock - Return true if we can prove that the value +/// we're analyzing is fully available in the specified block. As we go, keep +/// track of which blocks we know are fully alive in FullyAvailableBlocks. This +/// map is actually a tri-state map with the following values: +/// 0) we know the block *is not* fully available. +/// 1) we know the block *is* fully available. +/// 2) we do not know whether the block is fully available or not, but we are +/// currently speculating that it will be. +/// 3) we are speculating for this block and have used that to speculate for +/// other blocks. +static bool IsValueFullyAvailableInBlock(BasicBlock *BB, + DenseMap<BasicBlock*, char> &FullyAvailableBlocks, + uint32_t RecurseDepth) { + if (RecurseDepth > MaxRecurseDepth) + return false; + + // Optimistically assume that the block is fully available and check to see + // if we already know about this block in one lookup. + std::pair<DenseMap<BasicBlock*, char>::iterator, char> IV = + FullyAvailableBlocks.insert(std::make_pair(BB, 2)); + + // If the entry already existed for this block, return the precomputed value. + if (!IV.second) { + // If this is a speculative "available" value, mark it as being used for + // speculation of other blocks. + if (IV.first->second == 2) + IV.first->second = 3; + return IV.first->second != 0; + } + + // Otherwise, see if it is fully available in all predecessors. + pred_iterator PI = pred_begin(BB), PE = pred_end(BB); + + // If this block has no predecessors, it isn't live-in here. + if (PI == PE) + goto SpeculationFailure; + + for (; PI != PE; ++PI) + // If the value isn't fully available in one of our predecessors, then it + // isn't fully available in this block either. Undo our previous + // optimistic assumption and bail out. + if (!IsValueFullyAvailableInBlock(*PI, FullyAvailableBlocks,RecurseDepth+1)) + goto SpeculationFailure; + + return true; + +// SpeculationFailure - If we get here, we found out that this is not, after +// all, a fully-available block. We have a problem if we speculated on this and +// used the speculation to mark other blocks as available. +SpeculationFailure: + char &BBVal = FullyAvailableBlocks[BB]; + + // If we didn't speculate on this, just return with it set to false. + if (BBVal == 2) { + BBVal = 0; + return false; + } + + // If we did speculate on this value, we could have blocks set to 1 that are + // incorrect. Walk the (transitive) successors of this block and mark them as + // 0 if set to one. + SmallVector<BasicBlock*, 32> BBWorklist; + BBWorklist.push_back(BB); + + do { + BasicBlock *Entry = BBWorklist.pop_back_val(); + // Note that this sets blocks to 0 (unavailable) if they happen to not + // already be in FullyAvailableBlocks. This is safe. + char &EntryVal = FullyAvailableBlocks[Entry]; + if (EntryVal == 0) continue; // Already unavailable. + + // Mark as unavailable. + EntryVal = 0; + + for (succ_iterator I = succ_begin(Entry), E = succ_end(Entry); I != E; ++I) + BBWorklist.push_back(*I); + } while (!BBWorklist.empty()); + + return false; +} + + +/// CanCoerceMustAliasedValueToLoad - Return true if +/// CoerceAvailableValueToLoadType will succeed. +static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal, + Type *LoadTy, + const TargetData &TD) { + // If the loaded or stored value is an first class array or struct, don't try + // to transform them. We need to be able to bitcast to integer. + if (LoadTy->isStructTy() || LoadTy->isArrayTy() || + StoredVal->getType()->isStructTy() || + StoredVal->getType()->isArrayTy()) + return false; + + // The store has to be at least as big as the load. + if (TD.getTypeSizeInBits(StoredVal->getType()) < + TD.getTypeSizeInBits(LoadTy)) + return false; + + return true; +} + + +/// CoerceAvailableValueToLoadType - If we saw a store of a value to memory, and +/// then a load from a must-aliased pointer of a different type, try to coerce +/// the stored value. LoadedTy is the type of the load we want to replace and +/// InsertPt is the place to insert new instructions. +/// +/// If we can't do it, return null. +static Value *CoerceAvailableValueToLoadType(Value *StoredVal, + Type *LoadedTy, + Instruction *InsertPt, + const TargetData &TD) { + if (!CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, TD)) + return 0; + + // If this is already the right type, just return it. + Type *StoredValTy = StoredVal->getType(); + + uint64_t StoreSize = TD.getTypeSizeInBits(StoredValTy); + uint64_t LoadSize = TD.getTypeSizeInBits(LoadedTy); + + // If the store and reload are the same size, we can always reuse it. + if (StoreSize == LoadSize) { + // Pointer to Pointer -> use bitcast. + if (StoredValTy->isPointerTy() && LoadedTy->isPointerTy()) + return new BitCastInst(StoredVal, LoadedTy, "", InsertPt); + + // Convert source pointers to integers, which can be bitcast. + if (StoredValTy->isPointerTy()) { + StoredValTy = TD.getIntPtrType(StoredValTy->getContext()); + StoredVal = new PtrToIntInst(StoredVal, StoredValTy, "", InsertPt); + } + + Type *TypeToCastTo = LoadedTy; + if (TypeToCastTo->isPointerTy()) + TypeToCastTo = TD.getIntPtrType(StoredValTy->getContext()); + + if (StoredValTy != TypeToCastTo) + StoredVal = new BitCastInst(StoredVal, TypeToCastTo, "", InsertPt); + + // Cast to pointer if the load needs a pointer type. + if (LoadedTy->isPointerTy()) + StoredVal = new IntToPtrInst(StoredVal, LoadedTy, "", InsertPt); + + return StoredVal; + } + + // If the loaded value is smaller than the available value, then we can + // extract out a piece from it. If the available value is too small, then we + // can't do anything. + assert(StoreSize >= LoadSize && "CanCoerceMustAliasedValueToLoad fail"); + + // Convert source pointers to integers, which can be manipulated. + if (StoredValTy->isPointerTy()) { + StoredValTy = TD.getIntPtrType(StoredValTy->getContext()); + StoredVal = new PtrToIntInst(StoredVal, StoredValTy, "", InsertPt); + } + + // Convert vectors and fp to integer, which can be manipulated. + if (!StoredValTy->isIntegerTy()) { + StoredValTy = IntegerType::get(StoredValTy->getContext(), StoreSize); + StoredVal = new BitCastInst(StoredVal, StoredValTy, "", InsertPt); + } + + // If this is a big-endian system, we need to shift the value down to the low + // bits so that a truncate will work. + if (TD.isBigEndian()) { + Constant *Val = ConstantInt::get(StoredVal->getType(), StoreSize-LoadSize); + StoredVal = BinaryOperator::CreateLShr(StoredVal, Val, "tmp", InsertPt); + } + + // Truncate the integer to the right size now. + Type *NewIntTy = IntegerType::get(StoredValTy->getContext(), LoadSize); + StoredVal = new TruncInst(StoredVal, NewIntTy, "trunc", InsertPt); + + if (LoadedTy == NewIntTy) + return StoredVal; + + // If the result is a pointer, inttoptr. + if (LoadedTy->isPointerTy()) + return new IntToPtrInst(StoredVal, LoadedTy, "inttoptr", InsertPt); + + // Otherwise, bitcast. + return new BitCastInst(StoredVal, LoadedTy, "bitcast", InsertPt); +} + +/// AnalyzeLoadFromClobberingWrite - This function is called when we have a +/// memdep query of a load that ends up being a clobbering memory write (store, +/// memset, memcpy, memmove). This means that the write *may* provide bits used +/// by the load but we can't be sure because the pointers don't mustalias. +/// +/// Check this case to see if there is anything more we can do before we give +/// up. This returns -1 if we have to give up, or a byte number in the stored +/// value of the piece that feeds the load. +static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr, + Value *WritePtr, + uint64_t WriteSizeInBits, + const TargetData &TD) { + // If the loaded or stored value is a first class array or struct, don't try + // to transform them. We need to be able to bitcast to integer. + if (LoadTy->isStructTy() || LoadTy->isArrayTy()) + return -1; + + int64_t StoreOffset = 0, LoadOffset = 0; + Value *StoreBase = GetPointerBaseWithConstantOffset(WritePtr, StoreOffset,TD); + Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, TD); + if (StoreBase != LoadBase) + return -1; + + // If the load and store are to the exact same address, they should have been + // a must alias. AA must have gotten confused. + // FIXME: Study to see if/when this happens. One case is forwarding a memset + // to a load from the base of the memset. +#if 0 + if (LoadOffset == StoreOffset) { + dbgs() << "STORE/LOAD DEP WITH COMMON POINTER MISSED:\n" + << "Base = " << *StoreBase << "\n" + << "Store Ptr = " << *WritePtr << "\n" + << "Store Offs = " << StoreOffset << "\n" + << "Load Ptr = " << *LoadPtr << "\n"; + abort(); + } +#endif + + // If the load and store don't overlap at all, the store doesn't provide + // anything to the load. In this case, they really don't alias at all, AA + // must have gotten confused. + uint64_t LoadSize = TD.getTypeSizeInBits(LoadTy); + + if ((WriteSizeInBits & 7) | (LoadSize & 7)) + return -1; + uint64_t StoreSize = WriteSizeInBits >> 3; // Convert to bytes. + LoadSize >>= 3; + + + bool isAAFailure = false; + if (StoreOffset < LoadOffset) + isAAFailure = StoreOffset+int64_t(StoreSize) <= LoadOffset; + else + isAAFailure = LoadOffset+int64_t(LoadSize) <= StoreOffset; + + if (isAAFailure) { +#if 0 + dbgs() << "STORE LOAD DEP WITH COMMON BASE:\n" + << "Base = " << *StoreBase << "\n" + << "Store Ptr = " << *WritePtr << "\n" + << "Store Offs = " << StoreOffset << "\n" + << "Load Ptr = " << *LoadPtr << "\n"; + abort(); +#endif + return -1; + } + + // If the Load isn't completely contained within the stored bits, we don't + // have all the bits to feed it. We could do something crazy in the future + // (issue a smaller load then merge the bits in) but this seems unlikely to be + // valuable. + if (StoreOffset > LoadOffset || + StoreOffset+StoreSize < LoadOffset+LoadSize) + return -1; + + // Okay, we can do this transformation. Return the number of bytes into the + // store that the load is. + return LoadOffset-StoreOffset; +} + +/// AnalyzeLoadFromClobberingStore - This function is called when we have a +/// memdep query of a load that ends up being a clobbering store. +static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr, + StoreInst *DepSI, + const TargetData &TD) { + // Cannot handle reading from store of first-class aggregate yet. + if (DepSI->getValueOperand()->getType()->isStructTy() || + DepSI->getValueOperand()->getType()->isArrayTy()) + return -1; + + Value *StorePtr = DepSI->getPointerOperand(); + uint64_t StoreSize =TD.getTypeSizeInBits(DepSI->getValueOperand()->getType()); + return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, + StorePtr, StoreSize, TD); +} + +/// AnalyzeLoadFromClobberingLoad - This function is called when we have a +/// memdep query of a load that ends up being clobbered by another load. See if +/// the other load can feed into the second load. +static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, + LoadInst *DepLI, const TargetData &TD){ + // Cannot handle reading from store of first-class aggregate yet. + if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy()) + return -1; + + Value *DepPtr = DepLI->getPointerOperand(); + uint64_t DepSize = TD.getTypeSizeInBits(DepLI->getType()); + int R = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, TD); + if (R != -1) return R; + + // If we have a load/load clobber an DepLI can be widened to cover this load, + // then we should widen it! + int64_t LoadOffs = 0; + const Value *LoadBase = + GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, TD); + unsigned LoadSize = TD.getTypeStoreSize(LoadTy); + + unsigned Size = MemoryDependenceAnalysis:: + getLoadLoadClobberFullWidthSize(LoadBase, LoadOffs, LoadSize, DepLI, TD); + if (Size == 0) return -1; + + return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size*8, TD); +} + + + +static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, + MemIntrinsic *MI, + const TargetData &TD) { + // If the mem operation is a non-constant size, we can't handle it. + ConstantInt *SizeCst = dyn_cast<ConstantInt>(MI->getLength()); + if (SizeCst == 0) return -1; + uint64_t MemSizeInBits = SizeCst->getZExtValue()*8; + + // If this is memset, we just need to see if the offset is valid in the size + // of the memset.. + if (MI->getIntrinsicID() == Intrinsic::memset) + return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(), + MemSizeInBits, TD); + + // If we have a memcpy/memmove, the only case we can handle is if this is a + // copy from constant memory. In that case, we can read directly from the + // constant memory. + MemTransferInst *MTI = cast<MemTransferInst>(MI); + + Constant *Src = dyn_cast<Constant>(MTI->getSource()); + if (Src == 0) return -1; + + GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, &TD)); + if (GV == 0 || !GV->isConstant()) return -1; + + // See if the access is within the bounds of the transfer. + int Offset = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, + MI->getDest(), MemSizeInBits, TD); + if (Offset == -1) + return Offset; + + // Otherwise, see if we can constant fold a load from the constant with the + // offset applied as appropriate. + Src = ConstantExpr::getBitCast(Src, + llvm::Type::getInt8PtrTy(Src->getContext())); + Constant *OffsetCst = + ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); + Src = ConstantExpr::getGetElementPtr(Src, OffsetCst); + Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy)); + if (ConstantFoldLoadFromConstPtr(Src, &TD)) + return Offset; + return -1; +} + + +/// GetStoreValueForLoad - This function is called when we have a +/// memdep query of a load that ends up being a clobbering store. This means +/// that the store provides bits used by the load but we the pointers don't +/// mustalias. Check this case to see if there is anything more we can do +/// before we give up. +static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, + Type *LoadTy, + Instruction *InsertPt, const TargetData &TD){ + LLVMContext &Ctx = SrcVal->getType()->getContext(); + + uint64_t StoreSize = (TD.getTypeSizeInBits(SrcVal->getType()) + 7) / 8; + uint64_t LoadSize = (TD.getTypeSizeInBits(LoadTy) + 7) / 8; + + IRBuilder<> Builder(InsertPt->getParent(), InsertPt); + + // Compute which bits of the stored value are being used by the load. Convert + // to an integer type to start with. + if (SrcVal->getType()->isPointerTy()) + SrcVal = Builder.CreatePtrToInt(SrcVal, TD.getIntPtrType(Ctx)); + if (!SrcVal->getType()->isIntegerTy()) + SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8)); + + // Shift the bits to the least significant depending on endianness. + unsigned ShiftAmt; + if (TD.isLittleEndian()) + ShiftAmt = Offset*8; + else + ShiftAmt = (StoreSize-LoadSize-Offset)*8; + + if (ShiftAmt) + SrcVal = Builder.CreateLShr(SrcVal, ShiftAmt); + + if (LoadSize != StoreSize) + SrcVal = Builder.CreateTrunc(SrcVal, IntegerType::get(Ctx, LoadSize*8)); + + return CoerceAvailableValueToLoadType(SrcVal, LoadTy, InsertPt, TD); +} + +/// GetLoadValueForLoad - This function is called when we have a +/// memdep query of a load that ends up being a clobbering load. This means +/// that the load *may* provide bits used by the load but we can't be sure +/// because the pointers don't mustalias. Check this case to see if there is +/// anything more we can do before we give up. +static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, + Type *LoadTy, Instruction *InsertPt, + GVN &gvn) { + const TargetData &TD = *gvn.getTargetData(); + // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to + // widen SrcVal out to a larger load. + unsigned SrcValSize = TD.getTypeStoreSize(SrcVal->getType()); + unsigned LoadSize = TD.getTypeStoreSize(LoadTy); + if (Offset+LoadSize > SrcValSize) { + assert(SrcVal->isSimple() && "Cannot widen volatile/atomic load!"); + assert(SrcVal->getType()->isIntegerTy() && "Can't widen non-integer load"); + // If we have a load/load clobber an DepLI can be widened to cover this + // load, then we should widen it to the next power of 2 size big enough! + unsigned NewLoadSize = Offset+LoadSize; + if (!isPowerOf2_32(NewLoadSize)) + NewLoadSize = NextPowerOf2(NewLoadSize); + + Value *PtrVal = SrcVal->getPointerOperand(); + + // Insert the new load after the old load. This ensures that subsequent + // memdep queries will find the new load. We can't easily remove the old + // load completely because it is already in the value numbering table. + IRBuilder<> Builder(SrcVal->getParent(), ++BasicBlock::iterator(SrcVal)); + Type *DestPTy = + IntegerType::get(LoadTy->getContext(), NewLoadSize*8); + DestPTy = PointerType::get(DestPTy, + cast<PointerType>(PtrVal->getType())->getAddressSpace()); + Builder.SetCurrentDebugLocation(SrcVal->getDebugLoc()); + PtrVal = Builder.CreateBitCast(PtrVal, DestPTy); + LoadInst *NewLoad = Builder.CreateLoad(PtrVal); + NewLoad->takeName(SrcVal); + NewLoad->setAlignment(SrcVal->getAlignment()); + + DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n"); + DEBUG(dbgs() << "TO: " << *NewLoad << "\n"); + + // Replace uses of the original load with the wider load. On a big endian + // system, we need to shift down to get the relevant bits. + Value *RV = NewLoad; + if (TD.isBigEndian()) + RV = Builder.CreateLShr(RV, + NewLoadSize*8-SrcVal->getType()->getPrimitiveSizeInBits()); + RV = Builder.CreateTrunc(RV, SrcVal->getType()); + SrcVal->replaceAllUsesWith(RV); + + // We would like to use gvn.markInstructionForDeletion here, but we can't + // because the load is already memoized into the leader map table that GVN + // tracks. It is potentially possible to remove the load from the table, + // but then there all of the operations based on it would need to be + // rehashed. Just leave the dead load around. + gvn.getMemDep().removeInstruction(SrcVal); + SrcVal = NewLoad; + } + + return GetStoreValueForLoad(SrcVal, Offset, LoadTy, InsertPt, TD); +} + + +/// GetMemInstValueForLoad - This function is called when we have a +/// memdep query of a load that ends up being a clobbering mem intrinsic. +static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, + Type *LoadTy, Instruction *InsertPt, + const TargetData &TD){ + LLVMContext &Ctx = LoadTy->getContext(); + uint64_t LoadSize = TD.getTypeSizeInBits(LoadTy)/8; + + IRBuilder<> Builder(InsertPt->getParent(), InsertPt); + + // We know that this method is only called when the mem transfer fully + // provides the bits for the load. + if (MemSetInst *MSI = dyn_cast<MemSetInst>(SrcInst)) { + // memset(P, 'x', 1234) -> splat('x'), even if x is a variable, and + // independently of what the offset is. + Value *Val = MSI->getValue(); + if (LoadSize != 1) + Val = Builder.CreateZExt(Val, IntegerType::get(Ctx, LoadSize*8)); + + Value *OneElt = Val; + + // Splat the value out to the right number of bits. + for (unsigned NumBytesSet = 1; NumBytesSet != LoadSize; ) { + // If we can double the number of bytes set, do it. + if (NumBytesSet*2 <= LoadSize) { + Value *ShVal = Builder.CreateShl(Val, NumBytesSet*8); + Val = Builder.CreateOr(Val, ShVal); + NumBytesSet <<= 1; + continue; + } + + // Otherwise insert one byte at a time. + Value *ShVal = Builder.CreateShl(Val, 1*8); + Val = Builder.CreateOr(OneElt, ShVal); + ++NumBytesSet; + } + + return CoerceAvailableValueToLoadType(Val, LoadTy, InsertPt, TD); + } + + // Otherwise, this is a memcpy/memmove from a constant global. + MemTransferInst *MTI = cast<MemTransferInst>(SrcInst); + Constant *Src = cast<Constant>(MTI->getSource()); + + // Otherwise, see if we can constant fold a load from the constant with the + // offset applied as appropriate. + Src = ConstantExpr::getBitCast(Src, + llvm::Type::getInt8PtrTy(Src->getContext())); + Constant *OffsetCst = + ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); + Src = ConstantExpr::getGetElementPtr(Src, OffsetCst); + Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy)); + return ConstantFoldLoadFromConstPtr(Src, &TD); +} + +namespace { + +struct AvailableValueInBlock { + /// BB - The basic block in question. + BasicBlock *BB; + enum ValType { + SimpleVal, // A simple offsetted value that is accessed. + LoadVal, // A value produced by a load. + MemIntrin // A memory intrinsic which is loaded from. + }; + + /// V - The value that is live out of the block. + PointerIntPair<Value *, 2, ValType> Val; + + /// Offset - The byte offset in Val that is interesting for the load query. + unsigned Offset; + + static AvailableValueInBlock get(BasicBlock *BB, Value *V, + unsigned Offset = 0) { + AvailableValueInBlock Res; + Res.BB = BB; + Res.Val.setPointer(V); + Res.Val.setInt(SimpleVal); + Res.Offset = Offset; + return Res; + } + + static AvailableValueInBlock getMI(BasicBlock *BB, MemIntrinsic *MI, + unsigned Offset = 0) { + AvailableValueInBlock Res; + Res.BB = BB; + Res.Val.setPointer(MI); + Res.Val.setInt(MemIntrin); + Res.Offset = Offset; + return Res; + } + + static AvailableValueInBlock getLoad(BasicBlock *BB, LoadInst *LI, + unsigned Offset = 0) { + AvailableValueInBlock Res; + Res.BB = BB; + Res.Val.setPointer(LI); + Res.Val.setInt(LoadVal); + Res.Offset = Offset; + return Res; + } + + bool isSimpleValue() const { return Val.getInt() == SimpleVal; } + bool isCoercedLoadValue() const { return Val.getInt() == LoadVal; } + bool isMemIntrinValue() const { return Val.getInt() == MemIntrin; } + + Value *getSimpleValue() const { + assert(isSimpleValue() && "Wrong accessor"); + return Val.getPointer(); + } + + LoadInst *getCoercedLoadValue() const { + assert(isCoercedLoadValue() && "Wrong accessor"); + return cast<LoadInst>(Val.getPointer()); + } + + MemIntrinsic *getMemIntrinValue() const { + assert(isMemIntrinValue() && "Wrong accessor"); + return cast<MemIntrinsic>(Val.getPointer()); + } + + /// MaterializeAdjustedValue - Emit code into this block to adjust the value + /// defined here to the specified type. This handles various coercion cases. + Value *MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) const { + Value *Res; + if (isSimpleValue()) { + Res = getSimpleValue(); + if (Res->getType() != LoadTy) { + const TargetData *TD = gvn.getTargetData(); + assert(TD && "Need target data to handle type mismatch case"); + Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(), + *TD); + + DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << " " + << *getSimpleValue() << '\n' + << *Res << '\n' << "\n\n\n"); + } + } else if (isCoercedLoadValue()) { + LoadInst *Load = getCoercedLoadValue(); + if (Load->getType() == LoadTy && Offset == 0) { + Res = Load; + } else { + Res = GetLoadValueForLoad(Load, Offset, LoadTy, BB->getTerminator(), + gvn); + + DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset << " " + << *getCoercedLoadValue() << '\n' + << *Res << '\n' << "\n\n\n"); + } + } else { + const TargetData *TD = gvn.getTargetData(); + assert(TD && "Need target data to handle type mismatch case"); + Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset, + LoadTy, BB->getTerminator(), *TD); + DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset + << " " << *getMemIntrinValue() << '\n' + << *Res << '\n' << "\n\n\n"); + } + return Res; + } +}; + +} // end anonymous namespace + +/// ConstructSSAForLoadSet - Given a set of loads specified by ValuesPerBlock, +/// construct SSA form, allowing us to eliminate LI. This returns the value +/// that should be used at LI's definition site. +static Value *ConstructSSAForLoadSet(LoadInst *LI, + SmallVectorImpl<AvailableValueInBlock> &ValuesPerBlock, + GVN &gvn) { + // Check for the fully redundant, dominating load case. In this case, we can + // just use the dominating value directly. + if (ValuesPerBlock.size() == 1 && + gvn.getDominatorTree().properlyDominates(ValuesPerBlock[0].BB, + LI->getParent())) + return ValuesPerBlock[0].MaterializeAdjustedValue(LI->getType(), gvn); + + // Otherwise, we have to construct SSA form. + SmallVector<PHINode*, 8> NewPHIs; + SSAUpdater SSAUpdate(&NewPHIs); + SSAUpdate.Initialize(LI->getType(), LI->getName()); + + Type *LoadTy = LI->getType(); + + for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) { + const AvailableValueInBlock &AV = ValuesPerBlock[i]; + BasicBlock *BB = AV.BB; + + if (SSAUpdate.HasValueForBlock(BB)) + continue; + + SSAUpdate.AddAvailableValue(BB, AV.MaterializeAdjustedValue(LoadTy, gvn)); + } + + // Perform PHI construction. + Value *V = SSAUpdate.GetValueInMiddleOfBlock(LI->getParent()); + + // If new PHI nodes were created, notify alias analysis. + if (V->getType()->isPointerTy()) { + AliasAnalysis *AA = gvn.getAliasAnalysis(); + + for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i) + AA->copyValue(LI, NewPHIs[i]); + + // Now that we've copied information to the new PHIs, scan through + // them again and inform alias analysis that we've added potentially + // escaping uses to any values that are operands to these PHIs. + for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i) { + PHINode *P = NewPHIs[i]; + for (unsigned ii = 0, ee = P->getNumIncomingValues(); ii != ee; ++ii) { + unsigned jj = PHINode::getOperandNumForIncomingValue(ii); + AA->addEscapingUse(P->getOperandUse(jj)); + } + } + } + + return V; +} + +static bool isLifetimeStart(const Instruction *Inst) { + if (const IntrinsicInst* II = dyn_cast<IntrinsicInst>(Inst)) + return II->getIntrinsicID() == Intrinsic::lifetime_start; + return false; +} + +/// processNonLocalLoad - Attempt to eliminate a load whose dependencies are +/// non-local by performing PHI construction. +bool GVN::processNonLocalLoad(LoadInst *LI) { + // Find the non-local dependencies of the load. + SmallVector<NonLocalDepResult, 64> Deps; + AliasAnalysis::Location Loc = VN.getAliasAnalysis()->getLocation(LI); + MD->getNonLocalPointerDependency(Loc, true, LI->getParent(), Deps); + //DEBUG(dbgs() << "INVESTIGATING NONLOCAL LOAD: " + // << Deps.size() << *LI << '\n'); + + // If we had to process more than one hundred blocks to find the + // dependencies, this load isn't worth worrying about. Optimizing + // it will be too expensive. + unsigned NumDeps = Deps.size(); + if (NumDeps > 100) + return false; + + // If we had a phi translation failure, we'll have a single entry which is a + // clobber in the current block. Reject this early. + if (NumDeps == 1 && + !Deps[0].getResult().isDef() && !Deps[0].getResult().isClobber()) { + DEBUG( + dbgs() << "GVN: non-local load "; + WriteAsOperand(dbgs(), LI); + dbgs() << " has unknown dependencies\n"; + ); + return false; + } + + // Filter out useless results (non-locals, etc). Keep track of the blocks + // where we have a value available in repl, also keep track of whether we see + // dependencies that produce an unknown value for the load (such as a call + // that could potentially clobber the load). + SmallVector<AvailableValueInBlock, 64> ValuesPerBlock; + SmallVector<BasicBlock*, 64> UnavailableBlocks; + + for (unsigned i = 0, e = NumDeps; i != e; ++i) { + BasicBlock *DepBB = Deps[i].getBB(); + MemDepResult DepInfo = Deps[i].getResult(); + + if (!DepInfo.isDef() && !DepInfo.isClobber()) { + UnavailableBlocks.push_back(DepBB); + continue; + } + + if (DepInfo.isClobber()) { + // The address being loaded in this non-local block may not be the same as + // the pointer operand of the load if PHI translation occurs. Make sure + // to consider the right address. + Value *Address = Deps[i].getAddress(); + + // If the dependence is to a store that writes to a superset of the bits + // read by the load, we can extract the bits we need for the load from the + // stored value. + if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInfo.getInst())) { + if (TD && Address) { + int Offset = AnalyzeLoadFromClobberingStore(LI->getType(), Address, + DepSI, *TD); + if (Offset != -1) { + ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB, + DepSI->getValueOperand(), + Offset)); + continue; + } + } + } + + // Check to see if we have something like this: + // load i32* P + // load i8* (P+1) + // if we have this, replace the later with an extraction from the former. + if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInfo.getInst())) { + // If this is a clobber and L is the first instruction in its block, then + // we have the first instruction in the entry block. + if (DepLI != LI && Address && TD) { + int Offset = AnalyzeLoadFromClobberingLoad(LI->getType(), + LI->getPointerOperand(), + DepLI, *TD); + + if (Offset != -1) { + ValuesPerBlock.push_back(AvailableValueInBlock::getLoad(DepBB,DepLI, + Offset)); + continue; + } + } + } + + // If the clobbering value is a memset/memcpy/memmove, see if we can + // forward a value on from it. + if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInfo.getInst())) { + if (TD && Address) { + int Offset = AnalyzeLoadFromClobberingMemInst(LI->getType(), Address, + DepMI, *TD); + if (Offset != -1) { + ValuesPerBlock.push_back(AvailableValueInBlock::getMI(DepBB, DepMI, + Offset)); + continue; + } + } + } + + UnavailableBlocks.push_back(DepBB); + continue; + } + + // DepInfo.isDef() here + + Instruction *DepInst = DepInfo.getInst(); + + // Loading the allocation -> undef. + if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst) || + // Loading immediately after lifetime begin -> undef. + isLifetimeStart(DepInst)) { + ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB, + UndefValue::get(LI->getType()))); + continue; + } + + if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) { + // Reject loads and stores that are to the same address but are of + // different types if we have to. + if (S->getValueOperand()->getType() != LI->getType()) { + // If the stored value is larger or equal to the loaded value, we can + // reuse it. + if (TD == 0 || !CanCoerceMustAliasedValueToLoad(S->getValueOperand(), + LI->getType(), *TD)) { + UnavailableBlocks.push_back(DepBB); + continue; + } + } + + ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB, + S->getValueOperand())); + continue; + } + + if (LoadInst *LD = dyn_cast<LoadInst>(DepInst)) { + // If the types mismatch and we can't handle it, reject reuse of the load. + if (LD->getType() != LI->getType()) { + // If the stored value is larger or equal to the loaded value, we can + // reuse it. + if (TD == 0 || !CanCoerceMustAliasedValueToLoad(LD, LI->getType(),*TD)){ + UnavailableBlocks.push_back(DepBB); + continue; + } + } + ValuesPerBlock.push_back(AvailableValueInBlock::getLoad(DepBB, LD)); + continue; + } + + UnavailableBlocks.push_back(DepBB); + continue; + } + + // If we have no predecessors that produce a known value for this load, exit + // early. + if (ValuesPerBlock.empty()) return false; + + // If all of the instructions we depend on produce a known value for this + // load, then it is fully redundant and we can use PHI insertion to compute + // its value. Insert PHIs and remove the fully redundant value now. + if (UnavailableBlocks.empty()) { + DEBUG(dbgs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n'); + + // Perform PHI construction. + Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this); + LI->replaceAllUsesWith(V); + + if (isa<PHINode>(V)) + V->takeName(LI); + if (V->getType()->isPointerTy()) + MD->invalidateCachedPointerInfo(V); + markInstructionForDeletion(LI); + ++NumGVNLoad; + return true; + } + + if (!EnablePRE || !EnableLoadPRE) + return false; + + // Okay, we have *some* definitions of the value. This means that the value + // is available in some of our (transitive) predecessors. Lets think about + // doing PRE of this load. This will involve inserting a new load into the + // predecessor when it's not available. We could do this in general, but + // prefer to not increase code size. As such, we only do this when we know + // that we only have to insert *one* load (which means we're basically moving + // the load, not inserting a new one). + + SmallPtrSet<BasicBlock *, 4> Blockers; + for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i) + Blockers.insert(UnavailableBlocks[i]); + + // Let's find the first basic block with more than one predecessor. Walk + // backwards through predecessors if needed. + BasicBlock *LoadBB = LI->getParent(); + BasicBlock *TmpBB = LoadBB; + + bool isSinglePred = false; + bool allSingleSucc = true; + while (TmpBB->getSinglePredecessor()) { + isSinglePred = true; + TmpBB = TmpBB->getSinglePredecessor(); + if (TmpBB == LoadBB) // Infinite (unreachable) loop. + return false; + if (Blockers.count(TmpBB)) + return false; + + // If any of these blocks has more than one successor (i.e. if the edge we + // just traversed was critical), then there are other paths through this + // block along which the load may not be anticipated. Hoisting the load + // above this block would be adding the load to execution paths along + // which it was not previously executed. + if (TmpBB->getTerminator()->getNumSuccessors() != 1) + return false; + } + + assert(TmpBB); + LoadBB = TmpBB; + + // FIXME: It is extremely unclear what this loop is doing, other than + // artificially restricting loadpre. + if (isSinglePred) { + bool isHot = false; + for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) { + const AvailableValueInBlock &AV = ValuesPerBlock[i]; + if (AV.isSimpleValue()) + // "Hot" Instruction is in some loop (because it dominates its dep. + // instruction). + if (Instruction *I = dyn_cast<Instruction>(AV.getSimpleValue())) + if (DT->dominates(LI, I)) { + isHot = true; + break; + } + } + + // We are interested only in "hot" instructions. We don't want to do any + // mis-optimizations here. + if (!isHot) + return false; + } + + // Check to see how many predecessors have the loaded value fully + // available. + DenseMap<BasicBlock*, Value*> PredLoads; + DenseMap<BasicBlock*, char> FullyAvailableBlocks; + for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) + FullyAvailableBlocks[ValuesPerBlock[i].BB] = true; + for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i) + FullyAvailableBlocks[UnavailableBlocks[i]] = false; + + SmallVector<std::pair<TerminatorInst*, unsigned>, 4> NeedToSplit; + for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB); + PI != E; ++PI) { + BasicBlock *Pred = *PI; + if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks, 0)) { + continue; + } + PredLoads[Pred] = 0; + + if (Pred->getTerminator()->getNumSuccessors() != 1) { + if (isa<IndirectBrInst>(Pred->getTerminator())) { + DEBUG(dbgs() << "COULD NOT PRE LOAD BECAUSE OF INDBR CRITICAL EDGE '" + << Pred->getName() << "': " << *LI << '\n'); + return false; + } + + if (LoadBB->isLandingPad()) { + DEBUG(dbgs() + << "COULD NOT PRE LOAD BECAUSE OF LANDING PAD CRITICAL EDGE '" + << Pred->getName() << "': " << *LI << '\n'); + return false; + } + + unsigned SuccNum = GetSuccessorNumber(Pred, LoadBB); + NeedToSplit.push_back(std::make_pair(Pred->getTerminator(), SuccNum)); + } + } + + if (!NeedToSplit.empty()) { + toSplit.append(NeedToSplit.begin(), NeedToSplit.end()); + return false; + } + + // Decide whether PRE is profitable for this load. + unsigned NumUnavailablePreds = PredLoads.size(); + assert(NumUnavailablePreds != 0 && + "Fully available value should be eliminated above!"); + + // If this load is unavailable in multiple predecessors, reject it. + // FIXME: If we could restructure the CFG, we could make a common pred with + // all the preds that don't have an available LI and insert a new load into + // that one block. + if (NumUnavailablePreds != 1) + return false; + + // Check if the load can safely be moved to all the unavailable predecessors. + bool CanDoPRE = true; + SmallVector<Instruction*, 8> NewInsts; + for (DenseMap<BasicBlock*, Value*>::iterator I = PredLoads.begin(), + E = PredLoads.end(); I != E; ++I) { + BasicBlock *UnavailablePred = I->first; + + // Do PHI translation to get its value in the predecessor if necessary. The + // returned pointer (if non-null) is guaranteed to dominate UnavailablePred. + + // If all preds have a single successor, then we know it is safe to insert + // the load on the pred (?!?), so we can insert code to materialize the + // pointer if it is not available. + PHITransAddr Address(LI->getPointerOperand(), TD); + Value *LoadPtr = 0; + if (allSingleSucc) { + LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred, + *DT, NewInsts); + } else { + Address.PHITranslateValue(LoadBB, UnavailablePred, DT); + LoadPtr = Address.getAddr(); + } + + // If we couldn't find or insert a computation of this phi translated value, + // we fail PRE. + if (LoadPtr == 0) { + DEBUG(dbgs() << "COULDN'T INSERT PHI TRANSLATED VALUE OF: " + << *LI->getPointerOperand() << "\n"); + CanDoPRE = false; + break; + } + + // Make sure it is valid to move this load here. We have to watch out for: + // @1 = getelementptr (i8* p, ... + // test p and branch if == 0 + // load @1 + // It is valid to have the getelementptr before the test, even if p can + // be 0, as getelementptr only does address arithmetic. + // If we are not pushing the value through any multiple-successor blocks + // we do not have this case. Otherwise, check that the load is safe to + // put anywhere; this can be improved, but should be conservatively safe. + if (!allSingleSucc && + // FIXME: REEVALUTE THIS. + !isSafeToLoadUnconditionally(LoadPtr, + UnavailablePred->getTerminator(), + LI->getAlignment(), TD)) { + CanDoPRE = false; + break; + } + + I->second = LoadPtr; + } + + if (!CanDoPRE) { + while (!NewInsts.empty()) { + Instruction *I = NewInsts.pop_back_val(); + if (MD) MD->removeInstruction(I); + I->eraseFromParent(); + } + return false; + } + + // Okay, we can eliminate this load by inserting a reload in the predecessor + // and using PHI construction to get the value in the other predecessors, do + // it. + DEBUG(dbgs() << "GVN REMOVING PRE LOAD: " << *LI << '\n'); + DEBUG(if (!NewInsts.empty()) + dbgs() << "INSERTED " << NewInsts.size() << " INSTS: " + << *NewInsts.back() << '\n'); + + // Assign value numbers to the new instructions. + for (unsigned i = 0, e = NewInsts.size(); i != e; ++i) { + // FIXME: We really _ought_ to insert these value numbers into their + // parent's availability map. However, in doing so, we risk getting into + // ordering issues. If a block hasn't been processed yet, we would be + // marking a value as AVAIL-IN, which isn't what we intend. + VN.lookup_or_add(NewInsts[i]); + } + + for (DenseMap<BasicBlock*, Value*>::iterator I = PredLoads.begin(), + E = PredLoads.end(); I != E; ++I) { + BasicBlock *UnavailablePred = I->first; + Value *LoadPtr = I->second; + + Instruction *NewLoad = new LoadInst(LoadPtr, LI->getName()+".pre", false, + LI->getAlignment(), + UnavailablePred->getTerminator()); + + // Transfer the old load's TBAA tag to the new load. + if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) + NewLoad->setMetadata(LLVMContext::MD_tbaa, Tag); + + // Transfer DebugLoc. + NewLoad->setDebugLoc(LI->getDebugLoc()); + + // Add the newly created load. + ValuesPerBlock.push_back(AvailableValueInBlock::get(UnavailablePred, + NewLoad)); + MD->invalidateCachedPointerInfo(LoadPtr); + DEBUG(dbgs() << "GVN INSERTED " << *NewLoad << '\n'); + } + + // Perform PHI construction. + Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this); + LI->replaceAllUsesWith(V); + if (isa<PHINode>(V)) + V->takeName(LI); + if (V->getType()->isPointerTy()) + MD->invalidateCachedPointerInfo(V); + markInstructionForDeletion(LI); + ++NumPRELoad; + return true; +} + +static void patchReplacementInstruction(Value *Repl, Instruction *I) { + // Patch the replacement so that it is not more restrictive than the value + // being replaced. + BinaryOperator *Op = dyn_cast<BinaryOperator>(I); + BinaryOperator *ReplOp = dyn_cast<BinaryOperator>(Repl); + if (Op && ReplOp && isa<OverflowingBinaryOperator>(Op) && + isa<OverflowingBinaryOperator>(ReplOp)) { + if (ReplOp->hasNoSignedWrap() && !Op->hasNoSignedWrap()) + ReplOp->setHasNoSignedWrap(false); + if (ReplOp->hasNoUnsignedWrap() && !Op->hasNoUnsignedWrap()) + ReplOp->setHasNoUnsignedWrap(false); + } + if (Instruction *ReplInst = dyn_cast<Instruction>(Repl)) { + SmallVector<std::pair<unsigned, MDNode*>, 4> Metadata; + ReplInst->getAllMetadataOtherThanDebugLoc(Metadata); + for (int i = 0, n = Metadata.size(); i < n; ++i) { + unsigned Kind = Metadata[i].first; + MDNode *IMD = I->getMetadata(Kind); + MDNode *ReplMD = Metadata[i].second; + switch(Kind) { + default: + ReplInst->setMetadata(Kind, NULL); // Remove unknown metadata + break; + case LLVMContext::MD_dbg: + llvm_unreachable("getAllMetadataOtherThanDebugLoc returned a MD_dbg"); + case LLVMContext::MD_tbaa: + ReplInst->setMetadata(Kind, MDNode::getMostGenericTBAA(IMD, ReplMD)); + break; + case LLVMContext::MD_range: + ReplInst->setMetadata(Kind, MDNode::getMostGenericRange(IMD, ReplMD)); + break; + case LLVMContext::MD_prof: + llvm_unreachable("MD_prof in a non terminator instruction"); + break; + case LLVMContext::MD_fpmath: + ReplInst->setMetadata(Kind, MDNode::getMostGenericFPMath(IMD, ReplMD)); + break; + } + } + } +} + +static void patchAndReplaceAllUsesWith(Value *Repl, Instruction *I) { + patchReplacementInstruction(Repl, I); + I->replaceAllUsesWith(Repl); +} + +/// processLoad - Attempt to eliminate a load, first by eliminating it +/// locally, and then attempting non-local elimination if that fails. +bool GVN::processLoad(LoadInst *L) { + if (!MD) + return false; + + if (!L->isSimple()) + return false; + + if (L->use_empty()) { + markInstructionForDeletion(L); + return true; + } + + // ... to a pointer that has been loaded from before... + MemDepResult Dep = MD->getDependency(L); + + // If we have a clobber and target data is around, see if this is a clobber + // that we can fix up through code synthesis. + if (Dep.isClobber() && TD) { + // Check to see if we have something like this: + // store i32 123, i32* %P + // %A = bitcast i32* %P to i8* + // %B = gep i8* %A, i32 1 + // %C = load i8* %B + // + // We could do that by recognizing if the clobber instructions are obviously + // a common base + constant offset, and if the previous store (or memset) + // completely covers this load. This sort of thing can happen in bitfield + // access code. + Value *AvailVal = 0; + if (StoreInst *DepSI = dyn_cast<StoreInst>(Dep.getInst())) { + int Offset = AnalyzeLoadFromClobberingStore(L->getType(), + L->getPointerOperand(), + DepSI, *TD); + if (Offset != -1) + AvailVal = GetStoreValueForLoad(DepSI->getValueOperand(), Offset, + L->getType(), L, *TD); + } + + // Check to see if we have something like this: + // load i32* P + // load i8* (P+1) + // if we have this, replace the later with an extraction from the former. + if (LoadInst *DepLI = dyn_cast<LoadInst>(Dep.getInst())) { + // If this is a clobber and L is the first instruction in its block, then + // we have the first instruction in the entry block. + if (DepLI == L) + return false; + + int Offset = AnalyzeLoadFromClobberingLoad(L->getType(), + L->getPointerOperand(), + DepLI, *TD); + if (Offset != -1) + AvailVal = GetLoadValueForLoad(DepLI, Offset, L->getType(), L, *this); + } + + // If the clobbering value is a memset/memcpy/memmove, see if we can forward + // a value on from it. + if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(Dep.getInst())) { + int Offset = AnalyzeLoadFromClobberingMemInst(L->getType(), + L->getPointerOperand(), + DepMI, *TD); + if (Offset != -1) + AvailVal = GetMemInstValueForLoad(DepMI, Offset, L->getType(), L, *TD); + } + + if (AvailVal) { + DEBUG(dbgs() << "GVN COERCED INST:\n" << *Dep.getInst() << '\n' + << *AvailVal << '\n' << *L << "\n\n\n"); + + // Replace the load! + L->replaceAllUsesWith(AvailVal); + if (AvailVal->getType()->isPointerTy()) + MD->invalidateCachedPointerInfo(AvailVal); + markInstructionForDeletion(L); + ++NumGVNLoad; + return true; + } + } + + // If the value isn't available, don't do anything! + if (Dep.isClobber()) { + DEBUG( + // fast print dep, using operator<< on instruction is too slow. + dbgs() << "GVN: load "; + WriteAsOperand(dbgs(), L); + Instruction *I = Dep.getInst(); + dbgs() << " is clobbered by " << *I << '\n'; + ); + return false; + } + + // If it is defined in another block, try harder. + if (Dep.isNonLocal()) + return processNonLocalLoad(L); + + if (!Dep.isDef()) { + DEBUG( + // fast print dep, using operator<< on instruction is too slow. + dbgs() << "GVN: load "; + WriteAsOperand(dbgs(), L); + dbgs() << " has unknown dependence\n"; + ); + return false; + } + + Instruction *DepInst = Dep.getInst(); + if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInst)) { + Value *StoredVal = DepSI->getValueOperand(); + + // The store and load are to a must-aliased pointer, but they may not + // actually have the same type. See if we know how to reuse the stored + // value (depending on its type). + if (StoredVal->getType() != L->getType()) { + if (TD) { + StoredVal = CoerceAvailableValueToLoadType(StoredVal, L->getType(), + L, *TD); + if (StoredVal == 0) + return false; + + DEBUG(dbgs() << "GVN COERCED STORE:\n" << *DepSI << '\n' << *StoredVal + << '\n' << *L << "\n\n\n"); + } + else + return false; + } + + // Remove it! + L->replaceAllUsesWith(StoredVal); + if (StoredVal->getType()->isPointerTy()) + MD->invalidateCachedPointerInfo(StoredVal); + markInstructionForDeletion(L); + ++NumGVNLoad; + return true; + } + + if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInst)) { + Value *AvailableVal = DepLI; + + // The loads are of a must-aliased pointer, but they may not actually have + // the same type. See if we know how to reuse the previously loaded value + // (depending on its type). + if (DepLI->getType() != L->getType()) { + if (TD) { + AvailableVal = CoerceAvailableValueToLoadType(DepLI, L->getType(), + L, *TD); + if (AvailableVal == 0) + return false; + + DEBUG(dbgs() << "GVN COERCED LOAD:\n" << *DepLI << "\n" << *AvailableVal + << "\n" << *L << "\n\n\n"); + } + else + return false; + } + + // Remove it! + patchAndReplaceAllUsesWith(AvailableVal, L); + if (DepLI->getType()->isPointerTy()) + MD->invalidateCachedPointerInfo(DepLI); + markInstructionForDeletion(L); + ++NumGVNLoad; + return true; + } + + // If this load really doesn't depend on anything, then we must be loading an + // undef value. This can happen when loading for a fresh allocation with no + // intervening stores, for example. + if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst)) { + L->replaceAllUsesWith(UndefValue::get(L->getType())); + markInstructionForDeletion(L); + ++NumGVNLoad; + return true; + } + + // If this load occurs either right after a lifetime begin, + // then the loaded value is undefined. + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(DepInst)) { + if (II->getIntrinsicID() == Intrinsic::lifetime_start) { + L->replaceAllUsesWith(UndefValue::get(L->getType())); + markInstructionForDeletion(L); + ++NumGVNLoad; + return true; + } + } + + return false; +} + +// findLeader - In order to find a leader for a given value number at a +// specific basic block, we first obtain the list of all Values for that number, +// and then scan the list to find one whose block dominates the block in +// question. This is fast because dominator tree queries consist of only +// a few comparisons of DFS numbers. +Value *GVN::findLeader(const BasicBlock *BB, uint32_t num) { + LeaderTableEntry Vals = LeaderTable[num]; + if (!Vals.Val) return 0; + + Value *Val = 0; + if (DT->dominates(Vals.BB, BB)) { + Val = Vals.Val; + if (isa<Constant>(Val)) return Val; + } + + LeaderTableEntry* Next = Vals.Next; + while (Next) { + if (DT->dominates(Next->BB, BB)) { + if (isa<Constant>(Next->Val)) return Next->Val; + if (!Val) Val = Next->Val; + } + + Next = Next->Next; + } + + return Val; +} + +/// replaceAllDominatedUsesWith - Replace all uses of 'From' with 'To' if the +/// use is dominated by the given basic block. Returns the number of uses that +/// were replaced. +unsigned GVN::replaceAllDominatedUsesWith(Value *From, Value *To, + const BasicBlockEdge &Root) { + unsigned Count = 0; + for (Value::use_iterator UI = From->use_begin(), UE = From->use_end(); + UI != UE; ) { + Use &U = (UI++).getUse(); + + if (DT->dominates(Root, U)) { + U.set(To); + ++Count; + } + } + return Count; +} + +/// isOnlyReachableViaThisEdge - There is an edge from 'Src' to 'Dst'. Return +/// true if every path from the entry block to 'Dst' passes via this edge. In +/// particular 'Dst' must not be reachable via another edge from 'Src'. +static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E, + DominatorTree *DT) { + // While in theory it is interesting to consider the case in which Dst has + // more than one predecessor, because Dst might be part of a loop which is + // only reachable from Src, in practice it is pointless since at the time + // GVN runs all such loops have preheaders, which means that Dst will have + // been changed to have only one predecessor, namely Src. + const BasicBlock *Pred = E.getEnd()->getSinglePredecessor(); + const BasicBlock *Src = E.getStart(); + assert((!Pred || Pred == Src) && "No edge between these basic blocks!"); + (void)Src; + return Pred != 0; +} + +/// propagateEquality - The given values are known to be equal in every block +/// dominated by 'Root'. Exploit this, for example by replacing 'LHS' with +/// 'RHS' everywhere in the scope. Returns whether a change was made. +bool GVN::propagateEquality(Value *LHS, Value *RHS, + const BasicBlockEdge &Root) { + SmallVector<std::pair<Value*, Value*>, 4> Worklist; + Worklist.push_back(std::make_pair(LHS, RHS)); + bool Changed = false; + // For speed, compute a conservative fast approximation to + // DT->dominates(Root, Root.getEnd()); + bool RootDominatesEnd = isOnlyReachableViaThisEdge(Root, DT); + + while (!Worklist.empty()) { + std::pair<Value*, Value*> Item = Worklist.pop_back_val(); + LHS = Item.first; RHS = Item.second; + + if (LHS == RHS) continue; + assert(LHS->getType() == RHS->getType() && "Equality but unequal types!"); + + // Don't try to propagate equalities between constants. + if (isa<Constant>(LHS) && isa<Constant>(RHS)) continue; + + // Prefer a constant on the right-hand side, or an Argument if no constants. + if (isa<Constant>(LHS) || (isa<Argument>(LHS) && !isa<Constant>(RHS))) + std::swap(LHS, RHS); + assert((isa<Argument>(LHS) || isa<Instruction>(LHS)) && "Unexpected value!"); + + // If there is no obvious reason to prefer the left-hand side over the right- + // hand side, ensure the longest lived term is on the right-hand side, so the + // shortest lived term will be replaced by the longest lived. This tends to + // expose more simplifications. + uint32_t LVN = VN.lookup_or_add(LHS); + if ((isa<Argument>(LHS) && isa<Argument>(RHS)) || + (isa<Instruction>(LHS) && isa<Instruction>(RHS))) { + // Move the 'oldest' value to the right-hand side, using the value number as + // a proxy for age. + uint32_t RVN = VN.lookup_or_add(RHS); + if (LVN < RVN) { + std::swap(LHS, RHS); + LVN = RVN; + } + } + + // If value numbering later sees that an instruction in the scope is equal + // to 'LHS' then ensure it will be turned into 'RHS'. In order to preserve + // the invariant that instructions only occur in the leader table for their + // own value number (this is used by removeFromLeaderTable), do not do this + // if RHS is an instruction (if an instruction in the scope is morphed into + // LHS then it will be turned into RHS by the next GVN iteration anyway, so + // using the leader table is about compiling faster, not optimizing better). + // The leader table only tracks basic blocks, not edges. Only add to if we + // have the simple case where the edge dominates the end. + if (RootDominatesEnd && !isa<Instruction>(RHS)) + addToLeaderTable(LVN, RHS, Root.getEnd()); + + // Replace all occurrences of 'LHS' with 'RHS' everywhere in the scope. As + // LHS always has at least one use that is not dominated by Root, this will + // never do anything if LHS has only one use. + if (!LHS->hasOneUse()) { + unsigned NumReplacements = replaceAllDominatedUsesWith(LHS, RHS, Root); + Changed |= NumReplacements > 0; + NumGVNEqProp += NumReplacements; + } + + // Now try to deduce additional equalities from this one. For example, if the + // known equality was "(A != B)" == "false" then it follows that A and B are + // equal in the scope. Only boolean equalities with an explicit true or false + // RHS are currently supported. + if (!RHS->getType()->isIntegerTy(1)) + // Not a boolean equality - bail out. + continue; + ConstantInt *CI = dyn_cast<ConstantInt>(RHS); + if (!CI) + // RHS neither 'true' nor 'false' - bail out. + continue; + // Whether RHS equals 'true'. Otherwise it equals 'false'. + bool isKnownTrue = CI->isAllOnesValue(); + bool isKnownFalse = !isKnownTrue; + + // If "A && B" is known true then both A and B are known true. If "A || B" + // is known false then both A and B are known false. + Value *A, *B; + if ((isKnownTrue && match(LHS, m_And(m_Value(A), m_Value(B)))) || + (isKnownFalse && match(LHS, m_Or(m_Value(A), m_Value(B))))) { + Worklist.push_back(std::make_pair(A, RHS)); + Worklist.push_back(std::make_pair(B, RHS)); + continue; + } + + // If we are propagating an equality like "(A == B)" == "true" then also + // propagate the equality A == B. When propagating a comparison such as + // "(A >= B)" == "true", replace all instances of "A < B" with "false". + if (ICmpInst *Cmp = dyn_cast<ICmpInst>(LHS)) { + Value *Op0 = Cmp->getOperand(0), *Op1 = Cmp->getOperand(1); + + // If "A == B" is known true, or "A != B" is known false, then replace + // A with B everywhere in the scope. + if ((isKnownTrue && Cmp->getPredicate() == CmpInst::ICMP_EQ) || + (isKnownFalse && Cmp->getPredicate() == CmpInst::ICMP_NE)) + Worklist.push_back(std::make_pair(Op0, Op1)); + + // If "A >= B" is known true, replace "A < B" with false everywhere. + CmpInst::Predicate NotPred = Cmp->getInversePredicate(); + Constant *NotVal = ConstantInt::get(Cmp->getType(), isKnownFalse); + // Since we don't have the instruction "A < B" immediately to hand, work out + // the value number that it would have and use that to find an appropriate + // instruction (if any). + uint32_t NextNum = VN.getNextUnusedValueNumber(); + uint32_t Num = VN.lookup_or_add_cmp(Cmp->getOpcode(), NotPred, Op0, Op1); + // If the number we were assigned was brand new then there is no point in + // looking for an instruction realizing it: there cannot be one! + if (Num < NextNum) { + Value *NotCmp = findLeader(Root.getEnd(), Num); + if (NotCmp && isa<Instruction>(NotCmp)) { + unsigned NumReplacements = + replaceAllDominatedUsesWith(NotCmp, NotVal, Root); + Changed |= NumReplacements > 0; + NumGVNEqProp += NumReplacements; + } + } + // Ensure that any instruction in scope that gets the "A < B" value number + // is replaced with false. + // The leader table only tracks basic blocks, not edges. Only add to if we + // have the simple case where the edge dominates the end. + if (RootDominatesEnd) + addToLeaderTable(Num, NotVal, Root.getEnd()); + + continue; + } + } + + return Changed; +} + +/// processInstruction - When calculating availability, handle an instruction +/// by inserting it into the appropriate sets +bool GVN::processInstruction(Instruction *I) { + // Ignore dbg info intrinsics. + if (isa<DbgInfoIntrinsic>(I)) + return false; + + // If the instruction can be easily simplified then do so now in preference + // to value numbering it. Value numbering often exposes redundancies, for + // example if it determines that %y is equal to %x then the instruction + // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify. + if (Value *V = SimplifyInstruction(I, TD, TLI, DT)) { + I->replaceAllUsesWith(V); + if (MD && V->getType()->isPointerTy()) + MD->invalidateCachedPointerInfo(V); + markInstructionForDeletion(I); + ++NumGVNSimpl; + return true; + } + + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + if (processLoad(LI)) + return true; + + unsigned Num = VN.lookup_or_add(LI); + addToLeaderTable(Num, LI, LI->getParent()); + return false; + } + + // For conditional branches, we can perform simple conditional propagation on + // the condition value itself. + if (BranchInst *BI = dyn_cast<BranchInst>(I)) { + if (!BI->isConditional() || isa<Constant>(BI->getCondition())) + return false; + + Value *BranchCond = BI->getCondition(); + + BasicBlock *TrueSucc = BI->getSuccessor(0); + BasicBlock *FalseSucc = BI->getSuccessor(1); + // Avoid multiple edges early. + if (TrueSucc == FalseSucc) + return false; + + BasicBlock *Parent = BI->getParent(); + bool Changed = false; + + Value *TrueVal = ConstantInt::getTrue(TrueSucc->getContext()); + BasicBlockEdge TrueE(Parent, TrueSucc); + Changed |= propagateEquality(BranchCond, TrueVal, TrueE); + + Value *FalseVal = ConstantInt::getFalse(FalseSucc->getContext()); + BasicBlockEdge FalseE(Parent, FalseSucc); + Changed |= propagateEquality(BranchCond, FalseVal, FalseE); + + return Changed; + } + + // For switches, propagate the case values into the case destinations. + if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) { + Value *SwitchCond = SI->getCondition(); + BasicBlock *Parent = SI->getParent(); + bool Changed = false; + for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); + i != e; ++i) { + BasicBlock *Dst = i.getCaseSuccessor(); + BasicBlockEdge E(Parent, Dst); + if (E.isSingleEdge()) + Changed |= propagateEquality(SwitchCond, i.getCaseValue(), E); + } + return Changed; + } + + // Instructions with void type don't return a value, so there's + // no point in trying to find redundancies in them. + if (I->getType()->isVoidTy()) return false; + + uint32_t NextNum = VN.getNextUnusedValueNumber(); + unsigned Num = VN.lookup_or_add(I); + + // Allocations are always uniquely numbered, so we can save time and memory + // by fast failing them. + if (isa<AllocaInst>(I) || isa<TerminatorInst>(I) || isa<PHINode>(I)) { + addToLeaderTable(Num, I, I->getParent()); + return false; + } + + // If the number we were assigned was a brand new VN, then we don't + // need to do a lookup to see if the number already exists + // somewhere in the domtree: it can't! + if (Num >= NextNum) { + addToLeaderTable(Num, I, I->getParent()); + return false; + } + + // Perform fast-path value-number based elimination of values inherited from + // dominators. + Value *repl = findLeader(I->getParent(), Num); + if (repl == 0) { + // Failure, just remember this instance for future use. + addToLeaderTable(Num, I, I->getParent()); + return false; + } + + // Remove it! + patchAndReplaceAllUsesWith(repl, I); + if (MD && repl->getType()->isPointerTy()) + MD->invalidateCachedPointerInfo(repl); + markInstructionForDeletion(I); + return true; +} + +/// runOnFunction - This is the main transformation entry point for a function. +bool GVN::runOnFunction(Function& F) { + if (!NoLoads) + MD = &getAnalysis<MemoryDependenceAnalysis>(); + DT = &getAnalysis<DominatorTree>(); + TD = getAnalysisIfAvailable<TargetData>(); + TLI = &getAnalysis<TargetLibraryInfo>(); + VN.setAliasAnalysis(&getAnalysis<AliasAnalysis>()); + VN.setMemDep(MD); + VN.setDomTree(DT); + + bool Changed = false; + bool ShouldContinue = true; + + // Merge unconditional branches, allowing PRE to catch more + // optimization opportunities. + for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ) { + BasicBlock *BB = FI++; + + bool removedBlock = MergeBlockIntoPredecessor(BB, this); + if (removedBlock) ++NumGVNBlocks; + + Changed |= removedBlock; + } + + unsigned Iteration = 0; + while (ShouldContinue) { + DEBUG(dbgs() << "GVN iteration: " << Iteration << "\n"); + ShouldContinue = iterateOnFunction(F); + if (splitCriticalEdges()) + ShouldContinue = true; + Changed |= ShouldContinue; + ++Iteration; + } + + if (EnablePRE) { + bool PREChanged = true; + while (PREChanged) { + PREChanged = performPRE(F); + Changed |= PREChanged; + } + } + // FIXME: Should perform GVN again after PRE does something. PRE can move + // computations into blocks where they become fully redundant. Note that + // we can't do this until PRE's critical edge splitting updates memdep. + // Actually, when this happens, we should just fully integrate PRE into GVN. + + cleanupGlobalSets(); + + return Changed; +} + + +bool GVN::processBlock(BasicBlock *BB) { + // FIXME: Kill off InstrsToErase by doing erasing eagerly in a helper function + // (and incrementing BI before processing an instruction). + assert(InstrsToErase.empty() && + "We expect InstrsToErase to be empty across iterations"); + bool ChangedFunction = false; + + for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); + BI != BE;) { + ChangedFunction |= processInstruction(BI); + if (InstrsToErase.empty()) { + ++BI; + continue; + } + + // If we need some instructions deleted, do it now. + NumGVNInstr += InstrsToErase.size(); + + // Avoid iterator invalidation. + bool AtStart = BI == BB->begin(); + if (!AtStart) + --BI; + + for (SmallVector<Instruction*, 4>::iterator I = InstrsToErase.begin(), + E = InstrsToErase.end(); I != E; ++I) { + DEBUG(dbgs() << "GVN removed: " << **I << '\n'); + if (MD) MD->removeInstruction(*I); + (*I)->eraseFromParent(); + DEBUG(verifyRemoved(*I)); + } + InstrsToErase.clear(); + + if (AtStart) + BI = BB->begin(); + else + ++BI; + } + + return ChangedFunction; +} + +/// performPRE - Perform a purely local form of PRE that looks for diamond +/// control flow patterns and attempts to perform simple PRE at the join point. +bool GVN::performPRE(Function &F) { + bool Changed = false; + DenseMap<BasicBlock*, Value*> predMap; + for (df_iterator<BasicBlock*> DI = df_begin(&F.getEntryBlock()), + DE = df_end(&F.getEntryBlock()); DI != DE; ++DI) { + BasicBlock *CurrentBlock = *DI; + + // Nothing to PRE in the entry block. + if (CurrentBlock == &F.getEntryBlock()) continue; + + // Don't perform PRE on a landing pad. + if (CurrentBlock->isLandingPad()) continue; + + for (BasicBlock::iterator BI = CurrentBlock->begin(), + BE = CurrentBlock->end(); BI != BE; ) { + Instruction *CurInst = BI++; + + if (isa<AllocaInst>(CurInst) || + isa<TerminatorInst>(CurInst) || isa<PHINode>(CurInst) || + CurInst->getType()->isVoidTy() || + CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() || + isa<DbgInfoIntrinsic>(CurInst)) + continue; + + // Don't do PRE on compares. The PHI would prevent CodeGenPrepare from + // sinking the compare again, and it would force the code generator to + // move the i1 from processor flags or predicate registers into a general + // purpose register. + if (isa<CmpInst>(CurInst)) + continue; + + // We don't currently value number ANY inline asm calls. + if (CallInst *CallI = dyn_cast<CallInst>(CurInst)) + if (CallI->isInlineAsm()) + continue; + + uint32_t ValNo = VN.lookup(CurInst); + + // Look for the predecessors for PRE opportunities. We're + // only trying to solve the basic diamond case, where + // a value is computed in the successor and one predecessor, + // but not the other. We also explicitly disallow cases + // where the successor is its own predecessor, because they're + // more complicated to get right. + unsigned NumWith = 0; + unsigned NumWithout = 0; + BasicBlock *PREPred = 0; + predMap.clear(); + + for (pred_iterator PI = pred_begin(CurrentBlock), + PE = pred_end(CurrentBlock); PI != PE; ++PI) { + BasicBlock *P = *PI; + // We're not interested in PRE where the block is its + // own predecessor, or in blocks with predecessors + // that are not reachable. + if (P == CurrentBlock) { + NumWithout = 2; + break; + } else if (!DT->dominates(&F.getEntryBlock(), P)) { + NumWithout = 2; + break; + } + + Value* predV = findLeader(P, ValNo); + if (predV == 0) { + PREPred = P; + ++NumWithout; + } else if (predV == CurInst) { + NumWithout = 2; + } else { + predMap[P] = predV; + ++NumWith; + } + } + + // Don't do PRE when it might increase code size, i.e. when + // we would need to insert instructions in more than one pred. + if (NumWithout != 1 || NumWith == 0) + continue; + + // Don't do PRE across indirect branch. + if (isa<IndirectBrInst>(PREPred->getTerminator())) + continue; + + // We can't do PRE safely on a critical edge, so instead we schedule + // the edge to be split and perform the PRE the next time we iterate + // on the function. + unsigned SuccNum = GetSuccessorNumber(PREPred, CurrentBlock); + if (isCriticalEdge(PREPred->getTerminator(), SuccNum)) { + toSplit.push_back(std::make_pair(PREPred->getTerminator(), SuccNum)); + continue; + } + + // Instantiate the expression in the predecessor that lacked it. + // Because we are going top-down through the block, all value numbers + // will be available in the predecessor by the time we need them. Any + // that weren't originally present will have been instantiated earlier + // in this loop. + Instruction *PREInstr = CurInst->clone(); + bool success = true; + for (unsigned i = 0, e = CurInst->getNumOperands(); i != e; ++i) { + Value *Op = PREInstr->getOperand(i); + if (isa<Argument>(Op) || isa<Constant>(Op) || isa<GlobalValue>(Op)) + continue; + + if (Value *V = findLeader(PREPred, VN.lookup(Op))) { + PREInstr->setOperand(i, V); + } else { + success = false; + break; + } + } + + // Fail out if we encounter an operand that is not available in + // the PRE predecessor. This is typically because of loads which + // are not value numbered precisely. + if (!success) { + delete PREInstr; + DEBUG(verifyRemoved(PREInstr)); + continue; + } + + PREInstr->insertBefore(PREPred->getTerminator()); + PREInstr->setName(CurInst->getName() + ".pre"); + PREInstr->setDebugLoc(CurInst->getDebugLoc()); + predMap[PREPred] = PREInstr; + VN.add(PREInstr, ValNo); + ++NumGVNPRE; + + // Update the availability map to include the new instruction. + addToLeaderTable(ValNo, PREInstr, PREPred); + + // Create a PHI to make the value available in this block. + pred_iterator PB = pred_begin(CurrentBlock), PE = pred_end(CurrentBlock); + PHINode* Phi = PHINode::Create(CurInst->getType(), std::distance(PB, PE), + CurInst->getName() + ".pre-phi", + CurrentBlock->begin()); + for (pred_iterator PI = PB; PI != PE; ++PI) { + BasicBlock *P = *PI; + Phi->addIncoming(predMap[P], P); + } + + VN.add(Phi, ValNo); + addToLeaderTable(ValNo, Phi, CurrentBlock); + Phi->setDebugLoc(CurInst->getDebugLoc()); + CurInst->replaceAllUsesWith(Phi); + if (Phi->getType()->isPointerTy()) { + // Because we have added a PHI-use of the pointer value, it has now + // "escaped" from alias analysis' perspective. We need to inform + // AA of this. + for (unsigned ii = 0, ee = Phi->getNumIncomingValues(); ii != ee; + ++ii) { + unsigned jj = PHINode::getOperandNumForIncomingValue(ii); + VN.getAliasAnalysis()->addEscapingUse(Phi->getOperandUse(jj)); + } + + if (MD) + MD->invalidateCachedPointerInfo(Phi); + } + VN.erase(CurInst); + removeFromLeaderTable(ValNo, CurInst, CurrentBlock); + + DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n'); + if (MD) MD->removeInstruction(CurInst); + CurInst->eraseFromParent(); + DEBUG(verifyRemoved(CurInst)); + Changed = true; + } + } + + if (splitCriticalEdges()) + Changed = true; + + return Changed; +} + +/// splitCriticalEdges - Split critical edges found during the previous +/// iteration that may enable further optimization. +bool GVN::splitCriticalEdges() { + if (toSplit.empty()) + return false; + do { + std::pair<TerminatorInst*, unsigned> Edge = toSplit.pop_back_val(); + SplitCriticalEdge(Edge.first, Edge.second, this); + } while (!toSplit.empty()); + if (MD) MD->invalidateCachedPredecessors(); + return true; +} + +/// iterateOnFunction - Executes one iteration of GVN +bool GVN::iterateOnFunction(Function &F) { + cleanupGlobalSets(); + + // Top-down walk of the dominator tree + bool Changed = false; +#if 0 + // Needed for value numbering with phi construction to work. + ReversePostOrderTraversal<Function*> RPOT(&F); + for (ReversePostOrderTraversal<Function*>::rpo_iterator RI = RPOT.begin(), + RE = RPOT.end(); RI != RE; ++RI) + Changed |= processBlock(*RI); +#else + for (df_iterator<DomTreeNode*> DI = df_begin(DT->getRootNode()), + DE = df_end(DT->getRootNode()); DI != DE; ++DI) + Changed |= processBlock(DI->getBlock()); +#endif + + return Changed; +} + +void GVN::cleanupGlobalSets() { + VN.clear(); + LeaderTable.clear(); + TableAllocator.Reset(); +} + +/// verifyRemoved - Verify that the specified instruction does not occur in our +/// internal data structures. +void GVN::verifyRemoved(const Instruction *Inst) const { + VN.verifyRemoved(Inst); + + // Walk through the value number scope to make sure the instruction isn't + // ferreted away in it. + for (DenseMap<uint32_t, LeaderTableEntry>::const_iterator + I = LeaderTable.begin(), E = LeaderTable.end(); I != E; ++I) { + const LeaderTableEntry *Node = &I->second; + assert(Node->Val != Inst && "Inst still in value numbering scope!"); + + while (Node->Next) { + Node = Node->Next; + assert(Node->Val != Inst && "Inst still in value numbering scope!"); + } + } +} diff --git a/contrib/llvm/lib/Transforms/Scalar/GlobalMerge.cpp b/contrib/llvm/lib/Transforms/Scalar/GlobalMerge.cpp new file mode 100644 index 0000000..b36a3cb --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/GlobalMerge.cpp @@ -0,0 +1,226 @@ +//===-- GlobalMerge.cpp - Internal globals merging -----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// This pass merges globals with internal linkage into one. This way all the +// globals which were merged into a biggest one can be addressed using offsets +// from the same base pointer (no need for separate base pointer for each of the +// global). Such a transformation can significantly reduce the register pressure +// when many globals are involved. +// +// For example, consider the code which touches several global variables at +// once: +// +// static int foo[N], bar[N], baz[N]; +// +// for (i = 0; i < N; ++i) { +// foo[i] = bar[i] * baz[i]; +// } +// +// On ARM the addresses of 3 arrays should be kept in the registers, thus +// this code has quite large register pressure (loop body): +// +// ldr r1, [r5], #4 +// ldr r2, [r6], #4 +// mul r1, r2, r1 +// str r1, [r0], #4 +// +// Pass converts the code to something like: +// +// static struct { +// int foo[N]; +// int bar[N]; +// int baz[N]; +// } merged; +// +// for (i = 0; i < N; ++i) { +// merged.foo[i] = merged.bar[i] * merged.baz[i]; +// } +// +// and in ARM code this becomes: +// +// ldr r0, [r5, #40] +// ldr r1, [r5, #80] +// mul r0, r1, r0 +// str r0, [r5], #4 +// +// note that we saved 2 registers here almostly "for free". +// ===---------------------------------------------------------------------===// + +#define DEBUG_TYPE "global-merge" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Attributes.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/GlobalVariable.h" +#include "llvm/Instructions.h" +#include "llvm/Intrinsics.h" +#include "llvm/Module.h" +#include "llvm/Pass.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetLoweringObjectFile.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumMerged , "Number of globals merged"); +namespace { + class GlobalMerge : public FunctionPass { + /// TLI - Keep a pointer of a TargetLowering to consult for determining + /// target type sizes. + const TargetLowering *TLI; + + bool doMerge(SmallVectorImpl<GlobalVariable*> &Globals, + Module &M, bool isConst) const; + + public: + static char ID; // Pass identification, replacement for typeid. + explicit GlobalMerge(const TargetLowering *tli = 0) + : FunctionPass(ID), TLI(tli) { + initializeGlobalMergePass(*PassRegistry::getPassRegistry()); + } + + virtual bool doInitialization(Module &M); + virtual bool runOnFunction(Function &F); + + const char *getPassName() const { + return "Merge internal globals"; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + FunctionPass::getAnalysisUsage(AU); + } + + struct GlobalCmp { + const TargetData *TD; + + GlobalCmp(const TargetData *td) : TD(td) { } + + bool operator()(const GlobalVariable *GV1, const GlobalVariable *GV2) { + Type *Ty1 = cast<PointerType>(GV1->getType())->getElementType(); + Type *Ty2 = cast<PointerType>(GV2->getType())->getElementType(); + + return (TD->getTypeAllocSize(Ty1) < TD->getTypeAllocSize(Ty2)); + } + }; + }; +} // end anonymous namespace + +char GlobalMerge::ID = 0; +INITIALIZE_PASS(GlobalMerge, "global-merge", + "Global Merge", false, false) + + +bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals, + Module &M, bool isConst) const { + const TargetData *TD = TLI->getTargetData(); + + // FIXME: Infer the maximum possible offset depending on the actual users + // (these max offsets are different for the users inside Thumb or ARM + // functions) + unsigned MaxOffset = TLI->getMaximalGlobalOffset(); + + // FIXME: Find better heuristics + std::stable_sort(Globals.begin(), Globals.end(), GlobalCmp(TD)); + + Type *Int32Ty = Type::getInt32Ty(M.getContext()); + + for (size_t i = 0, e = Globals.size(); i != e; ) { + size_t j = 0; + uint64_t MergedSize = 0; + std::vector<Type*> Tys; + std::vector<Constant*> Inits; + for (j = i; j != e; ++j) { + Type *Ty = Globals[j]->getType()->getElementType(); + MergedSize += TD->getTypeAllocSize(Ty); + if (MergedSize > MaxOffset) { + break; + } + Tys.push_back(Ty); + Inits.push_back(Globals[j]->getInitializer()); + } + + StructType *MergedTy = StructType::get(M.getContext(), Tys); + Constant *MergedInit = ConstantStruct::get(MergedTy, Inits); + GlobalVariable *MergedGV = new GlobalVariable(M, MergedTy, isConst, + GlobalValue::InternalLinkage, + MergedInit, "_MergedGlobals"); + for (size_t k = i; k < j; ++k) { + Constant *Idx[2] = { + ConstantInt::get(Int32Ty, 0), + ConstantInt::get(Int32Ty, k-i) + }; + Constant *GEP = ConstantExpr::getInBoundsGetElementPtr(MergedGV, Idx); + Globals[k]->replaceAllUsesWith(GEP); + Globals[k]->eraseFromParent(); + NumMerged++; + } + i = j; + } + + return true; +} + + +bool GlobalMerge::doInitialization(Module &M) { + SmallVector<GlobalVariable*, 16> Globals, ConstGlobals, BSSGlobals; + const TargetData *TD = TLI->getTargetData(); + unsigned MaxOffset = TLI->getMaximalGlobalOffset(); + bool Changed = false; + + // Grab all non-const globals. + for (Module::global_iterator I = M.global_begin(), + E = M.global_end(); I != E; ++I) { + // Merge is safe for "normal" internal globals only + if (!I->hasLocalLinkage() || I->isThreadLocal() || I->hasSection()) + continue; + + // Ignore fancy-aligned globals for now. + unsigned Alignment = TD->getPreferredAlignment(I); + Type *Ty = I->getType()->getElementType(); + if (Alignment > TD->getABITypeAlignment(Ty)) + continue; + + // Ignore all 'special' globals. + if (I->getName().startswith("llvm.") || + I->getName().startswith(".llvm.")) + continue; + + if (TD->getTypeAllocSize(Ty) < MaxOffset) { + if (TargetLoweringObjectFile::getKindForGlobal(I, TLI->getTargetMachine()) + .isBSSLocal()) + BSSGlobals.push_back(I); + else if (I->isConstant()) + ConstGlobals.push_back(I); + else + Globals.push_back(I); + } + } + + if (Globals.size() > 1) + Changed |= doMerge(Globals, M, false); + if (BSSGlobals.size() > 1) + Changed |= doMerge(BSSGlobals, M, false); + + // FIXME: This currently breaks the EH processing due to way how the + // typeinfo detection works. We might want to detect the TIs and ignore + // them in the future. + // if (ConstGlobals.size() > 1) + // Changed |= doMerge(ConstGlobals, M, true); + + return Changed; +} + +bool GlobalMerge::runOnFunction(Function &F) { + return false; +} + +Pass *llvm::createGlobalMergePass(const TargetLowering *tli) { + return new GlobalMerge(tli); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp new file mode 100644 index 0000000..37f8bdf --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -0,0 +1,1798 @@ +//===- IndVarSimplify.cpp - Induction Variable Elimination ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This transformation analyzes and transforms the induction variables (and +// computations derived from them) into simpler forms suitable for subsequent +// analysis and transformation. +// +// If the trip count of a loop is computable, this pass also makes the following +// changes: +// 1. The exit condition for the loop is canonicalized to compare the +// induction value against the exit value. This turns loops like: +// 'for (i = 7; i*i < 1000; ++i)' into 'for (i = 0; i != 25; ++i)' +// 2. Any use outside of the loop of an expression derived from the indvar +// is changed to compute the derived value outside of the loop, eliminating +// the dependence on the exit value of the induction variable. If the only +// purpose of the loop is to compute the exit value of some derived +// expression, this transformation will make the loop dead. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "indvars" +#include "llvm/Transforms/Scalar.h" +#include "llvm/BasicBlock.h" +#include "llvm/Constants.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/LLVMContext.h" +#include "llvm/Type.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Support/CFG.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/SimplifyIndVar.h" +#include "llvm/Target/TargetData.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumWidened , "Number of indvars widened"); +STATISTIC(NumReplaced , "Number of exit values replaced"); +STATISTIC(NumLFTR , "Number of loop exit tests replaced"); +STATISTIC(NumElimExt , "Number of IV sign/zero extends eliminated"); +STATISTIC(NumElimIV , "Number of congruent IVs eliminated"); + +// Trip count verification can be enabled by default under NDEBUG if we +// implement a strong expression equivalence checker in SCEV. Until then, we +// use the verify-indvars flag, which may assert in some cases. +static cl::opt<bool> VerifyIndvars( + "verify-indvars", cl::Hidden, + cl::desc("Verify the ScalarEvolution result after running indvars")); + +namespace { + class IndVarSimplify : public LoopPass { + LoopInfo *LI; + ScalarEvolution *SE; + DominatorTree *DT; + TargetData *TD; + + SmallVector<WeakVH, 16> DeadInsts; + bool Changed; + public: + + static char ID; // Pass identification, replacement for typeid + IndVarSimplify() : LoopPass(ID), LI(0), SE(0), DT(0), TD(0), + Changed(false) { + initializeIndVarSimplifyPass(*PassRegistry::getPassRegistry()); + } + + virtual bool runOnLoop(Loop *L, LPPassManager &LPM); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<DominatorTree>(); + AU.addRequired<LoopInfo>(); + AU.addRequired<ScalarEvolution>(); + AU.addRequiredID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + AU.addPreserved<ScalarEvolution>(); + AU.addPreservedID(LoopSimplifyID); + AU.addPreservedID(LCSSAID); + AU.setPreservesCFG(); + } + + private: + virtual void releaseMemory() { + DeadInsts.clear(); + } + + bool isValidRewrite(Value *FromVal, Value *ToVal); + + void HandleFloatingPointIV(Loop *L, PHINode *PH); + void RewriteNonIntegerIVs(Loop *L); + + void SimplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LPPassManager &LPM); + + void RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter); + + Value *LinearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount, + PHINode *IndVar, SCEVExpander &Rewriter); + + void SinkUnusedInvariants(Loop *L); + }; +} + +char IndVarSimplify::ID = 0; +INITIALIZE_PASS_BEGIN(IndVarSimplify, "indvars", + "Induction Variable Simplification", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_END(IndVarSimplify, "indvars", + "Induction Variable Simplification", false, false) + +Pass *llvm::createIndVarSimplifyPass() { + return new IndVarSimplify(); +} + +/// isValidRewrite - Return true if the SCEV expansion generated by the +/// rewriter can replace the original value. SCEV guarantees that it +/// produces the same value, but the way it is produced may be illegal IR. +/// Ideally, this function will only be called for verification. +bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) { + // If an SCEV expression subsumed multiple pointers, its expansion could + // reassociate the GEP changing the base pointer. This is illegal because the + // final address produced by a GEP chain must be inbounds relative to its + // underlying object. Otherwise basic alias analysis, among other things, + // could fail in a dangerous way. Ultimately, SCEV will be improved to avoid + // producing an expression involving multiple pointers. Until then, we must + // bail out here. + // + // Retrieve the pointer operand of the GEP. Don't use GetUnderlyingObject + // because it understands lcssa phis while SCEV does not. + Value *FromPtr = FromVal; + Value *ToPtr = ToVal; + if (GEPOperator *GEP = dyn_cast<GEPOperator>(FromVal)) { + FromPtr = GEP->getPointerOperand(); + } + if (GEPOperator *GEP = dyn_cast<GEPOperator>(ToVal)) { + ToPtr = GEP->getPointerOperand(); + } + if (FromPtr != FromVal || ToPtr != ToVal) { + // Quickly check the common case + if (FromPtr == ToPtr) + return true; + + // SCEV may have rewritten an expression that produces the GEP's pointer + // operand. That's ok as long as the pointer operand has the same base + // pointer. Unlike GetUnderlyingObject(), getPointerBase() will find the + // base of a recurrence. This handles the case in which SCEV expansion + // converts a pointer type recurrence into a nonrecurrent pointer base + // indexed by an integer recurrence. + + // If the GEP base pointer is a vector of pointers, abort. + if (!FromPtr->getType()->isPointerTy() || !ToPtr->getType()->isPointerTy()) + return false; + + const SCEV *FromBase = SE->getPointerBase(SE->getSCEV(FromPtr)); + const SCEV *ToBase = SE->getPointerBase(SE->getSCEV(ToPtr)); + if (FromBase == ToBase) + return true; + + DEBUG(dbgs() << "INDVARS: GEP rewrite bail out " + << *FromBase << " != " << *ToBase << "\n"); + + return false; + } + return true; +} + +/// Determine the insertion point for this user. By default, insert immediately +/// before the user. SCEVExpander or LICM will hoist loop invariants out of the +/// loop. For PHI nodes, there may be multiple uses, so compute the nearest +/// common dominator for the incoming blocks. +static Instruction *getInsertPointForUses(Instruction *User, Value *Def, + DominatorTree *DT) { + PHINode *PHI = dyn_cast<PHINode>(User); + if (!PHI) + return User; + + Instruction *InsertPt = 0; + for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i) { + if (PHI->getIncomingValue(i) != Def) + continue; + + BasicBlock *InsertBB = PHI->getIncomingBlock(i); + if (!InsertPt) { + InsertPt = InsertBB->getTerminator(); + continue; + } + InsertBB = DT->findNearestCommonDominator(InsertPt->getParent(), InsertBB); + InsertPt = InsertBB->getTerminator(); + } + assert(InsertPt && "Missing phi operand"); + assert((!isa<Instruction>(Def) || + DT->dominates(cast<Instruction>(Def), InsertPt)) && + "def does not dominate all uses"); + return InsertPt; +} + +//===----------------------------------------------------------------------===// +// RewriteNonIntegerIVs and helpers. Prefer integer IVs. +//===----------------------------------------------------------------------===// + +/// ConvertToSInt - Convert APF to an integer, if possible. +static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) { + bool isExact = false; + if (&APF.getSemantics() == &APFloat::PPCDoubleDouble) + return false; + // See if we can convert this to an int64_t + uint64_t UIntVal; + if (APF.convertToInteger(&UIntVal, 64, true, APFloat::rmTowardZero, + &isExact) != APFloat::opOK || !isExact) + return false; + IntVal = UIntVal; + return true; +} + +/// HandleFloatingPointIV - If the loop has floating induction variable +/// then insert corresponding integer induction variable if possible. +/// For example, +/// for(double i = 0; i < 10000; ++i) +/// bar(i) +/// is converted into +/// for(int i = 0; i < 10000; ++i) +/// bar((double)i); +/// +void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) { + unsigned IncomingEdge = L->contains(PN->getIncomingBlock(0)); + unsigned BackEdge = IncomingEdge^1; + + // Check incoming value. + ConstantFP *InitValueVal = + dyn_cast<ConstantFP>(PN->getIncomingValue(IncomingEdge)); + + int64_t InitValue; + if (!InitValueVal || !ConvertToSInt(InitValueVal->getValueAPF(), InitValue)) + return; + + // Check IV increment. Reject this PN if increment operation is not + // an add or increment value can not be represented by an integer. + BinaryOperator *Incr = + dyn_cast<BinaryOperator>(PN->getIncomingValue(BackEdge)); + if (Incr == 0 || Incr->getOpcode() != Instruction::FAdd) return; + + // If this is not an add of the PHI with a constantfp, or if the constant fp + // is not an integer, bail out. + ConstantFP *IncValueVal = dyn_cast<ConstantFP>(Incr->getOperand(1)); + int64_t IncValue; + if (IncValueVal == 0 || Incr->getOperand(0) != PN || + !ConvertToSInt(IncValueVal->getValueAPF(), IncValue)) + return; + + // Check Incr uses. One user is PN and the other user is an exit condition + // used by the conditional terminator. + Value::use_iterator IncrUse = Incr->use_begin(); + Instruction *U1 = cast<Instruction>(*IncrUse++); + if (IncrUse == Incr->use_end()) return; + Instruction *U2 = cast<Instruction>(*IncrUse++); + if (IncrUse != Incr->use_end()) return; + + // Find exit condition, which is an fcmp. If it doesn't exist, or if it isn't + // only used by a branch, we can't transform it. + FCmpInst *Compare = dyn_cast<FCmpInst>(U1); + if (!Compare) + Compare = dyn_cast<FCmpInst>(U2); + if (Compare == 0 || !Compare->hasOneUse() || + !isa<BranchInst>(Compare->use_back())) + return; + + BranchInst *TheBr = cast<BranchInst>(Compare->use_back()); + + // We need to verify that the branch actually controls the iteration count + // of the loop. If not, the new IV can overflow and no one will notice. + // The branch block must be in the loop and one of the successors must be out + // of the loop. + assert(TheBr->isConditional() && "Can't use fcmp if not conditional"); + if (!L->contains(TheBr->getParent()) || + (L->contains(TheBr->getSuccessor(0)) && + L->contains(TheBr->getSuccessor(1)))) + return; + + + // If it isn't a comparison with an integer-as-fp (the exit value), we can't + // transform it. + ConstantFP *ExitValueVal = dyn_cast<ConstantFP>(Compare->getOperand(1)); + int64_t ExitValue; + if (ExitValueVal == 0 || + !ConvertToSInt(ExitValueVal->getValueAPF(), ExitValue)) + return; + + // Find new predicate for integer comparison. + CmpInst::Predicate NewPred = CmpInst::BAD_ICMP_PREDICATE; + switch (Compare->getPredicate()) { + default: return; // Unknown comparison. + case CmpInst::FCMP_OEQ: + case CmpInst::FCMP_UEQ: NewPred = CmpInst::ICMP_EQ; break; + case CmpInst::FCMP_ONE: + case CmpInst::FCMP_UNE: NewPred = CmpInst::ICMP_NE; break; + case CmpInst::FCMP_OGT: + case CmpInst::FCMP_UGT: NewPred = CmpInst::ICMP_SGT; break; + case CmpInst::FCMP_OGE: + case CmpInst::FCMP_UGE: NewPred = CmpInst::ICMP_SGE; break; + case CmpInst::FCMP_OLT: + case CmpInst::FCMP_ULT: NewPred = CmpInst::ICMP_SLT; break; + case CmpInst::FCMP_OLE: + case CmpInst::FCMP_ULE: NewPred = CmpInst::ICMP_SLE; break; + } + + // We convert the floating point induction variable to a signed i32 value if + // we can. This is only safe if the comparison will not overflow in a way + // that won't be trapped by the integer equivalent operations. Check for this + // now. + // TODO: We could use i64 if it is native and the range requires it. + + // The start/stride/exit values must all fit in signed i32. + if (!isInt<32>(InitValue) || !isInt<32>(IncValue) || !isInt<32>(ExitValue)) + return; + + // If not actually striding (add x, 0.0), avoid touching the code. + if (IncValue == 0) + return; + + // Positive and negative strides have different safety conditions. + if (IncValue > 0) { + // If we have a positive stride, we require the init to be less than the + // exit value. + if (InitValue >= ExitValue) + return; + + uint32_t Range = uint32_t(ExitValue-InitValue); + // Check for infinite loop, either: + // while (i <= Exit) or until (i > Exit) + if (NewPred == CmpInst::ICMP_SLE || NewPred == CmpInst::ICMP_SGT) { + if (++Range == 0) return; // Range overflows. + } + + unsigned Leftover = Range % uint32_t(IncValue); + + // If this is an equality comparison, we require that the strided value + // exactly land on the exit value, otherwise the IV condition will wrap + // around and do things the fp IV wouldn't. + if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) && + Leftover != 0) + return; + + // If the stride would wrap around the i32 before exiting, we can't + // transform the IV. + if (Leftover != 0 && int32_t(ExitValue+IncValue) < ExitValue) + return; + + } else { + // If we have a negative stride, we require the init to be greater than the + // exit value. + if (InitValue <= ExitValue) + return; + + uint32_t Range = uint32_t(InitValue-ExitValue); + // Check for infinite loop, either: + // while (i >= Exit) or until (i < Exit) + if (NewPred == CmpInst::ICMP_SGE || NewPred == CmpInst::ICMP_SLT) { + if (++Range == 0) return; // Range overflows. + } + + unsigned Leftover = Range % uint32_t(-IncValue); + + // If this is an equality comparison, we require that the strided value + // exactly land on the exit value, otherwise the IV condition will wrap + // around and do things the fp IV wouldn't. + if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) && + Leftover != 0) + return; + + // If the stride would wrap around the i32 before exiting, we can't + // transform the IV. + if (Leftover != 0 && int32_t(ExitValue+IncValue) > ExitValue) + return; + } + + IntegerType *Int32Ty = Type::getInt32Ty(PN->getContext()); + + // Insert new integer induction variable. + PHINode *NewPHI = PHINode::Create(Int32Ty, 2, PN->getName()+".int", PN); + NewPHI->addIncoming(ConstantInt::get(Int32Ty, InitValue), + PN->getIncomingBlock(IncomingEdge)); + + Value *NewAdd = + BinaryOperator::CreateAdd(NewPHI, ConstantInt::get(Int32Ty, IncValue), + Incr->getName()+".int", Incr); + NewPHI->addIncoming(NewAdd, PN->getIncomingBlock(BackEdge)); + + ICmpInst *NewCompare = new ICmpInst(TheBr, NewPred, NewAdd, + ConstantInt::get(Int32Ty, ExitValue), + Compare->getName()); + + // In the following deletions, PN may become dead and may be deleted. + // Use a WeakVH to observe whether this happens. + WeakVH WeakPH = PN; + + // Delete the old floating point exit comparison. The branch starts using the + // new comparison. + NewCompare->takeName(Compare); + Compare->replaceAllUsesWith(NewCompare); + RecursivelyDeleteTriviallyDeadInstructions(Compare); + + // Delete the old floating point increment. + Incr->replaceAllUsesWith(UndefValue::get(Incr->getType())); + RecursivelyDeleteTriviallyDeadInstructions(Incr); + + // If the FP induction variable still has uses, this is because something else + // in the loop uses its value. In order to canonicalize the induction + // variable, we chose to eliminate the IV and rewrite it in terms of an + // int->fp cast. + // + // We give preference to sitofp over uitofp because it is faster on most + // platforms. + if (WeakPH) { + Value *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv", + PN->getParent()->getFirstInsertionPt()); + PN->replaceAllUsesWith(Conv); + RecursivelyDeleteTriviallyDeadInstructions(PN); + } + Changed = true; +} + +void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) { + // First step. Check to see if there are any floating-point recurrences. + // If there are, change them into integer recurrences, permitting analysis by + // the SCEV routines. + // + BasicBlock *Header = L->getHeader(); + + SmallVector<WeakVH, 8> PHIs; + for (BasicBlock::iterator I = Header->begin(); + PHINode *PN = dyn_cast<PHINode>(I); ++I) + PHIs.push_back(PN); + + for (unsigned i = 0, e = PHIs.size(); i != e; ++i) + if (PHINode *PN = dyn_cast_or_null<PHINode>(&*PHIs[i])) + HandleFloatingPointIV(L, PN); + + // If the loop previously had floating-point IV, ScalarEvolution + // may not have been able to compute a trip count. Now that we've done some + // re-writing, the trip count may be computable. + if (Changed) + SE->forgetLoop(L); +} + +//===----------------------------------------------------------------------===// +// RewriteLoopExitValues - Optimize IV users outside the loop. +// As a side effect, reduces the amount of IV processing within the loop. +//===----------------------------------------------------------------------===// + +/// RewriteLoopExitValues - Check to see if this loop has a computable +/// loop-invariant execution count. If so, this means that we can compute the +/// final value of any expressions that are recurrent in the loop, and +/// substitute the exit values from the loop into any instructions outside of +/// the loop that use the final values of the current expressions. +/// +/// This is mostly redundant with the regular IndVarSimplify activities that +/// happen later, except that it's more powerful in some cases, because it's +/// able to brute-force evaluate arbitrary instructions as long as they have +/// constant operands at the beginning of the loop. +void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { + // Verify the input to the pass in already in LCSSA form. + assert(L->isLCSSAForm(*DT)); + + SmallVector<BasicBlock*, 8> ExitBlocks; + L->getUniqueExitBlocks(ExitBlocks); + + // Find all values that are computed inside the loop, but used outside of it. + // Because of LCSSA, these values will only occur in LCSSA PHI Nodes. Scan + // the exit blocks of the loop to find them. + for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) { + BasicBlock *ExitBB = ExitBlocks[i]; + + // If there are no PHI nodes in this exit block, then no values defined + // inside the loop are used on this path, skip it. + PHINode *PN = dyn_cast<PHINode>(ExitBB->begin()); + if (!PN) continue; + + unsigned NumPreds = PN->getNumIncomingValues(); + + // Iterate over all of the PHI nodes. + BasicBlock::iterator BBI = ExitBB->begin(); + while ((PN = dyn_cast<PHINode>(BBI++))) { + if (PN->use_empty()) + continue; // dead use, don't replace it + + // SCEV only supports integer expressions for now. + if (!PN->getType()->isIntegerTy() && !PN->getType()->isPointerTy()) + continue; + + // It's necessary to tell ScalarEvolution about this explicitly so that + // it can walk the def-use list and forget all SCEVs, as it may not be + // watching the PHI itself. Once the new exit value is in place, there + // may not be a def-use connection between the loop and every instruction + // which got a SCEVAddRecExpr for that loop. + SE->forgetValue(PN); + + // Iterate over all of the values in all the PHI nodes. + for (unsigned i = 0; i != NumPreds; ++i) { + // If the value being merged in is not integer or is not defined + // in the loop, skip it. + Value *InVal = PN->getIncomingValue(i); + if (!isa<Instruction>(InVal)) + continue; + + // If this pred is for a subloop, not L itself, skip it. + if (LI->getLoopFor(PN->getIncomingBlock(i)) != L) + continue; // The Block is in a subloop, skip it. + + // Check that InVal is defined in the loop. + Instruction *Inst = cast<Instruction>(InVal); + if (!L->contains(Inst)) + continue; + + // Okay, this instruction has a user outside of the current loop + // and varies predictably *inside* the loop. Evaluate the value it + // contains when the loop exits, if possible. + const SCEV *ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop()); + if (!SE->isLoopInvariant(ExitValue, L)) + continue; + + Value *ExitVal = Rewriter.expandCodeFor(ExitValue, PN->getType(), Inst); + + DEBUG(dbgs() << "INDVARS: RLEV: AfterLoopVal = " << *ExitVal << '\n' + << " LoopVal = " << *Inst << "\n"); + + if (!isValidRewrite(Inst, ExitVal)) { + DeadInsts.push_back(ExitVal); + continue; + } + Changed = true; + ++NumReplaced; + + PN->setIncomingValue(i, ExitVal); + + // If this instruction is dead now, delete it. + RecursivelyDeleteTriviallyDeadInstructions(Inst); + + if (NumPreds == 1) { + // Completely replace a single-pred PHI. This is safe, because the + // NewVal won't be variant in the loop, so we don't need an LCSSA phi + // node anymore. + PN->replaceAllUsesWith(ExitVal); + RecursivelyDeleteTriviallyDeadInstructions(PN); + } + } + if (NumPreds != 1) { + // Clone the PHI and delete the original one. This lets IVUsers and + // any other maps purge the original user from their records. + PHINode *NewPN = cast<PHINode>(PN->clone()); + NewPN->takeName(PN); + NewPN->insertBefore(PN); + PN->replaceAllUsesWith(NewPN); + PN->eraseFromParent(); + } + } + } + + // The insertion point instruction may have been deleted; clear it out + // so that the rewriter doesn't trip over it later. + Rewriter.clearInsertPoint(); +} + +//===----------------------------------------------------------------------===// +// IV Widening - Extend the width of an IV to cover its widest uses. +//===----------------------------------------------------------------------===// + +namespace { + // Collect information about induction variables that are used by sign/zero + // extend operations. This information is recorded by CollectExtend and + // provides the input to WidenIV. + struct WideIVInfo { + PHINode *NarrowIV; + Type *WidestNativeType; // Widest integer type created [sz]ext + bool IsSigned; // Was an sext user seen before a zext? + + WideIVInfo() : NarrowIV(0), WidestNativeType(0), IsSigned(false) {} + }; + + class WideIVVisitor : public IVVisitor { + ScalarEvolution *SE; + const TargetData *TD; + + public: + WideIVInfo WI; + + WideIVVisitor(PHINode *NarrowIV, ScalarEvolution *SCEV, + const TargetData *TData) : + SE(SCEV), TD(TData) { WI.NarrowIV = NarrowIV; } + + // Implement the interface used by simplifyUsersOfIV. + virtual void visitCast(CastInst *Cast); + }; +} + +/// visitCast - Update information about the induction variable that is +/// extended by this sign or zero extend operation. This is used to determine +/// the final width of the IV before actually widening it. +void WideIVVisitor::visitCast(CastInst *Cast) { + bool IsSigned = Cast->getOpcode() == Instruction::SExt; + if (!IsSigned && Cast->getOpcode() != Instruction::ZExt) + return; + + Type *Ty = Cast->getType(); + uint64_t Width = SE->getTypeSizeInBits(Ty); + if (TD && !TD->isLegalInteger(Width)) + return; + + if (!WI.WidestNativeType) { + WI.WidestNativeType = SE->getEffectiveSCEVType(Ty); + WI.IsSigned = IsSigned; + return; + } + + // We extend the IV to satisfy the sign of its first user, arbitrarily. + if (WI.IsSigned != IsSigned) + return; + + if (Width > SE->getTypeSizeInBits(WI.WidestNativeType)) + WI.WidestNativeType = SE->getEffectiveSCEVType(Ty); +} + +namespace { + +/// NarrowIVDefUse - Record a link in the Narrow IV def-use chain along with the +/// WideIV that computes the same value as the Narrow IV def. This avoids +/// caching Use* pointers. +struct NarrowIVDefUse { + Instruction *NarrowDef; + Instruction *NarrowUse; + Instruction *WideDef; + + NarrowIVDefUse(): NarrowDef(0), NarrowUse(0), WideDef(0) {} + + NarrowIVDefUse(Instruction *ND, Instruction *NU, Instruction *WD): + NarrowDef(ND), NarrowUse(NU), WideDef(WD) {} +}; + +/// WidenIV - The goal of this transform is to remove sign and zero extends +/// without creating any new induction variables. To do this, it creates a new +/// phi of the wider type and redirects all users, either removing extends or +/// inserting truncs whenever we stop propagating the type. +/// +class WidenIV { + // Parameters + PHINode *OrigPhi; + Type *WideType; + bool IsSigned; + + // Context + LoopInfo *LI; + Loop *L; + ScalarEvolution *SE; + DominatorTree *DT; + + // Result + PHINode *WidePhi; + Instruction *WideInc; + const SCEV *WideIncExpr; + SmallVectorImpl<WeakVH> &DeadInsts; + + SmallPtrSet<Instruction*,16> Widened; + SmallVector<NarrowIVDefUse, 8> NarrowIVUsers; + +public: + WidenIV(const WideIVInfo &WI, LoopInfo *LInfo, + ScalarEvolution *SEv, DominatorTree *DTree, + SmallVectorImpl<WeakVH> &DI) : + OrigPhi(WI.NarrowIV), + WideType(WI.WidestNativeType), + IsSigned(WI.IsSigned), + LI(LInfo), + L(LI->getLoopFor(OrigPhi->getParent())), + SE(SEv), + DT(DTree), + WidePhi(0), + WideInc(0), + WideIncExpr(0), + DeadInsts(DI) { + assert(L->getHeader() == OrigPhi->getParent() && "Phi must be an IV"); + } + + PHINode *CreateWideIV(SCEVExpander &Rewriter); + +protected: + Value *getExtend(Value *NarrowOper, Type *WideType, bool IsSigned, + Instruction *Use); + + Instruction *CloneIVUser(NarrowIVDefUse DU); + + const SCEVAddRecExpr *GetWideRecurrence(Instruction *NarrowUse); + + const SCEVAddRecExpr* GetExtendedOperandRecurrence(NarrowIVDefUse DU); + + Instruction *WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter); + + void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef); +}; +} // anonymous namespace + +/// isLoopInvariant - Perform a quick domtree based check for loop invariance +/// assuming that V is used within the loop. LoopInfo::isLoopInvariant() seems +/// gratuitous for this purpose. +static bool isLoopInvariant(Value *V, const Loop *L, const DominatorTree *DT) { + Instruction *Inst = dyn_cast<Instruction>(V); + if (!Inst) + return true; + + return DT->properlyDominates(Inst->getParent(), L->getHeader()); +} + +Value *WidenIV::getExtend(Value *NarrowOper, Type *WideType, bool IsSigned, + Instruction *Use) { + // Set the debug location and conservative insertion point. + IRBuilder<> Builder(Use); + // Hoist the insertion point into loop preheaders as far as possible. + for (const Loop *L = LI->getLoopFor(Use->getParent()); + L && L->getLoopPreheader() && isLoopInvariant(NarrowOper, L, DT); + L = L->getParentLoop()) + Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator()); + + return IsSigned ? Builder.CreateSExt(NarrowOper, WideType) : + Builder.CreateZExt(NarrowOper, WideType); +} + +/// CloneIVUser - Instantiate a wide operation to replace a narrow +/// operation. This only needs to handle operations that can evaluation to +/// SCEVAddRec. It can safely return 0 for any operation we decide not to clone. +Instruction *WidenIV::CloneIVUser(NarrowIVDefUse DU) { + unsigned Opcode = DU.NarrowUse->getOpcode(); + switch (Opcode) { + default: + return 0; + case Instruction::Add: + case Instruction::Mul: + case Instruction::UDiv: + case Instruction::Sub: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + DEBUG(dbgs() << "Cloning IVUser: " << *DU.NarrowUse << "\n"); + + // Replace NarrowDef operands with WideDef. Otherwise, we don't know + // anything about the narrow operand yet so must insert a [sz]ext. It is + // probably loop invariant and will be folded or hoisted. If it actually + // comes from a widened IV, it should be removed during a future call to + // WidenIVUse. + Value *LHS = (DU.NarrowUse->getOperand(0) == DU.NarrowDef) ? DU.WideDef : + getExtend(DU.NarrowUse->getOperand(0), WideType, IsSigned, DU.NarrowUse); + Value *RHS = (DU.NarrowUse->getOperand(1) == DU.NarrowDef) ? DU.WideDef : + getExtend(DU.NarrowUse->getOperand(1), WideType, IsSigned, DU.NarrowUse); + + BinaryOperator *NarrowBO = cast<BinaryOperator>(DU.NarrowUse); + BinaryOperator *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), + LHS, RHS, + NarrowBO->getName()); + IRBuilder<> Builder(DU.NarrowUse); + Builder.Insert(WideBO); + if (const OverflowingBinaryOperator *OBO = + dyn_cast<OverflowingBinaryOperator>(NarrowBO)) { + if (OBO->hasNoUnsignedWrap()) WideBO->setHasNoUnsignedWrap(); + if (OBO->hasNoSignedWrap()) WideBO->setHasNoSignedWrap(); + } + return WideBO; + } +} + +/// No-wrap operations can transfer sign extension of their result to their +/// operands. Generate the SCEV value for the widened operation without +/// actually modifying the IR yet. If the expression after extending the +/// operands is an AddRec for this loop, return it. +const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) { + // Handle the common case of add<nsw/nuw> + if (DU.NarrowUse->getOpcode() != Instruction::Add) + return 0; + + // One operand (NarrowDef) has already been extended to WideDef. Now determine + // if extending the other will lead to a recurrence. + unsigned ExtendOperIdx = DU.NarrowUse->getOperand(0) == DU.NarrowDef ? 1 : 0; + assert(DU.NarrowUse->getOperand(1-ExtendOperIdx) == DU.NarrowDef && "bad DU"); + + const SCEV *ExtendOperExpr = 0; + const OverflowingBinaryOperator *OBO = + cast<OverflowingBinaryOperator>(DU.NarrowUse); + if (IsSigned && OBO->hasNoSignedWrap()) + ExtendOperExpr = SE->getSignExtendExpr( + SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType); + else if(!IsSigned && OBO->hasNoUnsignedWrap()) + ExtendOperExpr = SE->getZeroExtendExpr( + SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType); + else + return 0; + + // When creating this AddExpr, don't apply the current operations NSW or NUW + // flags. This instruction may be guarded by control flow that the no-wrap + // behavior depends on. Non-control-equivalent instructions can be mapped to + // the same SCEV expression, and it would be incorrect to transfer NSW/NUW + // semantics to those operations. + const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>( + SE->getAddExpr(SE->getSCEV(DU.WideDef), ExtendOperExpr)); + + if (!AddRec || AddRec->getLoop() != L) + return 0; + return AddRec; +} + +/// GetWideRecurrence - Is this instruction potentially interesting from +/// IVUsers' perspective after widening it's type? In other words, can the +/// extend be safely hoisted out of the loop with SCEV reducing the value to a +/// recurrence on the same loop. If so, return the sign or zero extended +/// recurrence. Otherwise return NULL. +const SCEVAddRecExpr *WidenIV::GetWideRecurrence(Instruction *NarrowUse) { + if (!SE->isSCEVable(NarrowUse->getType())) + return 0; + + const SCEV *NarrowExpr = SE->getSCEV(NarrowUse); + if (SE->getTypeSizeInBits(NarrowExpr->getType()) + >= SE->getTypeSizeInBits(WideType)) { + // NarrowUse implicitly widens its operand. e.g. a gep with a narrow + // index. So don't follow this use. + return 0; + } + + const SCEV *WideExpr = IsSigned ? + SE->getSignExtendExpr(NarrowExpr, WideType) : + SE->getZeroExtendExpr(NarrowExpr, WideType); + const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(WideExpr); + if (!AddRec || AddRec->getLoop() != L) + return 0; + return AddRec; +} + +/// WidenIVUse - Determine whether an individual user of the narrow IV can be +/// widened. If so, return the wide clone of the user. +Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { + + // Stop traversing the def-use chain at inner-loop phis or post-loop phis. + if (isa<PHINode>(DU.NarrowUse) && + LI->getLoopFor(DU.NarrowUse->getParent()) != L) + return 0; + + // Our raison d'etre! Eliminate sign and zero extension. + if (IsSigned ? isa<SExtInst>(DU.NarrowUse) : isa<ZExtInst>(DU.NarrowUse)) { + Value *NewDef = DU.WideDef; + if (DU.NarrowUse->getType() != WideType) { + unsigned CastWidth = SE->getTypeSizeInBits(DU.NarrowUse->getType()); + unsigned IVWidth = SE->getTypeSizeInBits(WideType); + if (CastWidth < IVWidth) { + // The cast isn't as wide as the IV, so insert a Trunc. + IRBuilder<> Builder(DU.NarrowUse); + NewDef = Builder.CreateTrunc(DU.WideDef, DU.NarrowUse->getType()); + } + else { + // A wider extend was hidden behind a narrower one. This may induce + // another round of IV widening in which the intermediate IV becomes + // dead. It should be very rare. + DEBUG(dbgs() << "INDVARS: New IV " << *WidePhi + << " not wide enough to subsume " << *DU.NarrowUse << "\n"); + DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef); + NewDef = DU.NarrowUse; + } + } + if (NewDef != DU.NarrowUse) { + DEBUG(dbgs() << "INDVARS: eliminating " << *DU.NarrowUse + << " replaced by " << *DU.WideDef << "\n"); + ++NumElimExt; + DU.NarrowUse->replaceAllUsesWith(NewDef); + DeadInsts.push_back(DU.NarrowUse); + } + // Now that the extend is gone, we want to expose it's uses for potential + // further simplification. We don't need to directly inform SimplifyIVUsers + // of the new users, because their parent IV will be processed later as a + // new loop phi. If we preserved IVUsers analysis, we would also want to + // push the uses of WideDef here. + + // No further widening is needed. The deceased [sz]ext had done it for us. + return 0; + } + + // Does this user itself evaluate to a recurrence after widening? + const SCEVAddRecExpr *WideAddRec = GetWideRecurrence(DU.NarrowUse); + if (!WideAddRec) { + WideAddRec = GetExtendedOperandRecurrence(DU); + } + if (!WideAddRec) { + // This user does not evaluate to a recurence after widening, so don't + // follow it. Instead insert a Trunc to kill off the original use, + // eventually isolating the original narrow IV so it can be removed. + IRBuilder<> Builder(getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT)); + Value *Trunc = Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType()); + DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, Trunc); + return 0; + } + // Assume block terminators cannot evaluate to a recurrence. We can't to + // insert a Trunc after a terminator if there happens to be a critical edge. + assert(DU.NarrowUse != DU.NarrowUse->getParent()->getTerminator() && + "SCEV is not expected to evaluate a block terminator"); + + // Reuse the IV increment that SCEVExpander created as long as it dominates + // NarrowUse. + Instruction *WideUse = 0; + if (WideAddRec == WideIncExpr + && Rewriter.hoistIVInc(WideInc, DU.NarrowUse)) + WideUse = WideInc; + else { + WideUse = CloneIVUser(DU); + if (!WideUse) + return 0; + } + // Evaluation of WideAddRec ensured that the narrow expression could be + // extended outside the loop without overflow. This suggests that the wide use + // evaluates to the same expression as the extended narrow use, but doesn't + // absolutely guarantee it. Hence the following failsafe check. In rare cases + // where it fails, we simply throw away the newly created wide use. + if (WideAddRec != SE->getSCEV(WideUse)) { + DEBUG(dbgs() << "Wide use expression mismatch: " << *WideUse + << ": " << *SE->getSCEV(WideUse) << " != " << *WideAddRec << "\n"); + DeadInsts.push_back(WideUse); + return 0; + } + + // Returning WideUse pushes it on the worklist. + return WideUse; +} + +/// pushNarrowIVUsers - Add eligible users of NarrowDef to NarrowIVUsers. +/// +void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) { + for (Value::use_iterator UI = NarrowDef->use_begin(), + UE = NarrowDef->use_end(); UI != UE; ++UI) { + Instruction *NarrowUse = cast<Instruction>(*UI); + + // Handle data flow merges and bizarre phi cycles. + if (!Widened.insert(NarrowUse)) + continue; + + NarrowIVUsers.push_back(NarrowIVDefUse(NarrowDef, NarrowUse, WideDef)); + } +} + +/// CreateWideIV - Process a single induction variable. First use the +/// SCEVExpander to create a wide induction variable that evaluates to the same +/// recurrence as the original narrow IV. Then use a worklist to forward +/// traverse the narrow IV's def-use chain. After WidenIVUse has processed all +/// interesting IV users, the narrow IV will be isolated for removal by +/// DeleteDeadPHIs. +/// +/// It would be simpler to delete uses as they are processed, but we must avoid +/// invalidating SCEV expressions. +/// +PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) { + // Is this phi an induction variable? + const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(OrigPhi)); + if (!AddRec) + return NULL; + + // Widen the induction variable expression. + const SCEV *WideIVExpr = IsSigned ? + SE->getSignExtendExpr(AddRec, WideType) : + SE->getZeroExtendExpr(AddRec, WideType); + + assert(SE->getEffectiveSCEVType(WideIVExpr->getType()) == WideType && + "Expect the new IV expression to preserve its type"); + + // Can the IV be extended outside the loop without overflow? + AddRec = dyn_cast<SCEVAddRecExpr>(WideIVExpr); + if (!AddRec || AddRec->getLoop() != L) + return NULL; + + // An AddRec must have loop-invariant operands. Since this AddRec is + // materialized by a loop header phi, the expression cannot have any post-loop + // operands, so they must dominate the loop header. + assert(SE->properlyDominates(AddRec->getStart(), L->getHeader()) && + SE->properlyDominates(AddRec->getStepRecurrence(*SE), L->getHeader()) + && "Loop header phi recurrence inputs do not dominate the loop"); + + // The rewriter provides a value for the desired IV expression. This may + // either find an existing phi or materialize a new one. Either way, we + // expect a well-formed cyclic phi-with-increments. i.e. any operand not part + // of the phi-SCC dominates the loop entry. + Instruction *InsertPt = L->getHeader()->begin(); + WidePhi = cast<PHINode>(Rewriter.expandCodeFor(AddRec, WideType, InsertPt)); + + // Remembering the WideIV increment generated by SCEVExpander allows + // WidenIVUse to reuse it when widening the narrow IV's increment. We don't + // employ a general reuse mechanism because the call above is the only call to + // SCEVExpander. Henceforth, we produce 1-to-1 narrow to wide uses. + if (BasicBlock *LatchBlock = L->getLoopLatch()) { + WideInc = + cast<Instruction>(WidePhi->getIncomingValueForBlock(LatchBlock)); + WideIncExpr = SE->getSCEV(WideInc); + } + + DEBUG(dbgs() << "Wide IV: " << *WidePhi << "\n"); + ++NumWidened; + + // Traverse the def-use chain using a worklist starting at the original IV. + assert(Widened.empty() && NarrowIVUsers.empty() && "expect initial state" ); + + Widened.insert(OrigPhi); + pushNarrowIVUsers(OrigPhi, WidePhi); + + while (!NarrowIVUsers.empty()) { + NarrowIVDefUse DU = NarrowIVUsers.pop_back_val(); + + // Process a def-use edge. This may replace the use, so don't hold a + // use_iterator across it. + Instruction *WideUse = WidenIVUse(DU, Rewriter); + + // Follow all def-use edges from the previous narrow use. + if (WideUse) + pushNarrowIVUsers(DU.NarrowUse, WideUse); + + // WidenIVUse may have removed the def-use edge. + if (DU.NarrowDef->use_empty()) + DeadInsts.push_back(DU.NarrowDef); + } + return WidePhi; +} + +//===----------------------------------------------------------------------===// +// Simplification of IV users based on SCEV evaluation. +//===----------------------------------------------------------------------===// + + +/// SimplifyAndExtend - Iteratively perform simplification on a worklist of IV +/// users. Each successive simplification may push more users which may +/// themselves be candidates for simplification. +/// +/// Sign/Zero extend elimination is interleaved with IV simplification. +/// +void IndVarSimplify::SimplifyAndExtend(Loop *L, + SCEVExpander &Rewriter, + LPPassManager &LPM) { + SmallVector<WideIVInfo, 8> WideIVs; + + SmallVector<PHINode*, 8> LoopPhis; + for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { + LoopPhis.push_back(cast<PHINode>(I)); + } + // Each round of simplification iterates through the SimplifyIVUsers worklist + // for all current phis, then determines whether any IVs can be + // widened. Widening adds new phis to LoopPhis, inducing another round of + // simplification on the wide IVs. + while (!LoopPhis.empty()) { + // Evaluate as many IV expressions as possible before widening any IVs. This + // forces SCEV to set no-wrap flags before evaluating sign/zero + // extension. The first time SCEV attempts to normalize sign/zero extension, + // the result becomes final. So for the most predictable results, we delay + // evaluation of sign/zero extend evaluation until needed, and avoid running + // other SCEV based analysis prior to SimplifyAndExtend. + do { + PHINode *CurrIV = LoopPhis.pop_back_val(); + + // Information about sign/zero extensions of CurrIV. + WideIVVisitor WIV(CurrIV, SE, TD); + + Changed |= simplifyUsersOfIV(CurrIV, SE, &LPM, DeadInsts, &WIV); + + if (WIV.WI.WidestNativeType) { + WideIVs.push_back(WIV.WI); + } + } while(!LoopPhis.empty()); + + for (; !WideIVs.empty(); WideIVs.pop_back()) { + WidenIV Widener(WideIVs.back(), LI, SE, DT, DeadInsts); + if (PHINode *WidePhi = Widener.CreateWideIV(Rewriter)) { + Changed = true; + LoopPhis.push_back(WidePhi); + } + } + } +} + +//===----------------------------------------------------------------------===// +// LinearFunctionTestReplace and its kin. Rewrite the loop exit condition. +//===----------------------------------------------------------------------===// + +/// Check for expressions that ScalarEvolution generates to compute +/// BackedgeTakenInfo. If these expressions have not been reduced, then +/// expanding them may incur additional cost (albeit in the loop preheader). +static bool isHighCostExpansion(const SCEV *S, BranchInst *BI, + SmallPtrSet<const SCEV*, 8> &Processed, + ScalarEvolution *SE) { + if (!Processed.insert(S)) + return false; + + // If the backedge-taken count is a UDiv, it's very likely a UDiv that + // ScalarEvolution's HowFarToZero or HowManyLessThans produced to compute a + // precise expression, rather than a UDiv from the user's code. If we can't + // find a UDiv in the code with some simple searching, assume the former and + // forego rewriting the loop. + if (isa<SCEVUDivExpr>(S)) { + ICmpInst *OrigCond = dyn_cast<ICmpInst>(BI->getCondition()); + if (!OrigCond) return true; + const SCEV *R = SE->getSCEV(OrigCond->getOperand(1)); + R = SE->getMinusSCEV(R, SE->getConstant(R->getType(), 1)); + if (R != S) { + const SCEV *L = SE->getSCEV(OrigCond->getOperand(0)); + L = SE->getMinusSCEV(L, SE->getConstant(L->getType(), 1)); + if (L != S) + return true; + } + } + + // Recurse past add expressions, which commonly occur in the + // BackedgeTakenCount. They may already exist in program code, and if not, + // they are not too expensive rematerialize. + if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { + for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); + I != E; ++I) { + if (isHighCostExpansion(*I, BI, Processed, SE)) + return true; + } + return false; + } + + // HowManyLessThans uses a Max expression whenever the loop is not guarded by + // the exit condition. + if (isa<SCEVSMaxExpr>(S) || isa<SCEVUMaxExpr>(S)) + return true; + + // If we haven't recognized an expensive SCEV pattern, assume it's an + // expression produced by program code. + return false; +} + +/// canExpandBackedgeTakenCount - Return true if this loop's backedge taken +/// count expression can be safely and cheaply expanded into an instruction +/// sequence that can be used by LinearFunctionTestReplace. +/// +/// TODO: This fails for pointer-type loop counters with greater than one byte +/// strides, consequently preventing LFTR from running. For the purpose of LFTR +/// we could skip this check in the case that the LFTR loop counter (chosen by +/// FindLoopCounter) is also pointer type. Instead, we could directly convert +/// the loop test to an inequality test by checking the target data's alignment +/// of element types (given that the initial pointer value originates from or is +/// used by ABI constrained operation, as opposed to inttoptr/ptrtoint). +/// However, we don't yet have a strong motivation for converting loop tests +/// into inequality tests. +static bool canExpandBackedgeTakenCount(Loop *L, ScalarEvolution *SE) { + const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L); + if (isa<SCEVCouldNotCompute>(BackedgeTakenCount) || + BackedgeTakenCount->isZero()) + return false; + + if (!L->getExitingBlock()) + return false; + + // Can't rewrite non-branch yet. + BranchInst *BI = dyn_cast<BranchInst>(L->getExitingBlock()->getTerminator()); + if (!BI) + return false; + + SmallPtrSet<const SCEV*, 8> Processed; + if (isHighCostExpansion(BackedgeTakenCount, BI, Processed, SE)) + return false; + + return true; +} + +/// getLoopPhiForCounter - Return the loop header phi IFF IncV adds a loop +/// invariant value to the phi. +static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) { + Instruction *IncI = dyn_cast<Instruction>(IncV); + if (!IncI) + return 0; + + switch (IncI->getOpcode()) { + case Instruction::Add: + case Instruction::Sub: + break; + case Instruction::GetElementPtr: + // An IV counter must preserve its type. + if (IncI->getNumOperands() == 2) + break; + default: + return 0; + } + + PHINode *Phi = dyn_cast<PHINode>(IncI->getOperand(0)); + if (Phi && Phi->getParent() == L->getHeader()) { + if (isLoopInvariant(IncI->getOperand(1), L, DT)) + return Phi; + return 0; + } + if (IncI->getOpcode() == Instruction::GetElementPtr) + return 0; + + // Allow add/sub to be commuted. + Phi = dyn_cast<PHINode>(IncI->getOperand(1)); + if (Phi && Phi->getParent() == L->getHeader()) { + if (isLoopInvariant(IncI->getOperand(0), L, DT)) + return Phi; + } + return 0; +} + +/// Return the compare guarding the loop latch, or NULL for unrecognized tests. +static ICmpInst *getLoopTest(Loop *L) { + assert(L->getExitingBlock() && "expected loop exit"); + + BasicBlock *LatchBlock = L->getLoopLatch(); + // Don't bother with LFTR if the loop is not properly simplified. + if (!LatchBlock) + return 0; + + BranchInst *BI = dyn_cast<BranchInst>(L->getExitingBlock()->getTerminator()); + assert(BI && "expected exit branch"); + + return dyn_cast<ICmpInst>(BI->getCondition()); +} + +/// needsLFTR - LinearFunctionTestReplace policy. Return true unless we can show +/// that the current exit test is already sufficiently canonical. +static bool needsLFTR(Loop *L, DominatorTree *DT) { + // Do LFTR to simplify the exit condition to an ICMP. + ICmpInst *Cond = getLoopTest(L); + if (!Cond) + return true; + + // Do LFTR to simplify the exit ICMP to EQ/NE + ICmpInst::Predicate Pred = Cond->getPredicate(); + if (Pred != ICmpInst::ICMP_NE && Pred != ICmpInst::ICMP_EQ) + return true; + + // Look for a loop invariant RHS + Value *LHS = Cond->getOperand(0); + Value *RHS = Cond->getOperand(1); + if (!isLoopInvariant(RHS, L, DT)) { + if (!isLoopInvariant(LHS, L, DT)) + return true; + std::swap(LHS, RHS); + } + // Look for a simple IV counter LHS + PHINode *Phi = dyn_cast<PHINode>(LHS); + if (!Phi) + Phi = getLoopPhiForCounter(LHS, L, DT); + + if (!Phi) + return true; + + // Do LFTR if the exit condition's IV is *not* a simple counter. + Value *IncV = Phi->getIncomingValueForBlock(L->getLoopLatch()); + return Phi != getLoopPhiForCounter(IncV, L, DT); +} + +/// Recursive helper for hasConcreteDef(). Unfortunately, this currently boils +/// down to checking that all operands are constant and listing instructions +/// that may hide undef. +static bool hasConcreteDefImpl(Value *V, SmallPtrSet<Value*, 8> &Visited, + unsigned Depth) { + if (isa<Constant>(V)) + return !isa<UndefValue>(V); + + if (Depth >= 6) + return false; + + // Conservatively handle non-constant non-instructions. For example, Arguments + // may be undef. + Instruction *I = dyn_cast<Instruction>(V); + if (!I) + return false; + + // Load and return values may be undef. + if(I->mayReadFromMemory() || isa<CallInst>(I) || isa<InvokeInst>(I)) + return false; + + // Optimistically handle other instructions. + for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) { + if (!Visited.insert(*OI)) + continue; + if (!hasConcreteDefImpl(*OI, Visited, Depth+1)) + return false; + } + return true; +} + +/// Return true if the given value is concrete. We must prove that undef can +/// never reach it. +/// +/// TODO: If we decide that this is a good approach to checking for undef, we +/// may factor it into a common location. +static bool hasConcreteDef(Value *V) { + SmallPtrSet<Value*, 8> Visited; + Visited.insert(V); + return hasConcreteDefImpl(V, Visited, 0); +} + +/// AlmostDeadIV - Return true if this IV has any uses other than the (soon to +/// be rewritten) loop exit test. +static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) { + int LatchIdx = Phi->getBasicBlockIndex(LatchBlock); + Value *IncV = Phi->getIncomingValue(LatchIdx); + + for (Value::use_iterator UI = Phi->use_begin(), UE = Phi->use_end(); + UI != UE; ++UI) { + if (*UI != Cond && *UI != IncV) return false; + } + + for (Value::use_iterator UI = IncV->use_begin(), UE = IncV->use_end(); + UI != UE; ++UI) { + if (*UI != Cond && *UI != Phi) return false; + } + return true; +} + +/// FindLoopCounter - Find an affine IV in canonical form. +/// +/// BECount may be an i8* pointer type. The pointer difference is already +/// valid count without scaling the address stride, so it remains a pointer +/// expression as far as SCEV is concerned. +/// +/// Currently only valid for LFTR. See the comments on hasConcreteDef below. +/// +/// FIXME: Accept -1 stride and set IVLimit = IVInit - BECount +/// +/// FIXME: Accept non-unit stride as long as SCEV can reduce BECount * Stride. +/// This is difficult in general for SCEV because of potential overflow. But we +/// could at least handle constant BECounts. +static PHINode * +FindLoopCounter(Loop *L, const SCEV *BECount, + ScalarEvolution *SE, DominatorTree *DT, const TargetData *TD) { + uint64_t BCWidth = SE->getTypeSizeInBits(BECount->getType()); + + Value *Cond = + cast<BranchInst>(L->getExitingBlock()->getTerminator())->getCondition(); + + // Loop over all of the PHI nodes, looking for a simple counter. + PHINode *BestPhi = 0; + const SCEV *BestInit = 0; + BasicBlock *LatchBlock = L->getLoopLatch(); + assert(LatchBlock && "needsLFTR should guarantee a loop latch"); + + for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { + PHINode *Phi = cast<PHINode>(I); + if (!SE->isSCEVable(Phi->getType())) + continue; + + // Avoid comparing an integer IV against a pointer Limit. + if (BECount->getType()->isPointerTy() && !Phi->getType()->isPointerTy()) + continue; + + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Phi)); + if (!AR || AR->getLoop() != L || !AR->isAffine()) + continue; + + // AR may be a pointer type, while BECount is an integer type. + // AR may be wider than BECount. With eq/ne tests overflow is immaterial. + // AR may not be a narrower type, or we may never exit. + uint64_t PhiWidth = SE->getTypeSizeInBits(AR->getType()); + if (PhiWidth < BCWidth || (TD && !TD->isLegalInteger(PhiWidth))) + continue; + + const SCEV *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)); + if (!Step || !Step->isOne()) + continue; + + int LatchIdx = Phi->getBasicBlockIndex(LatchBlock); + Value *IncV = Phi->getIncomingValue(LatchIdx); + if (getLoopPhiForCounter(IncV, L, DT) != Phi) + continue; + + // Avoid reusing a potentially undef value to compute other values that may + // have originally had a concrete definition. + if (!hasConcreteDef(Phi)) { + // We explicitly allow unknown phis as long as they are already used by + // the loop test. In this case we assume that performing LFTR could not + // increase the number of undef users. + if (ICmpInst *Cond = getLoopTest(L)) { + if (Phi != getLoopPhiForCounter(Cond->getOperand(0), L, DT) + && Phi != getLoopPhiForCounter(Cond->getOperand(1), L, DT)) { + continue; + } + } + } + const SCEV *Init = AR->getStart(); + + if (BestPhi && !AlmostDeadIV(BestPhi, LatchBlock, Cond)) { + // Don't force a live loop counter if another IV can be used. + if (AlmostDeadIV(Phi, LatchBlock, Cond)) + continue; + + // Prefer to count-from-zero. This is a more "canonical" counter form. It + // also prefers integer to pointer IVs. + if (BestInit->isZero() != Init->isZero()) { + if (BestInit->isZero()) + continue; + } + // If two IVs both count from zero or both count from nonzero then the + // narrower is likely a dead phi that has been widened. Use the wider phi + // to allow the other to be eliminated. + else if (PhiWidth <= SE->getTypeSizeInBits(BestPhi->getType())) + continue; + } + BestPhi = Phi; + BestInit = Init; + } + return BestPhi; +} + +/// genLoopLimit - Help LinearFunctionTestReplace by generating a value that +/// holds the RHS of the new loop test. +static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L, + SCEVExpander &Rewriter, ScalarEvolution *SE) { + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IndVar)); + assert(AR && AR->getLoop() == L && AR->isAffine() && "bad loop counter"); + const SCEV *IVInit = AR->getStart(); + + // IVInit may be a pointer while IVCount is an integer when FindLoopCounter + // finds a valid pointer IV. Sign extend BECount in order to materialize a + // GEP. Avoid running SCEVExpander on a new pointer value, instead reusing + // the existing GEPs whenever possible. + if (IndVar->getType()->isPointerTy() + && !IVCount->getType()->isPointerTy()) { + + Type *OfsTy = SE->getEffectiveSCEVType(IVInit->getType()); + const SCEV *IVOffset = SE->getTruncateOrSignExtend(IVCount, OfsTy); + + // Expand the code for the iteration count. + assert(SE->isLoopInvariant(IVOffset, L) && + "Computed iteration count is not loop invariant!"); + BranchInst *BI = cast<BranchInst>(L->getExitingBlock()->getTerminator()); + Value *GEPOffset = Rewriter.expandCodeFor(IVOffset, OfsTy, BI); + + Value *GEPBase = IndVar->getIncomingValueForBlock(L->getLoopPreheader()); + assert(AR->getStart() == SE->getSCEV(GEPBase) && "bad loop counter"); + // We could handle pointer IVs other than i8*, but we need to compensate for + // gep index scaling. See canExpandBackedgeTakenCount comments. + assert(SE->getSizeOfExpr( + cast<PointerType>(GEPBase->getType())->getElementType())->isOne() + && "unit stride pointer IV must be i8*"); + + IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); + return Builder.CreateGEP(GEPBase, GEPOffset, "lftr.limit"); + } + else { + // In any other case, convert both IVInit and IVCount to integers before + // comparing. This may result in SCEV expension of pointers, but in practice + // SCEV will fold the pointer arithmetic away as such: + // BECount = (IVEnd - IVInit - 1) => IVLimit = IVInit (postinc). + // + // Valid Cases: (1) both integers is most common; (2) both may be pointers + // for simple memset-style loops; (3) IVInit is an integer and IVCount is a + // pointer may occur when enable-iv-rewrite generates a canonical IV on top + // of case #2. + + const SCEV *IVLimit = 0; + // For unit stride, IVCount = Start + BECount with 2's complement overflow. + // For non-zero Start, compute IVCount here. + if (AR->getStart()->isZero()) + IVLimit = IVCount; + else { + assert(AR->getStepRecurrence(*SE)->isOne() && "only handles unit stride"); + const SCEV *IVInit = AR->getStart(); + + // For integer IVs, truncate the IV before computing IVInit + BECount. + if (SE->getTypeSizeInBits(IVInit->getType()) + > SE->getTypeSizeInBits(IVCount->getType())) + IVInit = SE->getTruncateExpr(IVInit, IVCount->getType()); + + IVLimit = SE->getAddExpr(IVInit, IVCount); + } + // Expand the code for the iteration count. + BranchInst *BI = cast<BranchInst>(L->getExitingBlock()->getTerminator()); + IRBuilder<> Builder(BI); + assert(SE->isLoopInvariant(IVLimit, L) && + "Computed iteration count is not loop invariant!"); + // Ensure that we generate the same type as IndVar, or a smaller integer + // type. In the presence of null pointer values, we have an integer type + // SCEV expression (IVInit) for a pointer type IV value (IndVar). + Type *LimitTy = IVCount->getType()->isPointerTy() ? + IndVar->getType() : IVCount->getType(); + return Rewriter.expandCodeFor(IVLimit, LimitTy, BI); + } +} + +/// LinearFunctionTestReplace - This method rewrites the exit condition of the +/// loop to be a canonical != comparison against the incremented loop induction +/// variable. This pass is able to rewrite the exit tests of any loop where the +/// SCEV analysis can determine a loop-invariant trip count of the loop, which +/// is actually a much broader range than just linear tests. +Value *IndVarSimplify:: +LinearFunctionTestReplace(Loop *L, + const SCEV *BackedgeTakenCount, + PHINode *IndVar, + SCEVExpander &Rewriter) { + assert(canExpandBackedgeTakenCount(L, SE) && "precondition"); + + // LFTR can ignore IV overflow and truncate to the width of + // BECount. This avoids materializing the add(zext(add)) expression. + Type *CntTy = BackedgeTakenCount->getType(); + + const SCEV *IVCount = BackedgeTakenCount; + + // If the exiting block is the same as the backedge block, we prefer to + // compare against the post-incremented value, otherwise we must compare + // against the preincremented value. + Value *CmpIndVar; + if (L->getExitingBlock() == L->getLoopLatch()) { + // Add one to the "backedge-taken" count to get the trip count. + // If this addition may overflow, we have to be more pessimistic and + // cast the induction variable before doing the add. + const SCEV *N = + SE->getAddExpr(IVCount, SE->getConstant(IVCount->getType(), 1)); + if (CntTy == IVCount->getType()) + IVCount = N; + else { + const SCEV *Zero = SE->getConstant(IVCount->getType(), 0); + if ((isa<SCEVConstant>(N) && !N->isZero()) || + SE->isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, N, Zero)) { + // No overflow. Cast the sum. + IVCount = SE->getTruncateOrZeroExtend(N, CntTy); + } else { + // Potential overflow. Cast before doing the add. + IVCount = SE->getTruncateOrZeroExtend(IVCount, CntTy); + IVCount = SE->getAddExpr(IVCount, SE->getConstant(CntTy, 1)); + } + } + // The BackedgeTaken expression contains the number of times that the + // backedge branches to the loop header. This is one less than the + // number of times the loop executes, so use the incremented indvar. + CmpIndVar = IndVar->getIncomingValueForBlock(L->getExitingBlock()); + } else { + // We must use the preincremented value... + IVCount = SE->getTruncateOrZeroExtend(IVCount, CntTy); + CmpIndVar = IndVar; + } + + Value *ExitCnt = genLoopLimit(IndVar, IVCount, L, Rewriter, SE); + assert(ExitCnt->getType()->isPointerTy() == IndVar->getType()->isPointerTy() + && "genLoopLimit missed a cast"); + + // Insert a new icmp_ne or icmp_eq instruction before the branch. + BranchInst *BI = cast<BranchInst>(L->getExitingBlock()->getTerminator()); + ICmpInst::Predicate P; + if (L->contains(BI->getSuccessor(0))) + P = ICmpInst::ICMP_NE; + else + P = ICmpInst::ICMP_EQ; + + DEBUG(dbgs() << "INDVARS: Rewriting loop exit condition to:\n" + << " LHS:" << *CmpIndVar << '\n' + << " op:\t" + << (P == ICmpInst::ICMP_NE ? "!=" : "==") << "\n" + << " RHS:\t" << *ExitCnt << "\n" + << " IVCount:\t" << *IVCount << "\n"); + + IRBuilder<> Builder(BI); + if (SE->getTypeSizeInBits(CmpIndVar->getType()) + > SE->getTypeSizeInBits(ExitCnt->getType())) { + CmpIndVar = Builder.CreateTrunc(CmpIndVar, ExitCnt->getType(), + "lftr.wideiv"); + } + + Value *Cond = Builder.CreateICmp(P, CmpIndVar, ExitCnt, "exitcond"); + Value *OrigCond = BI->getCondition(); + // It's tempting to use replaceAllUsesWith here to fully replace the old + // comparison, but that's not immediately safe, since users of the old + // comparison may not be dominated by the new comparison. Instead, just + // update the branch to use the new comparison; in the common case this + // will make old comparison dead. + BI->setCondition(Cond); + DeadInsts.push_back(OrigCond); + + ++NumLFTR; + Changed = true; + return Cond; +} + +//===----------------------------------------------------------------------===// +// SinkUnusedInvariants. A late subpass to cleanup loop preheaders. +//===----------------------------------------------------------------------===// + +/// If there's a single exit block, sink any loop-invariant values that +/// were defined in the preheader but not used inside the loop into the +/// exit block to reduce register pressure in the loop. +void IndVarSimplify::SinkUnusedInvariants(Loop *L) { + BasicBlock *ExitBlock = L->getExitBlock(); + if (!ExitBlock) return; + + BasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) return; + + Instruction *InsertPt = ExitBlock->getFirstInsertionPt(); + BasicBlock::iterator I = Preheader->getTerminator(); + while (I != Preheader->begin()) { + --I; + // New instructions were inserted at the end of the preheader. + if (isa<PHINode>(I)) + break; + + // Don't move instructions which might have side effects, since the side + // effects need to complete before instructions inside the loop. Also don't + // move instructions which might read memory, since the loop may modify + // memory. Note that it's okay if the instruction might have undefined + // behavior: LoopSimplify guarantees that the preheader dominates the exit + // block. + if (I->mayHaveSideEffects() || I->mayReadFromMemory()) + continue; + + // Skip debug info intrinsics. + if (isa<DbgInfoIntrinsic>(I)) + continue; + + // Skip landingpad instructions. + if (isa<LandingPadInst>(I)) + continue; + + // Don't sink alloca: we never want to sink static alloca's out of the + // entry block, and correctly sinking dynamic alloca's requires + // checks for stacksave/stackrestore intrinsics. + // FIXME: Refactor this check somehow? + if (isa<AllocaInst>(I)) + continue; + + // Determine if there is a use in or before the loop (direct or + // otherwise). + bool UsedInLoop = false; + for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); + UI != UE; ++UI) { + User *U = *UI; + BasicBlock *UseBB = cast<Instruction>(U)->getParent(); + if (PHINode *P = dyn_cast<PHINode>(U)) { + unsigned i = + PHINode::getIncomingValueNumForOperand(UI.getOperandNo()); + UseBB = P->getIncomingBlock(i); + } + if (UseBB == Preheader || L->contains(UseBB)) { + UsedInLoop = true; + break; + } + } + + // If there is, the def must remain in the preheader. + if (UsedInLoop) + continue; + + // Otherwise, sink it to the exit block. + Instruction *ToMove = I; + bool Done = false; + + if (I != Preheader->begin()) { + // Skip debug info intrinsics. + do { + --I; + } while (isa<DbgInfoIntrinsic>(I) && I != Preheader->begin()); + + if (isa<DbgInfoIntrinsic>(I) && I == Preheader->begin()) + Done = true; + } else { + Done = true; + } + + ToMove->moveBefore(InsertPt); + if (Done) break; + InsertPt = ToMove; + } +} + +//===----------------------------------------------------------------------===// +// IndVarSimplify driver. Manage several subpasses of IV simplification. +//===----------------------------------------------------------------------===// + +bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { + // If LoopSimplify form is not available, stay out of trouble. Some notes: + // - LSR currently only supports LoopSimplify-form loops. Indvars' + // canonicalization can be a pessimization without LSR to "clean up" + // afterwards. + // - We depend on having a preheader; in particular, + // Loop::getCanonicalInductionVariable only supports loops with preheaders, + // and we're in trouble if we can't find the induction variable even when + // we've manually inserted one. + if (!L->isLoopSimplifyForm()) + return false; + + LI = &getAnalysis<LoopInfo>(); + SE = &getAnalysis<ScalarEvolution>(); + DT = &getAnalysis<DominatorTree>(); + TD = getAnalysisIfAvailable<TargetData>(); + + DeadInsts.clear(); + Changed = false; + + // If there are any floating-point recurrences, attempt to + // transform them to use integer recurrences. + RewriteNonIntegerIVs(L); + + const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L); + + // Create a rewriter object which we'll use to transform the code with. + SCEVExpander Rewriter(*SE, "indvars"); +#ifndef NDEBUG + Rewriter.setDebugType(DEBUG_TYPE); +#endif + + // Eliminate redundant IV users. + // + // Simplification works best when run before other consumers of SCEV. We + // attempt to avoid evaluating SCEVs for sign/zero extend operations until + // other expressions involving loop IVs have been evaluated. This helps SCEV + // set no-wrap flags before normalizing sign/zero extension. + Rewriter.disableCanonicalMode(); + SimplifyAndExtend(L, Rewriter, LPM); + + // Check to see if this loop has a computable loop-invariant execution count. + // If so, this means that we can compute the final value of any expressions + // that are recurrent in the loop, and substitute the exit values from the + // loop into any instructions outside of the loop that use the final values of + // the current expressions. + // + if (!isa<SCEVCouldNotCompute>(BackedgeTakenCount)) + RewriteLoopExitValues(L, Rewriter); + + // Eliminate redundant IV cycles. + NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts); + + // If we have a trip count expression, rewrite the loop's exit condition + // using it. We can currently only handle loops with a single exit. + if (canExpandBackedgeTakenCount(L, SE) && needsLFTR(L, DT)) { + PHINode *IndVar = FindLoopCounter(L, BackedgeTakenCount, SE, DT, TD); + if (IndVar) { + // Check preconditions for proper SCEVExpander operation. SCEV does not + // express SCEVExpander's dependencies, such as LoopSimplify. Instead any + // pass that uses the SCEVExpander must do it. This does not work well for + // loop passes because SCEVExpander makes assumptions about all loops, while + // LoopPassManager only forces the current loop to be simplified. + // + // FIXME: SCEV expansion has no way to bail out, so the caller must + // explicitly check any assumptions made by SCEV. Brittle. + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(BackedgeTakenCount); + if (!AR || AR->getLoop()->getLoopPreheader()) + (void)LinearFunctionTestReplace(L, BackedgeTakenCount, IndVar, + Rewriter); + } + } + // Clear the rewriter cache, because values that are in the rewriter's cache + // can be deleted in the loop below, causing the AssertingVH in the cache to + // trigger. + Rewriter.clear(); + + // Now that we're done iterating through lists, clean up any instructions + // which are now dead. + while (!DeadInsts.empty()) + if (Instruction *Inst = + dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val())) + RecursivelyDeleteTriviallyDeadInstructions(Inst); + + // The Rewriter may not be used from this point on. + + // Loop-invariant instructions in the preheader that aren't used in the + // loop may be sunk below the loop to reduce register pressure. + SinkUnusedInvariants(L); + + // Clean up dead instructions. + Changed |= DeleteDeadPHIs(L->getHeader()); + // Check a post-condition. + assert(L->isLCSSAForm(*DT) && + "Indvars did not leave the loop in lcssa form!"); + + // Verify that LFTR, and any other change have not interfered with SCEV's + // ability to compute trip count. +#ifndef NDEBUG + if (VerifyIndvars && !isa<SCEVCouldNotCompute>(BackedgeTakenCount)) { + SE->forgetLoop(L); + const SCEV *NewBECount = SE->getBackedgeTakenCount(L); + if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) < + SE->getTypeSizeInBits(NewBECount->getType())) + NewBECount = SE->getTruncateOrNoop(NewBECount, + BackedgeTakenCount->getType()); + else + BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, + NewBECount->getType()); + assert(BackedgeTakenCount == NewBECount && "indvars must preserve SCEV"); + } +#endif + + return Changed; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp new file mode 100644 index 0000000..dd42c59 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -0,0 +1,1609 @@ +//===- JumpThreading.cpp - Thread control through conditional blocks ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the Jump Threading pass. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "jump-threading" +#include "llvm/Transforms/Scalar.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/LLVMContext.h" +#include "llvm/Pass.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LazyValueInfo.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ValueHandle.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +STATISTIC(NumThreads, "Number of jumps threaded"); +STATISTIC(NumFolds, "Number of terminators folded"); +STATISTIC(NumDupes, "Number of branch blocks duplicated to eliminate phi"); + +static cl::opt<unsigned> +Threshold("jump-threading-threshold", + cl::desc("Max block size to duplicate for jump threading"), + cl::init(6), cl::Hidden); + +namespace { + // These are at global scope so static functions can use them too. + typedef SmallVectorImpl<std::pair<Constant*, BasicBlock*> > PredValueInfo; + typedef SmallVector<std::pair<Constant*, BasicBlock*>, 8> PredValueInfoTy; + + // This is used to keep track of what kind of constant we're currently hoping + // to find. + enum ConstantPreference { + WantInteger, + WantBlockAddress + }; + + /// This pass performs 'jump threading', which looks at blocks that have + /// multiple predecessors and multiple successors. If one or more of the + /// predecessors of the block can be proven to always jump to one of the + /// successors, we forward the edge from the predecessor to the successor by + /// duplicating the contents of this block. + /// + /// An example of when this can occur is code like this: + /// + /// if () { ... + /// X = 4; + /// } + /// if (X < 3) { + /// + /// In this case, the unconditional branch at the end of the first if can be + /// revectored to the false side of the second if. + /// + class JumpThreading : public FunctionPass { + TargetData *TD; + TargetLibraryInfo *TLI; + LazyValueInfo *LVI; +#ifdef NDEBUG + SmallPtrSet<BasicBlock*, 16> LoopHeaders; +#else + SmallSet<AssertingVH<BasicBlock>, 16> LoopHeaders; +#endif + DenseSet<std::pair<Value*, BasicBlock*> > RecursionSet; + + // RAII helper for updating the recursion stack. + struct RecursionSetRemover { + DenseSet<std::pair<Value*, BasicBlock*> > &TheSet; + std::pair<Value*, BasicBlock*> ThePair; + + RecursionSetRemover(DenseSet<std::pair<Value*, BasicBlock*> > &S, + std::pair<Value*, BasicBlock*> P) + : TheSet(S), ThePair(P) { } + + ~RecursionSetRemover() { + TheSet.erase(ThePair); + } + }; + public: + static char ID; // Pass identification + JumpThreading() : FunctionPass(ID) { + initializeJumpThreadingPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<LazyValueInfo>(); + AU.addPreserved<LazyValueInfo>(); + AU.addRequired<TargetLibraryInfo>(); + } + + void FindLoopHeaders(Function &F); + bool ProcessBlock(BasicBlock *BB); + bool ThreadEdge(BasicBlock *BB, const SmallVectorImpl<BasicBlock*> &PredBBs, + BasicBlock *SuccBB); + bool DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, + const SmallVectorImpl<BasicBlock *> &PredBBs); + + bool ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, + PredValueInfo &Result, + ConstantPreference Preference); + bool ProcessThreadableEdges(Value *Cond, BasicBlock *BB, + ConstantPreference Preference); + + bool ProcessBranchOnPHI(PHINode *PN); + bool ProcessBranchOnXOR(BinaryOperator *BO); + + bool SimplifyPartiallyRedundantLoad(LoadInst *LI); + }; +} + +char JumpThreading::ID = 0; +INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading", + "Jump Threading", false, false) +INITIALIZE_PASS_DEPENDENCY(LazyValueInfo) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_END(JumpThreading, "jump-threading", + "Jump Threading", false, false) + +// Public interface to the Jump Threading pass +FunctionPass *llvm::createJumpThreadingPass() { return new JumpThreading(); } + +/// runOnFunction - Top level algorithm. +/// +bool JumpThreading::runOnFunction(Function &F) { + DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n"); + TD = getAnalysisIfAvailable<TargetData>(); + TLI = &getAnalysis<TargetLibraryInfo>(); + LVI = &getAnalysis<LazyValueInfo>(); + + FindLoopHeaders(F); + + bool Changed, EverChanged = false; + do { + Changed = false; + for (Function::iterator I = F.begin(), E = F.end(); I != E;) { + BasicBlock *BB = I; + // Thread all of the branches we can over this block. + while (ProcessBlock(BB)) + Changed = true; + + ++I; + + // If the block is trivially dead, zap it. This eliminates the successor + // edges which simplifies the CFG. + if (pred_begin(BB) == pred_end(BB) && + BB != &BB->getParent()->getEntryBlock()) { + DEBUG(dbgs() << " JT: Deleting dead block '" << BB->getName() + << "' with terminator: " << *BB->getTerminator() << '\n'); + LoopHeaders.erase(BB); + LVI->eraseBlock(BB); + DeleteDeadBlock(BB); + Changed = true; + continue; + } + + BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()); + + // Can't thread an unconditional jump, but if the block is "almost + // empty", we can replace uses of it with uses of the successor and make + // this dead. + if (BI && BI->isUnconditional() && + BB != &BB->getParent()->getEntryBlock() && + // If the terminator is the only non-phi instruction, try to nuke it. + BB->getFirstNonPHIOrDbg()->isTerminator()) { + // Since TryToSimplifyUncondBranchFromEmptyBlock may delete the + // block, we have to make sure it isn't in the LoopHeaders set. We + // reinsert afterward if needed. + bool ErasedFromLoopHeaders = LoopHeaders.erase(BB); + BasicBlock *Succ = BI->getSuccessor(0); + + // FIXME: It is always conservatively correct to drop the info + // for a block even if it doesn't get erased. This isn't totally + // awesome, but it allows us to use AssertingVH to prevent nasty + // dangling pointer issues within LazyValueInfo. + LVI->eraseBlock(BB); + if (TryToSimplifyUncondBranchFromEmptyBlock(BB)) { + Changed = true; + // If we deleted BB and BB was the header of a loop, then the + // successor is now the header of the loop. + BB = Succ; + } + + if (ErasedFromLoopHeaders) + LoopHeaders.insert(BB); + } + } + EverChanged |= Changed; + } while (Changed); + + LoopHeaders.clear(); + return EverChanged; +} + +/// getJumpThreadDuplicationCost - Return the cost of duplicating this block to +/// thread across it. +static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB) { + /// Ignore PHI nodes, these will be flattened when duplication happens. + BasicBlock::const_iterator I = BB->getFirstNonPHI(); + + // FIXME: THREADING will delete values that are just used to compute the + // branch, so they shouldn't count against the duplication cost. + + + // Sum up the cost of each instruction until we get to the terminator. Don't + // include the terminator because the copy won't include it. + unsigned Size = 0; + for (; !isa<TerminatorInst>(I); ++I) { + // Debugger intrinsics don't incur code size. + if (isa<DbgInfoIntrinsic>(I)) continue; + + // If this is a pointer->pointer bitcast, it is free. + if (isa<BitCastInst>(I) && I->getType()->isPointerTy()) + continue; + + // All other instructions count for at least one unit. + ++Size; + + // Calls are more expensive. If they are non-intrinsic calls, we model them + // as having cost of 4. If they are a non-vector intrinsic, we model them + // as having cost of 2 total, and if they are a vector intrinsic, we model + // them as having cost 1. + if (const CallInst *CI = dyn_cast<CallInst>(I)) { + if (!isa<IntrinsicInst>(CI)) + Size += 3; + else if (!CI->getType()->isVectorTy()) + Size += 1; + } + } + + // Threading through a switch statement is particularly profitable. If this + // block ends in a switch, decrease its cost to make it more likely to happen. + if (isa<SwitchInst>(I)) + Size = Size > 6 ? Size-6 : 0; + + // The same holds for indirect branches, but slightly more so. + if (isa<IndirectBrInst>(I)) + Size = Size > 8 ? Size-8 : 0; + + return Size; +} + +/// FindLoopHeaders - We do not want jump threading to turn proper loop +/// structures into irreducible loops. Doing this breaks up the loop nesting +/// hierarchy and pessimizes later transformations. To prevent this from +/// happening, we first have to find the loop headers. Here we approximate this +/// by finding targets of backedges in the CFG. +/// +/// Note that there definitely are cases when we want to allow threading of +/// edges across a loop header. For example, threading a jump from outside the +/// loop (the preheader) to an exit block of the loop is definitely profitable. +/// It is also almost always profitable to thread backedges from within the loop +/// to exit blocks, and is often profitable to thread backedges to other blocks +/// within the loop (forming a nested loop). This simple analysis is not rich +/// enough to track all of these properties and keep it up-to-date as the CFG +/// mutates, so we don't allow any of these transformations. +/// +void JumpThreading::FindLoopHeaders(Function &F) { + SmallVector<std::pair<const BasicBlock*,const BasicBlock*>, 32> Edges; + FindFunctionBackedges(F, Edges); + + for (unsigned i = 0, e = Edges.size(); i != e; ++i) + LoopHeaders.insert(const_cast<BasicBlock*>(Edges[i].second)); +} + +/// getKnownConstant - Helper method to determine if we can thread over a +/// terminator with the given value as its condition, and if so what value to +/// use for that. What kind of value this is depends on whether we want an +/// integer or a block address, but an undef is always accepted. +/// Returns null if Val is null or not an appropriate constant. +static Constant *getKnownConstant(Value *Val, ConstantPreference Preference) { + if (!Val) + return 0; + + // Undef is "known" enough. + if (UndefValue *U = dyn_cast<UndefValue>(Val)) + return U; + + if (Preference == WantBlockAddress) + return dyn_cast<BlockAddress>(Val->stripPointerCasts()); + + return dyn_cast<ConstantInt>(Val); +} + +/// ComputeValueKnownInPredecessors - Given a basic block BB and a value V, see +/// if we can infer that the value is a known ConstantInt/BlockAddress or undef +/// in any of our predecessors. If so, return the known list of value and pred +/// BB in the result vector. +/// +/// This returns true if there were any known values. +/// +bool JumpThreading:: +ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, + ConstantPreference Preference) { + // This method walks up use-def chains recursively. Because of this, we could + // get into an infinite loop going around loops in the use-def chain. To + // prevent this, keep track of what (value, block) pairs we've already visited + // and terminate the search if we loop back to them + if (!RecursionSet.insert(std::make_pair(V, BB)).second) + return false; + + // An RAII help to remove this pair from the recursion set once the recursion + // stack pops back out again. + RecursionSetRemover remover(RecursionSet, std::make_pair(V, BB)); + + // If V is a constant, then it is known in all predecessors. + if (Constant *KC = getKnownConstant(V, Preference)) { + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) + Result.push_back(std::make_pair(KC, *PI)); + + return true; + } + + // If V is a non-instruction value, or an instruction in a different block, + // then it can't be derived from a PHI. + Instruction *I = dyn_cast<Instruction>(V); + if (I == 0 || I->getParent() != BB) { + + // Okay, if this is a live-in value, see if it has a known value at the end + // of any of our predecessors. + // + // FIXME: This should be an edge property, not a block end property. + /// TODO: Per PR2563, we could infer value range information about a + /// predecessor based on its terminator. + // + // FIXME: change this to use the more-rich 'getPredicateOnEdge' method if + // "I" is a non-local compare-with-a-constant instruction. This would be + // able to handle value inequalities better, for example if the compare is + // "X < 4" and "X < 3" is known true but "X < 4" itself is not available. + // Perhaps getConstantOnEdge should be smart enough to do this? + + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { + BasicBlock *P = *PI; + // If the value is known by LazyValueInfo to be a constant in a + // predecessor, use that information to try to thread this block. + Constant *PredCst = LVI->getConstantOnEdge(V, P, BB); + if (Constant *KC = getKnownConstant(PredCst, Preference)) + Result.push_back(std::make_pair(KC, P)); + } + + return !Result.empty(); + } + + /// If I is a PHI node, then we know the incoming values for any constants. + if (PHINode *PN = dyn_cast<PHINode>(I)) { + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + Value *InVal = PN->getIncomingValue(i); + if (Constant *KC = getKnownConstant(InVal, Preference)) { + Result.push_back(std::make_pair(KC, PN->getIncomingBlock(i))); + } else { + Constant *CI = LVI->getConstantOnEdge(InVal, + PN->getIncomingBlock(i), BB); + if (Constant *KC = getKnownConstant(CI, Preference)) + Result.push_back(std::make_pair(KC, PN->getIncomingBlock(i))); + } + } + + return !Result.empty(); + } + + PredValueInfoTy LHSVals, RHSVals; + + // Handle some boolean conditions. + if (I->getType()->getPrimitiveSizeInBits() == 1) { + assert(Preference == WantInteger && "One-bit non-integer type?"); + // X | true -> true + // X & false -> false + if (I->getOpcode() == Instruction::Or || + I->getOpcode() == Instruction::And) { + ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals, + WantInteger); + ComputeValueKnownInPredecessors(I->getOperand(1), BB, RHSVals, + WantInteger); + + if (LHSVals.empty() && RHSVals.empty()) + return false; + + ConstantInt *InterestingVal; + if (I->getOpcode() == Instruction::Or) + InterestingVal = ConstantInt::getTrue(I->getContext()); + else + InterestingVal = ConstantInt::getFalse(I->getContext()); + + SmallPtrSet<BasicBlock*, 4> LHSKnownBBs; + + // Scan for the sentinel. If we find an undef, force it to the + // interesting value: x|undef -> true and x&undef -> false. + for (unsigned i = 0, e = LHSVals.size(); i != e; ++i) + if (LHSVals[i].first == InterestingVal || + isa<UndefValue>(LHSVals[i].first)) { + Result.push_back(LHSVals[i]); + Result.back().first = InterestingVal; + LHSKnownBBs.insert(LHSVals[i].second); + } + for (unsigned i = 0, e = RHSVals.size(); i != e; ++i) + if (RHSVals[i].first == InterestingVal || + isa<UndefValue>(RHSVals[i].first)) { + // If we already inferred a value for this block on the LHS, don't + // re-add it. + if (!LHSKnownBBs.count(RHSVals[i].second)) { + Result.push_back(RHSVals[i]); + Result.back().first = InterestingVal; + } + } + + return !Result.empty(); + } + + // Handle the NOT form of XOR. + if (I->getOpcode() == Instruction::Xor && + isa<ConstantInt>(I->getOperand(1)) && + cast<ConstantInt>(I->getOperand(1))->isOne()) { + ComputeValueKnownInPredecessors(I->getOperand(0), BB, Result, + WantInteger); + if (Result.empty()) + return false; + + // Invert the known values. + for (unsigned i = 0, e = Result.size(); i != e; ++i) + Result[i].first = ConstantExpr::getNot(Result[i].first); + + return true; + } + + // Try to simplify some other binary operator values. + } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) { + assert(Preference != WantBlockAddress + && "A binary operator creating a block address?"); + if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) { + PredValueInfoTy LHSVals; + ComputeValueKnownInPredecessors(BO->getOperand(0), BB, LHSVals, + WantInteger); + + // Try to use constant folding to simplify the binary operator. + for (unsigned i = 0, e = LHSVals.size(); i != e; ++i) { + Constant *V = LHSVals[i].first; + Constant *Folded = ConstantExpr::get(BO->getOpcode(), V, CI); + + if (Constant *KC = getKnownConstant(Folded, WantInteger)) + Result.push_back(std::make_pair(KC, LHSVals[i].second)); + } + } + + return !Result.empty(); + } + + // Handle compare with phi operand, where the PHI is defined in this block. + if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) { + assert(Preference == WantInteger && "Compares only produce integers"); + PHINode *PN = dyn_cast<PHINode>(Cmp->getOperand(0)); + if (PN && PN->getParent() == BB) { + // We can do this simplification if any comparisons fold to true or false. + // See if any do. + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + BasicBlock *PredBB = PN->getIncomingBlock(i); + Value *LHS = PN->getIncomingValue(i); + Value *RHS = Cmp->getOperand(1)->DoPHITranslation(BB, PredBB); + + Value *Res = SimplifyCmpInst(Cmp->getPredicate(), LHS, RHS, TD); + if (Res == 0) { + if (!isa<Constant>(RHS)) + continue; + + LazyValueInfo::Tristate + ResT = LVI->getPredicateOnEdge(Cmp->getPredicate(), LHS, + cast<Constant>(RHS), PredBB, BB); + if (ResT == LazyValueInfo::Unknown) + continue; + Res = ConstantInt::get(Type::getInt1Ty(LHS->getContext()), ResT); + } + + if (Constant *KC = getKnownConstant(Res, WantInteger)) + Result.push_back(std::make_pair(KC, PredBB)); + } + + return !Result.empty(); + } + + + // If comparing a live-in value against a constant, see if we know the + // live-in value on any predecessors. + if (isa<Constant>(Cmp->getOperand(1)) && Cmp->getType()->isIntegerTy()) { + if (!isa<Instruction>(Cmp->getOperand(0)) || + cast<Instruction>(Cmp->getOperand(0))->getParent() != BB) { + Constant *RHSCst = cast<Constant>(Cmp->getOperand(1)); + + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB);PI != E; ++PI){ + BasicBlock *P = *PI; + // If the value is known by LazyValueInfo to be a constant in a + // predecessor, use that information to try to thread this block. + LazyValueInfo::Tristate Res = + LVI->getPredicateOnEdge(Cmp->getPredicate(), Cmp->getOperand(0), + RHSCst, P, BB); + if (Res == LazyValueInfo::Unknown) + continue; + + Constant *ResC = ConstantInt::get(Cmp->getType(), Res); + Result.push_back(std::make_pair(ResC, P)); + } + + return !Result.empty(); + } + + // Try to find a constant value for the LHS of a comparison, + // and evaluate it statically if we can. + if (Constant *CmpConst = dyn_cast<Constant>(Cmp->getOperand(1))) { + PredValueInfoTy LHSVals; + ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals, + WantInteger); + + for (unsigned i = 0, e = LHSVals.size(); i != e; ++i) { + Constant *V = LHSVals[i].first; + Constant *Folded = ConstantExpr::getCompare(Cmp->getPredicate(), + V, CmpConst); + if (Constant *KC = getKnownConstant(Folded, WantInteger)) + Result.push_back(std::make_pair(KC, LHSVals[i].second)); + } + + return !Result.empty(); + } + } + } + + if (SelectInst *SI = dyn_cast<SelectInst>(I)) { + // Handle select instructions where at least one operand is a known constant + // and we can figure out the condition value for any predecessor block. + Constant *TrueVal = getKnownConstant(SI->getTrueValue(), Preference); + Constant *FalseVal = getKnownConstant(SI->getFalseValue(), Preference); + PredValueInfoTy Conds; + if ((TrueVal || FalseVal) && + ComputeValueKnownInPredecessors(SI->getCondition(), BB, Conds, + WantInteger)) { + for (unsigned i = 0, e = Conds.size(); i != e; ++i) { + Constant *Cond = Conds[i].first; + + // Figure out what value to use for the condition. + bool KnownCond; + if (ConstantInt *CI = dyn_cast<ConstantInt>(Cond)) { + // A known boolean. + KnownCond = CI->isOne(); + } else { + assert(isa<UndefValue>(Cond) && "Unexpected condition value"); + // Either operand will do, so be sure to pick the one that's a known + // constant. + // FIXME: Do this more cleverly if both values are known constants? + KnownCond = (TrueVal != 0); + } + + // See if the select has a known constant value for this predecessor. + if (Constant *Val = KnownCond ? TrueVal : FalseVal) + Result.push_back(std::make_pair(Val, Conds[i].second)); + } + + return !Result.empty(); + } + } + + // If all else fails, see if LVI can figure out a constant value for us. + Constant *CI = LVI->getConstant(V, BB); + if (Constant *KC = getKnownConstant(CI, Preference)) { + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) + Result.push_back(std::make_pair(KC, *PI)); + } + + return !Result.empty(); +} + + + +/// GetBestDestForBranchOnUndef - If we determine that the specified block ends +/// in an undefined jump, decide which block is best to revector to. +/// +/// Since we can pick an arbitrary destination, we pick the successor with the +/// fewest predecessors. This should reduce the in-degree of the others. +/// +static unsigned GetBestDestForJumpOnUndef(BasicBlock *BB) { + TerminatorInst *BBTerm = BB->getTerminator(); + unsigned MinSucc = 0; + BasicBlock *TestBB = BBTerm->getSuccessor(MinSucc); + // Compute the successor with the minimum number of predecessors. + unsigned MinNumPreds = std::distance(pred_begin(TestBB), pred_end(TestBB)); + for (unsigned i = 1, e = BBTerm->getNumSuccessors(); i != e; ++i) { + TestBB = BBTerm->getSuccessor(i); + unsigned NumPreds = std::distance(pred_begin(TestBB), pred_end(TestBB)); + if (NumPreds < MinNumPreds) { + MinSucc = i; + MinNumPreds = NumPreds; + } + } + + return MinSucc; +} + +static bool hasAddressTakenAndUsed(BasicBlock *BB) { + if (!BB->hasAddressTaken()) return false; + + // If the block has its address taken, it may be a tree of dead constants + // hanging off of it. These shouldn't keep the block alive. + BlockAddress *BA = BlockAddress::get(BB); + BA->removeDeadConstantUsers(); + return !BA->use_empty(); +} + +/// ProcessBlock - If there are any predecessors whose control can be threaded +/// through to a successor, transform them now. +bool JumpThreading::ProcessBlock(BasicBlock *BB) { + // If the block is trivially dead, just return and let the caller nuke it. + // This simplifies other transformations. + if (pred_begin(BB) == pred_end(BB) && + BB != &BB->getParent()->getEntryBlock()) + return false; + + // If this block has a single predecessor, and if that pred has a single + // successor, merge the blocks. This encourages recursive jump threading + // because now the condition in this block can be threaded through + // predecessors of our predecessor block. + if (BasicBlock *SinglePred = BB->getSinglePredecessor()) { + if (SinglePred->getTerminator()->getNumSuccessors() == 1 && + SinglePred != BB && !hasAddressTakenAndUsed(BB)) { + // If SinglePred was a loop header, BB becomes one. + if (LoopHeaders.erase(SinglePred)) + LoopHeaders.insert(BB); + + // Remember if SinglePred was the entry block of the function. If so, we + // will need to move BB back to the entry position. + bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock(); + LVI->eraseBlock(SinglePred); + MergeBasicBlockIntoOnlyPred(BB); + + if (isEntry && BB != &BB->getParent()->getEntryBlock()) + BB->moveBefore(&BB->getParent()->getEntryBlock()); + return true; + } + } + + // What kind of constant we're looking for. + ConstantPreference Preference = WantInteger; + + // Look to see if the terminator is a conditional branch, switch or indirect + // branch, if not we can't thread it. + Value *Condition; + Instruction *Terminator = BB->getTerminator(); + if (BranchInst *BI = dyn_cast<BranchInst>(Terminator)) { + // Can't thread an unconditional jump. + if (BI->isUnconditional()) return false; + Condition = BI->getCondition(); + } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Terminator)) { + Condition = SI->getCondition(); + } else if (IndirectBrInst *IB = dyn_cast<IndirectBrInst>(Terminator)) { + // Can't thread indirect branch with no successors. + if (IB->getNumSuccessors() == 0) return false; + Condition = IB->getAddress()->stripPointerCasts(); + Preference = WantBlockAddress; + } else { + return false; // Must be an invoke. + } + + // Run constant folding to see if we can reduce the condition to a simple + // constant. + if (Instruction *I = dyn_cast<Instruction>(Condition)) { + Value *SimpleVal = ConstantFoldInstruction(I, TD, TLI); + if (SimpleVal) { + I->replaceAllUsesWith(SimpleVal); + I->eraseFromParent(); + Condition = SimpleVal; + } + } + + // If the terminator is branching on an undef, we can pick any of the + // successors to branch to. Let GetBestDestForJumpOnUndef decide. + if (isa<UndefValue>(Condition)) { + unsigned BestSucc = GetBestDestForJumpOnUndef(BB); + + // Fold the branch/switch. + TerminatorInst *BBTerm = BB->getTerminator(); + for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i) { + if (i == BestSucc) continue; + BBTerm->getSuccessor(i)->removePredecessor(BB, true); + } + + DEBUG(dbgs() << " In block '" << BB->getName() + << "' folding undef terminator: " << *BBTerm << '\n'); + BranchInst::Create(BBTerm->getSuccessor(BestSucc), BBTerm); + BBTerm->eraseFromParent(); + return true; + } + + // If the terminator of this block is branching on a constant, simplify the + // terminator to an unconditional branch. This can occur due to threading in + // other blocks. + if (getKnownConstant(Condition, Preference)) { + DEBUG(dbgs() << " In block '" << BB->getName() + << "' folding terminator: " << *BB->getTerminator() << '\n'); + ++NumFolds; + ConstantFoldTerminator(BB, true); + return true; + } + + Instruction *CondInst = dyn_cast<Instruction>(Condition); + + // All the rest of our checks depend on the condition being an instruction. + if (CondInst == 0) { + // FIXME: Unify this with code below. + if (ProcessThreadableEdges(Condition, BB, Preference)) + return true; + return false; + } + + + if (CmpInst *CondCmp = dyn_cast<CmpInst>(CondInst)) { + // For a comparison where the LHS is outside this block, it's possible + // that we've branched on it before. Used LVI to see if we can simplify + // the branch based on that. + BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator()); + Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1)); + pred_iterator PI = pred_begin(BB), PE = pred_end(BB); + if (CondBr && CondConst && CondBr->isConditional() && PI != PE && + (!isa<Instruction>(CondCmp->getOperand(0)) || + cast<Instruction>(CondCmp->getOperand(0))->getParent() != BB)) { + // For predecessor edge, determine if the comparison is true or false + // on that edge. If they're all true or all false, we can simplify the + // branch. + // FIXME: We could handle mixed true/false by duplicating code. + LazyValueInfo::Tristate Baseline = + LVI->getPredicateOnEdge(CondCmp->getPredicate(), CondCmp->getOperand(0), + CondConst, *PI, BB); + if (Baseline != LazyValueInfo::Unknown) { + // Check that all remaining incoming values match the first one. + while (++PI != PE) { + LazyValueInfo::Tristate Ret = + LVI->getPredicateOnEdge(CondCmp->getPredicate(), + CondCmp->getOperand(0), CondConst, *PI, BB); + if (Ret != Baseline) break; + } + + // If we terminated early, then one of the values didn't match. + if (PI == PE) { + unsigned ToRemove = Baseline == LazyValueInfo::True ? 1 : 0; + unsigned ToKeep = Baseline == LazyValueInfo::True ? 0 : 1; + CondBr->getSuccessor(ToRemove)->removePredecessor(BB, true); + BranchInst::Create(CondBr->getSuccessor(ToKeep), CondBr); + CondBr->eraseFromParent(); + return true; + } + } + } + } + + // Check for some cases that are worth simplifying. Right now we want to look + // for loads that are used by a switch or by the condition for the branch. If + // we see one, check to see if it's partially redundant. If so, insert a PHI + // which can then be used to thread the values. + // + Value *SimplifyValue = CondInst; + if (CmpInst *CondCmp = dyn_cast<CmpInst>(SimplifyValue)) + if (isa<Constant>(CondCmp->getOperand(1))) + SimplifyValue = CondCmp->getOperand(0); + + // TODO: There are other places where load PRE would be profitable, such as + // more complex comparisons. + if (LoadInst *LI = dyn_cast<LoadInst>(SimplifyValue)) + if (SimplifyPartiallyRedundantLoad(LI)) + return true; + + + // Handle a variety of cases where we are branching on something derived from + // a PHI node in the current block. If we can prove that any predecessors + // compute a predictable value based on a PHI node, thread those predecessors. + // + if (ProcessThreadableEdges(CondInst, BB, Preference)) + return true; + + // If this is an otherwise-unfoldable branch on a phi node in the current + // block, see if we can simplify. + if (PHINode *PN = dyn_cast<PHINode>(CondInst)) + if (PN->getParent() == BB && isa<BranchInst>(BB->getTerminator())) + return ProcessBranchOnPHI(PN); + + + // If this is an otherwise-unfoldable branch on a XOR, see if we can simplify. + if (CondInst->getOpcode() == Instruction::Xor && + CondInst->getParent() == BB && isa<BranchInst>(BB->getTerminator())) + return ProcessBranchOnXOR(cast<BinaryOperator>(CondInst)); + + + // TODO: If we have: "br (X > 0)" and we have a predecessor where we know + // "(X == 4)", thread through this block. + + return false; +} + + +/// SimplifyPartiallyRedundantLoad - If LI is an obviously partially redundant +/// load instruction, eliminate it by replacing it with a PHI node. This is an +/// important optimization that encourages jump threading, and needs to be run +/// interlaced with other jump threading tasks. +bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { + // Don't hack volatile/atomic loads. + if (!LI->isSimple()) return false; + + // If the load is defined in a block with exactly one predecessor, it can't be + // partially redundant. + BasicBlock *LoadBB = LI->getParent(); + if (LoadBB->getSinglePredecessor()) + return false; + + Value *LoadedPtr = LI->getOperand(0); + + // If the loaded operand is defined in the LoadBB, it can't be available. + // TODO: Could do simple PHI translation, that would be fun :) + if (Instruction *PtrOp = dyn_cast<Instruction>(LoadedPtr)) + if (PtrOp->getParent() == LoadBB) + return false; + + // Scan a few instructions up from the load, to see if it is obviously live at + // the entry to its block. + BasicBlock::iterator BBIt = LI; + + if (Value *AvailableVal = + FindAvailableLoadedValue(LoadedPtr, LoadBB, BBIt, 6)) { + // If the value if the load is locally available within the block, just use + // it. This frequently occurs for reg2mem'd allocas. + //cerr << "LOAD ELIMINATED:\n" << *BBIt << *LI << "\n"; + + // If the returned value is the load itself, replace with an undef. This can + // only happen in dead loops. + if (AvailableVal == LI) AvailableVal = UndefValue::get(LI->getType()); + LI->replaceAllUsesWith(AvailableVal); + LI->eraseFromParent(); + return true; + } + + // Otherwise, if we scanned the whole block and got to the top of the block, + // we know the block is locally transparent to the load. If not, something + // might clobber its value. + if (BBIt != LoadBB->begin()) + return false; + + // If all of the loads and stores that feed the value have the same TBAA tag, + // then we can propagate it onto any newly inserted loads. + MDNode *TBAATag = LI->getMetadata(LLVMContext::MD_tbaa); + + SmallPtrSet<BasicBlock*, 8> PredsScanned; + typedef SmallVector<std::pair<BasicBlock*, Value*>, 8> AvailablePredsTy; + AvailablePredsTy AvailablePreds; + BasicBlock *OneUnavailablePred = 0; + + // If we got here, the loaded value is transparent through to the start of the + // block. Check to see if it is available in any of the predecessor blocks. + for (pred_iterator PI = pred_begin(LoadBB), PE = pred_end(LoadBB); + PI != PE; ++PI) { + BasicBlock *PredBB = *PI; + + // If we already scanned this predecessor, skip it. + if (!PredsScanned.insert(PredBB)) + continue; + + // Scan the predecessor to see if the value is available in the pred. + BBIt = PredBB->end(); + MDNode *ThisTBAATag = 0; + Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt, 6, + 0, &ThisTBAATag); + if (!PredAvailable) { + OneUnavailablePred = PredBB; + continue; + } + + // If tbaa tags disagree or are not present, forget about them. + if (TBAATag != ThisTBAATag) TBAATag = 0; + + // If so, this load is partially redundant. Remember this info so that we + // can create a PHI node. + AvailablePreds.push_back(std::make_pair(PredBB, PredAvailable)); + } + + // If the loaded value isn't available in any predecessor, it isn't partially + // redundant. + if (AvailablePreds.empty()) return false; + + // Okay, the loaded value is available in at least one (and maybe all!) + // predecessors. If the value is unavailable in more than one unique + // predecessor, we want to insert a merge block for those common predecessors. + // This ensures that we only have to insert one reload, thus not increasing + // code size. + BasicBlock *UnavailablePred = 0; + + // If there is exactly one predecessor where the value is unavailable, the + // already computed 'OneUnavailablePred' block is it. If it ends in an + // unconditional branch, we know that it isn't a critical edge. + if (PredsScanned.size() == AvailablePreds.size()+1 && + OneUnavailablePred->getTerminator()->getNumSuccessors() == 1) { + UnavailablePred = OneUnavailablePred; + } else if (PredsScanned.size() != AvailablePreds.size()) { + // Otherwise, we had multiple unavailable predecessors or we had a critical + // edge from the one. + SmallVector<BasicBlock*, 8> PredsToSplit; + SmallPtrSet<BasicBlock*, 8> AvailablePredSet; + + for (unsigned i = 0, e = AvailablePreds.size(); i != e; ++i) + AvailablePredSet.insert(AvailablePreds[i].first); + + // Add all the unavailable predecessors to the PredsToSplit list. + for (pred_iterator PI = pred_begin(LoadBB), PE = pred_end(LoadBB); + PI != PE; ++PI) { + BasicBlock *P = *PI; + // If the predecessor is an indirect goto, we can't split the edge. + if (isa<IndirectBrInst>(P->getTerminator())) + return false; + + if (!AvailablePredSet.count(P)) + PredsToSplit.push_back(P); + } + + // Split them out to their own block. + UnavailablePred = + SplitBlockPredecessors(LoadBB, PredsToSplit, "thread-pre-split", this); + } + + // If the value isn't available in all predecessors, then there will be + // exactly one where it isn't available. Insert a load on that edge and add + // it to the AvailablePreds list. + if (UnavailablePred) { + assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 && + "Can't handle critical edge here!"); + LoadInst *NewVal = new LoadInst(LoadedPtr, LI->getName()+".pr", false, + LI->getAlignment(), + UnavailablePred->getTerminator()); + NewVal->setDebugLoc(LI->getDebugLoc()); + if (TBAATag) + NewVal->setMetadata(LLVMContext::MD_tbaa, TBAATag); + + AvailablePreds.push_back(std::make_pair(UnavailablePred, NewVal)); + } + + // Now we know that each predecessor of this block has a value in + // AvailablePreds, sort them for efficient access as we're walking the preds. + array_pod_sort(AvailablePreds.begin(), AvailablePreds.end()); + + // Create a PHI node at the start of the block for the PRE'd load value. + pred_iterator PB = pred_begin(LoadBB), PE = pred_end(LoadBB); + PHINode *PN = PHINode::Create(LI->getType(), std::distance(PB, PE), "", + LoadBB->begin()); + PN->takeName(LI); + PN->setDebugLoc(LI->getDebugLoc()); + + // Insert new entries into the PHI for each predecessor. A single block may + // have multiple entries here. + for (pred_iterator PI = PB; PI != PE; ++PI) { + BasicBlock *P = *PI; + AvailablePredsTy::iterator I = + std::lower_bound(AvailablePreds.begin(), AvailablePreds.end(), + std::make_pair(P, (Value*)0)); + + assert(I != AvailablePreds.end() && I->first == P && + "Didn't find entry for predecessor!"); + + PN->addIncoming(I->second, I->first); + } + + //cerr << "PRE: " << *LI << *PN << "\n"; + + LI->replaceAllUsesWith(PN); + LI->eraseFromParent(); + + return true; +} + +/// FindMostPopularDest - The specified list contains multiple possible +/// threadable destinations. Pick the one that occurs the most frequently in +/// the list. +static BasicBlock * +FindMostPopularDest(BasicBlock *BB, + const SmallVectorImpl<std::pair<BasicBlock*, + BasicBlock*> > &PredToDestList) { + assert(!PredToDestList.empty()); + + // Determine popularity. If there are multiple possible destinations, we + // explicitly choose to ignore 'undef' destinations. We prefer to thread + // blocks with known and real destinations to threading undef. We'll handle + // them later if interesting. + DenseMap<BasicBlock*, unsigned> DestPopularity; + for (unsigned i = 0, e = PredToDestList.size(); i != e; ++i) + if (PredToDestList[i].second) + DestPopularity[PredToDestList[i].second]++; + + // Find the most popular dest. + DenseMap<BasicBlock*, unsigned>::iterator DPI = DestPopularity.begin(); + BasicBlock *MostPopularDest = DPI->first; + unsigned Popularity = DPI->second; + SmallVector<BasicBlock*, 4> SamePopularity; + + for (++DPI; DPI != DestPopularity.end(); ++DPI) { + // If the popularity of this entry isn't higher than the popularity we've + // seen so far, ignore it. + if (DPI->second < Popularity) + ; // ignore. + else if (DPI->second == Popularity) { + // If it is the same as what we've seen so far, keep track of it. + SamePopularity.push_back(DPI->first); + } else { + // If it is more popular, remember it. + SamePopularity.clear(); + MostPopularDest = DPI->first; + Popularity = DPI->second; + } + } + + // Okay, now we know the most popular destination. If there is more than one + // destination, we need to determine one. This is arbitrary, but we need + // to make a deterministic decision. Pick the first one that appears in the + // successor list. + if (!SamePopularity.empty()) { + SamePopularity.push_back(MostPopularDest); + TerminatorInst *TI = BB->getTerminator(); + for (unsigned i = 0; ; ++i) { + assert(i != TI->getNumSuccessors() && "Didn't find any successor!"); + + if (std::find(SamePopularity.begin(), SamePopularity.end(), + TI->getSuccessor(i)) == SamePopularity.end()) + continue; + + MostPopularDest = TI->getSuccessor(i); + break; + } + } + + // Okay, we have finally picked the most popular destination. + return MostPopularDest; +} + +bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB, + ConstantPreference Preference) { + // If threading this would thread across a loop header, don't even try to + // thread the edge. + if (LoopHeaders.count(BB)) + return false; + + PredValueInfoTy PredValues; + if (!ComputeValueKnownInPredecessors(Cond, BB, PredValues, Preference)) + return false; + + assert(!PredValues.empty() && + "ComputeValueKnownInPredecessors returned true with no values"); + + DEBUG(dbgs() << "IN BB: " << *BB; + for (unsigned i = 0, e = PredValues.size(); i != e; ++i) { + dbgs() << " BB '" << BB->getName() << "': FOUND condition = " + << *PredValues[i].first + << " for pred '" << PredValues[i].second->getName() << "'.\n"; + }); + + // Decide what we want to thread through. Convert our list of known values to + // a list of known destinations for each pred. This also discards duplicate + // predecessors and keeps track of the undefined inputs (which are represented + // as a null dest in the PredToDestList). + SmallPtrSet<BasicBlock*, 16> SeenPreds; + SmallVector<std::pair<BasicBlock*, BasicBlock*>, 16> PredToDestList; + + BasicBlock *OnlyDest = 0; + BasicBlock *MultipleDestSentinel = (BasicBlock*)(intptr_t)~0ULL; + + for (unsigned i = 0, e = PredValues.size(); i != e; ++i) { + BasicBlock *Pred = PredValues[i].second; + if (!SeenPreds.insert(Pred)) + continue; // Duplicate predecessor entry. + + // If the predecessor ends with an indirect goto, we can't change its + // destination. + if (isa<IndirectBrInst>(Pred->getTerminator())) + continue; + + Constant *Val = PredValues[i].first; + + BasicBlock *DestBB; + if (isa<UndefValue>(Val)) + DestBB = 0; + else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) + DestBB = BI->getSuccessor(cast<ConstantInt>(Val)->isZero()); + else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) { + DestBB = SI->findCaseValue(cast<ConstantInt>(Val)).getCaseSuccessor(); + } else { + assert(isa<IndirectBrInst>(BB->getTerminator()) + && "Unexpected terminator"); + DestBB = cast<BlockAddress>(Val)->getBasicBlock(); + } + + // If we have exactly one destination, remember it for efficiency below. + if (PredToDestList.empty()) + OnlyDest = DestBB; + else if (OnlyDest != DestBB) + OnlyDest = MultipleDestSentinel; + + PredToDestList.push_back(std::make_pair(Pred, DestBB)); + } + + // If all edges were unthreadable, we fail. + if (PredToDestList.empty()) + return false; + + // Determine which is the most common successor. If we have many inputs and + // this block is a switch, we want to start by threading the batch that goes + // to the most popular destination first. If we only know about one + // threadable destination (the common case) we can avoid this. + BasicBlock *MostPopularDest = OnlyDest; + + if (MostPopularDest == MultipleDestSentinel) + MostPopularDest = FindMostPopularDest(BB, PredToDestList); + + // Now that we know what the most popular destination is, factor all + // predecessors that will jump to it into a single predecessor. + SmallVector<BasicBlock*, 16> PredsToFactor; + for (unsigned i = 0, e = PredToDestList.size(); i != e; ++i) + if (PredToDestList[i].second == MostPopularDest) { + BasicBlock *Pred = PredToDestList[i].first; + + // This predecessor may be a switch or something else that has multiple + // edges to the block. Factor each of these edges by listing them + // according to # occurrences in PredsToFactor. + TerminatorInst *PredTI = Pred->getTerminator(); + for (unsigned i = 0, e = PredTI->getNumSuccessors(); i != e; ++i) + if (PredTI->getSuccessor(i) == BB) + PredsToFactor.push_back(Pred); + } + + // If the threadable edges are branching on an undefined value, we get to pick + // the destination that these predecessors should get to. + if (MostPopularDest == 0) + MostPopularDest = BB->getTerminator()-> + getSuccessor(GetBestDestForJumpOnUndef(BB)); + + // Ok, try to thread it! + return ThreadEdge(BB, PredsToFactor, MostPopularDest); +} + +/// ProcessBranchOnPHI - We have an otherwise unthreadable conditional branch on +/// a PHI node in the current block. See if there are any simplifications we +/// can do based on inputs to the phi node. +/// +bool JumpThreading::ProcessBranchOnPHI(PHINode *PN) { + BasicBlock *BB = PN->getParent(); + + // TODO: We could make use of this to do it once for blocks with common PHI + // values. + SmallVector<BasicBlock*, 1> PredBBs; + PredBBs.resize(1); + + // If any of the predecessor blocks end in an unconditional branch, we can + // *duplicate* the conditional branch into that block in order to further + // encourage jump threading and to eliminate cases where we have branch on a + // phi of an icmp (branch on icmp is much better). + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + BasicBlock *PredBB = PN->getIncomingBlock(i); + if (BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator())) + if (PredBr->isUnconditional()) { + PredBBs[0] = PredBB; + // Try to duplicate BB into PredBB. + if (DuplicateCondBranchOnPHIIntoPred(BB, PredBBs)) + return true; + } + } + + return false; +} + +/// ProcessBranchOnXOR - We have an otherwise unthreadable conditional branch on +/// a xor instruction in the current block. See if there are any +/// simplifications we can do based on inputs to the xor. +/// +bool JumpThreading::ProcessBranchOnXOR(BinaryOperator *BO) { + BasicBlock *BB = BO->getParent(); + + // If either the LHS or RHS of the xor is a constant, don't do this + // optimization. + if (isa<ConstantInt>(BO->getOperand(0)) || + isa<ConstantInt>(BO->getOperand(1))) + return false; + + // If the first instruction in BB isn't a phi, we won't be able to infer + // anything special about any particular predecessor. + if (!isa<PHINode>(BB->front())) + return false; + + // If we have a xor as the branch input to this block, and we know that the + // LHS or RHS of the xor in any predecessor is true/false, then we can clone + // the condition into the predecessor and fix that value to true, saving some + // logical ops on that path and encouraging other paths to simplify. + // + // This copies something like this: + // + // BB: + // %X = phi i1 [1], [%X'] + // %Y = icmp eq i32 %A, %B + // %Z = xor i1 %X, %Y + // br i1 %Z, ... + // + // Into: + // BB': + // %Y = icmp ne i32 %A, %B + // br i1 %Z, ... + + PredValueInfoTy XorOpValues; + bool isLHS = true; + if (!ComputeValueKnownInPredecessors(BO->getOperand(0), BB, XorOpValues, + WantInteger)) { + assert(XorOpValues.empty()); + if (!ComputeValueKnownInPredecessors(BO->getOperand(1), BB, XorOpValues, + WantInteger)) + return false; + isLHS = false; + } + + assert(!XorOpValues.empty() && + "ComputeValueKnownInPredecessors returned true with no values"); + + // Scan the information to see which is most popular: true or false. The + // predecessors can be of the set true, false, or undef. + unsigned NumTrue = 0, NumFalse = 0; + for (unsigned i = 0, e = XorOpValues.size(); i != e; ++i) { + if (isa<UndefValue>(XorOpValues[i].first)) + // Ignore undefs for the count. + continue; + if (cast<ConstantInt>(XorOpValues[i].first)->isZero()) + ++NumFalse; + else + ++NumTrue; + } + + // Determine which value to split on, true, false, or undef if neither. + ConstantInt *SplitVal = 0; + if (NumTrue > NumFalse) + SplitVal = ConstantInt::getTrue(BB->getContext()); + else if (NumTrue != 0 || NumFalse != 0) + SplitVal = ConstantInt::getFalse(BB->getContext()); + + // Collect all of the blocks that this can be folded into so that we can + // factor this once and clone it once. + SmallVector<BasicBlock*, 8> BlocksToFoldInto; + for (unsigned i = 0, e = XorOpValues.size(); i != e; ++i) { + if (XorOpValues[i].first != SplitVal && + !isa<UndefValue>(XorOpValues[i].first)) + continue; + + BlocksToFoldInto.push_back(XorOpValues[i].second); + } + + // If we inferred a value for all of the predecessors, then duplication won't + // help us. However, we can just replace the LHS or RHS with the constant. + if (BlocksToFoldInto.size() == + cast<PHINode>(BB->front()).getNumIncomingValues()) { + if (SplitVal == 0) { + // If all preds provide undef, just nuke the xor, because it is undef too. + BO->replaceAllUsesWith(UndefValue::get(BO->getType())); + BO->eraseFromParent(); + } else if (SplitVal->isZero()) { + // If all preds provide 0, replace the xor with the other input. + BO->replaceAllUsesWith(BO->getOperand(isLHS)); + BO->eraseFromParent(); + } else { + // If all preds provide 1, set the computed value to 1. + BO->setOperand(!isLHS, SplitVal); + } + + return true; + } + + // Try to duplicate BB into PredBB. + return DuplicateCondBranchOnPHIIntoPred(BB, BlocksToFoldInto); +} + + +/// AddPHINodeEntriesForMappedBlock - We're adding 'NewPred' as a new +/// predecessor to the PHIBB block. If it has PHI nodes, add entries for +/// NewPred using the entries from OldPred (suitably mapped). +static void AddPHINodeEntriesForMappedBlock(BasicBlock *PHIBB, + BasicBlock *OldPred, + BasicBlock *NewPred, + DenseMap<Instruction*, Value*> &ValueMap) { + for (BasicBlock::iterator PNI = PHIBB->begin(); + PHINode *PN = dyn_cast<PHINode>(PNI); ++PNI) { + // Ok, we have a PHI node. Figure out what the incoming value was for the + // DestBlock. + Value *IV = PN->getIncomingValueForBlock(OldPred); + + // Remap the value if necessary. + if (Instruction *Inst = dyn_cast<Instruction>(IV)) { + DenseMap<Instruction*, Value*>::iterator I = ValueMap.find(Inst); + if (I != ValueMap.end()) + IV = I->second; + } + + PN->addIncoming(IV, NewPred); + } +} + +/// ThreadEdge - We have decided that it is safe and profitable to factor the +/// blocks in PredBBs to one predecessor, then thread an edge from it to SuccBB +/// across BB. Transform the IR to reflect this change. +bool JumpThreading::ThreadEdge(BasicBlock *BB, + const SmallVectorImpl<BasicBlock*> &PredBBs, + BasicBlock *SuccBB) { + // If threading to the same block as we come from, we would infinite loop. + if (SuccBB == BB) { + DEBUG(dbgs() << " Not threading across BB '" << BB->getName() + << "' - would thread to self!\n"); + return false; + } + + // If threading this would thread across a loop header, don't thread the edge. + // See the comments above FindLoopHeaders for justifications and caveats. + if (LoopHeaders.count(BB)) { + DEBUG(dbgs() << " Not threading across loop header BB '" << BB->getName() + << "' to dest BB '" << SuccBB->getName() + << "' - it might create an irreducible loop!\n"); + return false; + } + + unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB); + if (JumpThreadCost > Threshold) { + DEBUG(dbgs() << " Not threading BB '" << BB->getName() + << "' - Cost is too high: " << JumpThreadCost << "\n"); + return false; + } + + // And finally, do it! Start by factoring the predecessors is needed. + BasicBlock *PredBB; + if (PredBBs.size() == 1) + PredBB = PredBBs[0]; + else { + DEBUG(dbgs() << " Factoring out " << PredBBs.size() + << " common predecessors.\n"); + PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm", this); + } + + // And finally, do it! + DEBUG(dbgs() << " Threading edge from '" << PredBB->getName() << "' to '" + << SuccBB->getName() << "' with cost: " << JumpThreadCost + << ", across block:\n " + << *BB << "\n"); + + LVI->threadEdge(PredBB, BB, SuccBB); + + // We are going to have to map operands from the original BB block to the new + // copy of the block 'NewBB'. If there are PHI nodes in BB, evaluate them to + // account for entry from PredBB. + DenseMap<Instruction*, Value*> ValueMapping; + + BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), + BB->getName()+".thread", + BB->getParent(), BB); + NewBB->moveAfter(PredBB); + + BasicBlock::iterator BI = BB->begin(); + for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI) + ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB); + + // Clone the non-phi instructions of BB into NewBB, keeping track of the + // mapping and using it to remap operands in the cloned instructions. + for (; !isa<TerminatorInst>(BI); ++BI) { + Instruction *New = BI->clone(); + New->setName(BI->getName()); + NewBB->getInstList().push_back(New); + ValueMapping[BI] = New; + + // Remap operands to patch up intra-block references. + for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i) + if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) { + DenseMap<Instruction*, Value*>::iterator I = ValueMapping.find(Inst); + if (I != ValueMapping.end()) + New->setOperand(i, I->second); + } + } + + // We didn't copy the terminator from BB over to NewBB, because there is now + // an unconditional jump to SuccBB. Insert the unconditional jump. + BranchInst *NewBI =BranchInst::Create(SuccBB, NewBB); + NewBI->setDebugLoc(BB->getTerminator()->getDebugLoc()); + + // Check to see if SuccBB has PHI nodes. If so, we need to add entries to the + // PHI nodes for NewBB now. + AddPHINodeEntriesForMappedBlock(SuccBB, BB, NewBB, ValueMapping); + + // If there were values defined in BB that are used outside the block, then we + // now have to update all uses of the value to use either the original value, + // the cloned value, or some PHI derived value. This can require arbitrary + // PHI insertion, of which we are prepared to do, clean these up now. + SSAUpdater SSAUpdate; + SmallVector<Use*, 16> UsesToRename; + for (BasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) { + // Scan all uses of this instruction to see if it is used outside of its + // block, and if so, record them in UsesToRename. + for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI != E; + ++UI) { + Instruction *User = cast<Instruction>(*UI); + if (PHINode *UserPN = dyn_cast<PHINode>(User)) { + if (UserPN->getIncomingBlock(UI) == BB) + continue; + } else if (User->getParent() == BB) + continue; + + UsesToRename.push_back(&UI.getUse()); + } + + // If there are no uses outside the block, we're done with this instruction. + if (UsesToRename.empty()) + continue; + + DEBUG(dbgs() << "JT: Renaming non-local uses of: " << *I << "\n"); + + // We found a use of I outside of BB. Rename all uses of I that are outside + // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks + // with the two values we know. + SSAUpdate.Initialize(I->getType(), I->getName()); + SSAUpdate.AddAvailableValue(BB, I); + SSAUpdate.AddAvailableValue(NewBB, ValueMapping[I]); + + while (!UsesToRename.empty()) + SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); + DEBUG(dbgs() << "\n"); + } + + + // Ok, NewBB is good to go. Update the terminator of PredBB to jump to + // NewBB instead of BB. This eliminates predecessors from BB, which requires + // us to simplify any PHI nodes in BB. + TerminatorInst *PredTerm = PredBB->getTerminator(); + for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e; ++i) + if (PredTerm->getSuccessor(i) == BB) { + BB->removePredecessor(PredBB, true); + PredTerm->setSuccessor(i, NewBB); + } + + // At this point, the IR is fully up to date and consistent. Do a quick scan + // over the new instructions and zap any that are constants or dead. This + // frequently happens because of phi translation. + SimplifyInstructionsInBlock(NewBB, TD); + + // Threaded an edge! + ++NumThreads; + return true; +} + +/// DuplicateCondBranchOnPHIIntoPred - PredBB contains an unconditional branch +/// to BB which contains an i1 PHI node and a conditional branch on that PHI. +/// If we can duplicate the contents of BB up into PredBB do so now, this +/// improves the odds that the branch will be on an analyzable instruction like +/// a compare. +bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, + const SmallVectorImpl<BasicBlock *> &PredBBs) { + assert(!PredBBs.empty() && "Can't handle an empty set"); + + // If BB is a loop header, then duplicating this block outside the loop would + // cause us to transform this into an irreducible loop, don't do this. + // See the comments above FindLoopHeaders for justifications and caveats. + if (LoopHeaders.count(BB)) { + DEBUG(dbgs() << " Not duplicating loop header '" << BB->getName() + << "' into predecessor block '" << PredBBs[0]->getName() + << "' - it might create an irreducible loop!\n"); + return false; + } + + unsigned DuplicationCost = getJumpThreadDuplicationCost(BB); + if (DuplicationCost > Threshold) { + DEBUG(dbgs() << " Not duplicating BB '" << BB->getName() + << "' - Cost is too high: " << DuplicationCost << "\n"); + return false; + } + + // And finally, do it! Start by factoring the predecessors is needed. + BasicBlock *PredBB; + if (PredBBs.size() == 1) + PredBB = PredBBs[0]; + else { + DEBUG(dbgs() << " Factoring out " << PredBBs.size() + << " common predecessors.\n"); + PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm", this); + } + + // Okay, we decided to do this! Clone all the instructions in BB onto the end + // of PredBB. + DEBUG(dbgs() << " Duplicating block '" << BB->getName() << "' into end of '" + << PredBB->getName() << "' to eliminate branch on phi. Cost: " + << DuplicationCost << " block is:" << *BB << "\n"); + + // Unless PredBB ends with an unconditional branch, split the edge so that we + // can just clone the bits from BB into the end of the new PredBB. + BranchInst *OldPredBranch = dyn_cast<BranchInst>(PredBB->getTerminator()); + + if (OldPredBranch == 0 || !OldPredBranch->isUnconditional()) { + PredBB = SplitEdge(PredBB, BB, this); + OldPredBranch = cast<BranchInst>(PredBB->getTerminator()); + } + + // We are going to have to map operands from the original BB block into the + // PredBB block. Evaluate PHI nodes in BB. + DenseMap<Instruction*, Value*> ValueMapping; + + BasicBlock::iterator BI = BB->begin(); + for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI) + ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB); + + // Clone the non-phi instructions of BB into PredBB, keeping track of the + // mapping and using it to remap operands in the cloned instructions. + for (; BI != BB->end(); ++BI) { + Instruction *New = BI->clone(); + + // Remap operands to patch up intra-block references. + for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i) + if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) { + DenseMap<Instruction*, Value*>::iterator I = ValueMapping.find(Inst); + if (I != ValueMapping.end()) + New->setOperand(i, I->second); + } + + // If this instruction can be simplified after the operands are updated, + // just use the simplified value instead. This frequently happens due to + // phi translation. + if (Value *IV = SimplifyInstruction(New, TD)) { + delete New; + ValueMapping[BI] = IV; + } else { + // Otherwise, insert the new instruction into the block. + New->setName(BI->getName()); + PredBB->getInstList().insert(OldPredBranch, New); + ValueMapping[BI] = New; + } + } + + // Check to see if the targets of the branch had PHI nodes. If so, we need to + // add entries to the PHI nodes for branch from PredBB now. + BranchInst *BBBranch = cast<BranchInst>(BB->getTerminator()); + AddPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(0), BB, PredBB, + ValueMapping); + AddPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(1), BB, PredBB, + ValueMapping); + + // If there were values defined in BB that are used outside the block, then we + // now have to update all uses of the value to use either the original value, + // the cloned value, or some PHI derived value. This can require arbitrary + // PHI insertion, of which we are prepared to do, clean these up now. + SSAUpdater SSAUpdate; + SmallVector<Use*, 16> UsesToRename; + for (BasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) { + // Scan all uses of this instruction to see if it is used outside of its + // block, and if so, record them in UsesToRename. + for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI != E; + ++UI) { + Instruction *User = cast<Instruction>(*UI); + if (PHINode *UserPN = dyn_cast<PHINode>(User)) { + if (UserPN->getIncomingBlock(UI) == BB) + continue; + } else if (User->getParent() == BB) + continue; + + UsesToRename.push_back(&UI.getUse()); + } + + // If there are no uses outside the block, we're done with this instruction. + if (UsesToRename.empty()) + continue; + + DEBUG(dbgs() << "JT: Renaming non-local uses of: " << *I << "\n"); + + // We found a use of I outside of BB. Rename all uses of I that are outside + // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks + // with the two values we know. + SSAUpdate.Initialize(I->getType(), I->getName()); + SSAUpdate.AddAvailableValue(BB, I); + SSAUpdate.AddAvailableValue(PredBB, ValueMapping[I]); + + while (!UsesToRename.empty()) + SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); + DEBUG(dbgs() << "\n"); + } + + // PredBB no longer jumps to BB, remove entries in the PHI node for the edge + // that we nuked. + BB->removePredecessor(PredBB, true); + + // Remove the unconditional branch at the end of the PredBB block. + OldPredBranch->eraseFromParent(); + + ++NumDupes; + return true; +} + + diff --git a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp new file mode 100644 index 0000000..0192e92 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp @@ -0,0 +1,857 @@ +//===-- LICM.cpp - Loop Invariant Code Motion Pass ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass performs loop invariant code motion, attempting to remove as much +// code from the body of a loop as possible. It does this by either hoisting +// code into the preheader block, or by sinking code to the exit blocks if it is +// safe. This pass also promotes must-aliased memory locations in the loop to +// live in registers, thus hoisting and sinking "invariant" loads and stores. +// +// This pass uses alias analysis for two purposes: +// +// 1. Moving loop invariant loads and calls out of loops. If we can determine +// that a load or call inside of a loop never aliases anything stored to, +// we can hoist it or sink it like any other instruction. +// 2. Scalar Promotion of Memory - If there is a store instruction inside of +// the loop, we try to move the store to happen AFTER the loop instead of +// inside of the loop. This can only happen if a few conditions are true: +// A. The pointer stored through is loop invariant +// B. There are no stores or loads in the loop which _may_ alias the +// pointer. There are no calls in the loop which mod/ref the pointer. +// If these conditions are true, we can promote the loads and stores in the +// loop of the pointer to use a temporary alloca'd variable. We then use +// the SSAUpdater to construct the appropriate SSA form for the value. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "licm" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Instructions.h" +#include "llvm/LLVMContext.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Support/CFG.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Debug.h" +#include "llvm/ADT/Statistic.h" +#include <algorithm> +using namespace llvm; + +STATISTIC(NumSunk , "Number of instructions sunk out of loop"); +STATISTIC(NumHoisted , "Number of instructions hoisted out of loop"); +STATISTIC(NumMovedLoads, "Number of load insts hoisted or sunk"); +STATISTIC(NumMovedCalls, "Number of call insts hoisted or sunk"); +STATISTIC(NumPromoted , "Number of memory locations promoted to registers"); + +static cl::opt<bool> +DisablePromotion("disable-licm-promotion", cl::Hidden, + cl::desc("Disable memory promotion in LICM pass")); + +namespace { + struct LICM : public LoopPass { + static char ID; // Pass identification, replacement for typeid + LICM() : LoopPass(ID) { + initializeLICMPass(*PassRegistry::getPassRegistry()); + } + + virtual bool runOnLoop(Loop *L, LPPassManager &LPM); + + /// This transformation requires natural loop information & requires that + /// loop preheaders be inserted into the CFG... + /// + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired<DominatorTree>(); + AU.addRequired<LoopInfo>(); + AU.addRequiredID(LoopSimplifyID); + AU.addRequired<AliasAnalysis>(); + AU.addPreserved<AliasAnalysis>(); + AU.addPreserved("scalar-evolution"); + AU.addPreservedID(LoopSimplifyID); + AU.addRequired<TargetLibraryInfo>(); + } + + bool doFinalization() { + assert(LoopToAliasSetMap.empty() && "Didn't free loop alias sets"); + return false; + } + + private: + AliasAnalysis *AA; // Current AliasAnalysis information + LoopInfo *LI; // Current LoopInfo + DominatorTree *DT; // Dominator Tree for the current Loop. + + TargetData *TD; // TargetData for constant folding. + TargetLibraryInfo *TLI; // TargetLibraryInfo for constant folding. + + // State that is updated as we process loops. + bool Changed; // Set to true when we change anything. + BasicBlock *Preheader; // The preheader block of the current loop... + Loop *CurLoop; // The current loop we are working on... + AliasSetTracker *CurAST; // AliasSet information for the current loop... + DenseMap<Loop*, AliasSetTracker*> LoopToAliasSetMap; + + /// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info. + void cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L); + + /// deleteAnalysisValue - Simple Analysis hook. Delete value V from alias + /// set. + void deleteAnalysisValue(Value *V, Loop *L); + + /// SinkRegion - Walk the specified region of the CFG (defined by all blocks + /// dominated by the specified block, and that are in the current loop) in + /// reverse depth first order w.r.t the DominatorTree. This allows us to + /// visit uses before definitions, allowing us to sink a loop body in one + /// pass without iteration. + /// + void SinkRegion(DomTreeNode *N); + + /// HoistRegion - Walk the specified region of the CFG (defined by all + /// blocks dominated by the specified block, and that are in the current + /// loop) in depth first order w.r.t the DominatorTree. This allows us to + /// visit definitions before uses, allowing us to hoist a loop body in one + /// pass without iteration. + /// + void HoistRegion(DomTreeNode *N); + + /// inSubLoop - Little predicate that returns true if the specified basic + /// block is in a subloop of the current one, not the current one itself. + /// + bool inSubLoop(BasicBlock *BB) { + assert(CurLoop->contains(BB) && "Only valid if BB is IN the loop"); + return LI->getLoopFor(BB) != CurLoop; + } + + /// sink - When an instruction is found to only be used outside of the loop, + /// this function moves it to the exit blocks and patches up SSA form as + /// needed. + /// + void sink(Instruction &I); + + /// hoist - When an instruction is found to only use loop invariant operands + /// that is safe to hoist, this instruction is called to do the dirty work. + /// + void hoist(Instruction &I); + + /// isSafeToExecuteUnconditionally - Only sink or hoist an instruction if it + /// is not a trapping instruction or if it is a trapping instruction and is + /// guaranteed to execute. + /// + bool isSafeToExecuteUnconditionally(Instruction &I); + + /// isGuaranteedToExecute - Check that the instruction is guaranteed to + /// execute. + /// + bool isGuaranteedToExecute(Instruction &I); + + /// pointerInvalidatedByLoop - Return true if the body of this loop may + /// store into the memory location pointed to by V. + /// + bool pointerInvalidatedByLoop(Value *V, uint64_t Size, + const MDNode *TBAAInfo) { + // Check to see if any of the basic blocks in CurLoop invalidate *V. + return CurAST->getAliasSetForPointer(V, Size, TBAAInfo).isMod(); + } + + bool canSinkOrHoistInst(Instruction &I); + bool isNotUsedInLoop(Instruction &I); + + void PromoteAliasSet(AliasSet &AS, + SmallVectorImpl<BasicBlock*> &ExitBlocks, + SmallVectorImpl<Instruction*> &InsertPts); + }; +} + +char LICM::ID = 0; +INITIALIZE_PASS_BEGIN(LICM, "licm", "Loop Invariant Code Motion", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_END(LICM, "licm", "Loop Invariant Code Motion", false, false) + +Pass *llvm::createLICMPass() { return new LICM(); } + +/// Hoist expressions out of the specified loop. Note, alias info for inner +/// loop is not preserved so it is not a good idea to run LICM multiple +/// times on one loop. +/// +bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { + Changed = false; + + // Get our Loop and Alias Analysis information... + LI = &getAnalysis<LoopInfo>(); + AA = &getAnalysis<AliasAnalysis>(); + DT = &getAnalysis<DominatorTree>(); + + TD = getAnalysisIfAvailable<TargetData>(); + TLI = &getAnalysis<TargetLibraryInfo>(); + + CurAST = new AliasSetTracker(*AA); + // Collect Alias info from subloops. + for (Loop::iterator LoopItr = L->begin(), LoopItrE = L->end(); + LoopItr != LoopItrE; ++LoopItr) { + Loop *InnerL = *LoopItr; + AliasSetTracker *InnerAST = LoopToAliasSetMap[InnerL]; + assert(InnerAST && "Where is my AST?"); + + // What if InnerLoop was modified by other passes ? + CurAST->add(*InnerAST); + + // Once we've incorporated the inner loop's AST into ours, we don't need the + // subloop's anymore. + delete InnerAST; + LoopToAliasSetMap.erase(InnerL); + } + + CurLoop = L; + + // Get the preheader block to move instructions into... + Preheader = L->getLoopPreheader(); + + // Loop over the body of this loop, looking for calls, invokes, and stores. + // Because subloops have already been incorporated into AST, we skip blocks in + // subloops. + // + for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); + I != E; ++I) { + BasicBlock *BB = *I; + if (LI->getLoopFor(BB) == L) // Ignore blocks in subloops. + CurAST->add(*BB); // Incorporate the specified basic block + } + + // We want to visit all of the instructions in this loop... that are not parts + // of our subloops (they have already had their invariants hoisted out of + // their loop, into this loop, so there is no need to process the BODIES of + // the subloops). + // + // Traverse the body of the loop in depth first order on the dominator tree so + // that we are guaranteed to see definitions before we see uses. This allows + // us to sink instructions in one pass, without iteration. After sinking + // instructions, we perform another pass to hoist them out of the loop. + // + if (L->hasDedicatedExits()) + SinkRegion(DT->getNode(L->getHeader())); + if (Preheader) + HoistRegion(DT->getNode(L->getHeader())); + + // Now that all loop invariants have been removed from the loop, promote any + // memory references to scalars that we can. + if (!DisablePromotion && Preheader && L->hasDedicatedExits()) { + SmallVector<BasicBlock *, 8> ExitBlocks; + SmallVector<Instruction *, 8> InsertPts; + + // Loop over all of the alias sets in the tracker object. + for (AliasSetTracker::iterator I = CurAST->begin(), E = CurAST->end(); + I != E; ++I) + PromoteAliasSet(*I, ExitBlocks, InsertPts); + } + + // Clear out loops state information for the next iteration + CurLoop = 0; + Preheader = 0; + + // If this loop is nested inside of another one, save the alias information + // for when we process the outer loop. + if (L->getParentLoop()) + LoopToAliasSetMap[L] = CurAST; + else + delete CurAST; + return Changed; +} + +/// SinkRegion - Walk the specified region of the CFG (defined by all blocks +/// dominated by the specified block, and that are in the current loop) in +/// reverse depth first order w.r.t the DominatorTree. This allows us to visit +/// uses before definitions, allowing us to sink a loop body in one pass without +/// iteration. +/// +void LICM::SinkRegion(DomTreeNode *N) { + assert(N != 0 && "Null dominator tree node?"); + BasicBlock *BB = N->getBlock(); + + // If this subregion is not in the top level loop at all, exit. + if (!CurLoop->contains(BB)) return; + + // We are processing blocks in reverse dfo, so process children first. + const std::vector<DomTreeNode*> &Children = N->getChildren(); + for (unsigned i = 0, e = Children.size(); i != e; ++i) + SinkRegion(Children[i]); + + // Only need to process the contents of this block if it is not part of a + // subloop (which would already have been processed). + if (inSubLoop(BB)) return; + + for (BasicBlock::iterator II = BB->end(); II != BB->begin(); ) { + Instruction &I = *--II; + + // If the instruction is dead, we would try to sink it because it isn't used + // in the loop, instead, just delete it. + if (isInstructionTriviallyDead(&I)) { + DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n'); + ++II; + CurAST->deleteValue(&I); + I.eraseFromParent(); + Changed = true; + continue; + } + + // Check to see if we can sink this instruction to the exit blocks + // of the loop. We can do this if the all users of the instruction are + // outside of the loop. In this case, it doesn't even matter if the + // operands of the instruction are loop invariant. + // + if (isNotUsedInLoop(I) && canSinkOrHoistInst(I)) { + ++II; + sink(I); + } + } +} + +/// HoistRegion - Walk the specified region of the CFG (defined by all blocks +/// dominated by the specified block, and that are in the current loop) in depth +/// first order w.r.t the DominatorTree. This allows us to visit definitions +/// before uses, allowing us to hoist a loop body in one pass without iteration. +/// +void LICM::HoistRegion(DomTreeNode *N) { + assert(N != 0 && "Null dominator tree node?"); + BasicBlock *BB = N->getBlock(); + + // If this subregion is not in the top level loop at all, exit. + if (!CurLoop->contains(BB)) return; + + // Only need to process the contents of this block if it is not part of a + // subloop (which would already have been processed). + if (!inSubLoop(BB)) + for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ) { + Instruction &I = *II++; + + // Try constant folding this instruction. If all the operands are + // constants, it is technically hoistable, but it would be better to just + // fold it. + if (Constant *C = ConstantFoldInstruction(&I, TD, TLI)) { + DEBUG(dbgs() << "LICM folding inst: " << I << " --> " << *C << '\n'); + CurAST->copyValue(&I, C); + CurAST->deleteValue(&I); + I.replaceAllUsesWith(C); + I.eraseFromParent(); + continue; + } + + // Try hoisting the instruction out to the preheader. We can only do this + // if all of the operands of the instruction are loop invariant and if it + // is safe to hoist the instruction. + // + if (CurLoop->hasLoopInvariantOperands(&I) && canSinkOrHoistInst(I) && + isSafeToExecuteUnconditionally(I)) + hoist(I); + } + + const std::vector<DomTreeNode*> &Children = N->getChildren(); + for (unsigned i = 0, e = Children.size(); i != e; ++i) + HoistRegion(Children[i]); +} + +/// canSinkOrHoistInst - Return true if the hoister and sinker can handle this +/// instruction. +/// +bool LICM::canSinkOrHoistInst(Instruction &I) { + // Loads have extra constraints we have to verify before we can hoist them. + if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { + if (!LI->isUnordered()) + return false; // Don't hoist volatile/atomic loads! + + // Loads from constant memory are always safe to move, even if they end up + // in the same alias set as something that ends up being modified. + if (AA->pointsToConstantMemory(LI->getOperand(0))) + return true; + if (LI->getMetadata("invariant.load")) + return true; + + // Don't hoist loads which have may-aliased stores in loop. + uint64_t Size = 0; + if (LI->getType()->isSized()) + Size = AA->getTypeStoreSize(LI->getType()); + return !pointerInvalidatedByLoop(LI->getOperand(0), Size, + LI->getMetadata(LLVMContext::MD_tbaa)); + } else if (CallInst *CI = dyn_cast<CallInst>(&I)) { + // Don't sink or hoist dbg info; it's legal, but not useful. + if (isa<DbgInfoIntrinsic>(I)) + return false; + + // Handle simple cases by querying alias analysis. + AliasAnalysis::ModRefBehavior Behavior = AA->getModRefBehavior(CI); + if (Behavior == AliasAnalysis::DoesNotAccessMemory) + return true; + if (AliasAnalysis::onlyReadsMemory(Behavior)) { + // If this call only reads from memory and there are no writes to memory + // in the loop, we can hoist or sink the call as appropriate. + bool FoundMod = false; + for (AliasSetTracker::iterator I = CurAST->begin(), E = CurAST->end(); + I != E; ++I) { + AliasSet &AS = *I; + if (!AS.isForwardingAliasSet() && AS.isMod()) { + FoundMod = true; + break; + } + } + if (!FoundMod) return true; + } + + // FIXME: This should use mod/ref information to see if we can hoist or sink + // the call. + + return false; + } + + // Otherwise these instructions are hoistable/sinkable + return isa<BinaryOperator>(I) || isa<CastInst>(I) || + isa<SelectInst>(I) || isa<GetElementPtrInst>(I) || isa<CmpInst>(I) || + isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || + isa<ShuffleVectorInst>(I); +} + +/// isNotUsedInLoop - Return true if the only users of this instruction are +/// outside of the loop. If this is true, we can sink the instruction to the +/// exit blocks of the loop. +/// +bool LICM::isNotUsedInLoop(Instruction &I) { + for (Value::use_iterator UI = I.use_begin(), E = I.use_end(); UI != E; ++UI) { + Instruction *User = cast<Instruction>(*UI); + if (PHINode *PN = dyn_cast<PHINode>(User)) { + // PHI node uses occur in predecessor blocks! + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + if (PN->getIncomingValue(i) == &I) + if (CurLoop->contains(PN->getIncomingBlock(i))) + return false; + } else if (CurLoop->contains(User)) { + return false; + } + } + return true; +} + + +/// sink - When an instruction is found to only be used outside of the loop, +/// this function moves it to the exit blocks and patches up SSA form as needed. +/// This method is guaranteed to remove the original instruction from its +/// position, and may either delete it or move it to outside of the loop. +/// +void LICM::sink(Instruction &I) { + DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n"); + + SmallVector<BasicBlock*, 8> ExitBlocks; + CurLoop->getUniqueExitBlocks(ExitBlocks); + + if (isa<LoadInst>(I)) ++NumMovedLoads; + else if (isa<CallInst>(I)) ++NumMovedCalls; + ++NumSunk; + Changed = true; + + // The case where there is only a single exit node of this loop is common + // enough that we handle it as a special (more efficient) case. It is more + // efficient to handle because there are no PHI nodes that need to be placed. + if (ExitBlocks.size() == 1) { + if (!DT->dominates(I.getParent(), ExitBlocks[0])) { + // Instruction is not used, just delete it. + CurAST->deleteValue(&I); + // If I has users in unreachable blocks, eliminate. + // If I is not void type then replaceAllUsesWith undef. + // This allows ValueHandlers and custom metadata to adjust itself. + if (!I.use_empty()) + I.replaceAllUsesWith(UndefValue::get(I.getType())); + I.eraseFromParent(); + } else { + // Move the instruction to the start of the exit block, after any PHI + // nodes in it. + I.moveBefore(ExitBlocks[0]->getFirstInsertionPt()); + + // This instruction is no longer in the AST for the current loop, because + // we just sunk it out of the loop. If we just sunk it into an outer + // loop, we will rediscover the operation when we process it. + CurAST->deleteValue(&I); + } + return; + } + + if (ExitBlocks.empty()) { + // The instruction is actually dead if there ARE NO exit blocks. + CurAST->deleteValue(&I); + // If I has users in unreachable blocks, eliminate. + // If I is not void type then replaceAllUsesWith undef. + // This allows ValueHandlers and custom metadata to adjust itself. + if (!I.use_empty()) + I.replaceAllUsesWith(UndefValue::get(I.getType())); + I.eraseFromParent(); + return; + } + + // Otherwise, if we have multiple exits, use the SSAUpdater to do all of the + // hard work of inserting PHI nodes as necessary. + SmallVector<PHINode*, 8> NewPHIs; + SSAUpdater SSA(&NewPHIs); + + if (!I.use_empty()) + SSA.Initialize(I.getType(), I.getName()); + + // Insert a copy of the instruction in each exit block of the loop that is + // dominated by the instruction. Each exit block is known to only be in the + // ExitBlocks list once. + BasicBlock *InstOrigBB = I.getParent(); + unsigned NumInserted = 0; + + for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) { + BasicBlock *ExitBlock = ExitBlocks[i]; + + if (!DT->dominates(InstOrigBB, ExitBlock)) + continue; + + // Insert the code after the last PHI node. + BasicBlock::iterator InsertPt = ExitBlock->getFirstInsertionPt(); + + // If this is the first exit block processed, just move the original + // instruction, otherwise clone the original instruction and insert + // the copy. + Instruction *New; + if (NumInserted++ == 0) { + I.moveBefore(InsertPt); + New = &I; + } else { + New = I.clone(); + if (!I.getName().empty()) + New->setName(I.getName()+".le"); + ExitBlock->getInstList().insert(InsertPt, New); + } + + // Now that we have inserted the instruction, inform SSAUpdater. + if (!I.use_empty()) + SSA.AddAvailableValue(ExitBlock, New); + } + + // If the instruction doesn't dominate any exit blocks, it must be dead. + if (NumInserted == 0) { + CurAST->deleteValue(&I); + if (!I.use_empty()) + I.replaceAllUsesWith(UndefValue::get(I.getType())); + I.eraseFromParent(); + return; + } + + // Next, rewrite uses of the instruction, inserting PHI nodes as needed. + for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE; ) { + // Grab the use before incrementing the iterator. + Use &U = UI.getUse(); + // Increment the iterator before removing the use from the list. + ++UI; + SSA.RewriteUseAfterInsertions(U); + } + + // Update CurAST for NewPHIs if I had pointer type. + if (I.getType()->isPointerTy()) + for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i) + CurAST->copyValue(&I, NewPHIs[i]); + + // Finally, remove the instruction from CurAST. It is no longer in the loop. + CurAST->deleteValue(&I); +} + +/// hoist - When an instruction is found to only use loop invariant operands +/// that is safe to hoist, this instruction is called to do the dirty work. +/// +void LICM::hoist(Instruction &I) { + DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " + << I << "\n"); + + // Move the new node to the Preheader, before its terminator. + I.moveBefore(Preheader->getTerminator()); + + if (isa<LoadInst>(I)) ++NumMovedLoads; + else if (isa<CallInst>(I)) ++NumMovedCalls; + ++NumHoisted; + Changed = true; +} + +/// isSafeToExecuteUnconditionally - Only sink or hoist an instruction if it is +/// not a trapping instruction or if it is a trapping instruction and is +/// guaranteed to execute. +/// +bool LICM::isSafeToExecuteUnconditionally(Instruction &Inst) { + // If it is not a trapping instruction, it is always safe to hoist. + if (isSafeToSpeculativelyExecute(&Inst)) + return true; + + return isGuaranteedToExecute(Inst); +} + +bool LICM::isGuaranteedToExecute(Instruction &Inst) { + // Otherwise we have to check to make sure that the instruction dominates all + // of the exit blocks. If it doesn't, then there is a path out of the loop + // which does not execute this instruction, so we can't hoist it. + + // If the instruction is in the header block for the loop (which is very + // common), it is always guaranteed to dominate the exit blocks. Since this + // is a common case, and can save some work, check it now. + if (Inst.getParent() == CurLoop->getHeader()) + return true; + + // Get the exit blocks for the current loop. + SmallVector<BasicBlock*, 8> ExitBlocks; + CurLoop->getExitBlocks(ExitBlocks); + + // Verify that the block dominates each of the exit blocks of the loop. + for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) + if (!DT->dominates(Inst.getParent(), ExitBlocks[i])) + return false; + + // As a degenerate case, if the loop is statically infinite then we haven't + // proven anything since there are no exit blocks. + if (ExitBlocks.empty()) + return false; + + return true; +} + +namespace { + class LoopPromoter : public LoadAndStorePromoter { + Value *SomePtr; // Designated pointer to store to. + SmallPtrSet<Value*, 4> &PointerMustAliases; + SmallVectorImpl<BasicBlock*> &LoopExitBlocks; + SmallVectorImpl<Instruction*> &LoopInsertPts; + AliasSetTracker &AST; + DebugLoc DL; + int Alignment; + public: + LoopPromoter(Value *SP, + const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S, + SmallPtrSet<Value*, 4> &PMA, + SmallVectorImpl<BasicBlock*> &LEB, + SmallVectorImpl<Instruction*> &LIP, + AliasSetTracker &ast, DebugLoc dl, int alignment) + : LoadAndStorePromoter(Insts, S), SomePtr(SP), + PointerMustAliases(PMA), LoopExitBlocks(LEB), LoopInsertPts(LIP), + AST(ast), DL(dl), Alignment(alignment) {} + + virtual bool isInstInList(Instruction *I, + const SmallVectorImpl<Instruction*> &) const { + Value *Ptr; + if (LoadInst *LI = dyn_cast<LoadInst>(I)) + Ptr = LI->getOperand(0); + else + Ptr = cast<StoreInst>(I)->getPointerOperand(); + return PointerMustAliases.count(Ptr); + } + + virtual void doExtraRewritesBeforeFinalDeletion() const { + // Insert stores after in the loop exit blocks. Each exit block gets a + // store of the live-out values that feed them. Since we've already told + // the SSA updater about the defs in the loop and the preheader + // definition, it is all set and we can start using it. + for (unsigned i = 0, e = LoopExitBlocks.size(); i != e; ++i) { + BasicBlock *ExitBlock = LoopExitBlocks[i]; + Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock); + Instruction *InsertPos = LoopInsertPts[i]; + StoreInst *NewSI = new StoreInst(LiveInValue, SomePtr, InsertPos); + NewSI->setAlignment(Alignment); + NewSI->setDebugLoc(DL); + } + } + + virtual void replaceLoadWithValue(LoadInst *LI, Value *V) const { + // Update alias analysis. + AST.copyValue(LI, V); + } + virtual void instructionDeleted(Instruction *I) const { + AST.deleteValue(I); + } + }; +} // end anon namespace + +/// PromoteAliasSet - Try to promote memory values to scalars by sinking +/// stores out of the loop and moving loads to before the loop. We do this by +/// looping over the stores in the loop, looking for stores to Must pointers +/// which are loop invariant. +/// +void LICM::PromoteAliasSet(AliasSet &AS, + SmallVectorImpl<BasicBlock*> &ExitBlocks, + SmallVectorImpl<Instruction*> &InsertPts) { + // We can promote this alias set if it has a store, if it is a "Must" alias + // set, if the pointer is loop invariant, and if we are not eliminating any + // volatile loads or stores. + if (AS.isForwardingAliasSet() || !AS.isMod() || !AS.isMustAlias() || + AS.isVolatile() || !CurLoop->isLoopInvariant(AS.begin()->getValue())) + return; + + assert(!AS.empty() && + "Must alias set should have at least one pointer element in it!"); + Value *SomePtr = AS.begin()->getValue(); + + // It isn't safe to promote a load/store from the loop if the load/store is + // conditional. For example, turning: + // + // for () { if (c) *P += 1; } + // + // into: + // + // tmp = *P; for () { if (c) tmp +=1; } *P = tmp; + // + // is not safe, because *P may only be valid to access if 'c' is true. + // + // It is safe to promote P if all uses are direct load/stores and if at + // least one is guaranteed to be executed. + bool GuaranteedToExecute = false; + + SmallVector<Instruction*, 64> LoopUses; + SmallPtrSet<Value*, 4> PointerMustAliases; + + // We start with an alignment of one and try to find instructions that allow + // us to prove better alignment. + unsigned Alignment = 1; + + // Check that all of the pointers in the alias set have the same type. We + // cannot (yet) promote a memory location that is loaded and stored in + // different sizes. + for (AliasSet::iterator ASI = AS.begin(), E = AS.end(); ASI != E; ++ASI) { + Value *ASIV = ASI->getValue(); + PointerMustAliases.insert(ASIV); + + // Check that all of the pointers in the alias set have the same type. We + // cannot (yet) promote a memory location that is loaded and stored in + // different sizes. + if (SomePtr->getType() != ASIV->getType()) + return; + + for (Value::use_iterator UI = ASIV->use_begin(), UE = ASIV->use_end(); + UI != UE; ++UI) { + // Ignore instructions that are outside the loop. + Instruction *Use = dyn_cast<Instruction>(*UI); + if (!Use || !CurLoop->contains(Use)) + continue; + + // If there is an non-load/store instruction in the loop, we can't promote + // it. + if (LoadInst *load = dyn_cast<LoadInst>(Use)) { + assert(!load->isVolatile() && "AST broken"); + if (!load->isSimple()) + return; + } else if (StoreInst *store = dyn_cast<StoreInst>(Use)) { + // Stores *of* the pointer are not interesting, only stores *to* the + // pointer. + if (Use->getOperand(1) != ASIV) + continue; + assert(!store->isVolatile() && "AST broken"); + if (!store->isSimple()) + return; + + // Note that we only check GuaranteedToExecute inside the store case + // so that we do not introduce stores where they did not exist before + // (which would break the LLVM concurrency model). + + // If the alignment of this instruction allows us to specify a more + // restrictive (and performant) alignment and if we are sure this + // instruction will be executed, update the alignment. + // Larger is better, with the exception of 0 being the best alignment. + unsigned InstAlignment = store->getAlignment(); + if ((InstAlignment > Alignment || InstAlignment == 0) + && (Alignment != 0)) + if (isGuaranteedToExecute(*Use)) { + GuaranteedToExecute = true; + Alignment = InstAlignment; + } + + if (!GuaranteedToExecute) + GuaranteedToExecute = isGuaranteedToExecute(*Use); + + } else + return; // Not a load or store. + + LoopUses.push_back(Use); + } + } + + // If there isn't a guaranteed-to-execute instruction, we can't promote. + if (!GuaranteedToExecute) + return; + + // Otherwise, this is safe to promote, lets do it! + DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " <<*SomePtr<<'\n'); + Changed = true; + ++NumPromoted; + + // Grab a debug location for the inserted loads/stores; given that the + // inserted loads/stores have little relation to the original loads/stores, + // this code just arbitrarily picks a location from one, since any debug + // location is better than none. + DebugLoc DL = LoopUses[0]->getDebugLoc(); + + // Figure out the loop exits and their insertion points, if this is the + // first promotion. + if (ExitBlocks.empty()) { + CurLoop->getUniqueExitBlocks(ExitBlocks); + InsertPts.resize(ExitBlocks.size()); + for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) + InsertPts[i] = ExitBlocks[i]->getFirstInsertionPt(); + } + + // We use the SSAUpdater interface to insert phi nodes as required. + SmallVector<PHINode*, 16> NewPHIs; + SSAUpdater SSA(&NewPHIs); + LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks, + InsertPts, *CurAST, DL, Alignment); + + // Set up the preheader to have a definition of the value. It is the live-out + // value from the preheader that uses in the loop will use. + LoadInst *PreheaderLoad = + new LoadInst(SomePtr, SomePtr->getName()+".promoted", + Preheader->getTerminator()); + PreheaderLoad->setAlignment(Alignment); + PreheaderLoad->setDebugLoc(DL); + SSA.AddAvailableValue(Preheader, PreheaderLoad); + + // Rewrite all the loads in the loop and remember all the definitions from + // stores in the loop. + Promoter.run(LoopUses); + + // If the SSAUpdater didn't use the load in the preheader, just zap it now. + if (PreheaderLoad->use_empty()) + PreheaderLoad->eraseFromParent(); +} + + +/// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info. +void LICM::cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L) { + AliasSetTracker *AST = LoopToAliasSetMap.lookup(L); + if (!AST) + return; + + AST->copyValue(From, To); +} + +/// deleteAnalysisValue - Simple Analysis hook. Delete value V from alias +/// set. +void LICM::deleteAnalysisValue(Value *V, Loop *L) { + AliasSetTracker *AST = LoopToAliasSetMap.lookup(L); + if (!AST) + return; + + AST->deleteValue(V); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp new file mode 100644 index 0000000..3771f5a --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp @@ -0,0 +1,248 @@ +//===- LoopDeletion.cpp - Dead Loop Deletion Pass ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the Dead Loop Deletion Pass. This pass is responsible +// for eliminating loops with non-infinite computable trip counts that have no +// side effects or volatile instructions, and do not contribute to the +// computation of the function's return value. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "loop-delete" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/SmallVector.h" +using namespace llvm; + +STATISTIC(NumDeleted, "Number of loops deleted"); + +namespace { + class LoopDeletion : public LoopPass { + public: + static char ID; // Pass ID, replacement for typeid + LoopDeletion() : LoopPass(ID) { + initializeLoopDeletionPass(*PassRegistry::getPassRegistry()); + } + + // Possibly eliminate loop L if it is dead. + bool runOnLoop(Loop* L, LPPassManager& LPM); + + bool IsLoopDead(Loop* L, SmallVector<BasicBlock*, 4>& exitingBlocks, + SmallVector<BasicBlock*, 4>& exitBlocks, + bool &Changed, BasicBlock *Preheader); + + virtual void getAnalysisUsage(AnalysisUsage& AU) const { + AU.addRequired<DominatorTree>(); + AU.addRequired<LoopInfo>(); + AU.addRequired<ScalarEvolution>(); + AU.addRequiredID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + + AU.addPreserved<ScalarEvolution>(); + AU.addPreserved<DominatorTree>(); + AU.addPreserved<LoopInfo>(); + AU.addPreservedID(LoopSimplifyID); + AU.addPreservedID(LCSSAID); + } + }; +} + +char LoopDeletion::ID = 0; +INITIALIZE_PASS_BEGIN(LoopDeletion, "loop-deletion", + "Delete dead loops", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_END(LoopDeletion, "loop-deletion", + "Delete dead loops", false, false) + +Pass* llvm::createLoopDeletionPass() { + return new LoopDeletion(); +} + +/// IsLoopDead - Determined if a loop is dead. This assumes that we've already +/// checked for unique exit and exiting blocks, and that the code is in LCSSA +/// form. +bool LoopDeletion::IsLoopDead(Loop* L, + SmallVector<BasicBlock*, 4>& exitingBlocks, + SmallVector<BasicBlock*, 4>& exitBlocks, + bool &Changed, BasicBlock *Preheader) { + BasicBlock* exitBlock = exitBlocks[0]; + + // Make sure that all PHI entries coming from the loop are loop invariant. + // Because the code is in LCSSA form, any values used outside of the loop + // must pass through a PHI in the exit block, meaning that this check is + // sufficient to guarantee that no loop-variant values are used outside + // of the loop. + BasicBlock::iterator BI = exitBlock->begin(); + while (PHINode* P = dyn_cast<PHINode>(BI)) { + Value* incoming = P->getIncomingValueForBlock(exitingBlocks[0]); + + // Make sure all exiting blocks produce the same incoming value for the exit + // block. If there are different incoming values for different exiting + // blocks, then it is impossible to statically determine which value should + // be used. + for (unsigned i = 1; i < exitingBlocks.size(); ++i) { + if (incoming != P->getIncomingValueForBlock(exitingBlocks[i])) + return false; + } + + if (Instruction* I = dyn_cast<Instruction>(incoming)) + if (!L->makeLoopInvariant(I, Changed, Preheader->getTerminator())) + return false; + + ++BI; + } + + // Make sure that no instructions in the block have potential side-effects. + // This includes instructions that could write to memory, and loads that are + // marked volatile. This could be made more aggressive by using aliasing + // information to identify readonly and readnone calls. + for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end(); + LI != LE; ++LI) { + for (BasicBlock::iterator BI = (*LI)->begin(), BE = (*LI)->end(); + BI != BE; ++BI) { + if (BI->mayHaveSideEffects()) + return false; + } + } + + return true; +} + +/// runOnLoop - Remove dead loops, by which we mean loops that do not impact the +/// observable behavior of the program other than finite running time. Note +/// we do ensure that this never remove a loop that might be infinite, as doing +/// so could change the halting/non-halting nature of a program. +/// NOTE: This entire process relies pretty heavily on LoopSimplify and LCSSA +/// in order to make various safety checks work. +bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) { + // We can only remove the loop if there is a preheader that we can + // branch from after removing it. + BasicBlock* preheader = L->getLoopPreheader(); + if (!preheader) + return false; + + // If LoopSimplify form is not available, stay out of trouble. + if (!L->hasDedicatedExits()) + return false; + + // We can't remove loops that contain subloops. If the subloops were dead, + // they would already have been removed in earlier executions of this pass. + if (L->begin() != L->end()) + return false; + + SmallVector<BasicBlock*, 4> exitingBlocks; + L->getExitingBlocks(exitingBlocks); + + SmallVector<BasicBlock*, 4> exitBlocks; + L->getUniqueExitBlocks(exitBlocks); + + // We require that the loop only have a single exit block. Otherwise, we'd + // be in the situation of needing to be able to solve statically which exit + // block will be branched to, or trying to preserve the branching logic in + // a loop invariant manner. + if (exitBlocks.size() != 1) + return false; + + // Finally, we have to check that the loop really is dead. + bool Changed = false; + if (!IsLoopDead(L, exitingBlocks, exitBlocks, Changed, preheader)) + return Changed; + + // Don't remove loops for which we can't solve the trip count. + // They could be infinite, in which case we'd be changing program behavior. + ScalarEvolution& SE = getAnalysis<ScalarEvolution>(); + const SCEV *S = SE.getMaxBackedgeTakenCount(L); + if (isa<SCEVCouldNotCompute>(S)) + return Changed; + + // Now that we know the removal is safe, remove the loop by changing the + // branch from the preheader to go to the single exit block. + BasicBlock* exitBlock = exitBlocks[0]; + + // Because we're deleting a large chunk of code at once, the sequence in which + // we remove things is very important to avoid invalidation issues. Don't + // mess with this unless you have good reason and know what you're doing. + + // Tell ScalarEvolution that the loop is deleted. Do this before + // deleting the loop so that ScalarEvolution can look at the loop + // to determine what it needs to clean up. + SE.forgetLoop(L); + + // Connect the preheader directly to the exit block. + TerminatorInst* TI = preheader->getTerminator(); + TI->replaceUsesOfWith(L->getHeader(), exitBlock); + + // Rewrite phis in the exit block to get their inputs from + // the preheader instead of the exiting block. + BasicBlock* exitingBlock = exitingBlocks[0]; + BasicBlock::iterator BI = exitBlock->begin(); + while (PHINode* P = dyn_cast<PHINode>(BI)) { + int j = P->getBasicBlockIndex(exitingBlock); + assert(j >= 0 && "Can't find exiting block in exit block's phi node!"); + P->setIncomingBlock(j, preheader); + for (unsigned i = 1; i < exitingBlocks.size(); ++i) + P->removeIncomingValue(exitingBlocks[i]); + ++BI; + } + + // Update the dominator tree and remove the instructions and blocks that will + // be deleted from the reference counting scheme. + DominatorTree& DT = getAnalysis<DominatorTree>(); + SmallVector<DomTreeNode*, 8> ChildNodes; + for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end(); + LI != LE; ++LI) { + // Move all of the block's children to be children of the preheader, which + // allows us to remove the domtree entry for the block. + ChildNodes.insert(ChildNodes.begin(), DT[*LI]->begin(), DT[*LI]->end()); + for (SmallVector<DomTreeNode*, 8>::iterator DI = ChildNodes.begin(), + DE = ChildNodes.end(); DI != DE; ++DI) { + DT.changeImmediateDominator(*DI, DT[preheader]); + } + + ChildNodes.clear(); + DT.eraseNode(*LI); + + // Remove the block from the reference counting scheme, so that we can + // delete it freely later. + (*LI)->dropAllReferences(); + } + + // Erase the instructions and the blocks without having to worry + // about ordering because we already dropped the references. + // NOTE: This iteration is safe because erasing the block does not remove its + // entry from the loop's block list. We do that in the next section. + for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end(); + LI != LE; ++LI) + (*LI)->eraseFromParent(); + + // Finally, the blocks from loopinfo. This has to happen late because + // otherwise our loop iterators won't work. + LoopInfo& loopInfo = getAnalysis<LoopInfo>(); + SmallPtrSet<BasicBlock*, 8> blocks; + blocks.insert(L->block_begin(), L->block_end()); + for (SmallPtrSet<BasicBlock*,8>::iterator I = blocks.begin(), + E = blocks.end(); I != E; ++I) + loopInfo.removeBlock(*I); + + // The last step is to inform the loop pass manager that we've + // eliminated this loop. + LPM.deleteLoopFromQueue(L); + Changed = true; + + ++NumDeleted; + + return Changed; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp new file mode 100644 index 0000000..ac1082c --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -0,0 +1,634 @@ +//===-- LoopIdiomRecognize.cpp - Loop idiom recognition -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass implements an idiom recognizer that transforms simple loops into a +// non-loop form. In cases that this kicks in, it can be a significant +// performance win. +// +//===----------------------------------------------------------------------===// +// +// TODO List: +// +// Future loop memory idioms to recognize: +// memcmp, memmove, strlen, etc. +// Future floating point idioms to recognize in -ffast-math mode: +// fpowi +// Future integer operation idioms to recognize: +// ctpop, ctlz, cttz +// +// Beware that isel's default lowering for ctpop is highly inefficient for +// i64 and larger types when i64 is legal and the value has few bits set. It +// would be good to enhance isel to emit a loop for ctpop in this case. +// +// We should enhance the memset/memcpy recognition to handle multiple stores in +// the loop. This would handle things like: +// void foo(_Complex float *P) +// for (i) { __real__(*P) = 0; __imag__(*P) = 0; } +// +// We should enhance this to handle negative strides through memory. +// Alternatively (and perhaps better) we could rely on an earlier pass to force +// forward iteration through memory, which is generally better for cache +// behavior. Negative strides *do* happen for memset/memcpy loops. +// +// This could recognize common matrix multiplies and dot product idioms and +// replace them with calls to BLAS (if linked in??). +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "loop-idiom" +#include "llvm/Transforms/Scalar.h" +#include "llvm/IRBuilder.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Module.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/Local.h" +using namespace llvm; + +STATISTIC(NumMemSet, "Number of memset's formed from loop stores"); +STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores"); + +namespace { + class LoopIdiomRecognize : public LoopPass { + Loop *CurLoop; + const TargetData *TD; + DominatorTree *DT; + ScalarEvolution *SE; + TargetLibraryInfo *TLI; + public: + static char ID; + explicit LoopIdiomRecognize() : LoopPass(ID) { + initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry()); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl<BasicBlock*> &ExitBlocks); + + bool processLoopStore(StoreInst *SI, const SCEV *BECount); + bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount); + + bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize, + unsigned StoreAlignment, + Value *SplatValue, Instruction *TheStore, + const SCEVAddRecExpr *Ev, + const SCEV *BECount); + bool processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, + const SCEVAddRecExpr *StoreEv, + const SCEVAddRecExpr *LoadEv, + const SCEV *BECount); + + /// This transformation requires natural loop information & requires that + /// loop preheaders be inserted into the CFG. + /// + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<LoopInfo>(); + AU.addPreserved<LoopInfo>(); + AU.addRequiredID(LoopSimplifyID); + AU.addPreservedID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + AU.addPreservedID(LCSSAID); + AU.addRequired<AliasAnalysis>(); + AU.addPreserved<AliasAnalysis>(); + AU.addRequired<ScalarEvolution>(); + AU.addPreserved<ScalarEvolution>(); + AU.addPreserved<DominatorTree>(); + AU.addRequired<DominatorTree>(); + AU.addRequired<TargetLibraryInfo>(); + } + }; +} + +char LoopIdiomRecognize::ID = 0; +INITIALIZE_PASS_BEGIN(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", + false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_END(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", + false, false) + +Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognize(); } + +/// deleteDeadInstruction - Delete this instruction. Before we do, go through +/// and zero out all the operands of this instruction. If any of them become +/// dead, delete them and the computation tree that feeds them. +/// +static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE) { + SmallVector<Instruction*, 32> NowDeadInsts; + + NowDeadInsts.push_back(I); + + // Before we touch this instruction, remove it from SE! + do { + Instruction *DeadInst = NowDeadInsts.pop_back_val(); + + // This instruction is dead, zap it, in stages. Start by removing it from + // SCEV. + SE.forgetValue(DeadInst); + + for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) { + Value *Op = DeadInst->getOperand(op); + DeadInst->setOperand(op, 0); + + // If this operand just became dead, add it to the NowDeadInsts list. + if (!Op->use_empty()) continue; + + if (Instruction *OpI = dyn_cast<Instruction>(Op)) + if (isInstructionTriviallyDead(OpI)) + NowDeadInsts.push_back(OpI); + } + + DeadInst->eraseFromParent(); + + } while (!NowDeadInsts.empty()); +} + +/// deleteIfDeadInstruction - If the specified value is a dead instruction, +/// delete it and any recursively used instructions. +static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE) { + if (Instruction *I = dyn_cast<Instruction>(V)) + if (isInstructionTriviallyDead(I)) + deleteDeadInstruction(I, SE); +} + +bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { + CurLoop = L; + + // Disable loop idiom recognition if the function's name is a common idiom. + StringRef Name = L->getHeader()->getParent()->getName(); + if (Name == "memset" || Name == "memcpy") + return false; + + // The trip count of the loop must be analyzable. + SE = &getAnalysis<ScalarEvolution>(); + if (!SE->hasLoopInvariantBackedgeTakenCount(L)) + return false; + const SCEV *BECount = SE->getBackedgeTakenCount(L); + if (isa<SCEVCouldNotCompute>(BECount)) return false; + + // If this loop executes exactly one time, then it should be peeled, not + // optimized by this pass. + if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount)) + if (BECst->getValue()->getValue() == 0) + return false; + + // We require target data for now. + TD = getAnalysisIfAvailable<TargetData>(); + if (TD == 0) return false; + + DT = &getAnalysis<DominatorTree>(); + LoopInfo &LI = getAnalysis<LoopInfo>(); + TLI = &getAnalysis<TargetLibraryInfo>(); + + SmallVector<BasicBlock*, 8> ExitBlocks; + CurLoop->getUniqueExitBlocks(ExitBlocks); + + DEBUG(dbgs() << "loop-idiom Scanning: F[" + << L->getHeader()->getParent()->getName() + << "] Loop %" << L->getHeader()->getName() << "\n"); + + bool MadeChange = false; + // Scan all the blocks in the loop that are not in subloops. + for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E; + ++BI) { + // Ignore blocks in subloops. + if (LI.getLoopFor(*BI) != CurLoop) + continue; + + MadeChange |= runOnLoopBlock(*BI, BECount, ExitBlocks); + } + return MadeChange; +} + +/// runOnLoopBlock - Process the specified block, which lives in a counted loop +/// with the specified backedge count. This block is known to be in the current +/// loop and not in any subloops. +bool LoopIdiomRecognize::runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl<BasicBlock*> &ExitBlocks) { + // We can only promote stores in this block if they are unconditionally + // executed in the loop. For a block to be unconditionally executed, it has + // to dominate all the exit blocks of the loop. Verify this now. + for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) + if (!DT->dominates(BB, ExitBlocks[i])) + return false; + + bool MadeChange = false; + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) { + Instruction *Inst = I++; + // Look for store instructions, which may be optimized to memset/memcpy. + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + WeakVH InstPtr(I); + if (!processLoopStore(SI, BECount)) continue; + MadeChange = true; + + // If processing the store invalidated our iterator, start over from the + // top of the block. + if (InstPtr == 0) + I = BB->begin(); + continue; + } + + // Look for memset instructions, which may be optimized to a larger memset. + if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) { + WeakVH InstPtr(I); + if (!processLoopMemSet(MSI, BECount)) continue; + MadeChange = true; + + // If processing the memset invalidated our iterator, start over from the + // top of the block. + if (InstPtr == 0) + I = BB->begin(); + continue; + } + } + + return MadeChange; +} + + +/// processLoopStore - See if this store can be promoted to a memset or memcpy. +bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) { + if (!SI->isSimple()) return false; + + Value *StoredVal = SI->getValueOperand(); + Value *StorePtr = SI->getPointerOperand(); + + // Reject stores that are so large that they overflow an unsigned. + uint64_t SizeInBits = TD->getTypeSizeInBits(StoredVal->getType()); + if ((SizeInBits & 7) || (SizeInBits >> 32) != 0) + return false; + + // See if the pointer expression is an AddRec like {base,+,1} on the current + // loop, which indicates a strided store. If we have something else, it's a + // random store we can't handle. + const SCEVAddRecExpr *StoreEv = + dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); + if (StoreEv == 0 || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine()) + return false; + + // Check to see if the stride matches the size of the store. If so, then we + // know that every byte is touched in the loop. + unsigned StoreSize = (unsigned)SizeInBits >> 3; + const SCEVConstant *Stride = dyn_cast<SCEVConstant>(StoreEv->getOperand(1)); + + if (Stride == 0 || StoreSize != Stride->getValue()->getValue()) { + // TODO: Could also handle negative stride here someday, that will require + // the validity check in mayLoopAccessLocation to be updated though. + // Enable this to print exact negative strides. + if (0 && Stride && StoreSize == -Stride->getValue()->getValue()) { + dbgs() << "NEGATIVE STRIDE: " << *SI << "\n"; + dbgs() << "BB: " << *SI->getParent(); + } + + return false; + } + + // See if we can optimize just this store in isolation. + if (processLoopStridedStore(StorePtr, StoreSize, SI->getAlignment(), + StoredVal, SI, StoreEv, BECount)) + return true; + + // If the stored value is a strided load in the same loop with the same stride + // this this may be transformable into a memcpy. This kicks in for stuff like + // for (i) A[i] = B[i]; + if (LoadInst *LI = dyn_cast<LoadInst>(StoredVal)) { + const SCEVAddRecExpr *LoadEv = + dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getOperand(0))); + if (LoadEv && LoadEv->getLoop() == CurLoop && LoadEv->isAffine() && + StoreEv->getOperand(1) == LoadEv->getOperand(1) && LI->isSimple()) + if (processLoopStoreOfLoopLoad(SI, StoreSize, StoreEv, LoadEv, BECount)) + return true; + } + //errs() << "UNHANDLED strided store: " << *StoreEv << " - " << *SI << "\n"; + + return false; +} + +/// processLoopMemSet - See if this memset can be promoted to a large memset. +bool LoopIdiomRecognize:: +processLoopMemSet(MemSetInst *MSI, const SCEV *BECount) { + // We can only handle non-volatile memsets with a constant size. + if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength())) return false; + + // If we're not allowed to hack on memset, we fail. + if (!TLI->has(LibFunc::memset)) + return false; + + Value *Pointer = MSI->getDest(); + + // See if the pointer expression is an AddRec like {base,+,1} on the current + // loop, which indicates a strided store. If we have something else, it's a + // random store we can't handle. + const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer)); + if (Ev == 0 || Ev->getLoop() != CurLoop || !Ev->isAffine()) + return false; + + // Reject memsets that are so large that they overflow an unsigned. + uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue(); + if ((SizeInBytes >> 32) != 0) + return false; + + // Check to see if the stride matches the size of the memset. If so, then we + // know that every byte is touched in the loop. + const SCEVConstant *Stride = dyn_cast<SCEVConstant>(Ev->getOperand(1)); + + // TODO: Could also handle negative stride here someday, that will require the + // validity check in mayLoopAccessLocation to be updated though. + if (Stride == 0 || MSI->getLength() != Stride->getValue()) + return false; + + return processLoopStridedStore(Pointer, (unsigned)SizeInBytes, + MSI->getAlignment(), MSI->getValue(), + MSI, Ev, BECount); +} + + +/// mayLoopAccessLocation - Return true if the specified loop might access the +/// specified pointer location, which is a loop-strided access. The 'Access' +/// argument specifies what the verboten forms of access are (read or write). +static bool mayLoopAccessLocation(Value *Ptr,AliasAnalysis::ModRefResult Access, + Loop *L, const SCEV *BECount, + unsigned StoreSize, AliasAnalysis &AA, + Instruction *IgnoredStore) { + // Get the location that may be stored across the loop. Since the access is + // strided positively through memory, we say that the modified location starts + // at the pointer and has infinite size. + uint64_t AccessSize = AliasAnalysis::UnknownSize; + + // If the loop iterates a fixed number of times, we can refine the access size + // to be exactly the size of the memset, which is (BECount+1)*StoreSize + if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount)) + AccessSize = (BECst->getValue()->getZExtValue()+1)*StoreSize; + + // TODO: For this to be really effective, we have to dive into the pointer + // operand in the store. Store to &A[i] of 100 will always return may alias + // with store of &A[100], we need to StoreLoc to be "A" with size of 100, + // which will then no-alias a store to &A[100]. + AliasAnalysis::Location StoreLoc(Ptr, AccessSize); + + for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E; + ++BI) + for (BasicBlock::iterator I = (*BI)->begin(), E = (*BI)->end(); I != E; ++I) + if (&*I != IgnoredStore && + (AA.getModRefInfo(I, StoreLoc) & Access)) + return true; + + return false; +} + +/// getMemSetPatternValue - If a strided store of the specified value is safe to +/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should +/// be passed in. Otherwise, return null. +/// +/// Note that we don't ever attempt to use memset_pattern8 or 4, because these +/// just replicate their input array and then pass on to memset_pattern16. +static Constant *getMemSetPatternValue(Value *V, const TargetData &TD) { + // If the value isn't a constant, we can't promote it to being in a constant + // array. We could theoretically do a store to an alloca or something, but + // that doesn't seem worthwhile. + Constant *C = dyn_cast<Constant>(V); + if (C == 0) return 0; + + // Only handle simple values that are a power of two bytes in size. + uint64_t Size = TD.getTypeSizeInBits(V->getType()); + if (Size == 0 || (Size & 7) || (Size & (Size-1))) + return 0; + + // Don't care enough about darwin/ppc to implement this. + if (TD.isBigEndian()) + return 0; + + // Convert to size in bytes. + Size /= 8; + + // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see + // if the top and bottom are the same (e.g. for vectors and large integers). + if (Size > 16) return 0; + + // If the constant is exactly 16 bytes, just use it. + if (Size == 16) return C; + + // Otherwise, we'll use an array of the constants. + unsigned ArraySize = 16/Size; + ArrayType *AT = ArrayType::get(V->getType(), ArraySize); + return ConstantArray::get(AT, std::vector<Constant*>(ArraySize, C)); +} + + +/// processLoopStridedStore - We see a strided store of some value. If we can +/// transform this into a memset or memset_pattern in the loop preheader, do so. +bool LoopIdiomRecognize:: +processLoopStridedStore(Value *DestPtr, unsigned StoreSize, + unsigned StoreAlignment, Value *StoredVal, + Instruction *TheStore, const SCEVAddRecExpr *Ev, + const SCEV *BECount) { + + // If the stored value is a byte-wise value (like i32 -1), then it may be + // turned into a memset of i8 -1, assuming that all the consecutive bytes + // are stored. A store of i32 0x01020304 can never be turned into a memset, + // but it can be turned into memset_pattern if the target supports it. + Value *SplatValue = isBytewiseValue(StoredVal); + Constant *PatternValue = 0; + + // If we're allowed to form a memset, and the stored value would be acceptable + // for memset, use it. + if (SplatValue && TLI->has(LibFunc::memset) && + // Verify that the stored value is loop invariant. If not, we can't + // promote the memset. + CurLoop->isLoopInvariant(SplatValue)) { + // Keep and use SplatValue. + PatternValue = 0; + } else if (TLI->has(LibFunc::memset_pattern16) && + (PatternValue = getMemSetPatternValue(StoredVal, *TD))) { + // It looks like we can use PatternValue! + SplatValue = 0; + } else { + // Otherwise, this isn't an idiom we can transform. For example, we can't + // do anything with a 3-byte store. + return false; + } + + // The trip count of the loop and the base pointer of the addrec SCEV is + // guaranteed to be loop invariant, which means that it should dominate the + // header. This allows us to insert code for it in the preheader. + BasicBlock *Preheader = CurLoop->getLoopPreheader(); + IRBuilder<> Builder(Preheader->getTerminator()); + SCEVExpander Expander(*SE, "loop-idiom"); + + // Okay, we have a strided store "p[i]" of a splattable value. We can turn + // this into a memset in the loop preheader now if we want. However, this + // would be unsafe to do if there is anything else in the loop that may read + // or write to the aliased location. Check for any overlap by generating the + // base pointer and checking the region. + unsigned AddrSpace = cast<PointerType>(DestPtr->getType())->getAddressSpace(); + Value *BasePtr = + Expander.expandCodeFor(Ev->getStart(), Builder.getInt8PtrTy(AddrSpace), + Preheader->getTerminator()); + + + if (mayLoopAccessLocation(BasePtr, AliasAnalysis::ModRef, + CurLoop, BECount, + StoreSize, getAnalysis<AliasAnalysis>(), TheStore)){ + Expander.clear(); + // If we generated new code for the base pointer, clean up. + deleteIfDeadInstruction(BasePtr, *SE); + return false; + } + + // Okay, everything looks good, insert the memset. + + // The # stored bytes is (BECount+1)*Size. Expand the trip count out to + // pointer size if it isn't already. + Type *IntPtr = TD->getIntPtrType(DestPtr->getContext()); + BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr); + + const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1), + SCEV::FlagNUW); + if (StoreSize != 1) + NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize), + SCEV::FlagNUW); + + Value *NumBytes = + Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator()); + + CallInst *NewCall; + if (SplatValue) + NewCall = Builder.CreateMemSet(BasePtr, SplatValue,NumBytes,StoreAlignment); + else { + Module *M = TheStore->getParent()->getParent()->getParent(); + Value *MSP = M->getOrInsertFunction("memset_pattern16", + Builder.getVoidTy(), + Builder.getInt8PtrTy(), + Builder.getInt8PtrTy(), IntPtr, + (void*)0); + + // Otherwise we should form a memset_pattern16. PatternValue is known to be + // an constant array of 16-bytes. Plop the value into a mergable global. + GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true, + GlobalValue::InternalLinkage, + PatternValue, ".memset_pattern"); + GV->setUnnamedAddr(true); // Ok to merge these. + GV->setAlignment(16); + Value *PatternPtr = ConstantExpr::getBitCast(GV, Builder.getInt8PtrTy()); + NewCall = Builder.CreateCall3(MSP, BasePtr, PatternPtr, NumBytes); + } + + DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n" + << " from store to: " << *Ev << " at: " << *TheStore << "\n"); + NewCall->setDebugLoc(TheStore->getDebugLoc()); + + // Okay, the memset has been formed. Zap the original store and anything that + // feeds into it. + deleteDeadInstruction(TheStore, *SE); + ++NumMemSet; + return true; +} + +/// processLoopStoreOfLoopLoad - We see a strided store whose value is a +/// same-strided load. +bool LoopIdiomRecognize:: +processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, + const SCEVAddRecExpr *StoreEv, + const SCEVAddRecExpr *LoadEv, + const SCEV *BECount) { + // If we're not allowed to form memcpy, we fail. + if (!TLI->has(LibFunc::memcpy)) + return false; + + LoadInst *LI = cast<LoadInst>(SI->getValueOperand()); + + // The trip count of the loop and the base pointer of the addrec SCEV is + // guaranteed to be loop invariant, which means that it should dominate the + // header. This allows us to insert code for it in the preheader. + BasicBlock *Preheader = CurLoop->getLoopPreheader(); + IRBuilder<> Builder(Preheader->getTerminator()); + SCEVExpander Expander(*SE, "loop-idiom"); + + // Okay, we have a strided store "p[i]" of a loaded value. We can turn + // this into a memcpy in the loop preheader now if we want. However, this + // would be unsafe to do if there is anything else in the loop that may read + // or write the memory region we're storing to. This includes the load that + // feeds the stores. Check for an alias by generating the base address and + // checking everything. + Value *StoreBasePtr = + Expander.expandCodeFor(StoreEv->getStart(), + Builder.getInt8PtrTy(SI->getPointerAddressSpace()), + Preheader->getTerminator()); + + if (mayLoopAccessLocation(StoreBasePtr, AliasAnalysis::ModRef, + CurLoop, BECount, StoreSize, + getAnalysis<AliasAnalysis>(), SI)) { + Expander.clear(); + // If we generated new code for the base pointer, clean up. + deleteIfDeadInstruction(StoreBasePtr, *SE); + return false; + } + + // For a memcpy, we have to make sure that the input array is not being + // mutated by the loop. + Value *LoadBasePtr = + Expander.expandCodeFor(LoadEv->getStart(), + Builder.getInt8PtrTy(LI->getPointerAddressSpace()), + Preheader->getTerminator()); + + if (mayLoopAccessLocation(LoadBasePtr, AliasAnalysis::Mod, CurLoop, BECount, + StoreSize, getAnalysis<AliasAnalysis>(), SI)) { + Expander.clear(); + // If we generated new code for the base pointer, clean up. + deleteIfDeadInstruction(LoadBasePtr, *SE); + deleteIfDeadInstruction(StoreBasePtr, *SE); + return false; + } + + // Okay, everything is safe, we can transform this! + + + // The # stored bytes is (BECount+1)*Size. Expand the trip count out to + // pointer size if it isn't already. + Type *IntPtr = TD->getIntPtrType(SI->getContext()); + BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr); + + const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1), + SCEV::FlagNUW); + if (StoreSize != 1) + NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize), + SCEV::FlagNUW); + + Value *NumBytes = + Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator()); + + CallInst *NewCall = + Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, + std::min(SI->getAlignment(), LI->getAlignment())); + NewCall->setDebugLoc(SI->getDebugLoc()); + + DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n" + << " from load ptr=" << *LoadEv << " at: " << *LI << "\n" + << " from store ptr=" << *StoreEv << " at: " << *SI << "\n"); + + + // Okay, the memset has been formed. Zap the original store and anything that + // feeds into it. + deleteDeadInstruction(SI, *SE); + ++NumMemCpy; + return true; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp new file mode 100644 index 0000000..982400c --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp @@ -0,0 +1,174 @@ +//===- LoopInstSimplify.cpp - Loop Instruction Simplification Pass --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass performs lightweight instruction simplification on loop bodies. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "loop-instsimplify" +#include "llvm/Instructions.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumSimplified, "Number of redundant instructions simplified"); + +namespace { + class LoopInstSimplify : public LoopPass { + public: + static char ID; // Pass ID, replacement for typeid + LoopInstSimplify() : LoopPass(ID) { + initializeLoopInstSimplifyPass(*PassRegistry::getPassRegistry()); + } + + bool runOnLoop(Loop*, LPPassManager&); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired<LoopInfo>(); + AU.addRequiredID(LoopSimplifyID); + AU.addPreservedID(LoopSimplifyID); + AU.addPreservedID(LCSSAID); + AU.addPreserved("scalar-evolution"); + AU.addRequired<TargetLibraryInfo>(); + } + }; +} + +char LoopInstSimplify::ID = 0; +INITIALIZE_PASS_BEGIN(LoopInstSimplify, "loop-instsimplify", + "Simplify instructions in loops", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_END(LoopInstSimplify, "loop-instsimplify", + "Simplify instructions in loops", false, false) + +Pass *llvm::createLoopInstSimplifyPass() { + return new LoopInstSimplify(); +} + +bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { + DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>(); + LoopInfo *LI = &getAnalysis<LoopInfo>(); + const TargetData *TD = getAnalysisIfAvailable<TargetData>(); + const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); + + SmallVector<BasicBlock*, 8> ExitBlocks; + L->getUniqueExitBlocks(ExitBlocks); + array_pod_sort(ExitBlocks.begin(), ExitBlocks.end()); + + SmallPtrSet<const Instruction*, 8> S1, S2, *ToSimplify = &S1, *Next = &S2; + + // The bit we are stealing from the pointer represents whether this basic + // block is the header of a subloop, in which case we only process its phis. + typedef PointerIntPair<BasicBlock*, 1> WorklistItem; + SmallVector<WorklistItem, 16> VisitStack; + SmallPtrSet<BasicBlock*, 32> Visited; + + bool Changed = false; + bool LocalChanged; + do { + LocalChanged = false; + + VisitStack.clear(); + Visited.clear(); + + VisitStack.push_back(WorklistItem(L->getHeader(), false)); + + while (!VisitStack.empty()) { + WorklistItem Item = VisitStack.pop_back_val(); + BasicBlock *BB = Item.getPointer(); + bool IsSubloopHeader = Item.getInt(); + + // Simplify instructions in the current basic block. + for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { + Instruction *I = BI++; + + // The first time through the loop ToSimplify is empty and we try to + // simplify all instructions. On later iterations ToSimplify is not + // empty and we only bother simplifying instructions that are in it. + if (!ToSimplify->empty() && !ToSimplify->count(I)) + continue; + + // Don't bother simplifying unused instructions. + if (!I->use_empty()) { + Value *V = SimplifyInstruction(I, TD, TLI, DT); + if (V && LI->replacementPreservesLCSSAForm(I, V)) { + // Mark all uses for resimplification next time round the loop. + for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); + UI != UE; ++UI) + Next->insert(cast<Instruction>(*UI)); + + I->replaceAllUsesWith(V); + LocalChanged = true; + ++NumSimplified; + } + } + LocalChanged |= RecursivelyDeleteTriviallyDeadInstructions(I); + + if (IsSubloopHeader && !isa<PHINode>(I)) + break; + } + + // Add all successors to the worklist, except for loop exit blocks and the + // bodies of subloops. We visit the headers of loops so that we can process + // their phis, but we contract the rest of the subloop body and only follow + // edges leading back to the original loop. + for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; + ++SI) { + BasicBlock *SuccBB = *SI; + if (!Visited.insert(SuccBB)) + continue; + + const Loop *SuccLoop = LI->getLoopFor(SuccBB); + if (SuccLoop && SuccLoop->getHeader() == SuccBB + && L->contains(SuccLoop)) { + VisitStack.push_back(WorklistItem(SuccBB, true)); + + SmallVector<BasicBlock*, 8> SubLoopExitBlocks; + SuccLoop->getExitBlocks(SubLoopExitBlocks); + + for (unsigned i = 0; i < SubLoopExitBlocks.size(); ++i) { + BasicBlock *ExitBB = SubLoopExitBlocks[i]; + if (LI->getLoopFor(ExitBB) == L && Visited.insert(ExitBB)) + VisitStack.push_back(WorklistItem(ExitBB, false)); + } + + continue; + } + + bool IsExitBlock = std::binary_search(ExitBlocks.begin(), + ExitBlocks.end(), SuccBB); + if (IsExitBlock) + continue; + + VisitStack.push_back(WorklistItem(SuccBB, false)); + } + } + + // Place the list of instructions to simplify on the next loop iteration + // into ToSimplify. + std::swap(ToSimplify, Next); + Next->clear(); + + Changed |= LocalChanged; + } while (LocalChanged); + + return Changed; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp new file mode 100644 index 0000000..7eeb152 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp @@ -0,0 +1,458 @@ +//===- LoopRotation.cpp - Loop Rotation Pass ------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements Loop Rotation Pass. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "loop-rotate" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Function.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm/Support/Debug.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +#define MAX_HEADER_SIZE 16 + +STATISTIC(NumRotated, "Number of loops rotated"); +namespace { + + class LoopRotate : public LoopPass { + public: + static char ID; // Pass ID, replacement for typeid + LoopRotate() : LoopPass(ID) { + initializeLoopRotatePass(*PassRegistry::getPassRegistry()); + } + + // LCSSA form makes instruction renaming easier. + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addPreserved<DominatorTree>(); + AU.addRequired<LoopInfo>(); + AU.addPreserved<LoopInfo>(); + AU.addRequiredID(LoopSimplifyID); + AU.addPreservedID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + AU.addPreservedID(LCSSAID); + AU.addPreserved<ScalarEvolution>(); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM); + void simplifyLoopLatch(Loop *L); + bool rotateLoop(Loop *L); + + private: + LoopInfo *LI; + }; +} + +char LoopRotate::ID = 0; +INITIALIZE_PASS_BEGIN(LoopRotate, "loop-rotate", "Rotate Loops", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_END(LoopRotate, "loop-rotate", "Rotate Loops", false, false) + +Pass *llvm::createLoopRotatePass() { return new LoopRotate(); } + +/// Rotate Loop L as many times as possible. Return true if +/// the loop is rotated at least once. +bool LoopRotate::runOnLoop(Loop *L, LPPassManager &LPM) { + LI = &getAnalysis<LoopInfo>(); + + // Simplify the loop latch before attempting to rotate the header + // upward. Rotation may not be needed if the loop tail can be folded into the + // loop exit. + simplifyLoopLatch(L); + + // One loop can be rotated multiple times. + bool MadeChange = false; + while (rotateLoop(L)) + MadeChange = true; + + return MadeChange; +} + +/// RewriteUsesOfClonedInstructions - We just cloned the instructions from the +/// old header into the preheader. If there were uses of the values produced by +/// these instruction that were outside of the loop, we have to insert PHI nodes +/// to merge the two values. Do this now. +static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader, + BasicBlock *OrigPreheader, + ValueToValueMapTy &ValueMap) { + // Remove PHI node entries that are no longer live. + BasicBlock::iterator I, E = OrigHeader->end(); + for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I) + PN->removeIncomingValue(PN->getBasicBlockIndex(OrigPreheader)); + + // Now fix up users of the instructions in OrigHeader, inserting PHI nodes + // as necessary. + SSAUpdater SSA; + for (I = OrigHeader->begin(); I != E; ++I) { + Value *OrigHeaderVal = I; + + // If there are no uses of the value (e.g. because it returns void), there + // is nothing to rewrite. + if (OrigHeaderVal->use_empty()) + continue; + + Value *OrigPreHeaderVal = ValueMap[OrigHeaderVal]; + + // The value now exits in two versions: the initial value in the preheader + // and the loop "next" value in the original header. + SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName()); + SSA.AddAvailableValue(OrigHeader, OrigHeaderVal); + SSA.AddAvailableValue(OrigPreheader, OrigPreHeaderVal); + + // Visit each use of the OrigHeader instruction. + for (Value::use_iterator UI = OrigHeaderVal->use_begin(), + UE = OrigHeaderVal->use_end(); UI != UE; ) { + // Grab the use before incrementing the iterator. + Use &U = UI.getUse(); + + // Increment the iterator before removing the use from the list. + ++UI; + + // SSAUpdater can't handle a non-PHI use in the same block as an + // earlier def. We can easily handle those cases manually. + Instruction *UserInst = cast<Instruction>(U.getUser()); + if (!isa<PHINode>(UserInst)) { + BasicBlock *UserBB = UserInst->getParent(); + + // The original users in the OrigHeader are already using the + // original definitions. + if (UserBB == OrigHeader) + continue; + + // Users in the OrigPreHeader need to use the value to which the + // original definitions are mapped. + if (UserBB == OrigPreheader) { + U = OrigPreHeaderVal; + continue; + } + } + + // Anything else can be handled by SSAUpdater. + SSA.RewriteUse(U); + } + } +} + +/// Determine whether the instructions in this range my be safely and cheaply +/// speculated. This is not an important enough situation to develop complex +/// heuristics. We handle a single arithmetic instruction along with any type +/// conversions. +static bool shouldSpeculateInstrs(BasicBlock::iterator Begin, + BasicBlock::iterator End) { + bool seenIncrement = false; + for (BasicBlock::iterator I = Begin; I != End; ++I) { + + if (!isSafeToSpeculativelyExecute(I)) + return false; + + if (isa<DbgInfoIntrinsic>(I)) + continue; + + switch (I->getOpcode()) { + default: + return false; + case Instruction::GetElementPtr: + // GEPs are cheap if all indices are constant. + if (!cast<GEPOperator>(I)->hasAllConstantIndices()) + return false; + // fall-thru to increment case + case Instruction::Add: + case Instruction::Sub: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + if (seenIncrement) + return false; + seenIncrement = true; + break; + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + // ignore type conversions + break; + } + } + return true; +} + +/// Fold the loop tail into the loop exit by speculating the loop tail +/// instructions. Typically, this is a single post-increment. In the case of a +/// simple 2-block loop, hoisting the increment can be much better than +/// duplicating the entire loop header. In the cast of loops with early exits, +/// rotation will not work anyway, but simplifyLoopLatch will put the loop in +/// canonical form so downstream passes can handle it. +/// +/// I don't believe this invalidates SCEV. +void LoopRotate::simplifyLoopLatch(Loop *L) { + BasicBlock *Latch = L->getLoopLatch(); + if (!Latch || Latch->hasAddressTaken()) + return; + + BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator()); + if (!Jmp || !Jmp->isUnconditional()) + return; + + BasicBlock *LastExit = Latch->getSinglePredecessor(); + if (!LastExit || !L->isLoopExiting(LastExit)) + return; + + BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator()); + if (!BI) + return; + + if (!shouldSpeculateInstrs(Latch->begin(), Jmp)) + return; + + DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into " + << LastExit->getName() << "\n"); + + // Hoist the instructions from Latch into LastExit. + LastExit->getInstList().splice(BI, Latch->getInstList(), Latch->begin(), Jmp); + + unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1; + BasicBlock *Header = Jmp->getSuccessor(0); + assert(Header == L->getHeader() && "expected a backward branch"); + + // Remove Latch from the CFG so that LastExit becomes the new Latch. + BI->setSuccessor(FallThruPath, Header); + Latch->replaceSuccessorsPhiUsesWith(LastExit); + Jmp->eraseFromParent(); + + // Nuke the Latch block. + assert(Latch->empty() && "unable to evacuate Latch"); + LI->removeBlock(Latch); + if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>()) + DT->eraseNode(Latch); + Latch->eraseFromParent(); +} + +/// Rotate loop LP. Return true if the loop is rotated. +bool LoopRotate::rotateLoop(Loop *L) { + // If the loop has only one block then there is not much to rotate. + if (L->getBlocks().size() == 1) + return false; + + BasicBlock *OrigHeader = L->getHeader(); + + BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator()); + if (BI == 0 || BI->isUnconditional()) + return false; + + // If the loop header is not one of the loop exiting blocks then + // either this loop is already rotated or it is not + // suitable for loop rotation transformations. + if (!L->isLoopExiting(OrigHeader)) + return false; + + // Updating PHInodes in loops with multiple exits adds complexity. + // Keep it simple, and restrict loop rotation to loops with one exit only. + // In future, lift this restriction and support for multiple exits if + // required. + SmallVector<BasicBlock*, 8> ExitBlocks; + L->getExitBlocks(ExitBlocks); + if (ExitBlocks.size() > 1) + return false; + + // Check size of original header and reject loop if it is very big. + { + CodeMetrics Metrics; + Metrics.analyzeBasicBlock(OrigHeader); + if (Metrics.NumInsts > MAX_HEADER_SIZE) + return false; + } + + // Now, this loop is suitable for rotation. + BasicBlock *OrigPreheader = L->getLoopPreheader(); + BasicBlock *OrigLatch = L->getLoopLatch(); + + // If the loop could not be converted to canonical form, it must have an + // indirectbr in it, just give up. + if (OrigPreheader == 0 || OrigLatch == 0) + return false; + + // Anything ScalarEvolution may know about this loop or the PHI nodes + // in its header will soon be invalidated. + if (ScalarEvolution *SE = getAnalysisIfAvailable<ScalarEvolution>()) + SE->forgetLoop(L); + + // Find new Loop header. NewHeader is a Header's one and only successor + // that is inside loop. Header's other successor is outside the + // loop. Otherwise loop is not suitable for rotation. + BasicBlock *Exit = BI->getSuccessor(0); + BasicBlock *NewHeader = BI->getSuccessor(1); + if (L->contains(Exit)) + std::swap(Exit, NewHeader); + assert(NewHeader && "Unable to determine new loop header"); + assert(L->contains(NewHeader) && !L->contains(Exit) && + "Unable to determine loop header and exit blocks"); + + // This code assumes that the new header has exactly one predecessor. + // Remove any single-entry PHI nodes in it. + assert(NewHeader->getSinglePredecessor() && + "New header doesn't have one pred!"); + FoldSingleEntryPHINodes(NewHeader); + + // Begin by walking OrigHeader and populating ValueMap with an entry for + // each Instruction. + BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end(); + ValueToValueMapTy ValueMap; + + // For PHI nodes, the value available in OldPreHeader is just the + // incoming value from OldPreHeader. + for (; PHINode *PN = dyn_cast<PHINode>(I); ++I) + ValueMap[PN] = PN->getIncomingValueForBlock(OrigPreheader); + + // For the rest of the instructions, either hoist to the OrigPreheader if + // possible or create a clone in the OldPreHeader if not. + TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator(); + while (I != E) { + Instruction *Inst = I++; + + // If the instruction's operands are invariant and it doesn't read or write + // memory, then it is safe to hoist. Doing this doesn't change the order of + // execution in the preheader, but does prevent the instruction from + // executing in each iteration of the loop. This means it is safe to hoist + // something that might trap, but isn't safe to hoist something that reads + // memory (without proving that the loop doesn't write). + if (L->hasLoopInvariantOperands(Inst) && + !Inst->mayReadFromMemory() && !Inst->mayWriteToMemory() && + !isa<TerminatorInst>(Inst) && !isa<DbgInfoIntrinsic>(Inst) && + !isa<AllocaInst>(Inst)) { + Inst->moveBefore(LoopEntryBranch); + continue; + } + + // Otherwise, create a duplicate of the instruction. + Instruction *C = Inst->clone(); + + // Eagerly remap the operands of the instruction. + RemapInstruction(C, ValueMap, + RF_NoModuleLevelChanges|RF_IgnoreMissingEntries); + + // With the operands remapped, see if the instruction constant folds or is + // otherwise simplifyable. This commonly occurs because the entry from PHI + // nodes allows icmps and other instructions to fold. + Value *V = SimplifyInstruction(C); + if (V && LI->replacementPreservesLCSSAForm(C, V)) { + // If so, then delete the temporary instruction and stick the folded value + // in the map. + delete C; + ValueMap[Inst] = V; + } else { + // Otherwise, stick the new instruction into the new block! + C->setName(Inst->getName()); + C->insertBefore(LoopEntryBranch); + ValueMap[Inst] = C; + } + } + + // Along with all the other instructions, we just cloned OrigHeader's + // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's + // successors by duplicating their incoming values for OrigHeader. + TerminatorInst *TI = OrigHeader->getTerminator(); + for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) + for (BasicBlock::iterator BI = TI->getSuccessor(i)->begin(); + PHINode *PN = dyn_cast<PHINode>(BI); ++BI) + PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader); + + // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove + // OrigPreHeader's old terminator (the original branch into the loop), and + // remove the corresponding incoming values from the PHI nodes in OrigHeader. + LoopEntryBranch->eraseFromParent(); + + // If there were any uses of instructions in the duplicated block outside the + // loop, update them, inserting PHI nodes as required + RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap); + + // NewHeader is now the header of the loop. + L->moveToHeader(NewHeader); + assert(L->getHeader() == NewHeader && "Latch block is our new header"); + + + // At this point, we've finished our major CFG changes. As part of cloning + // the loop into the preheader we've simplified instructions and the + // duplicated conditional branch may now be branching on a constant. If it is + // branching on a constant and if that constant means that we enter the loop, + // then we fold away the cond branch to an uncond branch. This simplifies the + // loop in cases important for nested loops, and it also means we don't have + // to split as many edges. + BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator()); + assert(PHBI->isConditional() && "Should be clone of BI condbr!"); + if (!isa<ConstantInt>(PHBI->getCondition()) || + PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero()) + != NewHeader) { + // The conditional branch can't be folded, handle the general case. + // Update DominatorTree to reflect the CFG change we just made. Then split + // edges as necessary to preserve LoopSimplify form. + if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>()) { + // Since OrigPreheader now has the conditional branch to Exit block, it is + // the dominator of Exit. + DT->changeImmediateDominator(Exit, OrigPreheader); + DT->changeImmediateDominator(NewHeader, OrigPreheader); + + // Update OrigHeader to be dominated by the new header block. + DT->changeImmediateDominator(OrigHeader, OrigLatch); + } + + // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and + // thus is not a preheader anymore. + // Split the edge to form a real preheader. + BasicBlock *NewPH = SplitCriticalEdge(OrigPreheader, NewHeader, this); + NewPH->setName(NewHeader->getName() + ".lr.ph"); + + // Preserve canonical loop form, which means that 'Exit' should have only + // one predecessor. + BasicBlock *ExitSplit = SplitCriticalEdge(L->getLoopLatch(), Exit, this); + ExitSplit->moveBefore(Exit); + } else { + // We can fold the conditional branch in the preheader, this makes things + // simpler. The first step is to remove the extra edge to the Exit block. + Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/); + BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI); + NewBI->setDebugLoc(PHBI->getDebugLoc()); + PHBI->eraseFromParent(); + + // With our CFG finalized, update DomTree if it is available. + if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>()) { + // Update OrigHeader to be dominated by the new header block. + DT->changeImmediateDominator(NewHeader, OrigPreheader); + DT->changeImmediateDominator(OrigHeader, OrigLatch); + } + } + + assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation"); + assert(L->getLoopLatch() && "Invalid loop latch after loop rotation"); + + // Now that the CFG and DomTree are in a consistent state again, try to merge + // the OrigHeader block into OrigLatch. This will succeed if they are + // connected by an unconditional branch. This is just a cleanup so the + // emitted code isn't too gross in this common case. + MergeBlockIntoPredecessor(OrigHeader, this); + + ++NumRotated; + return true; +} + diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp new file mode 100644 index 0000000..b14a713 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -0,0 +1,4818 @@ +//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This transformation analyzes and transforms the induction variables (and +// computations derived from them) into forms suitable for efficient execution +// on the target. +// +// This pass performs a strength reduction on array references inside loops that +// have as one or more of their components the loop induction variable, it +// rewrites expressions to take advantage of scaled-index addressing modes +// available on the target, and it performs a variety of other optimizations +// related to loop induction variables. +// +// Terminology note: this code has a lot of handling for "post-increment" or +// "post-inc" users. This is not talking about post-increment addressing modes; +// it is instead talking about code like this: +// +// %i = phi [ 0, %entry ], [ %i.next, %latch ] +// ... +// %i.next = add %i, 1 +// %c = icmp eq %i.next, %n +// +// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however +// it's useful to think about these as the same register, with some uses using +// the value of the register before the add and some using // it after. In this +// example, the icmp is a post-increment user, since it uses %i.next, which is +// the value of the induction variable after the increment. The other common +// case of post-increment users is users outside the loop. +// +// TODO: More sophistication in the way Formulae are generated and filtered. +// +// TODO: Handle multiple loops at a time. +// +// TODO: Should TargetLowering::AddrMode::BaseGV be changed to a ConstantExpr +// instead of a GlobalValue? +// +// TODO: When truncation is free, truncate ICmp users' operands to make it a +// smaller encoding (on x86 at least). +// +// TODO: When a negated register is used by an add (such as in a list of +// multiple base registers, or as the increment expression in an addrec), +// we may not actually need both reg and (-1 * reg) in registers; the +// negation can be implemented by using a sub instead of an add. The +// lack of support for taking this into consideration when making +// register pressure decisions is partly worked around by the "Special" +// use kind. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "loop-reduce" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Constants.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Analysis/IVUsers.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Assembly/Writer.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/ADT/SmallBitVector.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ValueHandle.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetLowering.h" +#include <algorithm> +using namespace llvm; + +/// MaxIVUsers is an arbitrary threshold that provides an early opportunitiy for +/// bail out. This threshold is far beyond the number of users that LSR can +/// conceivably solve, so it should not affect generated code, but catches the +/// worst cases before LSR burns too much compile time and stack space. +static const unsigned MaxIVUsers = 200; + +// Temporary flag to cleanup congruent phis after LSR phi expansion. +// It's currently disabled until we can determine whether it's truly useful or +// not. The flag should be removed after the v3.0 release. +// This is now needed for ivchains. +static cl::opt<bool> EnablePhiElim( + "enable-lsr-phielim", cl::Hidden, cl::init(true), + cl::desc("Enable LSR phi elimination")); + +#ifndef NDEBUG +// Stress test IV chain generation. +static cl::opt<bool> StressIVChain( + "stress-ivchain", cl::Hidden, cl::init(false), + cl::desc("Stress test LSR IV chains")); +#else +static bool StressIVChain = false; +#endif + +namespace { + +/// RegSortData - This class holds data which is used to order reuse candidates. +class RegSortData { +public: + /// UsedByIndices - This represents the set of LSRUse indices which reference + /// a particular register. + SmallBitVector UsedByIndices; + + RegSortData() {} + + void print(raw_ostream &OS) const; + void dump() const; +}; + +} + +void RegSortData::print(raw_ostream &OS) const { + OS << "[NumUses=" << UsedByIndices.count() << ']'; +} + +void RegSortData::dump() const { + print(errs()); errs() << '\n'; +} + +namespace { + +/// RegUseTracker - Map register candidates to information about how they are +/// used. +class RegUseTracker { + typedef DenseMap<const SCEV *, RegSortData> RegUsesTy; + + RegUsesTy RegUsesMap; + SmallVector<const SCEV *, 16> RegSequence; + +public: + void CountRegister(const SCEV *Reg, size_t LUIdx); + void DropRegister(const SCEV *Reg, size_t LUIdx); + void SwapAndDropUse(size_t LUIdx, size_t LastLUIdx); + + bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const; + + const SmallBitVector &getUsedByIndices(const SCEV *Reg) const; + + void clear(); + + typedef SmallVectorImpl<const SCEV *>::iterator iterator; + typedef SmallVectorImpl<const SCEV *>::const_iterator const_iterator; + iterator begin() { return RegSequence.begin(); } + iterator end() { return RegSequence.end(); } + const_iterator begin() const { return RegSequence.begin(); } + const_iterator end() const { return RegSequence.end(); } +}; + +} + +void +RegUseTracker::CountRegister(const SCEV *Reg, size_t LUIdx) { + std::pair<RegUsesTy::iterator, bool> Pair = + RegUsesMap.insert(std::make_pair(Reg, RegSortData())); + RegSortData &RSD = Pair.first->second; + if (Pair.second) + RegSequence.push_back(Reg); + RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1)); + RSD.UsedByIndices.set(LUIdx); +} + +void +RegUseTracker::DropRegister(const SCEV *Reg, size_t LUIdx) { + RegUsesTy::iterator It = RegUsesMap.find(Reg); + assert(It != RegUsesMap.end()); + RegSortData &RSD = It->second; + assert(RSD.UsedByIndices.size() > LUIdx); + RSD.UsedByIndices.reset(LUIdx); +} + +void +RegUseTracker::SwapAndDropUse(size_t LUIdx, size_t LastLUIdx) { + assert(LUIdx <= LastLUIdx); + + // Update RegUses. The data structure is not optimized for this purpose; + // we must iterate through it and update each of the bit vectors. + for (RegUsesTy::iterator I = RegUsesMap.begin(), E = RegUsesMap.end(); + I != E; ++I) { + SmallBitVector &UsedByIndices = I->second.UsedByIndices; + if (LUIdx < UsedByIndices.size()) + UsedByIndices[LUIdx] = + LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : 0; + UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx)); + } +} + +bool +RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const { + RegUsesTy::const_iterator I = RegUsesMap.find(Reg); + if (I == RegUsesMap.end()) + return false; + const SmallBitVector &UsedByIndices = I->second.UsedByIndices; + int i = UsedByIndices.find_first(); + if (i == -1) return false; + if ((size_t)i != LUIdx) return true; + return UsedByIndices.find_next(i) != -1; +} + +const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const { + RegUsesTy::const_iterator I = RegUsesMap.find(Reg); + assert(I != RegUsesMap.end() && "Unknown register!"); + return I->second.UsedByIndices; +} + +void RegUseTracker::clear() { + RegUsesMap.clear(); + RegSequence.clear(); +} + +namespace { + +/// Formula - This class holds information that describes a formula for +/// computing satisfying a use. It may include broken-out immediates and scaled +/// registers. +struct Formula { + /// AM - This is used to represent complex addressing, as well as other kinds + /// of interesting uses. + TargetLowering::AddrMode AM; + + /// BaseRegs - The list of "base" registers for this use. When this is + /// non-empty, AM.HasBaseReg should be set to true. + SmallVector<const SCEV *, 2> BaseRegs; + + /// ScaledReg - The 'scaled' register for this use. This should be non-null + /// when AM.Scale is not zero. + const SCEV *ScaledReg; + + /// UnfoldedOffset - An additional constant offset which added near the + /// use. This requires a temporary register, but the offset itself can + /// live in an add immediate field rather than a register. + int64_t UnfoldedOffset; + + Formula() : ScaledReg(0), UnfoldedOffset(0) {} + + void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE); + + unsigned getNumRegs() const; + Type *getType() const; + + void DeleteBaseReg(const SCEV *&S); + + bool referencesReg(const SCEV *S) const; + bool hasRegsUsedByUsesOtherThan(size_t LUIdx, + const RegUseTracker &RegUses) const; + + void print(raw_ostream &OS) const; + void dump() const; +}; + +} + +/// DoInitialMatch - Recursion helper for InitialMatch. +static void DoInitialMatch(const SCEV *S, Loop *L, + SmallVectorImpl<const SCEV *> &Good, + SmallVectorImpl<const SCEV *> &Bad, + ScalarEvolution &SE) { + // Collect expressions which properly dominate the loop header. + if (SE.properlyDominates(S, L->getHeader())) { + Good.push_back(S); + return; + } + + // Look at add operands. + if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { + for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); + I != E; ++I) + DoInitialMatch(*I, L, Good, Bad, SE); + return; + } + + // Look at addrec operands. + if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) + if (!AR->getStart()->isZero()) { + DoInitialMatch(AR->getStart(), L, Good, Bad, SE); + DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0), + AR->getStepRecurrence(SE), + // FIXME: AR->getNoWrapFlags() + AR->getLoop(), SCEV::FlagAnyWrap), + L, Good, Bad, SE); + return; + } + + // Handle a multiplication by -1 (negation) if it didn't fold. + if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) + if (Mul->getOperand(0)->isAllOnesValue()) { + SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end()); + const SCEV *NewMul = SE.getMulExpr(Ops); + + SmallVector<const SCEV *, 4> MyGood; + SmallVector<const SCEV *, 4> MyBad; + DoInitialMatch(NewMul, L, MyGood, MyBad, SE); + const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue( + SE.getEffectiveSCEVType(NewMul->getType()))); + for (SmallVectorImpl<const SCEV *>::const_iterator I = MyGood.begin(), + E = MyGood.end(); I != E; ++I) + Good.push_back(SE.getMulExpr(NegOne, *I)); + for (SmallVectorImpl<const SCEV *>::const_iterator I = MyBad.begin(), + E = MyBad.end(); I != E; ++I) + Bad.push_back(SE.getMulExpr(NegOne, *I)); + return; + } + + // Ok, we can't do anything interesting. Just stuff the whole thing into a + // register and hope for the best. + Bad.push_back(S); +} + +/// InitialMatch - Incorporate loop-variant parts of S into this Formula, +/// attempting to keep all loop-invariant and loop-computable values in a +/// single base register. +void Formula::InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) { + SmallVector<const SCEV *, 4> Good; + SmallVector<const SCEV *, 4> Bad; + DoInitialMatch(S, L, Good, Bad, SE); + if (!Good.empty()) { + const SCEV *Sum = SE.getAddExpr(Good); + if (!Sum->isZero()) + BaseRegs.push_back(Sum); + AM.HasBaseReg = true; + } + if (!Bad.empty()) { + const SCEV *Sum = SE.getAddExpr(Bad); + if (!Sum->isZero()) + BaseRegs.push_back(Sum); + AM.HasBaseReg = true; + } +} + +/// getNumRegs - Return the total number of register operands used by this +/// formula. This does not include register uses implied by non-constant +/// addrec strides. +unsigned Formula::getNumRegs() const { + return !!ScaledReg + BaseRegs.size(); +} + +/// getType - Return the type of this formula, if it has one, or null +/// otherwise. This type is meaningless except for the bit size. +Type *Formula::getType() const { + return !BaseRegs.empty() ? BaseRegs.front()->getType() : + ScaledReg ? ScaledReg->getType() : + AM.BaseGV ? AM.BaseGV->getType() : + 0; +} + +/// DeleteBaseReg - Delete the given base reg from the BaseRegs list. +void Formula::DeleteBaseReg(const SCEV *&S) { + if (&S != &BaseRegs.back()) + std::swap(S, BaseRegs.back()); + BaseRegs.pop_back(); +} + +/// referencesReg - Test if this formula references the given register. +bool Formula::referencesReg(const SCEV *S) const { + return S == ScaledReg || + std::find(BaseRegs.begin(), BaseRegs.end(), S) != BaseRegs.end(); +} + +/// hasRegsUsedByUsesOtherThan - Test whether this formula uses registers +/// which are used by uses other than the use with the given index. +bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx, + const RegUseTracker &RegUses) const { + if (ScaledReg) + if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx)) + return true; + for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(), + E = BaseRegs.end(); I != E; ++I) + if (RegUses.isRegUsedByUsesOtherThan(*I, LUIdx)) + return true; + return false; +} + +void Formula::print(raw_ostream &OS) const { + bool First = true; + if (AM.BaseGV) { + if (!First) OS << " + "; else First = false; + WriteAsOperand(OS, AM.BaseGV, /*PrintType=*/false); + } + if (AM.BaseOffs != 0) { + if (!First) OS << " + "; else First = false; + OS << AM.BaseOffs; + } + for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(), + E = BaseRegs.end(); I != E; ++I) { + if (!First) OS << " + "; else First = false; + OS << "reg(" << **I << ')'; + } + if (AM.HasBaseReg && BaseRegs.empty()) { + if (!First) OS << " + "; else First = false; + OS << "**error: HasBaseReg**"; + } else if (!AM.HasBaseReg && !BaseRegs.empty()) { + if (!First) OS << " + "; else First = false; + OS << "**error: !HasBaseReg**"; + } + if (AM.Scale != 0) { + if (!First) OS << " + "; else First = false; + OS << AM.Scale << "*reg("; + if (ScaledReg) + OS << *ScaledReg; + else + OS << "<unknown>"; + OS << ')'; + } + if (UnfoldedOffset != 0) { + if (!First) OS << " + "; else First = false; + OS << "imm(" << UnfoldedOffset << ')'; + } +} + +void Formula::dump() const { + print(errs()); errs() << '\n'; +} + +/// isAddRecSExtable - Return true if the given addrec can be sign-extended +/// without changing its value. +static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) { + Type *WideTy = + IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1); + return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy)); +} + +/// isAddSExtable - Return true if the given add can be sign-extended +/// without changing its value. +static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) { + Type *WideTy = + IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1); + return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy)); +} + +/// isMulSExtable - Return true if the given mul can be sign-extended +/// without changing its value. +static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) { + Type *WideTy = + IntegerType::get(SE.getContext(), + SE.getTypeSizeInBits(M->getType()) * M->getNumOperands()); + return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy)); +} + +/// getExactSDiv - Return an expression for LHS /s RHS, if it can be determined +/// and if the remainder is known to be zero, or null otherwise. If +/// IgnoreSignificantBits is true, expressions like (X * Y) /s Y are simplified +/// to Y, ignoring that the multiplication may overflow, which is useful when +/// the result will be used in a context where the most significant bits are +/// ignored. +static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, + ScalarEvolution &SE, + bool IgnoreSignificantBits = false) { + // Handle the trivial case, which works for any SCEV type. + if (LHS == RHS) + return SE.getConstant(LHS->getType(), 1); + + // Handle a few RHS special cases. + const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS); + if (RC) { + const APInt &RA = RC->getValue()->getValue(); + // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do + // some folding. + if (RA.isAllOnesValue()) + return SE.getMulExpr(LHS, RC); + // Handle x /s 1 as x. + if (RA == 1) + return LHS; + } + + // Check for a division of a constant by a constant. + if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) { + if (!RC) + return 0; + const APInt &LA = C->getValue()->getValue(); + const APInt &RA = RC->getValue()->getValue(); + if (LA.srem(RA) != 0) + return 0; + return SE.getConstant(LA.sdiv(RA)); + } + + // Distribute the sdiv over addrec operands, if the addrec doesn't overflow. + if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) { + if (IgnoreSignificantBits || isAddRecSExtable(AR, SE)) { + const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE, + IgnoreSignificantBits); + if (!Step) return 0; + const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE, + IgnoreSignificantBits); + if (!Start) return 0; + // FlagNW is independent of the start value, step direction, and is + // preserved with smaller magnitude steps. + // FIXME: AR->getNoWrapFlags(SCEV::FlagNW) + return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap); + } + return 0; + } + + // Distribute the sdiv over add operands, if the add doesn't overflow. + if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) { + if (IgnoreSignificantBits || isAddSExtable(Add, SE)) { + SmallVector<const SCEV *, 8> Ops; + for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); + I != E; ++I) { + const SCEV *Op = getExactSDiv(*I, RHS, SE, + IgnoreSignificantBits); + if (!Op) return 0; + Ops.push_back(Op); + } + return SE.getAddExpr(Ops); + } + return 0; + } + + // Check for a multiply operand that we can pull RHS out of. + if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) { + if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) { + SmallVector<const SCEV *, 4> Ops; + bool Found = false; + for (SCEVMulExpr::op_iterator I = Mul->op_begin(), E = Mul->op_end(); + I != E; ++I) { + const SCEV *S = *I; + if (!Found) + if (const SCEV *Q = getExactSDiv(S, RHS, SE, + IgnoreSignificantBits)) { + S = Q; + Found = true; + } + Ops.push_back(S); + } + return Found ? SE.getMulExpr(Ops) : 0; + } + return 0; + } + + // Otherwise we don't know. + return 0; +} + +/// ExtractImmediate - If S involves the addition of a constant integer value, +/// return that integer value, and mutate S to point to a new SCEV with that +/// value excluded. +static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) { + if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) { + if (C->getValue()->getValue().getMinSignedBits() <= 64) { + S = SE.getConstant(C->getType(), 0); + return C->getValue()->getSExtValue(); + } + } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { + SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end()); + int64_t Result = ExtractImmediate(NewOps.front(), SE); + if (Result != 0) + S = SE.getAddExpr(NewOps); + return Result; + } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { + SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end()); + int64_t Result = ExtractImmediate(NewOps.front(), SE); + if (Result != 0) + S = SE.getAddRecExpr(NewOps, AR->getLoop(), + // FIXME: AR->getNoWrapFlags(SCEV::FlagNW) + SCEV::FlagAnyWrap); + return Result; + } + return 0; +} + +/// ExtractSymbol - If S involves the addition of a GlobalValue address, +/// return that symbol, and mutate S to point to a new SCEV with that +/// value excluded. +static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) { + if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) { + if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) { + S = SE.getConstant(GV->getType(), 0); + return GV; + } + } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { + SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end()); + GlobalValue *Result = ExtractSymbol(NewOps.back(), SE); + if (Result) + S = SE.getAddExpr(NewOps); + return Result; + } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { + SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end()); + GlobalValue *Result = ExtractSymbol(NewOps.front(), SE); + if (Result) + S = SE.getAddRecExpr(NewOps, AR->getLoop(), + // FIXME: AR->getNoWrapFlags(SCEV::FlagNW) + SCEV::FlagAnyWrap); + return Result; + } + return 0; +} + +/// isAddressUse - Returns true if the specified instruction is using the +/// specified value as an address. +static bool isAddressUse(Instruction *Inst, Value *OperandVal) { + bool isAddress = isa<LoadInst>(Inst); + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + if (SI->getOperand(1) == OperandVal) + isAddress = true; + } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { + // Addressing modes can also be folded into prefetches and a variety + // of intrinsics. + switch (II->getIntrinsicID()) { + default: break; + case Intrinsic::prefetch: + case Intrinsic::x86_sse_storeu_ps: + case Intrinsic::x86_sse2_storeu_pd: + case Intrinsic::x86_sse2_storeu_dq: + case Intrinsic::x86_sse2_storel_dq: + if (II->getArgOperand(0) == OperandVal) + isAddress = true; + break; + } + } + return isAddress; +} + +/// getAccessType - Return the type of the memory being accessed. +static Type *getAccessType(const Instruction *Inst) { + Type *AccessTy = Inst->getType(); + if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) + AccessTy = SI->getOperand(0)->getType(); + else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { + // Addressing modes can also be folded into prefetches and a variety + // of intrinsics. + switch (II->getIntrinsicID()) { + default: break; + case Intrinsic::x86_sse_storeu_ps: + case Intrinsic::x86_sse2_storeu_pd: + case Intrinsic::x86_sse2_storeu_dq: + case Intrinsic::x86_sse2_storel_dq: + AccessTy = II->getArgOperand(0)->getType(); + break; + } + } + + // All pointers have the same requirements, so canonicalize them to an + // arbitrary pointer type to minimize variation. + if (PointerType *PTy = dyn_cast<PointerType>(AccessTy)) + AccessTy = PointerType::get(IntegerType::get(PTy->getContext(), 1), + PTy->getAddressSpace()); + + return AccessTy; +} + +/// isExistingPhi - Return true if this AddRec is already a phi in its loop. +static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) { + for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin(); + PHINode *PN = dyn_cast<PHINode>(I); ++I) { + if (SE.isSCEVable(PN->getType()) && + (SE.getEffectiveSCEVType(PN->getType()) == + SE.getEffectiveSCEVType(AR->getType())) && + SE.getSCEV(PN) == AR) + return true; + } + return false; +} + +/// Check if expanding this expression is likely to incur significant cost. This +/// is tricky because SCEV doesn't track which expressions are actually computed +/// by the current IR. +/// +/// We currently allow expansion of IV increments that involve adds, +/// multiplication by constants, and AddRecs from existing phis. +/// +/// TODO: Allow UDivExpr if we can find an existing IV increment that is an +/// obvious multiple of the UDivExpr. +static bool isHighCostExpansion(const SCEV *S, + SmallPtrSet<const SCEV*, 8> &Processed, + ScalarEvolution &SE) { + // Zero/One operand expressions + switch (S->getSCEVType()) { + case scUnknown: + case scConstant: + return false; + case scTruncate: + return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(), + Processed, SE); + case scZeroExtend: + return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(), + Processed, SE); + case scSignExtend: + return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(), + Processed, SE); + } + + if (!Processed.insert(S)) + return false; + + if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { + for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); + I != E; ++I) { + if (isHighCostExpansion(*I, Processed, SE)) + return true; + } + return false; + } + + if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) { + if (Mul->getNumOperands() == 2) { + // Multiplication by a constant is ok + if (isa<SCEVConstant>(Mul->getOperand(0))) + return isHighCostExpansion(Mul->getOperand(1), Processed, SE); + + // If we have the value of one operand, check if an existing + // multiplication already generates this expression. + if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) { + Value *UVal = U->getValue(); + for (Value::use_iterator UI = UVal->use_begin(), UE = UVal->use_end(); + UI != UE; ++UI) { + // If U is a constant, it may be used by a ConstantExpr. + Instruction *User = dyn_cast<Instruction>(*UI); + if (User && User->getOpcode() == Instruction::Mul + && SE.isSCEVable(User->getType())) { + return SE.getSCEV(User) == Mul; + } + } + } + } + } + + if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { + if (isExistingPhi(AR, SE)) + return false; + } + + // Fow now, consider any other type of expression (div/mul/min/max) high cost. + return true; +} + +/// DeleteTriviallyDeadInstructions - If any of the instructions is the +/// specified set are trivially dead, delete them and see if this makes any of +/// their operands subsequently dead. +static bool +DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) { + bool Changed = false; + + while (!DeadInsts.empty()) { + Instruction *I = dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val()); + + if (I == 0 || !isInstructionTriviallyDead(I)) + continue; + + for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) + if (Instruction *U = dyn_cast<Instruction>(*OI)) { + *OI = 0; + if (U->use_empty()) + DeadInsts.push_back(U); + } + + I->eraseFromParent(); + Changed = true; + } + + return Changed; +} + +namespace { + +/// Cost - This class is used to measure and compare candidate formulae. +class Cost { + /// TODO: Some of these could be merged. Also, a lexical ordering + /// isn't always optimal. + unsigned NumRegs; + unsigned AddRecCost; + unsigned NumIVMuls; + unsigned NumBaseAdds; + unsigned ImmCost; + unsigned SetupCost; + +public: + Cost() + : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0), + SetupCost(0) {} + + bool operator<(const Cost &Other) const; + + void Loose(); + +#ifndef NDEBUG + // Once any of the metrics loses, they must all remain losers. + bool isValid() { + return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds + | ImmCost | SetupCost) != ~0u) + || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds + & ImmCost & SetupCost) == ~0u); + } +#endif + + bool isLoser() { + assert(isValid() && "invalid cost"); + return NumRegs == ~0u; + } + + void RateFormula(const Formula &F, + SmallPtrSet<const SCEV *, 16> &Regs, + const DenseSet<const SCEV *> &VisitedRegs, + const Loop *L, + const SmallVectorImpl<int64_t> &Offsets, + ScalarEvolution &SE, DominatorTree &DT, + SmallPtrSet<const SCEV *, 16> *LoserRegs = 0); + + void print(raw_ostream &OS) const; + void dump() const; + +private: + void RateRegister(const SCEV *Reg, + SmallPtrSet<const SCEV *, 16> &Regs, + const Loop *L, + ScalarEvolution &SE, DominatorTree &DT); + void RatePrimaryRegister(const SCEV *Reg, + SmallPtrSet<const SCEV *, 16> &Regs, + const Loop *L, + ScalarEvolution &SE, DominatorTree &DT, + SmallPtrSet<const SCEV *, 16> *LoserRegs); +}; + +} + +/// RateRegister - Tally up interesting quantities from the given register. +void Cost::RateRegister(const SCEV *Reg, + SmallPtrSet<const SCEV *, 16> &Regs, + const Loop *L, + ScalarEvolution &SE, DominatorTree &DT) { + if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) { + // If this is an addrec for another loop, don't second-guess its addrec phi + // nodes. LSR isn't currently smart enough to reason about more than one + // loop at a time. LSR has already run on inner loops, will not run on outer + // loops, and cannot be expected to change sibling loops. + if (AR->getLoop() != L) { + // If the AddRec exists, consider it's register free and leave it alone. + if (isExistingPhi(AR, SE)) + return; + + // Otherwise, do not consider this formula at all. + Loose(); + return; + } + AddRecCost += 1; /// TODO: This should be a function of the stride. + + // Add the step value register, if it needs one. + // TODO: The non-affine case isn't precisely modeled here. + if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) { + if (!Regs.count(AR->getOperand(1))) { + RateRegister(AR->getOperand(1), Regs, L, SE, DT); + if (isLoser()) + return; + } + } + } + ++NumRegs; + + // Rough heuristic; favor registers which don't require extra setup + // instructions in the preheader. + if (!isa<SCEVUnknown>(Reg) && + !isa<SCEVConstant>(Reg) && + !(isa<SCEVAddRecExpr>(Reg) && + (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) || + isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart())))) + ++SetupCost; + + NumIVMuls += isa<SCEVMulExpr>(Reg) && + SE.hasComputableLoopEvolution(Reg, L); +} + +/// RatePrimaryRegister - Record this register in the set. If we haven't seen it +/// before, rate it. Optional LoserRegs provides a way to declare any formula +/// that refers to one of those regs an instant loser. +void Cost::RatePrimaryRegister(const SCEV *Reg, + SmallPtrSet<const SCEV *, 16> &Regs, + const Loop *L, + ScalarEvolution &SE, DominatorTree &DT, + SmallPtrSet<const SCEV *, 16> *LoserRegs) { + if (LoserRegs && LoserRegs->count(Reg)) { + Loose(); + return; + } + if (Regs.insert(Reg)) { + RateRegister(Reg, Regs, L, SE, DT); + if (isLoser()) + LoserRegs->insert(Reg); + } +} + +void Cost::RateFormula(const Formula &F, + SmallPtrSet<const SCEV *, 16> &Regs, + const DenseSet<const SCEV *> &VisitedRegs, + const Loop *L, + const SmallVectorImpl<int64_t> &Offsets, + ScalarEvolution &SE, DominatorTree &DT, + SmallPtrSet<const SCEV *, 16> *LoserRegs) { + // Tally up the registers. + if (const SCEV *ScaledReg = F.ScaledReg) { + if (VisitedRegs.count(ScaledReg)) { + Loose(); + return; + } + RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs); + if (isLoser()) + return; + } + for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(), + E = F.BaseRegs.end(); I != E; ++I) { + const SCEV *BaseReg = *I; + if (VisitedRegs.count(BaseReg)) { + Loose(); + return; + } + RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs); + if (isLoser()) + return; + } + + // Determine how many (unfolded) adds we'll need inside the loop. + size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0); + if (NumBaseParts > 1) + NumBaseAdds += NumBaseParts - 1; + + // Tally up the non-zero immediates. + for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(), + E = Offsets.end(); I != E; ++I) { + int64_t Offset = (uint64_t)*I + F.AM.BaseOffs; + if (F.AM.BaseGV) + ImmCost += 64; // Handle symbolic values conservatively. + // TODO: This should probably be the pointer size. + else if (Offset != 0) + ImmCost += APInt(64, Offset, true).getMinSignedBits(); + } + assert(isValid() && "invalid cost"); +} + +/// Loose - Set this cost to a losing value. +void Cost::Loose() { + NumRegs = ~0u; + AddRecCost = ~0u; + NumIVMuls = ~0u; + NumBaseAdds = ~0u; + ImmCost = ~0u; + SetupCost = ~0u; +} + +/// operator< - Choose the lower cost. +bool Cost::operator<(const Cost &Other) const { + if (NumRegs != Other.NumRegs) + return NumRegs < Other.NumRegs; + if (AddRecCost != Other.AddRecCost) + return AddRecCost < Other.AddRecCost; + if (NumIVMuls != Other.NumIVMuls) + return NumIVMuls < Other.NumIVMuls; + if (NumBaseAdds != Other.NumBaseAdds) + return NumBaseAdds < Other.NumBaseAdds; + if (ImmCost != Other.ImmCost) + return ImmCost < Other.ImmCost; + if (SetupCost != Other.SetupCost) + return SetupCost < Other.SetupCost; + return false; +} + +void Cost::print(raw_ostream &OS) const { + OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s"); + if (AddRecCost != 0) + OS << ", with addrec cost " << AddRecCost; + if (NumIVMuls != 0) + OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s"); + if (NumBaseAdds != 0) + OS << ", plus " << NumBaseAdds << " base add" + << (NumBaseAdds == 1 ? "" : "s"); + if (ImmCost != 0) + OS << ", plus " << ImmCost << " imm cost"; + if (SetupCost != 0) + OS << ", plus " << SetupCost << " setup cost"; +} + +void Cost::dump() const { + print(errs()); errs() << '\n'; +} + +namespace { + +/// LSRFixup - An operand value in an instruction which is to be replaced +/// with some equivalent, possibly strength-reduced, replacement. +struct LSRFixup { + /// UserInst - The instruction which will be updated. + Instruction *UserInst; + + /// OperandValToReplace - The operand of the instruction which will + /// be replaced. The operand may be used more than once; every instance + /// will be replaced. + Value *OperandValToReplace; + + /// PostIncLoops - If this user is to use the post-incremented value of an + /// induction variable, this variable is non-null and holds the loop + /// associated with the induction variable. + PostIncLoopSet PostIncLoops; + + /// LUIdx - The index of the LSRUse describing the expression which + /// this fixup needs, minus an offset (below). + size_t LUIdx; + + /// Offset - A constant offset to be added to the LSRUse expression. + /// This allows multiple fixups to share the same LSRUse with different + /// offsets, for example in an unrolled loop. + int64_t Offset; + + bool isUseFullyOutsideLoop(const Loop *L) const; + + LSRFixup(); + + void print(raw_ostream &OS) const; + void dump() const; +}; + +} + +LSRFixup::LSRFixup() + : UserInst(0), OperandValToReplace(0), LUIdx(~size_t(0)), Offset(0) {} + +/// isUseFullyOutsideLoop - Test whether this fixup always uses its +/// value outside of the given loop. +bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const { + // PHI nodes use their value in their incoming blocks. + if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) { + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + if (PN->getIncomingValue(i) == OperandValToReplace && + L->contains(PN->getIncomingBlock(i))) + return false; + return true; + } + + return !L->contains(UserInst); +} + +void LSRFixup::print(raw_ostream &OS) const { + OS << "UserInst="; + // Store is common and interesting enough to be worth special-casing. + if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) { + OS << "store "; + WriteAsOperand(OS, Store->getOperand(0), /*PrintType=*/false); + } else if (UserInst->getType()->isVoidTy()) + OS << UserInst->getOpcodeName(); + else + WriteAsOperand(OS, UserInst, /*PrintType=*/false); + + OS << ", OperandValToReplace="; + WriteAsOperand(OS, OperandValToReplace, /*PrintType=*/false); + + for (PostIncLoopSet::const_iterator I = PostIncLoops.begin(), + E = PostIncLoops.end(); I != E; ++I) { + OS << ", PostIncLoop="; + WriteAsOperand(OS, (*I)->getHeader(), /*PrintType=*/false); + } + + if (LUIdx != ~size_t(0)) + OS << ", LUIdx=" << LUIdx; + + if (Offset != 0) + OS << ", Offset=" << Offset; +} + +void LSRFixup::dump() const { + print(errs()); errs() << '\n'; +} + +namespace { + +/// UniquifierDenseMapInfo - A DenseMapInfo implementation for holding +/// DenseMaps and DenseSets of sorted SmallVectors of const SCEV*. +struct UniquifierDenseMapInfo { + static SmallVector<const SCEV *, 2> getEmptyKey() { + SmallVector<const SCEV *, 2> V; + V.push_back(reinterpret_cast<const SCEV *>(-1)); + return V; + } + + static SmallVector<const SCEV *, 2> getTombstoneKey() { + SmallVector<const SCEV *, 2> V; + V.push_back(reinterpret_cast<const SCEV *>(-2)); + return V; + } + + static unsigned getHashValue(const SmallVector<const SCEV *, 2> &V) { + unsigned Result = 0; + for (SmallVectorImpl<const SCEV *>::const_iterator I = V.begin(), + E = V.end(); I != E; ++I) + Result ^= DenseMapInfo<const SCEV *>::getHashValue(*I); + return Result; + } + + static bool isEqual(const SmallVector<const SCEV *, 2> &LHS, + const SmallVector<const SCEV *, 2> &RHS) { + return LHS == RHS; + } +}; + +/// LSRUse - This class holds the state that LSR keeps for each use in +/// IVUsers, as well as uses invented by LSR itself. It includes information +/// about what kinds of things can be folded into the user, information about +/// the user itself, and information about how the use may be satisfied. +/// TODO: Represent multiple users of the same expression in common? +class LSRUse { + DenseSet<SmallVector<const SCEV *, 2>, UniquifierDenseMapInfo> Uniquifier; + +public: + /// KindType - An enum for a kind of use, indicating what types of + /// scaled and immediate operands it might support. + enum KindType { + Basic, ///< A normal use, with no folding. + Special, ///< A special case of basic, allowing -1 scales. + Address, ///< An address use; folding according to TargetLowering + ICmpZero ///< An equality icmp with both operands folded into one. + // TODO: Add a generic icmp too? + }; + + KindType Kind; + Type *AccessTy; + + SmallVector<int64_t, 8> Offsets; + int64_t MinOffset; + int64_t MaxOffset; + + /// AllFixupsOutsideLoop - This records whether all of the fixups using this + /// LSRUse are outside of the loop, in which case some special-case heuristics + /// may be used. + bool AllFixupsOutsideLoop; + + /// WidestFixupType - This records the widest use type for any fixup using + /// this LSRUse. FindUseWithSimilarFormula can't consider uses with different + /// max fixup widths to be equivalent, because the narrower one may be relying + /// on the implicit truncation to truncate away bogus bits. + Type *WidestFixupType; + + /// Formulae - A list of ways to build a value that can satisfy this user. + /// After the list is populated, one of these is selected heuristically and + /// used to formulate a replacement for OperandValToReplace in UserInst. + SmallVector<Formula, 12> Formulae; + + /// Regs - The set of register candidates used by all formulae in this LSRUse. + SmallPtrSet<const SCEV *, 4> Regs; + + LSRUse(KindType K, Type *T) : Kind(K), AccessTy(T), + MinOffset(INT64_MAX), + MaxOffset(INT64_MIN), + AllFixupsOutsideLoop(true), + WidestFixupType(0) {} + + bool HasFormulaWithSameRegs(const Formula &F) const; + bool InsertFormula(const Formula &F); + void DeleteFormula(Formula &F); + void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses); + + void print(raw_ostream &OS) const; + void dump() const; +}; + +} + +/// HasFormula - Test whether this use as a formula which has the same +/// registers as the given formula. +bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const { + SmallVector<const SCEV *, 2> Key = F.BaseRegs; + if (F.ScaledReg) Key.push_back(F.ScaledReg); + // Unstable sort by host order ok, because this is only used for uniquifying. + std::sort(Key.begin(), Key.end()); + return Uniquifier.count(Key); +} + +/// InsertFormula - If the given formula has not yet been inserted, add it to +/// the list, and return true. Return false otherwise. +bool LSRUse::InsertFormula(const Formula &F) { + SmallVector<const SCEV *, 2> Key = F.BaseRegs; + if (F.ScaledReg) Key.push_back(F.ScaledReg); + // Unstable sort by host order ok, because this is only used for uniquifying. + std::sort(Key.begin(), Key.end()); + + if (!Uniquifier.insert(Key).second) + return false; + + // Using a register to hold the value of 0 is not profitable. + assert((!F.ScaledReg || !F.ScaledReg->isZero()) && + "Zero allocated in a scaled register!"); +#ifndef NDEBUG + for (SmallVectorImpl<const SCEV *>::const_iterator I = + F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) + assert(!(*I)->isZero() && "Zero allocated in a base register!"); +#endif + + // Add the formula to the list. + Formulae.push_back(F); + + // Record registers now being used by this use. + Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end()); + + return true; +} + +/// DeleteFormula - Remove the given formula from this use's list. +void LSRUse::DeleteFormula(Formula &F) { + if (&F != &Formulae.back()) + std::swap(F, Formulae.back()); + Formulae.pop_back(); +} + +/// RecomputeRegs - Recompute the Regs field, and update RegUses. +void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) { + // Now that we've filtered out some formulae, recompute the Regs set. + SmallPtrSet<const SCEV *, 4> OldRegs = Regs; + Regs.clear(); + for (SmallVectorImpl<Formula>::const_iterator I = Formulae.begin(), + E = Formulae.end(); I != E; ++I) { + const Formula &F = *I; + if (F.ScaledReg) Regs.insert(F.ScaledReg); + Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end()); + } + + // Update the RegTracker. + for (SmallPtrSet<const SCEV *, 4>::iterator I = OldRegs.begin(), + E = OldRegs.end(); I != E; ++I) + if (!Regs.count(*I)) + RegUses.DropRegister(*I, LUIdx); +} + +void LSRUse::print(raw_ostream &OS) const { + OS << "LSR Use: Kind="; + switch (Kind) { + case Basic: OS << "Basic"; break; + case Special: OS << "Special"; break; + case ICmpZero: OS << "ICmpZero"; break; + case Address: + OS << "Address of "; + if (AccessTy->isPointerTy()) + OS << "pointer"; // the full pointer type could be really verbose + else + OS << *AccessTy; + } + + OS << ", Offsets={"; + for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(), + E = Offsets.end(); I != E; ++I) { + OS << *I; + if (llvm::next(I) != E) + OS << ','; + } + OS << '}'; + + if (AllFixupsOutsideLoop) + OS << ", all-fixups-outside-loop"; + + if (WidestFixupType) + OS << ", widest fixup type: " << *WidestFixupType; +} + +void LSRUse::dump() const { + print(errs()); errs() << '\n'; +} + +/// isLegalUse - Test whether the use described by AM is "legal", meaning it can +/// be completely folded into the user instruction at isel time. This includes +/// address-mode folding and special icmp tricks. +static bool isLegalUse(const TargetLowering::AddrMode &AM, + LSRUse::KindType Kind, Type *AccessTy, + const TargetLowering *TLI) { + switch (Kind) { + case LSRUse::Address: + // If we have low-level target information, ask the target if it can + // completely fold this address. + if (TLI) return TLI->isLegalAddressingMode(AM, AccessTy); + + // Otherwise, just guess that reg+reg addressing is legal. + return !AM.BaseGV && AM.BaseOffs == 0 && AM.Scale <= 1; + + case LSRUse::ICmpZero: + // There's not even a target hook for querying whether it would be legal to + // fold a GV into an ICmp. + if (AM.BaseGV) + return false; + + // ICmp only has two operands; don't allow more than two non-trivial parts. + if (AM.Scale != 0 && AM.HasBaseReg && AM.BaseOffs != 0) + return false; + + // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by + // putting the scaled register in the other operand of the icmp. + if (AM.Scale != 0 && AM.Scale != -1) + return false; + + // If we have low-level target information, ask the target if it can fold an + // integer immediate on an icmp. + if (AM.BaseOffs != 0) { + if (!TLI) + return false; + // We have one of: + // ICmpZero BaseReg + Offset => ICmp BaseReg, -Offset + // ICmpZero -1*ScaleReg + Offset => ICmp ScaleReg, Offset + // Offs is the ICmp immediate. + int64_t Offs = AM.BaseOffs; + if (AM.Scale == 0) + Offs = -(uint64_t)Offs; // The cast does the right thing with INT64_MIN. + return TLI->isLegalICmpImmediate(Offs); + } + + // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg + return true; + + case LSRUse::Basic: + // Only handle single-register values. + return !AM.BaseGV && AM.Scale == 0 && AM.BaseOffs == 0; + + case LSRUse::Special: + // Special case Basic to handle -1 scales. + return !AM.BaseGV && (AM.Scale == 0 || AM.Scale == -1) && AM.BaseOffs == 0; + } + + llvm_unreachable("Invalid LSRUse Kind!"); +} + +static bool isLegalUse(TargetLowering::AddrMode AM, + int64_t MinOffset, int64_t MaxOffset, + LSRUse::KindType Kind, Type *AccessTy, + const TargetLowering *TLI) { + // Check for overflow. + if (((int64_t)((uint64_t)AM.BaseOffs + MinOffset) > AM.BaseOffs) != + (MinOffset > 0)) + return false; + AM.BaseOffs = (uint64_t)AM.BaseOffs + MinOffset; + if (isLegalUse(AM, Kind, AccessTy, TLI)) { + AM.BaseOffs = (uint64_t)AM.BaseOffs - MinOffset; + // Check for overflow. + if (((int64_t)((uint64_t)AM.BaseOffs + MaxOffset) > AM.BaseOffs) != + (MaxOffset > 0)) + return false; + AM.BaseOffs = (uint64_t)AM.BaseOffs + MaxOffset; + return isLegalUse(AM, Kind, AccessTy, TLI); + } + return false; +} + +static bool isAlwaysFoldable(int64_t BaseOffs, + GlobalValue *BaseGV, + bool HasBaseReg, + LSRUse::KindType Kind, Type *AccessTy, + const TargetLowering *TLI) { + // Fast-path: zero is always foldable. + if (BaseOffs == 0 && !BaseGV) return true; + + // Conservatively, create an address with an immediate and a + // base and a scale. + TargetLowering::AddrMode AM; + AM.BaseOffs = BaseOffs; + AM.BaseGV = BaseGV; + AM.HasBaseReg = HasBaseReg; + AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1; + + // Canonicalize a scale of 1 to a base register if the formula doesn't + // already have a base register. + if (!AM.HasBaseReg && AM.Scale == 1) { + AM.Scale = 0; + AM.HasBaseReg = true; + } + + return isLegalUse(AM, Kind, AccessTy, TLI); +} + +static bool isAlwaysFoldable(const SCEV *S, + int64_t MinOffset, int64_t MaxOffset, + bool HasBaseReg, + LSRUse::KindType Kind, Type *AccessTy, + const TargetLowering *TLI, + ScalarEvolution &SE) { + // Fast-path: zero is always foldable. + if (S->isZero()) return true; + + // Conservatively, create an address with an immediate and a + // base and a scale. + int64_t BaseOffs = ExtractImmediate(S, SE); + GlobalValue *BaseGV = ExtractSymbol(S, SE); + + // If there's anything else involved, it's not foldable. + if (!S->isZero()) return false; + + // Fast-path: zero is always foldable. + if (BaseOffs == 0 && !BaseGV) return true; + + // Conservatively, create an address with an immediate and a + // base and a scale. + TargetLowering::AddrMode AM; + AM.BaseOffs = BaseOffs; + AM.BaseGV = BaseGV; + AM.HasBaseReg = HasBaseReg; + AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1; + + return isLegalUse(AM, MinOffset, MaxOffset, Kind, AccessTy, TLI); +} + +namespace { + +/// UseMapDenseMapInfo - A DenseMapInfo implementation for holding +/// DenseMaps and DenseSets of pairs of const SCEV* and LSRUse::Kind. +struct UseMapDenseMapInfo { + static std::pair<const SCEV *, LSRUse::KindType> getEmptyKey() { + return std::make_pair(reinterpret_cast<const SCEV *>(-1), LSRUse::Basic); + } + + static std::pair<const SCEV *, LSRUse::KindType> getTombstoneKey() { + return std::make_pair(reinterpret_cast<const SCEV *>(-2), LSRUse::Basic); + } + + static unsigned + getHashValue(const std::pair<const SCEV *, LSRUse::KindType> &V) { + unsigned Result = DenseMapInfo<const SCEV *>::getHashValue(V.first); + Result ^= DenseMapInfo<unsigned>::getHashValue(unsigned(V.second)); + return Result; + } + + static bool isEqual(const std::pair<const SCEV *, LSRUse::KindType> &LHS, + const std::pair<const SCEV *, LSRUse::KindType> &RHS) { + return LHS == RHS; + } +}; + +/// IVInc - An individual increment in a Chain of IV increments. +/// Relate an IV user to an expression that computes the IV it uses from the IV +/// used by the previous link in the Chain. +/// +/// For the head of a chain, IncExpr holds the absolute SCEV expression for the +/// original IVOperand. The head of the chain's IVOperand is only valid during +/// chain collection, before LSR replaces IV users. During chain generation, +/// IncExpr can be used to find the new IVOperand that computes the same +/// expression. +struct IVInc { + Instruction *UserInst; + Value* IVOperand; + const SCEV *IncExpr; + + IVInc(Instruction *U, Value *O, const SCEV *E): + UserInst(U), IVOperand(O), IncExpr(E) {} +}; + +// IVChain - The list of IV increments in program order. +// We typically add the head of a chain without finding subsequent links. +struct IVChain { + SmallVector<IVInc,1> Incs; + const SCEV *ExprBase; + + IVChain() : ExprBase(0) {} + + IVChain(const IVInc &Head, const SCEV *Base) + : Incs(1, Head), ExprBase(Base) {} + + typedef SmallVectorImpl<IVInc>::const_iterator const_iterator; + + // begin - return the first increment in the chain. + const_iterator begin() const { + assert(!Incs.empty()); + return llvm::next(Incs.begin()); + } + const_iterator end() const { + return Incs.end(); + } + + // hasIncs - Returns true if this chain contains any increments. + bool hasIncs() const { return Incs.size() >= 2; } + + // add - Add an IVInc to the end of this chain. + void add(const IVInc &X) { Incs.push_back(X); } + + // tailUserInst - Returns the last UserInst in the chain. + Instruction *tailUserInst() const { return Incs.back().UserInst; } + + // isProfitableIncrement - Returns true if IncExpr can be profitably added to + // this chain. + bool isProfitableIncrement(const SCEV *OperExpr, + const SCEV *IncExpr, + ScalarEvolution&); +}; + +/// ChainUsers - Helper for CollectChains to track multiple IV increment uses. +/// Distinguish between FarUsers that definitely cross IV increments and +/// NearUsers that may be used between IV increments. +struct ChainUsers { + SmallPtrSet<Instruction*, 4> FarUsers; + SmallPtrSet<Instruction*, 4> NearUsers; +}; + +/// LSRInstance - This class holds state for the main loop strength reduction +/// logic. +class LSRInstance { + IVUsers &IU; + ScalarEvolution &SE; + DominatorTree &DT; + LoopInfo &LI; + const TargetLowering *const TLI; + Loop *const L; + bool Changed; + + /// IVIncInsertPos - This is the insert position that the current loop's + /// induction variable increment should be placed. In simple loops, this is + /// the latch block's terminator. But in more complicated cases, this is a + /// position which will dominate all the in-loop post-increment users. + Instruction *IVIncInsertPos; + + /// Factors - Interesting factors between use strides. + SmallSetVector<int64_t, 8> Factors; + + /// Types - Interesting use types, to facilitate truncation reuse. + SmallSetVector<Type *, 4> Types; + + /// Fixups - The list of operands which are to be replaced. + SmallVector<LSRFixup, 16> Fixups; + + /// Uses - The list of interesting uses. + SmallVector<LSRUse, 16> Uses; + + /// RegUses - Track which uses use which register candidates. + RegUseTracker RegUses; + + // Limit the number of chains to avoid quadratic behavior. We don't expect to + // have more than a few IV increment chains in a loop. Missing a Chain falls + // back to normal LSR behavior for those uses. + static const unsigned MaxChains = 8; + + /// IVChainVec - IV users can form a chain of IV increments. + SmallVector<IVChain, MaxChains> IVChainVec; + + /// IVIncSet - IV users that belong to profitable IVChains. + SmallPtrSet<Use*, MaxChains> IVIncSet; + + void OptimizeShadowIV(); + bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse); + ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse); + void OptimizeLoopTermCond(); + + void ChainInstruction(Instruction *UserInst, Instruction *IVOper, + SmallVectorImpl<ChainUsers> &ChainUsersVec); + void FinalizeChain(IVChain &Chain); + void CollectChains(); + void GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, + SmallVectorImpl<WeakVH> &DeadInsts); + + void CollectInterestingTypesAndFactors(); + void CollectFixupsAndInitialFormulae(); + + LSRFixup &getNewFixup() { + Fixups.push_back(LSRFixup()); + return Fixups.back(); + } + + // Support for sharing of LSRUses between LSRFixups. + typedef DenseMap<std::pair<const SCEV *, LSRUse::KindType>, + size_t, + UseMapDenseMapInfo> UseMapTy; + UseMapTy UseMap; + + bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, + LSRUse::KindType Kind, Type *AccessTy); + + std::pair<size_t, int64_t> getUse(const SCEV *&Expr, + LSRUse::KindType Kind, + Type *AccessTy); + + void DeleteUse(LSRUse &LU, size_t LUIdx); + + LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU); + + void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx); + void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx); + void CountRegisters(const Formula &F, size_t LUIdx); + bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F); + + void CollectLoopInvariantFixupsAndFormulae(); + + void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base, + unsigned Depth = 0); + void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base); + void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base); + void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base); + void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base); + void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base); + void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base); + void GenerateCrossUseConstantOffsets(); + void GenerateAllReuseFormulae(); + + void FilterOutUndesirableDedicatedRegisters(); + + size_t EstimateSearchSpaceComplexity() const; + void NarrowSearchSpaceByDetectingSupersets(); + void NarrowSearchSpaceByCollapsingUnrolledCode(); + void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(); + void NarrowSearchSpaceByPickingWinnerRegs(); + void NarrowSearchSpaceUsingHeuristics(); + + void SolveRecurse(SmallVectorImpl<const Formula *> &Solution, + Cost &SolutionCost, + SmallVectorImpl<const Formula *> &Workspace, + const Cost &CurCost, + const SmallPtrSet<const SCEV *, 16> &CurRegs, + DenseSet<const SCEV *> &VisitedRegs) const; + void Solve(SmallVectorImpl<const Formula *> &Solution) const; + + BasicBlock::iterator + HoistInsertPosition(BasicBlock::iterator IP, + const SmallVectorImpl<Instruction *> &Inputs) const; + BasicBlock::iterator + AdjustInsertPositionForExpand(BasicBlock::iterator IP, + const LSRFixup &LF, + const LSRUse &LU, + SCEVExpander &Rewriter) const; + + Value *Expand(const LSRFixup &LF, + const Formula &F, + BasicBlock::iterator IP, + SCEVExpander &Rewriter, + SmallVectorImpl<WeakVH> &DeadInsts) const; + void RewriteForPHI(PHINode *PN, const LSRFixup &LF, + const Formula &F, + SCEVExpander &Rewriter, + SmallVectorImpl<WeakVH> &DeadInsts, + Pass *P) const; + void Rewrite(const LSRFixup &LF, + const Formula &F, + SCEVExpander &Rewriter, + SmallVectorImpl<WeakVH> &DeadInsts, + Pass *P) const; + void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, + Pass *P); + +public: + LSRInstance(const TargetLowering *tli, Loop *l, Pass *P); + + bool getChanged() const { return Changed; } + + void print_factors_and_types(raw_ostream &OS) const; + void print_fixups(raw_ostream &OS) const; + void print_uses(raw_ostream &OS) const; + void print(raw_ostream &OS) const; + void dump() const; +}; + +} + +/// OptimizeShadowIV - If IV is used in a int-to-float cast +/// inside the loop then try to eliminate the cast operation. +void LSRInstance::OptimizeShadowIV() { + const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); + if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) + return; + + for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); + UI != E; /* empty */) { + IVUsers::const_iterator CandidateUI = UI; + ++UI; + Instruction *ShadowUse = CandidateUI->getUser(); + Type *DestTy = NULL; + bool IsSigned = false; + + /* If shadow use is a int->float cast then insert a second IV + to eliminate this cast. + + for (unsigned i = 0; i < n; ++i) + foo((double)i); + + is transformed into + + double d = 0.0; + for (unsigned i = 0; i < n; ++i, ++d) + foo(d); + */ + if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) { + IsSigned = false; + DestTy = UCast->getDestTy(); + } + else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) { + IsSigned = true; + DestTy = SCast->getDestTy(); + } + if (!DestTy) continue; + + if (TLI) { + // If target does not support DestTy natively then do not apply + // this transformation. + EVT DVT = TLI->getValueType(DestTy); + if (!TLI->isTypeLegal(DVT)) continue; + } + + PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0)); + if (!PH) continue; + if (PH->getNumIncomingValues() != 2) continue; + + Type *SrcTy = PH->getType(); + int Mantissa = DestTy->getFPMantissaWidth(); + if (Mantissa == -1) continue; + if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa) + continue; + + unsigned Entry, Latch; + if (PH->getIncomingBlock(0) == L->getLoopPreheader()) { + Entry = 0; + Latch = 1; + } else { + Entry = 1; + Latch = 0; + } + + ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry)); + if (!Init) continue; + Constant *NewInit = ConstantFP::get(DestTy, IsSigned ? + (double)Init->getSExtValue() : + (double)Init->getZExtValue()); + + BinaryOperator *Incr = + dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch)); + if (!Incr) continue; + if (Incr->getOpcode() != Instruction::Add + && Incr->getOpcode() != Instruction::Sub) + continue; + + /* Initialize new IV, double d = 0.0 in above example. */ + ConstantInt *C = NULL; + if (Incr->getOperand(0) == PH) + C = dyn_cast<ConstantInt>(Incr->getOperand(1)); + else if (Incr->getOperand(1) == PH) + C = dyn_cast<ConstantInt>(Incr->getOperand(0)); + else + continue; + + if (!C) continue; + + // Ignore negative constants, as the code below doesn't handle them + // correctly. TODO: Remove this restriction. + if (!C->getValue().isStrictlyPositive()) continue; + + /* Add new PHINode. */ + PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH); + + /* create new increment. '++d' in above example. */ + Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue()); + BinaryOperator *NewIncr = + BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ? + Instruction::FAdd : Instruction::FSub, + NewPH, CFP, "IV.S.next.", Incr); + + NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry)); + NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch)); + + /* Remove cast operation */ + ShadowUse->replaceAllUsesWith(NewPH); + ShadowUse->eraseFromParent(); + Changed = true; + break; + } +} + +/// FindIVUserForCond - If Cond has an operand that is an expression of an IV, +/// set the IV user and stride information and return true, otherwise return +/// false. +bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) { + for (IVUsers::iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) + if (UI->getUser() == Cond) { + // NOTE: we could handle setcc instructions with multiple uses here, but + // InstCombine does it as well for simple uses, it's not clear that it + // occurs enough in real life to handle. + CondUse = UI; + return true; + } + return false; +} + +/// OptimizeMax - Rewrite the loop's terminating condition if it uses +/// a max computation. +/// +/// This is a narrow solution to a specific, but acute, problem. For loops +/// like this: +/// +/// i = 0; +/// do { +/// p[i] = 0.0; +/// } while (++i < n); +/// +/// the trip count isn't just 'n', because 'n' might not be positive. And +/// unfortunately this can come up even for loops where the user didn't use +/// a C do-while loop. For example, seemingly well-behaved top-test loops +/// will commonly be lowered like this: +// +/// if (n > 0) { +/// i = 0; +/// do { +/// p[i] = 0.0; +/// } while (++i < n); +/// } +/// +/// and then it's possible for subsequent optimization to obscure the if +/// test in such a way that indvars can't find it. +/// +/// When indvars can't find the if test in loops like this, it creates a +/// max expression, which allows it to give the loop a canonical +/// induction variable: +/// +/// i = 0; +/// max = n < 1 ? 1 : n; +/// do { +/// p[i] = 0.0; +/// } while (++i != max); +/// +/// Canonical induction variables are necessary because the loop passes +/// are designed around them. The most obvious example of this is the +/// LoopInfo analysis, which doesn't remember trip count values. It +/// expects to be able to rediscover the trip count each time it is +/// needed, and it does this using a simple analysis that only succeeds if +/// the loop has a canonical induction variable. +/// +/// However, when it comes time to generate code, the maximum operation +/// can be quite costly, especially if it's inside of an outer loop. +/// +/// This function solves this problem by detecting this type of loop and +/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting +/// the instructions for the maximum computation. +/// +ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) { + // Check that the loop matches the pattern we're looking for. + if (Cond->getPredicate() != CmpInst::ICMP_EQ && + Cond->getPredicate() != CmpInst::ICMP_NE) + return Cond; + + SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1)); + if (!Sel || !Sel->hasOneUse()) return Cond; + + const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); + if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) + return Cond; + const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1); + + // Add one to the backedge-taken count to get the trip count. + const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount); + if (IterationCount != SE.getSCEV(Sel)) return Cond; + + // Check for a max calculation that matches the pattern. There's no check + // for ICMP_ULE here because the comparison would be with zero, which + // isn't interesting. + CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE; + const SCEVNAryExpr *Max = 0; + if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) { + Pred = ICmpInst::ICMP_SLE; + Max = S; + } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) { + Pred = ICmpInst::ICMP_SLT; + Max = S; + } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) { + Pred = ICmpInst::ICMP_ULT; + Max = U; + } else { + // No match; bail. + return Cond; + } + + // To handle a max with more than two operands, this optimization would + // require additional checking and setup. + if (Max->getNumOperands() != 2) + return Cond; + + const SCEV *MaxLHS = Max->getOperand(0); + const SCEV *MaxRHS = Max->getOperand(1); + + // ScalarEvolution canonicalizes constants to the left. For < and >, look + // for a comparison with 1. For <= and >=, a comparison with zero. + if (!MaxLHS || + (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One))) + return Cond; + + // Check the relevant induction variable for conformance to + // the pattern. + const SCEV *IV = SE.getSCEV(Cond->getOperand(0)); + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV); + if (!AR || !AR->isAffine() || + AR->getStart() != One || + AR->getStepRecurrence(SE) != One) + return Cond; + + assert(AR->getLoop() == L && + "Loop condition operand is an addrec in a different loop!"); + + // Check the right operand of the select, and remember it, as it will + // be used in the new comparison instruction. + Value *NewRHS = 0; + if (ICmpInst::isTrueWhenEqual(Pred)) { + // Look for n+1, and grab n. + if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1))) + if (isa<ConstantInt>(BO->getOperand(1)) && + cast<ConstantInt>(BO->getOperand(1))->isOne() && + SE.getSCEV(BO->getOperand(0)) == MaxRHS) + NewRHS = BO->getOperand(0); + if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2))) + if (isa<ConstantInt>(BO->getOperand(1)) && + cast<ConstantInt>(BO->getOperand(1))->isOne() && + SE.getSCEV(BO->getOperand(0)) == MaxRHS) + NewRHS = BO->getOperand(0); + if (!NewRHS) + return Cond; + } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS) + NewRHS = Sel->getOperand(1); + else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS) + NewRHS = Sel->getOperand(2); + else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS)) + NewRHS = SU->getValue(); + else + // Max doesn't match expected pattern. + return Cond; + + // Determine the new comparison opcode. It may be signed or unsigned, + // and the original comparison may be either equality or inequality. + if (Cond->getPredicate() == CmpInst::ICMP_EQ) + Pred = CmpInst::getInversePredicate(Pred); + + // Ok, everything looks ok to change the condition into an SLT or SGE and + // delete the max calculation. + ICmpInst *NewCond = + new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp"); + + // Delete the max calculation instructions. + Cond->replaceAllUsesWith(NewCond); + CondUse->setUser(NewCond); + Instruction *Cmp = cast<Instruction>(Sel->getOperand(0)); + Cond->eraseFromParent(); + Sel->eraseFromParent(); + if (Cmp->use_empty()) + Cmp->eraseFromParent(); + return NewCond; +} + +/// OptimizeLoopTermCond - Change loop terminating condition to use the +/// postinc iv when possible. +void +LSRInstance::OptimizeLoopTermCond() { + SmallPtrSet<Instruction *, 4> PostIncs; + + BasicBlock *LatchBlock = L->getLoopLatch(); + SmallVector<BasicBlock*, 8> ExitingBlocks; + L->getExitingBlocks(ExitingBlocks); + + for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) { + BasicBlock *ExitingBlock = ExitingBlocks[i]; + + // Get the terminating condition for the loop if possible. If we + // can, we want to change it to use a post-incremented version of its + // induction variable, to allow coalescing the live ranges for the IV into + // one register value. + + BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator()); + if (!TermBr) + continue; + // FIXME: Overly conservative, termination condition could be an 'or' etc.. + if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition())) + continue; + + // Search IVUsesByStride to find Cond's IVUse if there is one. + IVStrideUse *CondUse = 0; + ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition()); + if (!FindIVUserForCond(Cond, CondUse)) + continue; + + // If the trip count is computed in terms of a max (due to ScalarEvolution + // being unable to find a sufficient guard, for example), change the loop + // comparison to use SLT or ULT instead of NE. + // One consequence of doing this now is that it disrupts the count-down + // optimization. That's not always a bad thing though, because in such + // cases it may still be worthwhile to avoid a max. + Cond = OptimizeMax(Cond, CondUse); + + // If this exiting block dominates the latch block, it may also use + // the post-inc value if it won't be shared with other uses. + // Check for dominance. + if (!DT.dominates(ExitingBlock, LatchBlock)) + continue; + + // Conservatively avoid trying to use the post-inc value in non-latch + // exits if there may be pre-inc users in intervening blocks. + if (LatchBlock != ExitingBlock) + for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) + // Test if the use is reachable from the exiting block. This dominator + // query is a conservative approximation of reachability. + if (&*UI != CondUse && + !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) { + // Conservatively assume there may be reuse if the quotient of their + // strides could be a legal scale. + const SCEV *A = IU.getStride(*CondUse, L); + const SCEV *B = IU.getStride(*UI, L); + if (!A || !B) continue; + if (SE.getTypeSizeInBits(A->getType()) != + SE.getTypeSizeInBits(B->getType())) { + if (SE.getTypeSizeInBits(A->getType()) > + SE.getTypeSizeInBits(B->getType())) + B = SE.getSignExtendExpr(B, A->getType()); + else + A = SE.getSignExtendExpr(A, B->getType()); + } + if (const SCEVConstant *D = + dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) { + const ConstantInt *C = D->getValue(); + // Stride of one or negative one can have reuse with non-addresses. + if (C->isOne() || C->isAllOnesValue()) + goto decline_post_inc; + // Avoid weird situations. + if (C->getValue().getMinSignedBits() >= 64 || + C->getValue().isMinSignedValue()) + goto decline_post_inc; + // Without TLI, assume that any stride might be valid, and so any + // use might be shared. + if (!TLI) + goto decline_post_inc; + // Check for possible scaled-address reuse. + Type *AccessTy = getAccessType(UI->getUser()); + TargetLowering::AddrMode AM; + AM.Scale = C->getSExtValue(); + if (TLI->isLegalAddressingMode(AM, AccessTy)) + goto decline_post_inc; + AM.Scale = -AM.Scale; + if (TLI->isLegalAddressingMode(AM, AccessTy)) + goto decline_post_inc; + } + } + + DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: " + << *Cond << '\n'); + + // It's possible for the setcc instruction to be anywhere in the loop, and + // possible for it to have multiple users. If it is not immediately before + // the exiting block branch, move it. + if (&*++BasicBlock::iterator(Cond) != TermBr) { + if (Cond->hasOneUse()) { + Cond->moveBefore(TermBr); + } else { + // Clone the terminating condition and insert into the loopend. + ICmpInst *OldCond = Cond; + Cond = cast<ICmpInst>(Cond->clone()); + Cond->setName(L->getHeader()->getName() + ".termcond"); + ExitingBlock->getInstList().insert(TermBr, Cond); + + // Clone the IVUse, as the old use still exists! + CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace()); + TermBr->replaceUsesOfWith(OldCond, Cond); + } + } + + // If we get to here, we know that we can transform the setcc instruction to + // use the post-incremented version of the IV, allowing us to coalesce the + // live ranges for the IV correctly. + CondUse->transformToPostInc(L); + Changed = true; + + PostIncs.insert(Cond); + decline_post_inc:; + } + + // Determine an insertion point for the loop induction variable increment. It + // must dominate all the post-inc comparisons we just set up, and it must + // dominate the loop latch edge. + IVIncInsertPos = L->getLoopLatch()->getTerminator(); + for (SmallPtrSet<Instruction *, 4>::const_iterator I = PostIncs.begin(), + E = PostIncs.end(); I != E; ++I) { + BasicBlock *BB = + DT.findNearestCommonDominator(IVIncInsertPos->getParent(), + (*I)->getParent()); + if (BB == (*I)->getParent()) + IVIncInsertPos = *I; + else if (BB != IVIncInsertPos->getParent()) + IVIncInsertPos = BB->getTerminator(); + } +} + +/// reconcileNewOffset - Determine if the given use can accommodate a fixup +/// at the given offset and other details. If so, update the use and +/// return true. +bool +LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, + LSRUse::KindType Kind, Type *AccessTy) { + int64_t NewMinOffset = LU.MinOffset; + int64_t NewMaxOffset = LU.MaxOffset; + Type *NewAccessTy = AccessTy; + + // Check for a mismatched kind. It's tempting to collapse mismatched kinds to + // something conservative, however this can pessimize in the case that one of + // the uses will have all its uses outside the loop, for example. + if (LU.Kind != Kind) + return false; + // Conservatively assume HasBaseReg is true for now. + if (NewOffset < LU.MinOffset) { + if (!isAlwaysFoldable(LU.MaxOffset - NewOffset, 0, HasBaseReg, + Kind, AccessTy, TLI)) + return false; + NewMinOffset = NewOffset; + } else if (NewOffset > LU.MaxOffset) { + if (!isAlwaysFoldable(NewOffset - LU.MinOffset, 0, HasBaseReg, + Kind, AccessTy, TLI)) + return false; + NewMaxOffset = NewOffset; + } + // Check for a mismatched access type, and fall back conservatively as needed. + // TODO: Be less conservative when the type is similar and can use the same + // addressing modes. + if (Kind == LSRUse::Address && AccessTy != LU.AccessTy) + NewAccessTy = Type::getVoidTy(AccessTy->getContext()); + + // Update the use. + LU.MinOffset = NewMinOffset; + LU.MaxOffset = NewMaxOffset; + LU.AccessTy = NewAccessTy; + if (NewOffset != LU.Offsets.back()) + LU.Offsets.push_back(NewOffset); + return true; +} + +/// getUse - Return an LSRUse index and an offset value for a fixup which +/// needs the given expression, with the given kind and optional access type. +/// Either reuse an existing use or create a new one, as needed. +std::pair<size_t, int64_t> +LSRInstance::getUse(const SCEV *&Expr, + LSRUse::KindType Kind, Type *AccessTy) { + const SCEV *Copy = Expr; + int64_t Offset = ExtractImmediate(Expr, SE); + + // Basic uses can't accept any offset, for example. + if (!isAlwaysFoldable(Offset, 0, /*HasBaseReg=*/true, Kind, AccessTy, TLI)) { + Expr = Copy; + Offset = 0; + } + + std::pair<UseMapTy::iterator, bool> P = + UseMap.insert(std::make_pair(std::make_pair(Expr, Kind), 0)); + if (!P.second) { + // A use already existed with this base. + size_t LUIdx = P.first->second; + LSRUse &LU = Uses[LUIdx]; + if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy)) + // Reuse this use. + return std::make_pair(LUIdx, Offset); + } + + // Create a new use. + size_t LUIdx = Uses.size(); + P.first->second = LUIdx; + Uses.push_back(LSRUse(Kind, AccessTy)); + LSRUse &LU = Uses[LUIdx]; + + // We don't need to track redundant offsets, but we don't need to go out + // of our way here to avoid them. + if (LU.Offsets.empty() || Offset != LU.Offsets.back()) + LU.Offsets.push_back(Offset); + + LU.MinOffset = Offset; + LU.MaxOffset = Offset; + return std::make_pair(LUIdx, Offset); +} + +/// DeleteUse - Delete the given use from the Uses list. +void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) { + if (&LU != &Uses.back()) + std::swap(LU, Uses.back()); + Uses.pop_back(); + + // Update RegUses. + RegUses.SwapAndDropUse(LUIdx, Uses.size()); +} + +/// FindUseWithFormula - Look for a use distinct from OrigLU which is has +/// a formula that has the same registers as the given formula. +LSRUse * +LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF, + const LSRUse &OrigLU) { + // Search all uses for the formula. This could be more clever. + for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { + LSRUse &LU = Uses[LUIdx]; + // Check whether this use is close enough to OrigLU, to see whether it's + // worthwhile looking through its formulae. + // Ignore ICmpZero uses because they may contain formulae generated by + // GenerateICmpZeroScales, in which case adding fixup offsets may + // be invalid. + if (&LU != &OrigLU && + LU.Kind != LSRUse::ICmpZero && + LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy && + LU.WidestFixupType == OrigLU.WidestFixupType && + LU.HasFormulaWithSameRegs(OrigF)) { + // Scan through this use's formulae. + for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(), + E = LU.Formulae.end(); I != E; ++I) { + const Formula &F = *I; + // Check to see if this formula has the same registers and symbols + // as OrigF. + if (F.BaseRegs == OrigF.BaseRegs && + F.ScaledReg == OrigF.ScaledReg && + F.AM.BaseGV == OrigF.AM.BaseGV && + F.AM.Scale == OrigF.AM.Scale && + F.UnfoldedOffset == OrigF.UnfoldedOffset) { + if (F.AM.BaseOffs == 0) + return &LU; + // This is the formula where all the registers and symbols matched; + // there aren't going to be any others. Since we declined it, we + // can skip the rest of the formulae and proceed to the next LSRUse. + break; + } + } + } + } + + // Nothing looked good. + return 0; +} + +void LSRInstance::CollectInterestingTypesAndFactors() { + SmallSetVector<const SCEV *, 4> Strides; + + // Collect interesting types and strides. + SmallVector<const SCEV *, 4> Worklist; + for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) { + const SCEV *Expr = IU.getExpr(*UI); + + // Collect interesting types. + Types.insert(SE.getEffectiveSCEVType(Expr->getType())); + + // Add strides for mentioned loops. + Worklist.push_back(Expr); + do { + const SCEV *S = Worklist.pop_back_val(); + if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { + if (AR->getLoop() == L) + Strides.insert(AR->getStepRecurrence(SE)); + Worklist.push_back(AR->getStart()); + } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { + Worklist.append(Add->op_begin(), Add->op_end()); + } + } while (!Worklist.empty()); + } + + // Compute interesting factors from the set of interesting strides. + for (SmallSetVector<const SCEV *, 4>::const_iterator + I = Strides.begin(), E = Strides.end(); I != E; ++I) + for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter = + llvm::next(I); NewStrideIter != E; ++NewStrideIter) { + const SCEV *OldStride = *I; + const SCEV *NewStride = *NewStrideIter; + + if (SE.getTypeSizeInBits(OldStride->getType()) != + SE.getTypeSizeInBits(NewStride->getType())) { + if (SE.getTypeSizeInBits(OldStride->getType()) > + SE.getTypeSizeInBits(NewStride->getType())) + NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType()); + else + OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType()); + } + if (const SCEVConstant *Factor = + dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride, + SE, true))) { + if (Factor->getValue()->getValue().getMinSignedBits() <= 64) + Factors.insert(Factor->getValue()->getValue().getSExtValue()); + } else if (const SCEVConstant *Factor = + dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride, + NewStride, + SE, true))) { + if (Factor->getValue()->getValue().getMinSignedBits() <= 64) + Factors.insert(Factor->getValue()->getValue().getSExtValue()); + } + } + + // If all uses use the same type, don't bother looking for truncation-based + // reuse. + if (Types.size() == 1) + Types.clear(); + + DEBUG(print_factors_and_types(dbgs())); +} + +/// findIVOperand - Helper for CollectChains that finds an IV operand (computed +/// by an AddRec in this loop) within [OI,OE) or returns OE. If IVUsers mapped +/// Instructions to IVStrideUses, we could partially skip this. +static User::op_iterator +findIVOperand(User::op_iterator OI, User::op_iterator OE, + Loop *L, ScalarEvolution &SE) { + for(; OI != OE; ++OI) { + if (Instruction *Oper = dyn_cast<Instruction>(*OI)) { + if (!SE.isSCEVable(Oper->getType())) + continue; + + if (const SCEVAddRecExpr *AR = + dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Oper))) { + if (AR->getLoop() == L) + break; + } + } + } + return OI; +} + +/// getWideOperand - IVChain logic must consistenctly peek base TruncInst +/// operands, so wrap it in a convenient helper. +static Value *getWideOperand(Value *Oper) { + if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper)) + return Trunc->getOperand(0); + return Oper; +} + +/// isCompatibleIVType - Return true if we allow an IV chain to include both +/// types. +static bool isCompatibleIVType(Value *LVal, Value *RVal) { + Type *LType = LVal->getType(); + Type *RType = RVal->getType(); + return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy()); +} + +/// getExprBase - Return an approximation of this SCEV expression's "base", or +/// NULL for any constant. Returning the expression itself is +/// conservative. Returning a deeper subexpression is more precise and valid as +/// long as it isn't less complex than another subexpression. For expressions +/// involving multiple unscaled values, we need to return the pointer-type +/// SCEVUnknown. This avoids forming chains across objects, such as: +/// PrevOper==a[i], IVOper==b[i], IVInc==b-a. +/// +/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost +/// SCEVUnknown, we simply return the rightmost SCEV operand. +static const SCEV *getExprBase(const SCEV *S) { + switch (S->getSCEVType()) { + default: // uncluding scUnknown. + return S; + case scConstant: + return 0; + case scTruncate: + return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand()); + case scZeroExtend: + return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand()); + case scSignExtend: + return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand()); + case scAddExpr: { + // Skip over scaled operands (scMulExpr) to follow add operands as long as + // there's nothing more complex. + // FIXME: not sure if we want to recognize negation. + const SCEVAddExpr *Add = cast<SCEVAddExpr>(S); + for (std::reverse_iterator<SCEVAddExpr::op_iterator> I(Add->op_end()), + E(Add->op_begin()); I != E; ++I) { + const SCEV *SubExpr = *I; + if (SubExpr->getSCEVType() == scAddExpr) + return getExprBase(SubExpr); + + if (SubExpr->getSCEVType() != scMulExpr) + return SubExpr; + } + return S; // all operands are scaled, be conservative. + } + case scAddRecExpr: + return getExprBase(cast<SCEVAddRecExpr>(S)->getStart()); + } +} + +/// Return true if the chain increment is profitable to expand into a loop +/// invariant value, which may require its own register. A profitable chain +/// increment will be an offset relative to the same base. We allow such offsets +/// to potentially be used as chain increment as long as it's not obviously +/// expensive to expand using real instructions. +bool IVChain::isProfitableIncrement(const SCEV *OperExpr, + const SCEV *IncExpr, + ScalarEvolution &SE) { + // Aggressively form chains when -stress-ivchain. + if (StressIVChain) + return true; + + // Do not replace a constant offset from IV head with a nonconstant IV + // increment. + if (!isa<SCEVConstant>(IncExpr)) { + const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand)); + if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr))) + return 0; + } + + SmallPtrSet<const SCEV*, 8> Processed; + return !isHighCostExpansion(IncExpr, Processed, SE); +} + +/// Return true if the number of registers needed for the chain is estimated to +/// be less than the number required for the individual IV users. First prohibit +/// any IV users that keep the IV live across increments (the Users set should +/// be empty). Next count the number and type of increments in the chain. +/// +/// Chaining IVs can lead to considerable code bloat if ISEL doesn't +/// effectively use postinc addressing modes. Only consider it profitable it the +/// increments can be computed in fewer registers when chained. +/// +/// TODO: Consider IVInc free if it's already used in another chains. +static bool +isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users, + ScalarEvolution &SE, const TargetLowering *TLI) { + if (StressIVChain) + return true; + + if (!Chain.hasIncs()) + return false; + + if (!Users.empty()) { + DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n"; + for (SmallPtrSet<Instruction*, 4>::const_iterator I = Users.begin(), + E = Users.end(); I != E; ++I) { + dbgs() << " " << **I << "\n"; + }); + return false; + } + assert(!Chain.Incs.empty() && "empty IV chains are not allowed"); + + // The chain itself may require a register, so intialize cost to 1. + int cost = 1; + + // A complete chain likely eliminates the need for keeping the original IV in + // a register. LSR does not currently know how to form a complete chain unless + // the header phi already exists. + if (isa<PHINode>(Chain.tailUserInst()) + && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) { + --cost; + } + const SCEV *LastIncExpr = 0; + unsigned NumConstIncrements = 0; + unsigned NumVarIncrements = 0; + unsigned NumReusedIncrements = 0; + for (IVChain::const_iterator I = Chain.begin(), E = Chain.end(); + I != E; ++I) { + + if (I->IncExpr->isZero()) + continue; + + // Incrementing by zero or some constant is neutral. We assume constants can + // be folded into an addressing mode or an add's immediate operand. + if (isa<SCEVConstant>(I->IncExpr)) { + ++NumConstIncrements; + continue; + } + + if (I->IncExpr == LastIncExpr) + ++NumReusedIncrements; + else + ++NumVarIncrements; + + LastIncExpr = I->IncExpr; + } + // An IV chain with a single increment is handled by LSR's postinc + // uses. However, a chain with multiple increments requires keeping the IV's + // value live longer than it needs to be if chained. + if (NumConstIncrements > 1) + --cost; + + // Materializing increment expressions in the preheader that didn't exist in + // the original code may cost a register. For example, sign-extended array + // indices can produce ridiculous increments like this: + // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64))) + cost += NumVarIncrements; + + // Reusing variable increments likely saves a register to hold the multiple of + // the stride. + cost -= NumReusedIncrements; + + DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost + << "\n"); + + return cost < 0; +} + +/// ChainInstruction - Add this IV user to an existing chain or make it the head +/// of a new chain. +void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper, + SmallVectorImpl<ChainUsers> &ChainUsersVec) { + // When IVs are used as types of varying widths, they are generally converted + // to a wider type with some uses remaining narrow under a (free) trunc. + Value *const NextIV = getWideOperand(IVOper); + const SCEV *const OperExpr = SE.getSCEV(NextIV); + const SCEV *const OperExprBase = getExprBase(OperExpr); + + // Visit all existing chains. Check if its IVOper can be computed as a + // profitable loop invariant increment from the last link in the Chain. + unsigned ChainIdx = 0, NChains = IVChainVec.size(); + const SCEV *LastIncExpr = 0; + for (; ChainIdx < NChains; ++ChainIdx) { + IVChain &Chain = IVChainVec[ChainIdx]; + + // Prune the solution space aggressively by checking that both IV operands + // are expressions that operate on the same unscaled SCEVUnknown. This + // "base" will be canceled by the subsequent getMinusSCEV call. Checking + // first avoids creating extra SCEV expressions. + if (!StressIVChain && Chain.ExprBase != OperExprBase) + continue; + + Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand); + if (!isCompatibleIVType(PrevIV, NextIV)) + continue; + + // A phi node terminates a chain. + if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst())) + continue; + + // The increment must be loop-invariant so it can be kept in a register. + const SCEV *PrevExpr = SE.getSCEV(PrevIV); + const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr); + if (!SE.isLoopInvariant(IncExpr, L)) + continue; + + if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) { + LastIncExpr = IncExpr; + break; + } + } + // If we haven't found a chain, create a new one, unless we hit the max. Don't + // bother for phi nodes, because they must be last in the chain. + if (ChainIdx == NChains) { + if (isa<PHINode>(UserInst)) + return; + if (NChains >= MaxChains && !StressIVChain) { + DEBUG(dbgs() << "IV Chain Limit\n"); + return; + } + LastIncExpr = OperExpr; + // IVUsers may have skipped over sign/zero extensions. We don't currently + // attempt to form chains involving extensions unless they can be hoisted + // into this loop's AddRec. + if (!isa<SCEVAddRecExpr>(LastIncExpr)) + return; + ++NChains; + IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr), + OperExprBase)); + ChainUsersVec.resize(NChains); + DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst + << ") IV=" << *LastIncExpr << "\n"); + } else { + DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst + << ") IV+" << *LastIncExpr << "\n"); + // Add this IV user to the end of the chain. + IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr)); + } + + SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers; + // This chain's NearUsers become FarUsers. + if (!LastIncExpr->isZero()) { + ChainUsersVec[ChainIdx].FarUsers.insert(NearUsers.begin(), + NearUsers.end()); + NearUsers.clear(); + } + + // All other uses of IVOperand become near uses of the chain. + // We currently ignore intermediate values within SCEV expressions, assuming + // they will eventually be used be the current chain, or can be computed + // from one of the chain increments. To be more precise we could + // transitively follow its user and only add leaf IV users to the set. + for (Value::use_iterator UseIter = IVOper->use_begin(), + UseEnd = IVOper->use_end(); UseIter != UseEnd; ++UseIter) { + Instruction *OtherUse = dyn_cast<Instruction>(*UseIter); + if (!OtherUse || OtherUse == UserInst) + continue; + if (SE.isSCEVable(OtherUse->getType()) + && !isa<SCEVUnknown>(SE.getSCEV(OtherUse)) + && IU.isIVUserOrOperand(OtherUse)) { + continue; + } + NearUsers.insert(OtherUse); + } + + // Since this user is part of the chain, it's no longer considered a use + // of the chain. + ChainUsersVec[ChainIdx].FarUsers.erase(UserInst); +} + +/// CollectChains - Populate the vector of Chains. +/// +/// This decreases ILP at the architecture level. Targets with ample registers, +/// multiple memory ports, and no register renaming probably don't want +/// this. However, such targets should probably disable LSR altogether. +/// +/// The job of LSR is to make a reasonable choice of induction variables across +/// the loop. Subsequent passes can easily "unchain" computation exposing more +/// ILP *within the loop* if the target wants it. +/// +/// Finding the best IV chain is potentially a scheduling problem. Since LSR +/// will not reorder memory operations, it will recognize this as a chain, but +/// will generate redundant IV increments. Ideally this would be corrected later +/// by a smart scheduler: +/// = A[i] +/// = A[i+x] +/// A[i] = +/// A[i+x] = +/// +/// TODO: Walk the entire domtree within this loop, not just the path to the +/// loop latch. This will discover chains on side paths, but requires +/// maintaining multiple copies of the Chains state. +void LSRInstance::CollectChains() { + DEBUG(dbgs() << "Collecting IV Chains.\n"); + SmallVector<ChainUsers, 8> ChainUsersVec; + + SmallVector<BasicBlock *,8> LatchPath; + BasicBlock *LoopHeader = L->getHeader(); + for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch()); + Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) { + LatchPath.push_back(Rung->getBlock()); + } + LatchPath.push_back(LoopHeader); + + // Walk the instruction stream from the loop header to the loop latch. + for (SmallVectorImpl<BasicBlock *>::reverse_iterator + BBIter = LatchPath.rbegin(), BBEnd = LatchPath.rend(); + BBIter != BBEnd; ++BBIter) { + for (BasicBlock::iterator I = (*BBIter)->begin(), E = (*BBIter)->end(); + I != E; ++I) { + // Skip instructions that weren't seen by IVUsers analysis. + if (isa<PHINode>(I) || !IU.isIVUserOrOperand(I)) + continue; + + // Ignore users that are part of a SCEV expression. This way we only + // consider leaf IV Users. This effectively rediscovers a portion of + // IVUsers analysis but in program order this time. + if (SE.isSCEVable(I->getType()) && !isa<SCEVUnknown>(SE.getSCEV(I))) + continue; + + // Remove this instruction from any NearUsers set it may be in. + for (unsigned ChainIdx = 0, NChains = IVChainVec.size(); + ChainIdx < NChains; ++ChainIdx) { + ChainUsersVec[ChainIdx].NearUsers.erase(I); + } + // Search for operands that can be chained. + SmallPtrSet<Instruction*, 4> UniqueOperands; + User::op_iterator IVOpEnd = I->op_end(); + User::op_iterator IVOpIter = findIVOperand(I->op_begin(), IVOpEnd, L, SE); + while (IVOpIter != IVOpEnd) { + Instruction *IVOpInst = cast<Instruction>(*IVOpIter); + if (UniqueOperands.insert(IVOpInst)) + ChainInstruction(I, IVOpInst, ChainUsersVec); + IVOpIter = findIVOperand(llvm::next(IVOpIter), IVOpEnd, L, SE); + } + } // Continue walking down the instructions. + } // Continue walking down the domtree. + // Visit phi backedges to determine if the chain can generate the IV postinc. + for (BasicBlock::iterator I = L->getHeader()->begin(); + PHINode *PN = dyn_cast<PHINode>(I); ++I) { + if (!SE.isSCEVable(PN->getType())) + continue; + + Instruction *IncV = + dyn_cast<Instruction>(PN->getIncomingValueForBlock(L->getLoopLatch())); + if (IncV) + ChainInstruction(PN, IncV, ChainUsersVec); + } + // Remove any unprofitable chains. + unsigned ChainIdx = 0; + for (unsigned UsersIdx = 0, NChains = IVChainVec.size(); + UsersIdx < NChains; ++UsersIdx) { + if (!isProfitableChain(IVChainVec[UsersIdx], + ChainUsersVec[UsersIdx].FarUsers, SE, TLI)) + continue; + // Preserve the chain at UsesIdx. + if (ChainIdx != UsersIdx) + IVChainVec[ChainIdx] = IVChainVec[UsersIdx]; + FinalizeChain(IVChainVec[ChainIdx]); + ++ChainIdx; + } + IVChainVec.resize(ChainIdx); +} + +void LSRInstance::FinalizeChain(IVChain &Chain) { + assert(!Chain.Incs.empty() && "empty IV chains are not allowed"); + DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n"); + + for (IVChain::const_iterator I = Chain.begin(), E = Chain.end(); + I != E; ++I) { + DEBUG(dbgs() << " Inc: " << *I->UserInst << "\n"); + User::op_iterator UseI = + std::find(I->UserInst->op_begin(), I->UserInst->op_end(), I->IVOperand); + assert(UseI != I->UserInst->op_end() && "cannot find IV operand"); + IVIncSet.insert(UseI); + } +} + +/// Return true if the IVInc can be folded into an addressing mode. +static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, + Value *Operand, const TargetLowering *TLI) { + const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr); + if (!IncConst || !isAddressUse(UserInst, Operand)) + return false; + + if (IncConst->getValue()->getValue().getMinSignedBits() > 64) + return false; + + int64_t IncOffset = IncConst->getValue()->getSExtValue(); + if (!isAlwaysFoldable(IncOffset, /*BaseGV=*/0, /*HaseBaseReg=*/false, + LSRUse::Address, getAccessType(UserInst), TLI)) + return false; + + return true; +} + +/// GenerateIVChains - Generate an add or subtract for each IVInc in a chain to +/// materialize the IV user's operand from the previous IV user's operand. +void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, + SmallVectorImpl<WeakVH> &DeadInsts) { + // Find the new IVOperand for the head of the chain. It may have been replaced + // by LSR. + const IVInc &Head = Chain.Incs[0]; + User::op_iterator IVOpEnd = Head.UserInst->op_end(); + User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(), + IVOpEnd, L, SE); + Value *IVSrc = 0; + while (IVOpIter != IVOpEnd) { + IVSrc = getWideOperand(*IVOpIter); + + // If this operand computes the expression that the chain needs, we may use + // it. (Check this after setting IVSrc which is used below.) + // + // Note that if Head.IncExpr is wider than IVSrc, then this phi is too + // narrow for the chain, so we can no longer use it. We do allow using a + // wider phi, assuming the LSR checked for free truncation. In that case we + // should already have a truncate on this operand such that + // getSCEV(IVSrc) == IncExpr. + if (SE.getSCEV(*IVOpIter) == Head.IncExpr + || SE.getSCEV(IVSrc) == Head.IncExpr) { + break; + } + IVOpIter = findIVOperand(llvm::next(IVOpIter), IVOpEnd, L, SE); + } + if (IVOpIter == IVOpEnd) { + // Gracefully give up on this chain. + DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n"); + return; + } + + DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n"); + Type *IVTy = IVSrc->getType(); + Type *IntTy = SE.getEffectiveSCEVType(IVTy); + const SCEV *LeftOverExpr = 0; + for (IVChain::const_iterator IncI = Chain.begin(), + IncE = Chain.end(); IncI != IncE; ++IncI) { + + Instruction *InsertPt = IncI->UserInst; + if (isa<PHINode>(InsertPt)) + InsertPt = L->getLoopLatch()->getTerminator(); + + // IVOper will replace the current IV User's operand. IVSrc is the IV + // value currently held in a register. + Value *IVOper = IVSrc; + if (!IncI->IncExpr->isZero()) { + // IncExpr was the result of subtraction of two narrow values, so must + // be signed. + const SCEV *IncExpr = SE.getNoopOrSignExtend(IncI->IncExpr, IntTy); + LeftOverExpr = LeftOverExpr ? + SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr; + } + if (LeftOverExpr && !LeftOverExpr->isZero()) { + // Expand the IV increment. + Rewriter.clearPostInc(); + Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt); + const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc), + SE.getUnknown(IncV)); + IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt); + + // If an IV increment can't be folded, use it as the next IV value. + if (!canFoldIVIncExpr(LeftOverExpr, IncI->UserInst, IncI->IVOperand, + TLI)) { + assert(IVTy == IVOper->getType() && "inconsistent IV increment type"); + IVSrc = IVOper; + LeftOverExpr = 0; + } + } + Type *OperTy = IncI->IVOperand->getType(); + if (IVTy != OperTy) { + assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) && + "cannot extend a chained IV"); + IRBuilder<> Builder(InsertPt); + IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain"); + } + IncI->UserInst->replaceUsesOfWith(IncI->IVOperand, IVOper); + DeadInsts.push_back(IncI->IVOperand); + } + // If LSR created a new, wider phi, we may also replace its postinc. We only + // do this if we also found a wide value for the head of the chain. + if (isa<PHINode>(Chain.tailUserInst())) { + for (BasicBlock::iterator I = L->getHeader()->begin(); + PHINode *Phi = dyn_cast<PHINode>(I); ++I) { + if (!isCompatibleIVType(Phi, IVSrc)) + continue; + Instruction *PostIncV = dyn_cast<Instruction>( + Phi->getIncomingValueForBlock(L->getLoopLatch())); + if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc))) + continue; + Value *IVOper = IVSrc; + Type *PostIncTy = PostIncV->getType(); + if (IVTy != PostIncTy) { + assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types"); + IRBuilder<> Builder(L->getLoopLatch()->getTerminator()); + Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc()); + IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain"); + } + Phi->replaceUsesOfWith(PostIncV, IVOper); + DeadInsts.push_back(PostIncV); + } + } +} + +void LSRInstance::CollectFixupsAndInitialFormulae() { + for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) { + Instruction *UserInst = UI->getUser(); + // Skip IV users that are part of profitable IV Chains. + User::op_iterator UseI = std::find(UserInst->op_begin(), UserInst->op_end(), + UI->getOperandValToReplace()); + assert(UseI != UserInst->op_end() && "cannot find IV operand"); + if (IVIncSet.count(UseI)) + continue; + + // Record the uses. + LSRFixup &LF = getNewFixup(); + LF.UserInst = UserInst; + LF.OperandValToReplace = UI->getOperandValToReplace(); + LF.PostIncLoops = UI->getPostIncLoops(); + + LSRUse::KindType Kind = LSRUse::Basic; + Type *AccessTy = 0; + if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) { + Kind = LSRUse::Address; + AccessTy = getAccessType(LF.UserInst); + } + + const SCEV *S = IU.getExpr(*UI); + + // Equality (== and !=) ICmps are special. We can rewrite (i == N) as + // (N - i == 0), and this allows (N - i) to be the expression that we work + // with rather than just N or i, so we can consider the register + // requirements for both N and i at the same time. Limiting this code to + // equality icmps is not a problem because all interesting loops use + // equality icmps, thanks to IndVarSimplify. + if (ICmpInst *CI = dyn_cast<ICmpInst>(LF.UserInst)) + if (CI->isEquality()) { + // Swap the operands if needed to put the OperandValToReplace on the + // left, for consistency. + Value *NV = CI->getOperand(1); + if (NV == LF.OperandValToReplace) { + CI->setOperand(1, CI->getOperand(0)); + CI->setOperand(0, NV); + NV = CI->getOperand(1); + Changed = true; + } + + // x == y --> x - y == 0 + const SCEV *N = SE.getSCEV(NV); + if (SE.isLoopInvariant(N, L) && isSafeToExpand(N)) { + // S is normalized, so normalize N before folding it into S + // to keep the result normalized. + N = TransformForPostIncUse(Normalize, N, CI, 0, + LF.PostIncLoops, SE, DT); + Kind = LSRUse::ICmpZero; + S = SE.getMinusSCEV(N, S); + } + + // -1 and the negations of all interesting strides (except the negation + // of -1) are now also interesting. + for (size_t i = 0, e = Factors.size(); i != e; ++i) + if (Factors[i] != -1) + Factors.insert(-(uint64_t)Factors[i]); + Factors.insert(-1); + } + + // Set up the initial formula for this use. + std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy); + LF.LUIdx = P.first; + LF.Offset = P.second; + LSRUse &LU = Uses[LF.LUIdx]; + LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); + if (!LU.WidestFixupType || + SE.getTypeSizeInBits(LU.WidestFixupType) < + SE.getTypeSizeInBits(LF.OperandValToReplace->getType())) + LU.WidestFixupType = LF.OperandValToReplace->getType(); + + // If this is the first use of this LSRUse, give it a formula. + if (LU.Formulae.empty()) { + InsertInitialFormula(S, LU, LF.LUIdx); + CountRegisters(LU.Formulae.back(), LF.LUIdx); + } + } + + DEBUG(print_fixups(dbgs())); +} + +/// InsertInitialFormula - Insert a formula for the given expression into +/// the given use, separating out loop-variant portions from loop-invariant +/// and loop-computable portions. +void +LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) { + Formula F; + F.InitialMatch(S, L, SE); + bool Inserted = InsertFormula(LU, LUIdx, F); + assert(Inserted && "Initial formula already exists!"); (void)Inserted; +} + +/// InsertSupplementalFormula - Insert a simple single-register formula for +/// the given expression into the given use. +void +LSRInstance::InsertSupplementalFormula(const SCEV *S, + LSRUse &LU, size_t LUIdx) { + Formula F; + F.BaseRegs.push_back(S); + F.AM.HasBaseReg = true; + bool Inserted = InsertFormula(LU, LUIdx, F); + assert(Inserted && "Supplemental formula already exists!"); (void)Inserted; +} + +/// CountRegisters - Note which registers are used by the given formula, +/// updating RegUses. +void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) { + if (F.ScaledReg) + RegUses.CountRegister(F.ScaledReg, LUIdx); + for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(), + E = F.BaseRegs.end(); I != E; ++I) + RegUses.CountRegister(*I, LUIdx); +} + +/// InsertFormula - If the given formula has not yet been inserted, add it to +/// the list, and return true. Return false otherwise. +bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) { + if (!LU.InsertFormula(F)) + return false; + + CountRegisters(F, LUIdx); + return true; +} + +/// CollectLoopInvariantFixupsAndFormulae - Check for other uses of +/// loop-invariant values which we're tracking. These other uses will pin these +/// values in registers, making them less profitable for elimination. +/// TODO: This currently misses non-constant addrec step registers. +/// TODO: Should this give more weight to users inside the loop? +void +LSRInstance::CollectLoopInvariantFixupsAndFormulae() { + SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end()); + SmallPtrSet<const SCEV *, 8> Inserted; + + while (!Worklist.empty()) { + const SCEV *S = Worklist.pop_back_val(); + + if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S)) + Worklist.append(N->op_begin(), N->op_end()); + else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S)) + Worklist.push_back(C->getOperand()); + else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) { + Worklist.push_back(D->getLHS()); + Worklist.push_back(D->getRHS()); + } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) { + if (!Inserted.insert(U)) continue; + const Value *V = U->getValue(); + if (const Instruction *Inst = dyn_cast<Instruction>(V)) { + // Look for instructions defined outside the loop. + if (L->contains(Inst)) continue; + } else if (isa<UndefValue>(V)) + // Undef doesn't have a live range, so it doesn't matter. + continue; + for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end(); + UI != UE; ++UI) { + const Instruction *UserInst = dyn_cast<Instruction>(*UI); + // Ignore non-instructions. + if (!UserInst) + continue; + // Ignore instructions in other functions (as can happen with + // Constants). + if (UserInst->getParent()->getParent() != L->getHeader()->getParent()) + continue; + // Ignore instructions not dominated by the loop. + const BasicBlock *UseBB = !isa<PHINode>(UserInst) ? + UserInst->getParent() : + cast<PHINode>(UserInst)->getIncomingBlock( + PHINode::getIncomingValueNumForOperand(UI.getOperandNo())); + if (!DT.dominates(L->getHeader(), UseBB)) + continue; + // Ignore uses which are part of other SCEV expressions, to avoid + // analyzing them multiple times. + if (SE.isSCEVable(UserInst->getType())) { + const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst)); + // If the user is a no-op, look through to its uses. + if (!isa<SCEVUnknown>(UserS)) + continue; + if (UserS == U) { + Worklist.push_back( + SE.getUnknown(const_cast<Instruction *>(UserInst))); + continue; + } + } + // Ignore icmp instructions which are already being analyzed. + if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) { + unsigned OtherIdx = !UI.getOperandNo(); + Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx)); + if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L)) + continue; + } + + LSRFixup &LF = getNewFixup(); + LF.UserInst = const_cast<Instruction *>(UserInst); + LF.OperandValToReplace = UI.getUse(); + std::pair<size_t, int64_t> P = getUse(S, LSRUse::Basic, 0); + LF.LUIdx = P.first; + LF.Offset = P.second; + LSRUse &LU = Uses[LF.LUIdx]; + LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); + if (!LU.WidestFixupType || + SE.getTypeSizeInBits(LU.WidestFixupType) < + SE.getTypeSizeInBits(LF.OperandValToReplace->getType())) + LU.WidestFixupType = LF.OperandValToReplace->getType(); + InsertSupplementalFormula(U, LU, LF.LUIdx); + CountRegisters(LU.Formulae.back(), Uses.size() - 1); + break; + } + } + } +} + +/// CollectSubexprs - Split S into subexpressions which can be pulled out into +/// separate registers. If C is non-null, multiply each subexpression by C. +/// +/// Return remainder expression after factoring the subexpressions captured by +/// Ops. If Ops is complete, return NULL. +static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C, + SmallVectorImpl<const SCEV *> &Ops, + const Loop *L, + ScalarEvolution &SE, + unsigned Depth = 0) { + // Arbitrarily cap recursion to protect compile time. + if (Depth >= 3) + return S; + + if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { + // Break out add operands. + for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); + I != E; ++I) { + const SCEV *Remainder = CollectSubexprs(*I, C, Ops, L, SE, Depth+1); + if (Remainder) + Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder); + } + return NULL; + } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { + // Split a non-zero base out of an addrec. + if (AR->getStart()->isZero()) + return S; + + const SCEV *Remainder = CollectSubexprs(AR->getStart(), + C, Ops, L, SE, Depth+1); + // Split the non-zero AddRec unless it is part of a nested recurrence that + // does not pertain to this loop. + if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) { + Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder); + Remainder = NULL; + } + if (Remainder != AR->getStart()) { + if (!Remainder) + Remainder = SE.getConstant(AR->getType(), 0); + return SE.getAddRecExpr(Remainder, + AR->getStepRecurrence(SE), + AR->getLoop(), + //FIXME: AR->getNoWrapFlags(SCEV::FlagNW) + SCEV::FlagAnyWrap); + } + } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) { + // Break (C * (a + b + c)) into C*a + C*b + C*c. + if (Mul->getNumOperands() != 2) + return S; + if (const SCEVConstant *Op0 = + dyn_cast<SCEVConstant>(Mul->getOperand(0))) { + C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0; + const SCEV *Remainder = + CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1); + if (Remainder) + Ops.push_back(SE.getMulExpr(C, Remainder)); + return NULL; + } + } + return S; +} + +/// GenerateReassociations - Split out subexpressions from adds and the bases of +/// addrecs. +void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, + Formula Base, + unsigned Depth) { + // Arbitrarily cap recursion to protect compile time. + if (Depth >= 3) return; + + for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) { + const SCEV *BaseReg = Base.BaseRegs[i]; + + SmallVector<const SCEV *, 8> AddOps; + const SCEV *Remainder = CollectSubexprs(BaseReg, 0, AddOps, L, SE); + if (Remainder) + AddOps.push_back(Remainder); + + if (AddOps.size() == 1) continue; + + for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(), + JE = AddOps.end(); J != JE; ++J) { + + // Loop-variant "unknown" values are uninteresting; we won't be able to + // do anything meaningful with them. + if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L)) + continue; + + // Don't pull a constant into a register if the constant could be folded + // into an immediate field. + if (isAlwaysFoldable(*J, LU.MinOffset, LU.MaxOffset, + Base.getNumRegs() > 1, + LU.Kind, LU.AccessTy, TLI, SE)) + continue; + + // Collect all operands except *J. + SmallVector<const SCEV *, 8> InnerAddOps + (((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J); + InnerAddOps.append + (llvm::next(J), ((const SmallVector<const SCEV *, 8> &)AddOps).end()); + + // Don't leave just a constant behind in a register if the constant could + // be folded into an immediate field. + if (InnerAddOps.size() == 1 && + isAlwaysFoldable(InnerAddOps[0], LU.MinOffset, LU.MaxOffset, + Base.getNumRegs() > 1, + LU.Kind, LU.AccessTy, TLI, SE)) + continue; + + const SCEV *InnerSum = SE.getAddExpr(InnerAddOps); + if (InnerSum->isZero()) + continue; + Formula F = Base; + + // Add the remaining pieces of the add back into the new formula. + const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum); + if (TLI && InnerSumSC && + SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 && + TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset + + InnerSumSC->getValue()->getZExtValue())) { + F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset + + InnerSumSC->getValue()->getZExtValue(); + F.BaseRegs.erase(F.BaseRegs.begin() + i); + } else + F.BaseRegs[i] = InnerSum; + + // Add J as its own register, or an unfolded immediate. + const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J); + if (TLI && SC && SE.getTypeSizeInBits(SC->getType()) <= 64 && + TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset + + SC->getValue()->getZExtValue())) + F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset + + SC->getValue()->getZExtValue(); + else + F.BaseRegs.push_back(*J); + + if (InsertFormula(LU, LUIdx, F)) + // If that formula hadn't been seen before, recurse to find more like + // it. + GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth+1); + } + } +} + +/// GenerateCombinations - Generate a formula consisting of all of the +/// loop-dominating registers added into a single register. +void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, + Formula Base) { + // This method is only interesting on a plurality of registers. + if (Base.BaseRegs.size() <= 1) return; + + Formula F = Base; + F.BaseRegs.clear(); + SmallVector<const SCEV *, 4> Ops; + for (SmallVectorImpl<const SCEV *>::const_iterator + I = Base.BaseRegs.begin(), E = Base.BaseRegs.end(); I != E; ++I) { + const SCEV *BaseReg = *I; + if (SE.properlyDominates(BaseReg, L->getHeader()) && + !SE.hasComputableLoopEvolution(BaseReg, L)) + Ops.push_back(BaseReg); + else + F.BaseRegs.push_back(BaseReg); + } + if (Ops.size() > 1) { + const SCEV *Sum = SE.getAddExpr(Ops); + // TODO: If Sum is zero, it probably means ScalarEvolution missed an + // opportunity to fold something. For now, just ignore such cases + // rather than proceed with zero in a register. + if (!Sum->isZero()) { + F.BaseRegs.push_back(Sum); + (void)InsertFormula(LU, LUIdx, F); + } + } +} + +/// GenerateSymbolicOffsets - Generate reuse formulae using symbolic offsets. +void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, + Formula Base) { + // We can't add a symbolic offset if the address already contains one. + if (Base.AM.BaseGV) return; + + for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) { + const SCEV *G = Base.BaseRegs[i]; + GlobalValue *GV = ExtractSymbol(G, SE); + if (G->isZero() || !GV) + continue; + Formula F = Base; + F.AM.BaseGV = GV; + if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset, + LU.Kind, LU.AccessTy, TLI)) + continue; + F.BaseRegs[i] = G; + (void)InsertFormula(LU, LUIdx, F); + } +} + +/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets. +void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, + Formula Base) { + // TODO: For now, just add the min and max offset, because it usually isn't + // worthwhile looking at everything inbetween. + SmallVector<int64_t, 2> Worklist; + Worklist.push_back(LU.MinOffset); + if (LU.MaxOffset != LU.MinOffset) + Worklist.push_back(LU.MaxOffset); + + for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) { + const SCEV *G = Base.BaseRegs[i]; + + for (SmallVectorImpl<int64_t>::const_iterator I = Worklist.begin(), + E = Worklist.end(); I != E; ++I) { + Formula F = Base; + F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs - *I; + if (isLegalUse(F.AM, LU.MinOffset - *I, LU.MaxOffset - *I, + LU.Kind, LU.AccessTy, TLI)) { + // Add the offset to the base register. + const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), *I), G); + // If it cancelled out, drop the base register, otherwise update it. + if (NewG->isZero()) { + std::swap(F.BaseRegs[i], F.BaseRegs.back()); + F.BaseRegs.pop_back(); + } else + F.BaseRegs[i] = NewG; + + (void)InsertFormula(LU, LUIdx, F); + } + } + + int64_t Imm = ExtractImmediate(G, SE); + if (G->isZero() || Imm == 0) + continue; + Formula F = Base; + F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Imm; + if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset, + LU.Kind, LU.AccessTy, TLI)) + continue; + F.BaseRegs[i] = G; + (void)InsertFormula(LU, LUIdx, F); + } +} + +/// GenerateICmpZeroScales - For ICmpZero, check to see if we can scale up +/// the comparison. For example, x == y -> x*c == y*c. +void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, + Formula Base) { + if (LU.Kind != LSRUse::ICmpZero) return; + + // Determine the integer type for the base formula. + Type *IntTy = Base.getType(); + if (!IntTy) return; + if (SE.getTypeSizeInBits(IntTy) > 64) return; + + // Don't do this if there is more than one offset. + if (LU.MinOffset != LU.MaxOffset) return; + + assert(!Base.AM.BaseGV && "ICmpZero use is not legal!"); + + // Check each interesting stride. + for (SmallSetVector<int64_t, 8>::const_iterator + I = Factors.begin(), E = Factors.end(); I != E; ++I) { + int64_t Factor = *I; + + // Check that the multiplication doesn't overflow. + if (Base.AM.BaseOffs == INT64_MIN && Factor == -1) + continue; + int64_t NewBaseOffs = (uint64_t)Base.AM.BaseOffs * Factor; + if (NewBaseOffs / Factor != Base.AM.BaseOffs) + continue; + + // Check that multiplying with the use offset doesn't overflow. + int64_t Offset = LU.MinOffset; + if (Offset == INT64_MIN && Factor == -1) + continue; + Offset = (uint64_t)Offset * Factor; + if (Offset / Factor != LU.MinOffset) + continue; + + Formula F = Base; + F.AM.BaseOffs = NewBaseOffs; + + // Check that this scale is legal. + if (!isLegalUse(F.AM, Offset, Offset, LU.Kind, LU.AccessTy, TLI)) + continue; + + // Compensate for the use having MinOffset built into it. + F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Offset - LU.MinOffset; + + const SCEV *FactorS = SE.getConstant(IntTy, Factor); + + // Check that multiplying with each base register doesn't overflow. + for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) { + F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS); + if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i]) + goto next; + } + + // Check that multiplying with the scaled register doesn't overflow. + if (F.ScaledReg) { + F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS); + if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg) + continue; + } + + // Check that multiplying with the unfolded offset doesn't overflow. + if (F.UnfoldedOffset != 0) { + if (F.UnfoldedOffset == INT64_MIN && Factor == -1) + continue; + F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor; + if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset) + continue; + } + + // If we make it here and it's legal, add it. + (void)InsertFormula(LU, LUIdx, F); + next:; + } +} + +/// GenerateScales - Generate stride factor reuse formulae by making use of +/// scaled-offset address modes, for example. +void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { + // Determine the integer type for the base formula. + Type *IntTy = Base.getType(); + if (!IntTy) return; + + // If this Formula already has a scaled register, we can't add another one. + if (Base.AM.Scale != 0) return; + + // Check each interesting stride. + for (SmallSetVector<int64_t, 8>::const_iterator + I = Factors.begin(), E = Factors.end(); I != E; ++I) { + int64_t Factor = *I; + + Base.AM.Scale = Factor; + Base.AM.HasBaseReg = Base.BaseRegs.size() > 1; + // Check whether this scale is going to be legal. + if (!isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset, + LU.Kind, LU.AccessTy, TLI)) { + // As a special-case, handle special out-of-loop Basic users specially. + // TODO: Reconsider this special case. + if (LU.Kind == LSRUse::Basic && + isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset, + LSRUse::Special, LU.AccessTy, TLI) && + LU.AllFixupsOutsideLoop) + LU.Kind = LSRUse::Special; + else + continue; + } + // For an ICmpZero, negating a solitary base register won't lead to + // new solutions. + if (LU.Kind == LSRUse::ICmpZero && + !Base.AM.HasBaseReg && Base.AM.BaseOffs == 0 && !Base.AM.BaseGV) + continue; + // For each addrec base reg, apply the scale, if possible. + for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) + if (const SCEVAddRecExpr *AR = + dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) { + const SCEV *FactorS = SE.getConstant(IntTy, Factor); + if (FactorS->isZero()) + continue; + // Divide out the factor, ignoring high bits, since we'll be + // scaling the value back up in the end. + if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) { + // TODO: This could be optimized to avoid all the copying. + Formula F = Base; + F.ScaledReg = Quotient; + F.DeleteBaseReg(F.BaseRegs[i]); + (void)InsertFormula(LU, LUIdx, F); + } + } + } +} + +/// GenerateTruncates - Generate reuse formulae from different IV types. +void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) { + // This requires TargetLowering to tell us which truncates are free. + if (!TLI) return; + + // Don't bother truncating symbolic values. + if (Base.AM.BaseGV) return; + + // Determine the integer type for the base formula. + Type *DstTy = Base.getType(); + if (!DstTy) return; + DstTy = SE.getEffectiveSCEVType(DstTy); + + for (SmallSetVector<Type *, 4>::const_iterator + I = Types.begin(), E = Types.end(); I != E; ++I) { + Type *SrcTy = *I; + if (SrcTy != DstTy && TLI->isTruncateFree(SrcTy, DstTy)) { + Formula F = Base; + + if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, *I); + for (SmallVectorImpl<const SCEV *>::iterator J = F.BaseRegs.begin(), + JE = F.BaseRegs.end(); J != JE; ++J) + *J = SE.getAnyExtendExpr(*J, SrcTy); + + // TODO: This assumes we've done basic processing on all uses and + // have an idea what the register usage is. + if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses)) + continue; + + (void)InsertFormula(LU, LUIdx, F); + } + } +} + +namespace { + +/// WorkItem - Helper class for GenerateCrossUseConstantOffsets. It's used to +/// defer modifications so that the search phase doesn't have to worry about +/// the data structures moving underneath it. +struct WorkItem { + size_t LUIdx; + int64_t Imm; + const SCEV *OrigReg; + + WorkItem(size_t LI, int64_t I, const SCEV *R) + : LUIdx(LI), Imm(I), OrigReg(R) {} + + void print(raw_ostream &OS) const; + void dump() const; +}; + +} + +void WorkItem::print(raw_ostream &OS) const { + OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx + << " , add offset " << Imm; +} + +void WorkItem::dump() const { + print(errs()); errs() << '\n'; +} + +/// GenerateCrossUseConstantOffsets - Look for registers which are a constant +/// distance apart and try to form reuse opportunities between them. +void LSRInstance::GenerateCrossUseConstantOffsets() { + // Group the registers by their value without any added constant offset. + typedef std::map<int64_t, const SCEV *> ImmMapTy; + typedef DenseMap<const SCEV *, ImmMapTy> RegMapTy; + RegMapTy Map; + DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap; + SmallVector<const SCEV *, 8> Sequence; + for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end(); + I != E; ++I) { + const SCEV *Reg = *I; + int64_t Imm = ExtractImmediate(Reg, SE); + std::pair<RegMapTy::iterator, bool> Pair = + Map.insert(std::make_pair(Reg, ImmMapTy())); + if (Pair.second) + Sequence.push_back(Reg); + Pair.first->second.insert(std::make_pair(Imm, *I)); + UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(*I); + } + + // Now examine each set of registers with the same base value. Build up + // a list of work to do and do the work in a separate step so that we're + // not adding formulae and register counts while we're searching. + SmallVector<WorkItem, 32> WorkItems; + SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems; + for (SmallVectorImpl<const SCEV *>::const_iterator I = Sequence.begin(), + E = Sequence.end(); I != E; ++I) { + const SCEV *Reg = *I; + const ImmMapTy &Imms = Map.find(Reg)->second; + + // It's not worthwhile looking for reuse if there's only one offset. + if (Imms.size() == 1) + continue; + + DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':'; + for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end(); + J != JE; ++J) + dbgs() << ' ' << J->first; + dbgs() << '\n'); + + // Examine each offset. + for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end(); + J != JE; ++J) { + const SCEV *OrigReg = J->second; + + int64_t JImm = J->first; + const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg); + + if (!isa<SCEVConstant>(OrigReg) && + UsedByIndicesMap[Reg].count() == 1) { + DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg << '\n'); + continue; + } + + // Conservatively examine offsets between this orig reg a few selected + // other orig regs. + ImmMapTy::const_iterator OtherImms[] = { + Imms.begin(), prior(Imms.end()), + Imms.lower_bound((Imms.begin()->first + prior(Imms.end())->first) / 2) + }; + for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) { + ImmMapTy::const_iterator M = OtherImms[i]; + if (M == J || M == JE) continue; + + // Compute the difference between the two. + int64_t Imm = (uint64_t)JImm - M->first; + for (int LUIdx = UsedByIndices.find_first(); LUIdx != -1; + LUIdx = UsedByIndices.find_next(LUIdx)) + // Make a memo of this use, offset, and register tuple. + if (UniqueItems.insert(std::make_pair(LUIdx, Imm))) + WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg)); + } + } + } + + Map.clear(); + Sequence.clear(); + UsedByIndicesMap.clear(); + UniqueItems.clear(); + + // Now iterate through the worklist and add new formulae. + for (SmallVectorImpl<WorkItem>::const_iterator I = WorkItems.begin(), + E = WorkItems.end(); I != E; ++I) { + const WorkItem &WI = *I; + size_t LUIdx = WI.LUIdx; + LSRUse &LU = Uses[LUIdx]; + int64_t Imm = WI.Imm; + const SCEV *OrigReg = WI.OrigReg; + + Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType()); + const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm)); + unsigned BitWidth = SE.getTypeSizeInBits(IntTy); + + // TODO: Use a more targeted data structure. + for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) { + const Formula &F = LU.Formulae[L]; + // Use the immediate in the scaled register. + if (F.ScaledReg == OrigReg) { + int64_t Offs = (uint64_t)F.AM.BaseOffs + + Imm * (uint64_t)F.AM.Scale; + // Don't create 50 + reg(-50). + if (F.referencesReg(SE.getSCEV( + ConstantInt::get(IntTy, -(uint64_t)Offs)))) + continue; + Formula NewF = F; + NewF.AM.BaseOffs = Offs; + if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset, + LU.Kind, LU.AccessTy, TLI)) + continue; + NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg); + + // If the new scale is a constant in a register, and adding the constant + // value to the immediate would produce a value closer to zero than the + // immediate itself, then the formula isn't worthwhile. + if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) + if (C->getValue()->isNegative() != + (NewF.AM.BaseOffs < 0) && + (C->getValue()->getValue().abs() * APInt(BitWidth, F.AM.Scale)) + .ule(abs64(NewF.AM.BaseOffs))) + continue; + + // OK, looks good. + (void)InsertFormula(LU, LUIdx, NewF); + } else { + // Use the immediate in a base register. + for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) { + const SCEV *BaseReg = F.BaseRegs[N]; + if (BaseReg != OrigReg) + continue; + Formula NewF = F; + NewF.AM.BaseOffs = (uint64_t)NewF.AM.BaseOffs + Imm; + if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset, + LU.Kind, LU.AccessTy, TLI)) { + if (!TLI || + !TLI->isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm)) + continue; + NewF = F; + NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm; + } + NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg); + + // If the new formula has a constant in a register, and adding the + // constant value to the immediate would produce a value closer to + // zero than the immediate itself, then the formula isn't worthwhile. + for (SmallVectorImpl<const SCEV *>::const_iterator + J = NewF.BaseRegs.begin(), JE = NewF.BaseRegs.end(); + J != JE; ++J) + if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*J)) + if ((C->getValue()->getValue() + NewF.AM.BaseOffs).abs().slt( + abs64(NewF.AM.BaseOffs)) && + (C->getValue()->getValue() + + NewF.AM.BaseOffs).countTrailingZeros() >= + CountTrailingZeros_64(NewF.AM.BaseOffs)) + goto skip_formula; + + // Ok, looks good. + (void)InsertFormula(LU, LUIdx, NewF); + break; + skip_formula:; + } + } + } + } +} + +/// GenerateAllReuseFormulae - Generate formulae for each use. +void +LSRInstance::GenerateAllReuseFormulae() { + // This is split into multiple loops so that hasRegsUsedByUsesOtherThan + // queries are more precise. + for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { + LSRUse &LU = Uses[LUIdx]; + for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) + GenerateReassociations(LU, LUIdx, LU.Formulae[i]); + for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) + GenerateCombinations(LU, LUIdx, LU.Formulae[i]); + } + for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { + LSRUse &LU = Uses[LUIdx]; + for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) + GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]); + for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) + GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]); + for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) + GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]); + for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) + GenerateScales(LU, LUIdx, LU.Formulae[i]); + } + for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { + LSRUse &LU = Uses[LUIdx]; + for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) + GenerateTruncates(LU, LUIdx, LU.Formulae[i]); + } + + GenerateCrossUseConstantOffsets(); + + DEBUG(dbgs() << "\n" + "After generating reuse formulae:\n"; + print_uses(dbgs())); +} + +/// If there are multiple formulae with the same set of registers used +/// by other uses, pick the best one and delete the others. +void LSRInstance::FilterOutUndesirableDedicatedRegisters() { + DenseSet<const SCEV *> VisitedRegs; + SmallPtrSet<const SCEV *, 16> Regs; + SmallPtrSet<const SCEV *, 16> LoserRegs; +#ifndef NDEBUG + bool ChangedFormulae = false; +#endif + + // Collect the best formula for each unique set of shared registers. This + // is reset for each use. + typedef DenseMap<SmallVector<const SCEV *, 2>, size_t, UniquifierDenseMapInfo> + BestFormulaeTy; + BestFormulaeTy BestFormulae; + + for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { + LSRUse &LU = Uses[LUIdx]; + DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n'); + + bool Any = false; + for (size_t FIdx = 0, NumForms = LU.Formulae.size(); + FIdx != NumForms; ++FIdx) { + Formula &F = LU.Formulae[FIdx]; + + // Some formulas are instant losers. For example, they may depend on + // nonexistent AddRecs from other loops. These need to be filtered + // immediately, otherwise heuristics could choose them over others leading + // to an unsatisfactory solution. Passing LoserRegs into RateFormula here + // avoids the need to recompute this information across formulae using the + // same bad AddRec. Passing LoserRegs is also essential unless we remove + // the corresponding bad register from the Regs set. + Cost CostF; + Regs.clear(); + CostF.RateFormula(F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, + &LoserRegs); + if (CostF.isLoser()) { + // During initial formula generation, undesirable formulae are generated + // by uses within other loops that have some non-trivial address mode or + // use the postinc form of the IV. LSR needs to provide these formulae + // as the basis of rediscovering the desired formula that uses an AddRec + // corresponding to the existing phi. Once all formulae have been + // generated, these initial losers may be pruned. + DEBUG(dbgs() << " Filtering loser "; F.print(dbgs()); + dbgs() << "\n"); + } + else { + SmallVector<const SCEV *, 2> Key; + for (SmallVectorImpl<const SCEV *>::const_iterator J = F.BaseRegs.begin(), + JE = F.BaseRegs.end(); J != JE; ++J) { + const SCEV *Reg = *J; + if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx)) + Key.push_back(Reg); + } + if (F.ScaledReg && + RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx)) + Key.push_back(F.ScaledReg); + // Unstable sort by host order ok, because this is only used for + // uniquifying. + std::sort(Key.begin(), Key.end()); + + std::pair<BestFormulaeTy::const_iterator, bool> P = + BestFormulae.insert(std::make_pair(Key, FIdx)); + if (P.second) + continue; + + Formula &Best = LU.Formulae[P.first->second]; + + Cost CostBest; + Regs.clear(); + CostBest.RateFormula(Best, Regs, VisitedRegs, L, LU.Offsets, SE, DT); + if (CostF < CostBest) + std::swap(F, Best); + DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs()); + dbgs() << "\n" + " in favor of formula "; Best.print(dbgs()); + dbgs() << '\n'); + } +#ifndef NDEBUG + ChangedFormulae = true; +#endif + LU.DeleteFormula(F); + --FIdx; + --NumForms; + Any = true; + } + + // Now that we've filtered out some formulae, recompute the Regs set. + if (Any) + LU.RecomputeRegs(LUIdx, RegUses); + + // Reset this to prepare for the next use. + BestFormulae.clear(); + } + + DEBUG(if (ChangedFormulae) { + dbgs() << "\n" + "After filtering out undesirable candidates:\n"; + print_uses(dbgs()); + }); +} + +// This is a rough guess that seems to work fairly well. +static const size_t ComplexityLimit = UINT16_MAX; + +/// EstimateSearchSpaceComplexity - Estimate the worst-case number of +/// solutions the solver might have to consider. It almost never considers +/// this many solutions because it prune the search space, but the pruning +/// isn't always sufficient. +size_t LSRInstance::EstimateSearchSpaceComplexity() const { + size_t Power = 1; + for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(), + E = Uses.end(); I != E; ++I) { + size_t FSize = I->Formulae.size(); + if (FSize >= ComplexityLimit) { + Power = ComplexityLimit; + break; + } + Power *= FSize; + if (Power >= ComplexityLimit) + break; + } + return Power; +} + +/// NarrowSearchSpaceByDetectingSupersets - When one formula uses a superset +/// of the registers of another formula, it won't help reduce register +/// pressure (though it may not necessarily hurt register pressure); remove +/// it to simplify the system. +void LSRInstance::NarrowSearchSpaceByDetectingSupersets() { + if (EstimateSearchSpaceComplexity() >= ComplexityLimit) { + DEBUG(dbgs() << "The search space is too complex.\n"); + + DEBUG(dbgs() << "Narrowing the search space by eliminating formulae " + "which use a superset of registers used by other " + "formulae.\n"); + + for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { + LSRUse &LU = Uses[LUIdx]; + bool Any = false; + for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) { + Formula &F = LU.Formulae[i]; + // Look for a formula with a constant or GV in a register. If the use + // also has a formula with that same value in an immediate field, + // delete the one that uses a register. + for (SmallVectorImpl<const SCEV *>::const_iterator + I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) { + if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) { + Formula NewF = F; + NewF.AM.BaseOffs += C->getValue()->getSExtValue(); + NewF.BaseRegs.erase(NewF.BaseRegs.begin() + + (I - F.BaseRegs.begin())); + if (LU.HasFormulaWithSameRegs(NewF)) { + DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n'); + LU.DeleteFormula(F); + --i; + --e; + Any = true; + break; + } + } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) { + if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) + if (!F.AM.BaseGV) { + Formula NewF = F; + NewF.AM.BaseGV = GV; + NewF.BaseRegs.erase(NewF.BaseRegs.begin() + + (I - F.BaseRegs.begin())); + if (LU.HasFormulaWithSameRegs(NewF)) { + DEBUG(dbgs() << " Deleting "; F.print(dbgs()); + dbgs() << '\n'); + LU.DeleteFormula(F); + --i; + --e; + Any = true; + break; + } + } + } + } + } + if (Any) + LU.RecomputeRegs(LUIdx, RegUses); + } + + DEBUG(dbgs() << "After pre-selection:\n"; + print_uses(dbgs())); + } +} + +/// NarrowSearchSpaceByCollapsingUnrolledCode - When there are many registers +/// for expressions like A, A+1, A+2, etc., allocate a single register for +/// them. +void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { + if (EstimateSearchSpaceComplexity() >= ComplexityLimit) { + DEBUG(dbgs() << "The search space is too complex.\n"); + + DEBUG(dbgs() << "Narrowing the search space by assuming that uses " + "separated by a constant offset will use the same " + "registers.\n"); + + // This is especially useful for unrolled loops. + + for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { + LSRUse &LU = Uses[LUIdx]; + for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(), + E = LU.Formulae.end(); I != E; ++I) { + const Formula &F = *I; + if (F.AM.BaseOffs != 0 && F.AM.Scale == 0) { + if (LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU)) { + if (reconcileNewOffset(*LUThatHas, F.AM.BaseOffs, + /*HasBaseReg=*/false, + LU.Kind, LU.AccessTy)) { + DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); + dbgs() << '\n'); + + LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop; + + // Update the relocs to reference the new use. + for (SmallVectorImpl<LSRFixup>::iterator I = Fixups.begin(), + E = Fixups.end(); I != E; ++I) { + LSRFixup &Fixup = *I; + if (Fixup.LUIdx == LUIdx) { + Fixup.LUIdx = LUThatHas - &Uses.front(); + Fixup.Offset += F.AM.BaseOffs; + // Add the new offset to LUThatHas' offset list. + if (LUThatHas->Offsets.back() != Fixup.Offset) { + LUThatHas->Offsets.push_back(Fixup.Offset); + if (Fixup.Offset > LUThatHas->MaxOffset) + LUThatHas->MaxOffset = Fixup.Offset; + if (Fixup.Offset < LUThatHas->MinOffset) + LUThatHas->MinOffset = Fixup.Offset; + } + DEBUG(dbgs() << "New fixup has offset " + << Fixup.Offset << '\n'); + } + if (Fixup.LUIdx == NumUses-1) + Fixup.LUIdx = LUIdx; + } + + // Delete formulae from the new use which are no longer legal. + bool Any = false; + for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) { + Formula &F = LUThatHas->Formulae[i]; + if (!isLegalUse(F.AM, + LUThatHas->MinOffset, LUThatHas->MaxOffset, + LUThatHas->Kind, LUThatHas->AccessTy, TLI)) { + DEBUG(dbgs() << " Deleting "; F.print(dbgs()); + dbgs() << '\n'); + LUThatHas->DeleteFormula(F); + --i; + --e; + Any = true; + } + } + if (Any) + LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses); + + // Delete the old use. + DeleteUse(LU, LUIdx); + --LUIdx; + --NumUses; + break; + } + } + } + } + } + + DEBUG(dbgs() << "After pre-selection:\n"; + print_uses(dbgs())); + } +} + +/// NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters - Call +/// FilterOutUndesirableDedicatedRegisters again, if necessary, now that +/// we've done more filtering, as it may be able to find more formulae to +/// eliminate. +void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){ + if (EstimateSearchSpaceComplexity() >= ComplexityLimit) { + DEBUG(dbgs() << "The search space is too complex.\n"); + + DEBUG(dbgs() << "Narrowing the search space by re-filtering out " + "undesirable dedicated registers.\n"); + + FilterOutUndesirableDedicatedRegisters(); + + DEBUG(dbgs() << "After pre-selection:\n"; + print_uses(dbgs())); + } +} + +/// NarrowSearchSpaceByPickingWinnerRegs - Pick a register which seems likely +/// to be profitable, and then in any use which has any reference to that +/// register, delete all formulae which do not reference that register. +void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() { + // With all other options exhausted, loop until the system is simple + // enough to handle. + SmallPtrSet<const SCEV *, 4> Taken; + while (EstimateSearchSpaceComplexity() >= ComplexityLimit) { + // Ok, we have too many of formulae on our hands to conveniently handle. + // Use a rough heuristic to thin out the list. + DEBUG(dbgs() << "The search space is too complex.\n"); + + // Pick the register which is used by the most LSRUses, which is likely + // to be a good reuse register candidate. + const SCEV *Best = 0; + unsigned BestNum = 0; + for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end(); + I != E; ++I) { + const SCEV *Reg = *I; + if (Taken.count(Reg)) + continue; + if (!Best) + Best = Reg; + else { + unsigned Count = RegUses.getUsedByIndices(Reg).count(); + if (Count > BestNum) { + Best = Reg; + BestNum = Count; + } + } + } + + DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best + << " will yield profitable reuse.\n"); + Taken.insert(Best); + + // In any use with formulae which references this register, delete formulae + // which don't reference it. + for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { + LSRUse &LU = Uses[LUIdx]; + if (!LU.Regs.count(Best)) continue; + + bool Any = false; + for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) { + Formula &F = LU.Formulae[i]; + if (!F.referencesReg(Best)) { + DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n'); + LU.DeleteFormula(F); + --e; + --i; + Any = true; + assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?"); + continue; + } + } + + if (Any) + LU.RecomputeRegs(LUIdx, RegUses); + } + + DEBUG(dbgs() << "After pre-selection:\n"; + print_uses(dbgs())); + } +} + +/// NarrowSearchSpaceUsingHeuristics - If there are an extraordinary number of +/// formulae to choose from, use some rough heuristics to prune down the number +/// of formulae. This keeps the main solver from taking an extraordinary amount +/// of time in some worst-case scenarios. +void LSRInstance::NarrowSearchSpaceUsingHeuristics() { + NarrowSearchSpaceByDetectingSupersets(); + NarrowSearchSpaceByCollapsingUnrolledCode(); + NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(); + NarrowSearchSpaceByPickingWinnerRegs(); +} + +/// SolveRecurse - This is the recursive solver. +void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution, + Cost &SolutionCost, + SmallVectorImpl<const Formula *> &Workspace, + const Cost &CurCost, + const SmallPtrSet<const SCEV *, 16> &CurRegs, + DenseSet<const SCEV *> &VisitedRegs) const { + // Some ideas: + // - prune more: + // - use more aggressive filtering + // - sort the formula so that the most profitable solutions are found first + // - sort the uses too + // - search faster: + // - don't compute a cost, and then compare. compare while computing a cost + // and bail early. + // - track register sets with SmallBitVector + + const LSRUse &LU = Uses[Workspace.size()]; + + // If this use references any register that's already a part of the + // in-progress solution, consider it a requirement that a formula must + // reference that register in order to be considered. This prunes out + // unprofitable searching. + SmallSetVector<const SCEV *, 4> ReqRegs; + for (SmallPtrSet<const SCEV *, 16>::const_iterator I = CurRegs.begin(), + E = CurRegs.end(); I != E; ++I) + if (LU.Regs.count(*I)) + ReqRegs.insert(*I); + + SmallPtrSet<const SCEV *, 16> NewRegs; + Cost NewCost; + for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(), + E = LU.Formulae.end(); I != E; ++I) { + const Formula &F = *I; + + // Ignore formulae which do not use any of the required registers. + bool SatisfiedReqReg = true; + for (SmallSetVector<const SCEV *, 4>::const_iterator J = ReqRegs.begin(), + JE = ReqRegs.end(); J != JE; ++J) { + const SCEV *Reg = *J; + if ((!F.ScaledReg || F.ScaledReg != Reg) && + std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) == + F.BaseRegs.end()) { + SatisfiedReqReg = false; + break; + } + } + if (!SatisfiedReqReg) { + // If none of the formulae satisfied the required registers, then we could + // clear ReqRegs and try again. Currently, we simply give up in this case. + continue; + } + + // Evaluate the cost of the current formula. If it's already worse than + // the current best, prune the search at that point. + NewCost = CurCost; + NewRegs = CurRegs; + NewCost.RateFormula(F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT); + if (NewCost < SolutionCost) { + Workspace.push_back(&F); + if (Workspace.size() != Uses.size()) { + SolveRecurse(Solution, SolutionCost, Workspace, NewCost, + NewRegs, VisitedRegs); + if (F.getNumRegs() == 1 && Workspace.size() == 1) + VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]); + } else { + DEBUG(dbgs() << "New best at "; NewCost.print(dbgs()); + dbgs() << ".\n Regs:"; + for (SmallPtrSet<const SCEV *, 16>::const_iterator + I = NewRegs.begin(), E = NewRegs.end(); I != E; ++I) + dbgs() << ' ' << **I; + dbgs() << '\n'); + + SolutionCost = NewCost; + Solution = Workspace; + } + Workspace.pop_back(); + } + } +} + +/// Solve - Choose one formula from each use. Return the results in the given +/// Solution vector. +void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const { + SmallVector<const Formula *, 8> Workspace; + Cost SolutionCost; + SolutionCost.Loose(); + Cost CurCost; + SmallPtrSet<const SCEV *, 16> CurRegs; + DenseSet<const SCEV *> VisitedRegs; + Workspace.reserve(Uses.size()); + + // SolveRecurse does all the work. + SolveRecurse(Solution, SolutionCost, Workspace, CurCost, + CurRegs, VisitedRegs); + if (Solution.empty()) { + DEBUG(dbgs() << "\nNo Satisfactory Solution\n"); + return; + } + + // Ok, we've now made all our decisions. + DEBUG(dbgs() << "\n" + "The chosen solution requires "; SolutionCost.print(dbgs()); + dbgs() << ":\n"; + for (size_t i = 0, e = Uses.size(); i != e; ++i) { + dbgs() << " "; + Uses[i].print(dbgs()); + dbgs() << "\n" + " "; + Solution[i]->print(dbgs()); + dbgs() << '\n'; + }); + + assert(Solution.size() == Uses.size() && "Malformed solution!"); +} + +/// HoistInsertPosition - Helper for AdjustInsertPositionForExpand. Climb up +/// the dominator tree far as we can go while still being dominated by the +/// input positions. This helps canonicalize the insert position, which +/// encourages sharing. +BasicBlock::iterator +LSRInstance::HoistInsertPosition(BasicBlock::iterator IP, + const SmallVectorImpl<Instruction *> &Inputs) + const { + for (;;) { + const Loop *IPLoop = LI.getLoopFor(IP->getParent()); + unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0; + + BasicBlock *IDom; + for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) { + if (!Rung) return IP; + Rung = Rung->getIDom(); + if (!Rung) return IP; + IDom = Rung->getBlock(); + + // Don't climb into a loop though. + const Loop *IDomLoop = LI.getLoopFor(IDom); + unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0; + if (IDomDepth <= IPLoopDepth && + (IDomDepth != IPLoopDepth || IDomLoop == IPLoop)) + break; + } + + bool AllDominate = true; + Instruction *BetterPos = 0; + Instruction *Tentative = IDom->getTerminator(); + for (SmallVectorImpl<Instruction *>::const_iterator I = Inputs.begin(), + E = Inputs.end(); I != E; ++I) { + Instruction *Inst = *I; + if (Inst == Tentative || !DT.dominates(Inst, Tentative)) { + AllDominate = false; + break; + } + // Attempt to find an insert position in the middle of the block, + // instead of at the end, so that it can be used for other expansions. + if (IDom == Inst->getParent() && + (!BetterPos || !DT.dominates(Inst, BetterPos))) + BetterPos = llvm::next(BasicBlock::iterator(Inst)); + } + if (!AllDominate) + break; + if (BetterPos) + IP = BetterPos; + else + IP = Tentative; + } + + return IP; +} + +/// AdjustInsertPositionForExpand - Determine an input position which will be +/// dominated by the operands and which will dominate the result. +BasicBlock::iterator +LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP, + const LSRFixup &LF, + const LSRUse &LU, + SCEVExpander &Rewriter) const { + // Collect some instructions which must be dominated by the + // expanding replacement. These must be dominated by any operands that + // will be required in the expansion. + SmallVector<Instruction *, 4> Inputs; + if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace)) + Inputs.push_back(I); + if (LU.Kind == LSRUse::ICmpZero) + if (Instruction *I = + dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1))) + Inputs.push_back(I); + if (LF.PostIncLoops.count(L)) { + if (LF.isUseFullyOutsideLoop(L)) + Inputs.push_back(L->getLoopLatch()->getTerminator()); + else + Inputs.push_back(IVIncInsertPos); + } + // The expansion must also be dominated by the increment positions of any + // loops it for which it is using post-inc mode. + for (PostIncLoopSet::const_iterator I = LF.PostIncLoops.begin(), + E = LF.PostIncLoops.end(); I != E; ++I) { + const Loop *PIL = *I; + if (PIL == L) continue; + + // Be dominated by the loop exit. + SmallVector<BasicBlock *, 4> ExitingBlocks; + PIL->getExitingBlocks(ExitingBlocks); + if (!ExitingBlocks.empty()) { + BasicBlock *BB = ExitingBlocks[0]; + for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i) + BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]); + Inputs.push_back(BB->getTerminator()); + } + } + + assert(!isa<PHINode>(LowestIP) && !isa<LandingPadInst>(LowestIP) + && !isa<DbgInfoIntrinsic>(LowestIP) && + "Insertion point must be a normal instruction"); + + // Then, climb up the immediate dominator tree as far as we can go while + // still being dominated by the input positions. + BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs); + + // Don't insert instructions before PHI nodes. + while (isa<PHINode>(IP)) ++IP; + + // Ignore landingpad instructions. + while (isa<LandingPadInst>(IP)) ++IP; + + // Ignore debug intrinsics. + while (isa<DbgInfoIntrinsic>(IP)) ++IP; + + // Set IP below instructions recently inserted by SCEVExpander. This keeps the + // IP consistent across expansions and allows the previously inserted + // instructions to be reused by subsequent expansion. + while (Rewriter.isInsertedInstruction(IP) && IP != LowestIP) ++IP; + + return IP; +} + +/// Expand - Emit instructions for the leading candidate expression for this +/// LSRUse (this is called "expanding"). +Value *LSRInstance::Expand(const LSRFixup &LF, + const Formula &F, + BasicBlock::iterator IP, + SCEVExpander &Rewriter, + SmallVectorImpl<WeakVH> &DeadInsts) const { + const LSRUse &LU = Uses[LF.LUIdx]; + + // Determine an input position which will be dominated by the operands and + // which will dominate the result. + IP = AdjustInsertPositionForExpand(IP, LF, LU, Rewriter); + + // Inform the Rewriter if we have a post-increment use, so that it can + // perform an advantageous expansion. + Rewriter.setPostInc(LF.PostIncLoops); + + // This is the type that the user actually needs. + Type *OpTy = LF.OperandValToReplace->getType(); + // This will be the type that we'll initially expand to. + Type *Ty = F.getType(); + if (!Ty) + // No type known; just expand directly to the ultimate type. + Ty = OpTy; + else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy)) + // Expand directly to the ultimate type if it's the right size. + Ty = OpTy; + // This is the type to do integer arithmetic in. + Type *IntTy = SE.getEffectiveSCEVType(Ty); + + // Build up a list of operands to add together to form the full base. + SmallVector<const SCEV *, 8> Ops; + + // Expand the BaseRegs portion. + for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(), + E = F.BaseRegs.end(); I != E; ++I) { + const SCEV *Reg = *I; + assert(!Reg->isZero() && "Zero allocated in a base register!"); + + // If we're expanding for a post-inc user, make the post-inc adjustment. + PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops); + Reg = TransformForPostIncUse(Denormalize, Reg, + LF.UserInst, LF.OperandValToReplace, + Loops, SE, DT); + + Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, 0, IP))); + } + + // Expand the ScaledReg portion. + Value *ICmpScaledV = 0; + if (F.AM.Scale != 0) { + const SCEV *ScaledS = F.ScaledReg; + + // If we're expanding for a post-inc user, make the post-inc adjustment. + PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops); + ScaledS = TransformForPostIncUse(Denormalize, ScaledS, + LF.UserInst, LF.OperandValToReplace, + Loops, SE, DT); + + if (LU.Kind == LSRUse::ICmpZero) { + // An interesting way of "folding" with an icmp is to use a negated + // scale, which we'll implement by inserting it into the other operand + // of the icmp. + assert(F.AM.Scale == -1 && + "The only scale supported by ICmpZero uses is -1!"); + ICmpScaledV = Rewriter.expandCodeFor(ScaledS, 0, IP); + } else { + // Otherwise just expand the scaled register and an explicit scale, + // which is expected to be matched as part of the address. + + // Flush the operand list to suppress SCEVExpander hoisting address modes. + if (!Ops.empty() && LU.Kind == LSRUse::Address) { + Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); + Ops.clear(); + Ops.push_back(SE.getUnknown(FullV)); + } + ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP)); + ScaledS = SE.getMulExpr(ScaledS, + SE.getConstant(ScaledS->getType(), F.AM.Scale)); + Ops.push_back(ScaledS); + } + } + + // Expand the GV portion. + if (F.AM.BaseGV) { + // Flush the operand list to suppress SCEVExpander hoisting. + if (!Ops.empty()) { + Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); + Ops.clear(); + Ops.push_back(SE.getUnknown(FullV)); + } + Ops.push_back(SE.getUnknown(F.AM.BaseGV)); + } + + // Flush the operand list to suppress SCEVExpander hoisting of both folded and + // unfolded offsets. LSR assumes they both live next to their uses. + if (!Ops.empty()) { + Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); + Ops.clear(); + Ops.push_back(SE.getUnknown(FullV)); + } + + // Expand the immediate portion. + int64_t Offset = (uint64_t)F.AM.BaseOffs + LF.Offset; + if (Offset != 0) { + if (LU.Kind == LSRUse::ICmpZero) { + // The other interesting way of "folding" with an ICmpZero is to use a + // negated immediate. + if (!ICmpScaledV) + ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset); + else { + Ops.push_back(SE.getUnknown(ICmpScaledV)); + ICmpScaledV = ConstantInt::get(IntTy, Offset); + } + } else { + // Just add the immediate values. These again are expected to be matched + // as part of the address. + Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset))); + } + } + + // Expand the unfolded offset portion. + int64_t UnfoldedOffset = F.UnfoldedOffset; + if (UnfoldedOffset != 0) { + // Just add the immediate values. + Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, + UnfoldedOffset))); + } + + // Emit instructions summing all the operands. + const SCEV *FullS = Ops.empty() ? + SE.getConstant(IntTy, 0) : + SE.getAddExpr(Ops); + Value *FullV = Rewriter.expandCodeFor(FullS, Ty, IP); + + // We're done expanding now, so reset the rewriter. + Rewriter.clearPostInc(); + + // An ICmpZero Formula represents an ICmp which we're handling as a + // comparison against zero. Now that we've expanded an expression for that + // form, update the ICmp's other operand. + if (LU.Kind == LSRUse::ICmpZero) { + ICmpInst *CI = cast<ICmpInst>(LF.UserInst); + DeadInsts.push_back(CI->getOperand(1)); + assert(!F.AM.BaseGV && "ICmp does not support folding a global value and " + "a scale at the same time!"); + if (F.AM.Scale == -1) { + if (ICmpScaledV->getType() != OpTy) { + Instruction *Cast = + CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false, + OpTy, false), + ICmpScaledV, OpTy, "tmp", CI); + ICmpScaledV = Cast; + } + CI->setOperand(1, ICmpScaledV); + } else { + assert(F.AM.Scale == 0 && + "ICmp does not support folding a global value and " + "a scale at the same time!"); + Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy), + -(uint64_t)Offset); + if (C->getType() != OpTy) + C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false, + OpTy, false), + C, OpTy); + + CI->setOperand(1, C); + } + } + + return FullV; +} + +/// RewriteForPHI - Helper for Rewrite. PHI nodes are special because the use +/// of their operands effectively happens in their predecessor blocks, so the +/// expression may need to be expanded in multiple places. +void LSRInstance::RewriteForPHI(PHINode *PN, + const LSRFixup &LF, + const Formula &F, + SCEVExpander &Rewriter, + SmallVectorImpl<WeakVH> &DeadInsts, + Pass *P) const { + DenseMap<BasicBlock *, Value *> Inserted; + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + if (PN->getIncomingValue(i) == LF.OperandValToReplace) { + BasicBlock *BB = PN->getIncomingBlock(i); + + // If this is a critical edge, split the edge so that we do not insert + // the code on all predecessor/successor paths. We do this unless this + // is the canonical backedge for this loop, which complicates post-inc + // users. + if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 && + !isa<IndirectBrInst>(BB->getTerminator())) { + BasicBlock *Parent = PN->getParent(); + Loop *PNLoop = LI.getLoopFor(Parent); + if (!PNLoop || Parent != PNLoop->getHeader()) { + // Split the critical edge. + BasicBlock *NewBB = 0; + if (!Parent->isLandingPad()) { + NewBB = SplitCriticalEdge(BB, Parent, P, + /*MergeIdenticalEdges=*/true, + /*DontDeleteUselessPhis=*/true); + } else { + SmallVector<BasicBlock*, 2> NewBBs; + SplitLandingPadPredecessors(Parent, BB, "", "", P, NewBBs); + NewBB = NewBBs[0]; + } + + // If PN is outside of the loop and BB is in the loop, we want to + // move the block to be immediately before the PHI block, not + // immediately after BB. + if (L->contains(BB) && !L->contains(PN)) + NewBB->moveBefore(PN->getParent()); + + // Splitting the edge can reduce the number of PHI entries we have. + e = PN->getNumIncomingValues(); + BB = NewBB; + i = PN->getBasicBlockIndex(BB); + } + } + + std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair = + Inserted.insert(std::make_pair(BB, static_cast<Value *>(0))); + if (!Pair.second) + PN->setIncomingValue(i, Pair.first->second); + else { + Value *FullV = Expand(LF, F, BB->getTerminator(), Rewriter, DeadInsts); + + // If this is reuse-by-noop-cast, insert the noop cast. + Type *OpTy = LF.OperandValToReplace->getType(); + if (FullV->getType() != OpTy) + FullV = + CastInst::Create(CastInst::getCastOpcode(FullV, false, + OpTy, false), + FullV, LF.OperandValToReplace->getType(), + "tmp", BB->getTerminator()); + + PN->setIncomingValue(i, FullV); + Pair.first->second = FullV; + } + } +} + +/// Rewrite - Emit instructions for the leading candidate expression for this +/// LSRUse (this is called "expanding"), and update the UserInst to reference +/// the newly expanded value. +void LSRInstance::Rewrite(const LSRFixup &LF, + const Formula &F, + SCEVExpander &Rewriter, + SmallVectorImpl<WeakVH> &DeadInsts, + Pass *P) const { + // First, find an insertion point that dominates UserInst. For PHI nodes, + // find the nearest block which dominates all the relevant uses. + if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) { + RewriteForPHI(PN, LF, F, Rewriter, DeadInsts, P); + } else { + Value *FullV = Expand(LF, F, LF.UserInst, Rewriter, DeadInsts); + + // If this is reuse-by-noop-cast, insert the noop cast. + Type *OpTy = LF.OperandValToReplace->getType(); + if (FullV->getType() != OpTy) { + Instruction *Cast = + CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false), + FullV, OpTy, "tmp", LF.UserInst); + FullV = Cast; + } + + // Update the user. ICmpZero is handled specially here (for now) because + // Expand may have updated one of the operands of the icmp already, and + // its new value may happen to be equal to LF.OperandValToReplace, in + // which case doing replaceUsesOfWith leads to replacing both operands + // with the same value. TODO: Reorganize this. + if (Uses[LF.LUIdx].Kind == LSRUse::ICmpZero) + LF.UserInst->setOperand(0, FullV); + else + LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV); + } + + DeadInsts.push_back(LF.OperandValToReplace); +} + +/// ImplementSolution - Rewrite all the fixup locations with new values, +/// following the chosen solution. +void +LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, + Pass *P) { + // Keep track of instructions we may have made dead, so that + // we can remove them after we are done working. + SmallVector<WeakVH, 16> DeadInsts; + + SCEVExpander Rewriter(SE, "lsr"); +#ifndef NDEBUG + Rewriter.setDebugType(DEBUG_TYPE); +#endif + Rewriter.disableCanonicalMode(); + Rewriter.enableLSRMode(); + Rewriter.setIVIncInsertPos(L, IVIncInsertPos); + + // Mark phi nodes that terminate chains so the expander tries to reuse them. + for (SmallVectorImpl<IVChain>::const_iterator ChainI = IVChainVec.begin(), + ChainE = IVChainVec.end(); ChainI != ChainE; ++ChainI) { + if (PHINode *PN = dyn_cast<PHINode>(ChainI->tailUserInst())) + Rewriter.setChainedPhi(PN); + } + + // Expand the new value definitions and update the users. + for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(), + E = Fixups.end(); I != E; ++I) { + const LSRFixup &Fixup = *I; + + Rewrite(Fixup, *Solution[Fixup.LUIdx], Rewriter, DeadInsts, P); + + Changed = true; + } + + for (SmallVectorImpl<IVChain>::const_iterator ChainI = IVChainVec.begin(), + ChainE = IVChainVec.end(); ChainI != ChainE; ++ChainI) { + GenerateIVChain(*ChainI, Rewriter, DeadInsts); + Changed = true; + } + // Clean up after ourselves. This must be done before deleting any + // instructions. + Rewriter.clear(); + + Changed |= DeleteTriviallyDeadInstructions(DeadInsts); +} + +LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P) + : IU(P->getAnalysis<IVUsers>()), + SE(P->getAnalysis<ScalarEvolution>()), + DT(P->getAnalysis<DominatorTree>()), + LI(P->getAnalysis<LoopInfo>()), + TLI(tli), L(l), Changed(false), IVIncInsertPos(0) { + + // If LoopSimplify form is not available, stay out of trouble. + if (!L->isLoopSimplifyForm()) + return; + + // If there's no interesting work to be done, bail early. + if (IU.empty()) return; + + // If there's too much analysis to be done, bail early. We won't be able to + // model the problem anyway. + unsigned NumUsers = 0; + for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) { + if (++NumUsers > MaxIVUsers) { + DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << *L + << "\n"); + return; + } + } + +#ifndef NDEBUG + // All dominating loops must have preheaders, or SCEVExpander may not be able + // to materialize an AddRecExpr whose Start is an outer AddRecExpr. + // + // IVUsers analysis should only create users that are dominated by simple loop + // headers. Since this loop should dominate all of its users, its user list + // should be empty if this loop itself is not within a simple loop nest. + for (DomTreeNode *Rung = DT.getNode(L->getLoopPreheader()); + Rung; Rung = Rung->getIDom()) { + BasicBlock *BB = Rung->getBlock(); + const Loop *DomLoop = LI.getLoopFor(BB); + if (DomLoop && DomLoop->getHeader() == BB) { + assert(DomLoop->getLoopPreheader() && "LSR needs a simplified loop nest"); + } + } +#endif // DEBUG + + DEBUG(dbgs() << "\nLSR on loop "; + WriteAsOperand(dbgs(), L->getHeader(), /*PrintType=*/false); + dbgs() << ":\n"); + + // First, perform some low-level loop optimizations. + OptimizeShadowIV(); + OptimizeLoopTermCond(); + + // If loop preparation eliminates all interesting IV users, bail. + if (IU.empty()) return; + + // Skip nested loops until we can model them better with formulae. + if (!L->empty()) { + DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n"); + return; + } + + // Start collecting data and preparing for the solver. + CollectChains(); + CollectInterestingTypesAndFactors(); + CollectFixupsAndInitialFormulae(); + CollectLoopInvariantFixupsAndFormulae(); + + assert(!Uses.empty() && "IVUsers reported at least one use"); + DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n"; + print_uses(dbgs())); + + // Now use the reuse data to generate a bunch of interesting ways + // to formulate the values needed for the uses. + GenerateAllReuseFormulae(); + + FilterOutUndesirableDedicatedRegisters(); + NarrowSearchSpaceUsingHeuristics(); + + SmallVector<const Formula *, 8> Solution; + Solve(Solution); + + // Release memory that is no longer needed. + Factors.clear(); + Types.clear(); + RegUses.clear(); + + if (Solution.empty()) + return; + +#ifndef NDEBUG + // Formulae should be legal. + for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(), + E = Uses.end(); I != E; ++I) { + const LSRUse &LU = *I; + for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(), + JE = LU.Formulae.end(); J != JE; ++J) + assert(isLegalUse(J->AM, LU.MinOffset, LU.MaxOffset, + LU.Kind, LU.AccessTy, TLI) && + "Illegal formula generated!"); + }; +#endif + + // Now that we've decided what we want, make it so. + ImplementSolution(Solution, P); +} + +void LSRInstance::print_factors_and_types(raw_ostream &OS) const { + if (Factors.empty() && Types.empty()) return; + + OS << "LSR has identified the following interesting factors and types: "; + bool First = true; + + for (SmallSetVector<int64_t, 8>::const_iterator + I = Factors.begin(), E = Factors.end(); I != E; ++I) { + if (!First) OS << ", "; + First = false; + OS << '*' << *I; + } + + for (SmallSetVector<Type *, 4>::const_iterator + I = Types.begin(), E = Types.end(); I != E; ++I) { + if (!First) OS << ", "; + First = false; + OS << '(' << **I << ')'; + } + OS << '\n'; +} + +void LSRInstance::print_fixups(raw_ostream &OS) const { + OS << "LSR is examining the following fixup sites:\n"; + for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(), + E = Fixups.end(); I != E; ++I) { + dbgs() << " "; + I->print(OS); + OS << '\n'; + } +} + +void LSRInstance::print_uses(raw_ostream &OS) const { + OS << "LSR is examining the following uses:\n"; + for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(), + E = Uses.end(); I != E; ++I) { + const LSRUse &LU = *I; + dbgs() << " "; + LU.print(OS); + OS << '\n'; + for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(), + JE = LU.Formulae.end(); J != JE; ++J) { + OS << " "; + J->print(OS); + OS << '\n'; + } + } +} + +void LSRInstance::print(raw_ostream &OS) const { + print_factors_and_types(OS); + print_fixups(OS); + print_uses(OS); +} + +void LSRInstance::dump() const { + print(errs()); errs() << '\n'; +} + +namespace { + +class LoopStrengthReduce : public LoopPass { + /// TLI - Keep a pointer of a TargetLowering to consult for determining + /// transformation profitability. + const TargetLowering *const TLI; + +public: + static char ID; // Pass ID, replacement for typeid + explicit LoopStrengthReduce(const TargetLowering *tli = 0); + +private: + bool runOnLoop(Loop *L, LPPassManager &LPM); + void getAnalysisUsage(AnalysisUsage &AU) const; +}; + +} + +char LoopStrengthReduce::ID = 0; +INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce", + "Loop Strength Reduction", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(IVUsers) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce", + "Loop Strength Reduction", false, false) + + +Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) { + return new LoopStrengthReduce(TLI); +} + +LoopStrengthReduce::LoopStrengthReduce(const TargetLowering *tli) + : LoopPass(ID), TLI(tli) { + initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry()); + } + +void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { + // We split critical edges, so we change the CFG. However, we do update + // many analyses if they are around. + AU.addPreservedID(LoopSimplifyID); + + AU.addRequired<LoopInfo>(); + AU.addPreserved<LoopInfo>(); + AU.addRequiredID(LoopSimplifyID); + AU.addRequired<DominatorTree>(); + AU.addPreserved<DominatorTree>(); + AU.addRequired<ScalarEvolution>(); + AU.addPreserved<ScalarEvolution>(); + // Requiring LoopSimplify a second time here prevents IVUsers from running + // twice, since LoopSimplify was invalidated by running ScalarEvolution. + AU.addRequiredID(LoopSimplifyID); + AU.addRequired<IVUsers>(); + AU.addPreserved<IVUsers>(); +} + +bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { + bool Changed = false; + + // Run the main LSR transformation. + Changed |= LSRInstance(TLI, L, this).getChanged(); + + // Remove any extra phis created by processing inner loops. + Changed |= DeleteDeadPHIs(L->getHeader()); + if (EnablePhiElim) { + SmallVector<WeakVH, 16> DeadInsts; + SCEVExpander Rewriter(getAnalysis<ScalarEvolution>(), "lsr"); +#ifndef NDEBUG + Rewriter.setDebugType(DEBUG_TYPE); +#endif + unsigned numFolded = Rewriter. + replaceCongruentIVs(L, &getAnalysis<DominatorTree>(), DeadInsts, TLI); + if (numFolded) { + Changed = true; + DeleteTriviallyDeadInstructions(DeadInsts); + DeleteDeadPHIs(L->getHeader()); + } + } + return Changed; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp new file mode 100644 index 0000000..09a186f --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -0,0 +1,224 @@ +//===-- LoopUnroll.cpp - Loop unroller pass -------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass implements a simple loop unroller. It works best when loops have +// been canonicalized by the -indvars pass, allowing it to determine the trip +// counts of loops easily. +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "loop-unroll" +#include "llvm/IntrinsicInst.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/UnrollLoop.h" +#include "llvm/Target/TargetData.h" +#include <climits> + +using namespace llvm; + +static cl::opt<unsigned> +UnrollThreshold("unroll-threshold", cl::init(150), cl::Hidden, + cl::desc("The cut-off point for automatic loop unrolling")); + +static cl::opt<unsigned> +UnrollCount("unroll-count", cl::init(0), cl::Hidden, + cl::desc("Use this unroll count for all loops, for testing purposes")); + +static cl::opt<bool> +UnrollAllowPartial("unroll-allow-partial", cl::init(false), cl::Hidden, + cl::desc("Allows loops to be partially unrolled until " + "-unroll-threshold loop size is reached.")); + +static cl::opt<bool> +UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::init(false), cl::Hidden, + cl::desc("Unroll loops with run-time trip counts")); + +namespace { + class LoopUnroll : public LoopPass { + public: + static char ID; // Pass ID, replacement for typeid + LoopUnroll(int T = -1, int C = -1, int P = -1) : LoopPass(ID) { + CurrentThreshold = (T == -1) ? UnrollThreshold : unsigned(T); + CurrentCount = (C == -1) ? UnrollCount : unsigned(C); + CurrentAllowPartial = (P == -1) ? UnrollAllowPartial : (bool)P; + + UserThreshold = (T != -1) || (UnrollThreshold.getNumOccurrences() > 0); + + initializeLoopUnrollPass(*PassRegistry::getPassRegistry()); + } + + /// A magic value for use with the Threshold parameter to indicate + /// that the loop unroll should be performed regardless of how much + /// code expansion would result. + static const unsigned NoThreshold = UINT_MAX; + + // Threshold to use when optsize is specified (and there is no + // explicit -unroll-threshold). + static const unsigned OptSizeUnrollThreshold = 50; + + // Default unroll count for loops with run-time trip count if + // -unroll-count is not set + static const unsigned UnrollRuntimeCount = 8; + + unsigned CurrentCount; + unsigned CurrentThreshold; + bool CurrentAllowPartial; + bool UserThreshold; // CurrentThreshold is user-specified. + + bool runOnLoop(Loop *L, LPPassManager &LPM); + + /// This transformation requires natural loop information & requires that + /// loop preheaders be inserted into the CFG... + /// + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<LoopInfo>(); + AU.addPreserved<LoopInfo>(); + AU.addRequiredID(LoopSimplifyID); + AU.addPreservedID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + AU.addPreservedID(LCSSAID); + AU.addRequired<ScalarEvolution>(); + AU.addPreserved<ScalarEvolution>(); + // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info. + // If loop unroll does not preserve dom info then LCSSA pass on next + // loop will receive invalid dom info. + // For now, recreate dom info, if loop is unrolled. + AU.addPreserved<DominatorTree>(); + } + }; +} + +char LoopUnroll::ID = 0; +INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false) + +Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial) { + return new LoopUnroll(Threshold, Count, AllowPartial); +} + +/// ApproximateLoopSize - Approximate the size of the loop. +static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, + const TargetData *TD) { + CodeMetrics Metrics; + for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); + I != E; ++I) + Metrics.analyzeBasicBlock(*I, TD); + NumCalls = Metrics.NumInlineCandidates; + + unsigned LoopSize = Metrics.NumInsts; + + // Don't allow an estimate of size zero. This would allows unrolling of loops + // with huge iteration counts, which is a compile time problem even if it's + // not a problem for code quality. + if (LoopSize == 0) LoopSize = 1; + + return LoopSize; +} + +bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { + LoopInfo *LI = &getAnalysis<LoopInfo>(); + ScalarEvolution *SE = &getAnalysis<ScalarEvolution>(); + + BasicBlock *Header = L->getHeader(); + DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName() + << "] Loop %" << Header->getName() << "\n"); + (void)Header; + + // Determine the current unrolling threshold. While this is normally set + // from UnrollThreshold, it is overridden to a smaller value if the current + // function is marked as optimize-for-size, and the unroll threshold was + // not user specified. + unsigned Threshold = CurrentThreshold; + if (!UserThreshold && + Header->getParent()->hasFnAttr(Attribute::OptimizeForSize)) + Threshold = OptSizeUnrollThreshold; + + // Find trip count and trip multiple if count is not available + unsigned TripCount = 0; + unsigned TripMultiple = 1; + // Find "latch trip count". UnrollLoop assumes that control cannot exit + // via the loop latch on any iteration prior to TripCount. The loop may exit + // early via an earlier branch. + BasicBlock *LatchBlock = L->getLoopLatch(); + if (LatchBlock) { + TripCount = SE->getSmallConstantTripCount(L, LatchBlock); + TripMultiple = SE->getSmallConstantTripMultiple(L, LatchBlock); + } + // Use a default unroll-count if the user doesn't specify a value + // and the trip count is a run-time value. The default is different + // for run-time or compile-time trip count loops. + unsigned Count = CurrentCount; + if (UnrollRuntime && CurrentCount == 0 && TripCount == 0) + Count = UnrollRuntimeCount; + + if (Count == 0) { + // Conservative heuristic: if we know the trip count, see if we can + // completely unroll (subject to the threshold, checked below); otherwise + // try to find greatest modulo of the trip count which is still under + // threshold value. + if (TripCount == 0) + return false; + Count = TripCount; + } + + // Enforce the threshold. + if (Threshold != NoThreshold) { + const TargetData *TD = getAnalysisIfAvailable<TargetData>(); + unsigned NumInlineCandidates; + unsigned LoopSize = ApproximateLoopSize(L, NumInlineCandidates, TD); + DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n"); + if (NumInlineCandidates != 0) { + DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n"); + return false; + } + uint64_t Size = (uint64_t)LoopSize*Count; + if (TripCount != 1 && Size > Threshold) { + DEBUG(dbgs() << " Too large to fully unroll with count: " << Count + << " because size: " << Size << ">" << Threshold << "\n"); + if (!CurrentAllowPartial && !(UnrollRuntime && TripCount == 0)) { + DEBUG(dbgs() << " will not try to unroll partially because " + << "-unroll-allow-partial not given\n"); + return false; + } + if (TripCount) { + // Reduce unroll count to be modulo of TripCount for partial unrolling + Count = Threshold / LoopSize; + while (Count != 0 && TripCount%Count != 0) + Count--; + } + else if (UnrollRuntime) { + // Reduce unroll count to be a lower power-of-two value + while (Count != 0 && Size > Threshold) { + Count >>= 1; + Size = LoopSize*Count; + } + } + if (Count < 2) { + DEBUG(dbgs() << " could not unroll partially\n"); + return false; + } + DEBUG(dbgs() << " partially unrolling with count: " << Count << "\n"); + } + } + + // Unroll the loop. + if (!UnrollLoop(L, Count, TripCount, UnrollRuntime, TripMultiple, LI, &LPM)) + return false; + + return true; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp new file mode 100644 index 0000000..58f7739 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -0,0 +1,1281 @@ +//===-- LoopUnswitch.cpp - Hoist loop-invariant conditionals in loop ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass transforms loops that contain branches on loop-invariant conditions +// to have multiple loops. For example, it turns the left into the right code: +// +// for (...) if (lic) +// A for (...) +// if (lic) A; B; C +// B else +// C for (...) +// A; C +// +// This can increase the size of the code exponentially (doubling it every time +// a loop is unswitched) so we only unswitch if the resultant code will be +// smaller than a threshold. +// +// This pass expects LICM to be run before it to hoist invariant conditions out +// of the loop, to make the unswitching opportunity obvious. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "loop-unswitch" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/Instructions.h" +#include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include <algorithm> +#include <map> +#include <set> +using namespace llvm; + +STATISTIC(NumBranches, "Number of branches unswitched"); +STATISTIC(NumSwitches, "Number of switches unswitched"); +STATISTIC(NumSelects , "Number of selects unswitched"); +STATISTIC(NumTrivial , "Number of unswitches that are trivial"); +STATISTIC(NumSimplify, "Number of simplifications of unswitched code"); +STATISTIC(TotalInsts, "Total number of instructions analyzed"); + +// The specific value of 100 here was chosen based only on intuition and a +// few specific examples. +static cl::opt<unsigned> +Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"), + cl::init(100), cl::Hidden); + +namespace { + + class LUAnalysisCache { + + typedef DenseMap<const SwitchInst*, SmallPtrSet<const Value *, 8> > + UnswitchedValsMap; + + typedef UnswitchedValsMap::iterator UnswitchedValsIt; + + struct LoopProperties { + unsigned CanBeUnswitchedCount; + unsigned SizeEstimation; + UnswitchedValsMap UnswitchedVals; + }; + + // Here we use std::map instead of DenseMap, since we need to keep valid + // LoopProperties pointer for current loop for better performance. + typedef std::map<const Loop*, LoopProperties> LoopPropsMap; + typedef LoopPropsMap::iterator LoopPropsMapIt; + + LoopPropsMap LoopsProperties; + UnswitchedValsMap* CurLoopInstructions; + LoopProperties* CurrentLoopProperties; + + // Max size of code we can produce on remained iterations. + unsigned MaxSize; + + public: + + LUAnalysisCache() : + CurLoopInstructions(NULL), CurrentLoopProperties(NULL), + MaxSize(Threshold) + {} + + // Analyze loop. Check its size, calculate is it possible to unswitch + // it. Returns true if we can unswitch this loop. + bool countLoop(const Loop* L); + + // Clean all data related to given loop. + void forgetLoop(const Loop* L); + + // Mark case value as unswitched. + // Since SI instruction can be partly unswitched, in order to avoid + // extra unswitching in cloned loops keep track all unswitched values. + void setUnswitched(const SwitchInst* SI, const Value* V); + + // Check was this case value unswitched before or not. + bool isUnswitched(const SwitchInst* SI, const Value* V); + + // Clone all loop-unswitch related loop properties. + // Redistribute unswitching quotas. + // Note, that new loop data is stored inside the VMap. + void cloneData(const Loop* NewLoop, const Loop* OldLoop, + const ValueToValueMapTy& VMap); + }; + + class LoopUnswitch : public LoopPass { + LoopInfo *LI; // Loop information + LPPassManager *LPM; + + // LoopProcessWorklist - Used to check if second loop needs processing + // after RewriteLoopBodyWithConditionConstant rewrites first loop. + std::vector<Loop*> LoopProcessWorklist; + + LUAnalysisCache BranchesInfo; + + bool OptimizeForSize; + bool redoLoop; + + Loop *currentLoop; + DominatorTree *DT; + BasicBlock *loopHeader; + BasicBlock *loopPreheader; + + // LoopBlocks contains all of the basic blocks of the loop, including the + // preheader of the loop, the body of the loop, and the exit blocks of the + // loop, in that order. + std::vector<BasicBlock*> LoopBlocks; + // NewBlocks contained cloned copy of basic blocks from LoopBlocks. + std::vector<BasicBlock*> NewBlocks; + + public: + static char ID; // Pass ID, replacement for typeid + explicit LoopUnswitch(bool Os = false) : + LoopPass(ID), OptimizeForSize(Os), redoLoop(false), + currentLoop(NULL), DT(NULL), loopHeader(NULL), + loopPreheader(NULL) { + initializeLoopUnswitchPass(*PassRegistry::getPassRegistry()); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM); + bool processCurrentLoop(); + + /// This transformation requires natural loop information & requires that + /// loop preheaders be inserted into the CFG. + /// + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequiredID(LoopSimplifyID); + AU.addPreservedID(LoopSimplifyID); + AU.addRequired<LoopInfo>(); + AU.addPreserved<LoopInfo>(); + AU.addRequiredID(LCSSAID); + AU.addPreservedID(LCSSAID); + AU.addPreserved<DominatorTree>(); + AU.addPreserved<ScalarEvolution>(); + } + + private: + + virtual void releaseMemory() { + BranchesInfo.forgetLoop(currentLoop); + } + + /// RemoveLoopFromWorklist - If the specified loop is on the loop worklist, + /// remove it. + void RemoveLoopFromWorklist(Loop *L) { + std::vector<Loop*>::iterator I = std::find(LoopProcessWorklist.begin(), + LoopProcessWorklist.end(), L); + if (I != LoopProcessWorklist.end()) + LoopProcessWorklist.erase(I); + } + + void initLoopData() { + loopHeader = currentLoop->getHeader(); + loopPreheader = currentLoop->getLoopPreheader(); + } + + /// Split all of the edges from inside the loop to their exit blocks. + /// Update the appropriate Phi nodes as we do so. + void SplitExitEdges(Loop *L, const SmallVector<BasicBlock *, 8> &ExitBlocks); + + bool UnswitchIfProfitable(Value *LoopCond, Constant *Val); + void UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val, + BasicBlock *ExitBlock); + void UnswitchNontrivialCondition(Value *LIC, Constant *OnVal, Loop *L); + + void RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, + Constant *Val, bool isEqual); + + void EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val, + BasicBlock *TrueDest, + BasicBlock *FalseDest, + Instruction *InsertPt); + + void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L); + void RemoveBlockIfDead(BasicBlock *BB, + std::vector<Instruction*> &Worklist, Loop *l); + void RemoveLoopFromHierarchy(Loop *L); + bool IsTrivialUnswitchCondition(Value *Cond, Constant **Val = 0, + BasicBlock **LoopExit = 0); + + }; +} + +// Analyze loop. Check its size, calculate is it possible to unswitch +// it. Returns true if we can unswitch this loop. +bool LUAnalysisCache::countLoop(const Loop* L) { + + std::pair<LoopPropsMapIt, bool> InsertRes = + LoopsProperties.insert(std::make_pair(L, LoopProperties())); + + LoopProperties& Props = InsertRes.first->second; + + if (InsertRes.second) { + // New loop. + + // Limit the number of instructions to avoid causing significant code + // expansion, and the number of basic blocks, to avoid loops with + // large numbers of branches which cause loop unswitching to go crazy. + // This is a very ad-hoc heuristic. + + // FIXME: This is overly conservative because it does not take into + // consideration code simplification opportunities and code that can + // be shared by the resultant unswitched loops. + CodeMetrics Metrics; + for (Loop::block_iterator I = L->block_begin(), + E = L->block_end(); + I != E; ++I) + Metrics.analyzeBasicBlock(*I); + + Props.SizeEstimation = std::min(Metrics.NumInsts, Metrics.NumBlocks * 5); + Props.CanBeUnswitchedCount = MaxSize / (Props.SizeEstimation); + MaxSize -= Props.SizeEstimation * Props.CanBeUnswitchedCount; + } + + if (!Props.CanBeUnswitchedCount) { + DEBUG(dbgs() << "NOT unswitching loop %" + << L->getHeader()->getName() << ", cost too high: " + << L->getBlocks().size() << "\n"); + + return false; + } + + // Be careful. This links are good only before new loop addition. + CurrentLoopProperties = &Props; + CurLoopInstructions = &Props.UnswitchedVals; + + return true; +} + +// Clean all data related to given loop. +void LUAnalysisCache::forgetLoop(const Loop* L) { + + LoopPropsMapIt LIt = LoopsProperties.find(L); + + if (LIt != LoopsProperties.end()) { + LoopProperties& Props = LIt->second; + MaxSize += Props.CanBeUnswitchedCount * Props.SizeEstimation; + LoopsProperties.erase(LIt); + } + + CurrentLoopProperties = NULL; + CurLoopInstructions = NULL; +} + +// Mark case value as unswitched. +// Since SI instruction can be partly unswitched, in order to avoid +// extra unswitching in cloned loops keep track all unswitched values. +void LUAnalysisCache::setUnswitched(const SwitchInst* SI, const Value* V) { + (*CurLoopInstructions)[SI].insert(V); +} + +// Check was this case value unswitched before or not. +bool LUAnalysisCache::isUnswitched(const SwitchInst* SI, const Value* V) { + return (*CurLoopInstructions)[SI].count(V); +} + +// Clone all loop-unswitch related loop properties. +// Redistribute unswitching quotas. +// Note, that new loop data is stored inside the VMap. +void LUAnalysisCache::cloneData(const Loop* NewLoop, const Loop* OldLoop, + const ValueToValueMapTy& VMap) { + + LoopProperties& NewLoopProps = LoopsProperties[NewLoop]; + LoopProperties& OldLoopProps = *CurrentLoopProperties; + UnswitchedValsMap& Insts = OldLoopProps.UnswitchedVals; + + // Reallocate "can-be-unswitched quota" + + --OldLoopProps.CanBeUnswitchedCount; + unsigned Quota = OldLoopProps.CanBeUnswitchedCount; + NewLoopProps.CanBeUnswitchedCount = Quota / 2; + OldLoopProps.CanBeUnswitchedCount = Quota - Quota / 2; + + NewLoopProps.SizeEstimation = OldLoopProps.SizeEstimation; + + // Clone unswitched values info: + // for new loop switches we clone info about values that was + // already unswitched and has redundant successors. + for (UnswitchedValsIt I = Insts.begin(); I != Insts.end(); ++I) { + const SwitchInst* OldInst = I->first; + Value* NewI = VMap.lookup(OldInst); + const SwitchInst* NewInst = cast_or_null<SwitchInst>(NewI); + assert(NewInst && "All instructions that are in SrcBB must be in VMap."); + + NewLoopProps.UnswitchedVals[NewInst] = OldLoopProps.UnswitchedVals[OldInst]; + } +} + +char LoopUnswitch::ID = 0; +INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops", + false, false) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops", + false, false) + +Pass *llvm::createLoopUnswitchPass(bool Os) { + return new LoopUnswitch(Os); +} + +/// FindLIVLoopCondition - Cond is a condition that occurs in L. If it is +/// invariant in the loop, or has an invariant piece, return the invariant. +/// Otherwise, return null. +static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) { + + // We started analyze new instruction, increment scanned instructions counter. + ++TotalInsts; + + // We can never unswitch on vector conditions. + if (Cond->getType()->isVectorTy()) + return 0; + + // Constants should be folded, not unswitched on! + if (isa<Constant>(Cond)) return 0; + + // TODO: Handle: br (VARIANT|INVARIANT). + + // Hoist simple values out. + if (L->makeLoopInvariant(Cond, Changed)) + return Cond; + + if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Cond)) + if (BO->getOpcode() == Instruction::And || + BO->getOpcode() == Instruction::Or) { + // If either the left or right side is invariant, we can unswitch on this, + // which will cause the branch to go away in one loop and the condition to + // simplify in the other one. + if (Value *LHS = FindLIVLoopCondition(BO->getOperand(0), L, Changed)) + return LHS; + if (Value *RHS = FindLIVLoopCondition(BO->getOperand(1), L, Changed)) + return RHS; + } + + return 0; +} + +bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) { + LI = &getAnalysis<LoopInfo>(); + LPM = &LPM_Ref; + DT = getAnalysisIfAvailable<DominatorTree>(); + currentLoop = L; + Function *F = currentLoop->getHeader()->getParent(); + bool Changed = false; + do { + assert(currentLoop->isLCSSAForm(*DT)); + redoLoop = false; + Changed |= processCurrentLoop(); + } while(redoLoop); + + if (Changed) { + // FIXME: Reconstruct dom info, because it is not preserved properly. + if (DT) + DT->runOnFunction(*F); + } + return Changed; +} + +/// processCurrentLoop - Do actual work and unswitch loop if possible +/// and profitable. +bool LoopUnswitch::processCurrentLoop() { + bool Changed = false; + + initLoopData(); + + // If LoopSimplify was unable to form a preheader, don't do any unswitching. + if (!loopPreheader) + return false; + + // Loops with indirectbr cannot be cloned. + if (!currentLoop->isSafeToClone()) + return false; + + // Without dedicated exits, splitting the exit edge may fail. + if (!currentLoop->hasDedicatedExits()) + return false; + + LLVMContext &Context = loopHeader->getContext(); + + // Probably we reach the quota of branches for this loop. If so + // stop unswitching. + if (!BranchesInfo.countLoop(currentLoop)) + return false; + + // Loop over all of the basic blocks in the loop. If we find an interior + // block that is branching on a loop-invariant condition, we can unswitch this + // loop. + for (Loop::block_iterator I = currentLoop->block_begin(), + E = currentLoop->block_end(); I != E; ++I) { + TerminatorInst *TI = (*I)->getTerminator(); + if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { + // If this isn't branching on an invariant condition, we can't unswitch + // it. + if (BI->isConditional()) { + // See if this, or some part of it, is loop invariant. If so, we can + // unswitch on it if we desire. + Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), + currentLoop, Changed); + if (LoopCond && UnswitchIfProfitable(LoopCond, + ConstantInt::getTrue(Context))) { + ++NumBranches; + return true; + } + } + } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { + Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), + currentLoop, Changed); + unsigned NumCases = SI->getNumCases(); + if (LoopCond && NumCases) { + // Find a value to unswitch on: + // FIXME: this should chose the most expensive case! + // FIXME: scan for a case with a non-critical edge? + Constant *UnswitchVal = NULL; + + // Do not process same value again and again. + // At this point we have some cases already unswitched and + // some not yet unswitched. Let's find the first not yet unswitched one. + for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); + i != e; ++i) { + Constant* UnswitchValCandidate = i.getCaseValue(); + if (!BranchesInfo.isUnswitched(SI, UnswitchValCandidate)) { + UnswitchVal = UnswitchValCandidate; + break; + } + } + + if (!UnswitchVal) + continue; + + if (UnswitchIfProfitable(LoopCond, UnswitchVal)) { + ++NumSwitches; + return true; + } + } + } + + // Scan the instructions to check for unswitchable values. + for (BasicBlock::iterator BBI = (*I)->begin(), E = (*I)->end(); + BBI != E; ++BBI) + if (SelectInst *SI = dyn_cast<SelectInst>(BBI)) { + Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), + currentLoop, Changed); + if (LoopCond && UnswitchIfProfitable(LoopCond, + ConstantInt::getTrue(Context))) { + ++NumSelects; + return true; + } + } + } + return Changed; +} + +/// isTrivialLoopExitBlock - Check to see if all paths from BB exit the +/// loop with no side effects (including infinite loops). +/// +/// If true, we return true and set ExitBB to the block we +/// exit through. +/// +static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB, + BasicBlock *&ExitBB, + std::set<BasicBlock*> &Visited) { + if (!Visited.insert(BB).second) { + // Already visited. Without more analysis, this could indicate an infinite + // loop. + return false; + } else if (!L->contains(BB)) { + // Otherwise, this is a loop exit, this is fine so long as this is the + // first exit. + if (ExitBB != 0) return false; + ExitBB = BB; + return true; + } + + // Otherwise, this is an unvisited intra-loop node. Check all successors. + for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI) { + // Check to see if the successor is a trivial loop exit. + if (!isTrivialLoopExitBlockHelper(L, *SI, ExitBB, Visited)) + return false; + } + + // Okay, everything after this looks good, check to make sure that this block + // doesn't include any side effects. + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) + if (I->mayHaveSideEffects()) + return false; + + return true; +} + +/// isTrivialLoopExitBlock - Return true if the specified block unconditionally +/// leads to an exit from the specified loop, and has no side-effects in the +/// process. If so, return the block that is exited to, otherwise return null. +static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) { + std::set<BasicBlock*> Visited; + Visited.insert(L->getHeader()); // Branches to header make infinite loops. + BasicBlock *ExitBB = 0; + if (isTrivialLoopExitBlockHelper(L, BB, ExitBB, Visited)) + return ExitBB; + return 0; +} + +/// IsTrivialUnswitchCondition - Check to see if this unswitch condition is +/// trivial: that is, that the condition controls whether or not the loop does +/// anything at all. If this is a trivial condition, unswitching produces no +/// code duplications (equivalently, it produces a simpler loop and a new empty +/// loop, which gets deleted). +/// +/// If this is a trivial condition, return true, otherwise return false. When +/// returning true, this sets Cond and Val to the condition that controls the +/// trivial condition: when Cond dynamically equals Val, the loop is known to +/// exit. Finally, this sets LoopExit to the BB that the loop exits to when +/// Cond == Val. +/// +bool LoopUnswitch::IsTrivialUnswitchCondition(Value *Cond, Constant **Val, + BasicBlock **LoopExit) { + BasicBlock *Header = currentLoop->getHeader(); + TerminatorInst *HeaderTerm = Header->getTerminator(); + LLVMContext &Context = Header->getContext(); + + BasicBlock *LoopExitBB = 0; + if (BranchInst *BI = dyn_cast<BranchInst>(HeaderTerm)) { + // If the header block doesn't end with a conditional branch on Cond, we + // can't handle it. + if (!BI->isConditional() || BI->getCondition() != Cond) + return false; + + // Check to see if a successor of the branch is guaranteed to + // exit through a unique exit block without having any + // side-effects. If so, determine the value of Cond that causes it to do + // this. + if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, + BI->getSuccessor(0)))) { + if (Val) *Val = ConstantInt::getTrue(Context); + } else if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, + BI->getSuccessor(1)))) { + if (Val) *Val = ConstantInt::getFalse(Context); + } + } else if (SwitchInst *SI = dyn_cast<SwitchInst>(HeaderTerm)) { + // If this isn't a switch on Cond, we can't handle it. + if (SI->getCondition() != Cond) return false; + + // Check to see if a successor of the switch is guaranteed to go to the + // latch block or exit through a one exit block without having any + // side-effects. If so, determine the value of Cond that causes it to do + // this. + // Note that we can't trivially unswitch on the default case or + // on already unswitched cases. + for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); + i != e; ++i) { + BasicBlock* LoopExitCandidate; + if ((LoopExitCandidate = isTrivialLoopExitBlock(currentLoop, + i.getCaseSuccessor()))) { + // Okay, we found a trivial case, remember the value that is trivial. + ConstantInt* CaseVal = i.getCaseValue(); + + // Check that it was not unswitched before, since already unswitched + // trivial vals are looks trivial too. + if (BranchesInfo.isUnswitched(SI, CaseVal)) + continue; + LoopExitBB = LoopExitCandidate; + if (Val) *Val = CaseVal; + break; + } + } + } + + // If we didn't find a single unique LoopExit block, or if the loop exit block + // contains phi nodes, this isn't trivial. + if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin())) + return false; // Can't handle this. + + if (LoopExit) *LoopExit = LoopExitBB; + + // We already know that nothing uses any scalar values defined inside of this + // loop. As such, we just have to check to see if this loop will execute any + // side-effecting instructions (e.g. stores, calls, volatile loads) in the + // part of the loop that the code *would* execute. We already checked the + // tail, check the header now. + for (BasicBlock::iterator I = Header->begin(), E = Header->end(); I != E; ++I) + if (I->mayHaveSideEffects()) + return false; + return true; +} + +/// UnswitchIfProfitable - We have found that we can unswitch currentLoop when +/// LoopCond == Val to simplify the loop. If we decide that this is profitable, +/// unswitch the loop, reprocess the pieces, then return true. +bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val) { + Function *F = loopHeader->getParent(); + Constant *CondVal = 0; + BasicBlock *ExitBlock = 0; + + if (IsTrivialUnswitchCondition(LoopCond, &CondVal, &ExitBlock)) { + // If the condition is trivial, always unswitch. There is no code growth + // for this case. + UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, ExitBlock); + return true; + } + + // Check to see if it would be profitable to unswitch current loop. + + // Do not do non-trivial unswitch while optimizing for size. + if (OptimizeForSize || F->hasFnAttr(Attribute::OptimizeForSize)) + return false; + + UnswitchNontrivialCondition(LoopCond, Val, currentLoop); + return true; +} + +/// CloneLoop - Recursively clone the specified loop and all of its children, +/// mapping the blocks with the specified map. +static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM, + LoopInfo *LI, LPPassManager *LPM) { + Loop *New = new Loop(); + LPM->insertLoop(New, PL); + + // Add all of the blocks in L to the new loop. + for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); + I != E; ++I) + if (LI->getLoopFor(*I) == L) + New->addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), LI->getBase()); + + // Add all of the subloops to the new loop. + for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) + CloneLoop(*I, New, VM, LI, LPM); + + return New; +} + +/// EmitPreheaderBranchOnCondition - Emit a conditional branch on two values +/// if LIC == Val, branch to TrueDst, otherwise branch to FalseDest. Insert the +/// code immediately before InsertPt. +void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val, + BasicBlock *TrueDest, + BasicBlock *FalseDest, + Instruction *InsertPt) { + // Insert a conditional branch on LIC to the two preheaders. The original + // code is the true version and the new code is the false version. + Value *BranchVal = LIC; + if (!isa<ConstantInt>(Val) || + Val->getType() != Type::getInt1Ty(LIC->getContext())) + BranchVal = new ICmpInst(InsertPt, ICmpInst::ICMP_EQ, LIC, Val); + else if (Val != ConstantInt::getTrue(Val->getContext())) + // We want to enter the new loop when the condition is true. + std::swap(TrueDest, FalseDest); + + // Insert the new branch. + BranchInst *BI = BranchInst::Create(TrueDest, FalseDest, BranchVal, InsertPt); + + // If either edge is critical, split it. This helps preserve LoopSimplify + // form for enclosing loops. + SplitCriticalEdge(BI, 0, this, false, false, true); + SplitCriticalEdge(BI, 1, this, false, false, true); +} + +/// UnswitchTrivialCondition - Given a loop that has a trivial unswitchable +/// condition in it (a cond branch from its header block to its latch block, +/// where the path through the loop that doesn't execute its body has no +/// side-effects), unswitch it. This doesn't involve any code duplication, just +/// moving the conditional branch outside of the loop and updating loop info. +void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, + Constant *Val, + BasicBlock *ExitBlock) { + DEBUG(dbgs() << "loop-unswitch: Trivial-Unswitch loop %" + << loopHeader->getName() << " [" << L->getBlocks().size() + << " blocks] in Function " << L->getHeader()->getParent()->getName() + << " on cond: " << *Val << " == " << *Cond << "\n"); + + // First step, split the preheader, so that we know that there is a safe place + // to insert the conditional branch. We will change loopPreheader to have a + // conditional branch on Cond. + BasicBlock *NewPH = SplitEdge(loopPreheader, loopHeader, this); + + // Now that we have a place to insert the conditional branch, create a place + // to branch to: this is the exit block out of the loop that we should + // short-circuit to. + + // Split this block now, so that the loop maintains its exit block, and so + // that the jump from the preheader can execute the contents of the exit block + // without actually branching to it (the exit block should be dominated by the + // loop header, not the preheader). + assert(!L->contains(ExitBlock) && "Exit block is in the loop?"); + BasicBlock *NewExit = SplitBlock(ExitBlock, ExitBlock->begin(), this); + + // Okay, now we have a position to branch from and a position to branch to, + // insert the new conditional branch. + EmitPreheaderBranchOnCondition(Cond, Val, NewExit, NewPH, + loopPreheader->getTerminator()); + LPM->deleteSimpleAnalysisValue(loopPreheader->getTerminator(), L); + loopPreheader->getTerminator()->eraseFromParent(); + + // We need to reprocess this loop, it could be unswitched again. + redoLoop = true; + + // Now that we know that the loop is never entered when this condition is a + // particular value, rewrite the loop with this info. We know that this will + // at least eliminate the old branch. + RewriteLoopBodyWithConditionConstant(L, Cond, Val, false); + ++NumTrivial; +} + +/// SplitExitEdges - Split all of the edges from inside the loop to their exit +/// blocks. Update the appropriate Phi nodes as we do so. +void LoopUnswitch::SplitExitEdges(Loop *L, + const SmallVector<BasicBlock *, 8> &ExitBlocks){ + + for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) { + BasicBlock *ExitBlock = ExitBlocks[i]; + SmallVector<BasicBlock *, 4> Preds(pred_begin(ExitBlock), + pred_end(ExitBlock)); + + // Although SplitBlockPredecessors doesn't preserve loop-simplify in + // general, if we call it on all predecessors of all exits then it does. + if (!ExitBlock->isLandingPad()) { + SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa", this); + } else { + SmallVector<BasicBlock*, 2> NewBBs; + SplitLandingPadPredecessors(ExitBlock, Preds, ".us-lcssa", ".us-lcssa", + this, NewBBs); + } + } +} + +/// UnswitchNontrivialCondition - We determined that the loop is profitable +/// to unswitch when LIC equal Val. Split it into loop versions and test the +/// condition outside of either loop. Return the loops created as Out1/Out2. +void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, + Loop *L) { + Function *F = loopHeader->getParent(); + DEBUG(dbgs() << "loop-unswitch: Unswitching loop %" + << loopHeader->getName() << " [" << L->getBlocks().size() + << " blocks] in Function " << F->getName() + << " when '" << *Val << "' == " << *LIC << "\n"); + + if (ScalarEvolution *SE = getAnalysisIfAvailable<ScalarEvolution>()) + SE->forgetLoop(L); + + LoopBlocks.clear(); + NewBlocks.clear(); + + // First step, split the preheader and exit blocks, and add these blocks to + // the LoopBlocks list. + BasicBlock *NewPreheader = SplitEdge(loopPreheader, loopHeader, this); + LoopBlocks.push_back(NewPreheader); + + // We want the loop to come after the preheader, but before the exit blocks. + LoopBlocks.insert(LoopBlocks.end(), L->block_begin(), L->block_end()); + + SmallVector<BasicBlock*, 8> ExitBlocks; + L->getUniqueExitBlocks(ExitBlocks); + + // Split all of the edges from inside the loop to their exit blocks. Update + // the appropriate Phi nodes as we do so. + SplitExitEdges(L, ExitBlocks); + + // The exit blocks may have been changed due to edge splitting, recompute. + ExitBlocks.clear(); + L->getUniqueExitBlocks(ExitBlocks); + + // Add exit blocks to the loop blocks. + LoopBlocks.insert(LoopBlocks.end(), ExitBlocks.begin(), ExitBlocks.end()); + + // Next step, clone all of the basic blocks that make up the loop (including + // the loop preheader and exit blocks), keeping track of the mapping between + // the instructions and blocks. + NewBlocks.reserve(LoopBlocks.size()); + ValueToValueMapTy VMap; + for (unsigned i = 0, e = LoopBlocks.size(); i != e; ++i) { + BasicBlock *NewBB = CloneBasicBlock(LoopBlocks[i], VMap, ".us", F); + + NewBlocks.push_back(NewBB); + VMap[LoopBlocks[i]] = NewBB; // Keep the BB mapping. + LPM->cloneBasicBlockSimpleAnalysis(LoopBlocks[i], NewBB, L); + } + + // Splice the newly inserted blocks into the function right before the + // original preheader. + F->getBasicBlockList().splice(NewPreheader, F->getBasicBlockList(), + NewBlocks[0], F->end()); + + // Now we create the new Loop object for the versioned loop. + Loop *NewLoop = CloneLoop(L, L->getParentLoop(), VMap, LI, LPM); + + // Recalculate unswitching quota, inherit simplified switches info for NewBB, + // Probably clone more loop-unswitch related loop properties. + BranchesInfo.cloneData(NewLoop, L, VMap); + + Loop *ParentLoop = L->getParentLoop(); + if (ParentLoop) { + // Make sure to add the cloned preheader and exit blocks to the parent loop + // as well. + ParentLoop->addBasicBlockToLoop(NewBlocks[0], LI->getBase()); + } + + for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) { + BasicBlock *NewExit = cast<BasicBlock>(VMap[ExitBlocks[i]]); + // The new exit block should be in the same loop as the old one. + if (Loop *ExitBBLoop = LI->getLoopFor(ExitBlocks[i])) + ExitBBLoop->addBasicBlockToLoop(NewExit, LI->getBase()); + + assert(NewExit->getTerminator()->getNumSuccessors() == 1 && + "Exit block should have been split to have one successor!"); + BasicBlock *ExitSucc = NewExit->getTerminator()->getSuccessor(0); + + // If the successor of the exit block had PHI nodes, add an entry for + // NewExit. + PHINode *PN; + for (BasicBlock::iterator I = ExitSucc->begin(); isa<PHINode>(I); ++I) { + PN = cast<PHINode>(I); + Value *V = PN->getIncomingValueForBlock(ExitBlocks[i]); + ValueToValueMapTy::iterator It = VMap.find(V); + if (It != VMap.end()) V = It->second; + PN->addIncoming(V, NewExit); + } + + if (LandingPadInst *LPad = NewExit->getLandingPadInst()) { + PN = PHINode::Create(LPad->getType(), 0, "", + ExitSucc->getFirstInsertionPt()); + + for (pred_iterator I = pred_begin(ExitSucc), E = pred_end(ExitSucc); + I != E; ++I) { + BasicBlock *BB = *I; + LandingPadInst *LPI = BB->getLandingPadInst(); + LPI->replaceAllUsesWith(PN); + PN->addIncoming(LPI, BB); + } + } + } + + // Rewrite the code to refer to itself. + for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) + for (BasicBlock::iterator I = NewBlocks[i]->begin(), + E = NewBlocks[i]->end(); I != E; ++I) + RemapInstruction(I, VMap,RF_NoModuleLevelChanges|RF_IgnoreMissingEntries); + + // Rewrite the original preheader to select between versions of the loop. + BranchInst *OldBR = cast<BranchInst>(loopPreheader->getTerminator()); + assert(OldBR->isUnconditional() && OldBR->getSuccessor(0) == LoopBlocks[0] && + "Preheader splitting did not work correctly!"); + + // Emit the new branch that selects between the two versions of this loop. + EmitPreheaderBranchOnCondition(LIC, Val, NewBlocks[0], LoopBlocks[0], OldBR); + LPM->deleteSimpleAnalysisValue(OldBR, L); + OldBR->eraseFromParent(); + + LoopProcessWorklist.push_back(NewLoop); + redoLoop = true; + + // Keep a WeakVH holding onto LIC. If the first call to RewriteLoopBody + // deletes the instruction (for example by simplifying a PHI that feeds into + // the condition that we're unswitching on), we don't rewrite the second + // iteration. + WeakVH LICHandle(LIC); + + // Now we rewrite the original code to know that the condition is true and the + // new code to know that the condition is false. + RewriteLoopBodyWithConditionConstant(L, LIC, Val, false); + + // It's possible that simplifying one loop could cause the other to be + // changed to another value or a constant. If its a constant, don't simplify + // it. + if (!LoopProcessWorklist.empty() && LoopProcessWorklist.back() == NewLoop && + LICHandle && !isa<Constant>(LICHandle)) + RewriteLoopBodyWithConditionConstant(NewLoop, LICHandle, Val, true); +} + +/// RemoveFromWorklist - Remove all instances of I from the worklist vector +/// specified. +static void RemoveFromWorklist(Instruction *I, + std::vector<Instruction*> &Worklist) { + std::vector<Instruction*>::iterator WI = std::find(Worklist.begin(), + Worklist.end(), I); + while (WI != Worklist.end()) { + unsigned Offset = WI-Worklist.begin(); + Worklist.erase(WI); + WI = std::find(Worklist.begin()+Offset, Worklist.end(), I); + } +} + +/// ReplaceUsesOfWith - When we find that I really equals V, remove I from the +/// program, replacing all uses with V and update the worklist. +static void ReplaceUsesOfWith(Instruction *I, Value *V, + std::vector<Instruction*> &Worklist, + Loop *L, LPPassManager *LPM) { + DEBUG(dbgs() << "Replace with '" << *V << "': " << *I); + + // Add uses to the worklist, which may be dead now. + for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) + if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i))) + Worklist.push_back(Use); + + // Add users to the worklist which may be simplified now. + for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); + UI != E; ++UI) + Worklist.push_back(cast<Instruction>(*UI)); + LPM->deleteSimpleAnalysisValue(I, L); + RemoveFromWorklist(I, Worklist); + I->replaceAllUsesWith(V); + I->eraseFromParent(); + ++NumSimplify; +} + +/// RemoveBlockIfDead - If the specified block is dead, remove it, update loop +/// information, and remove any dead successors it has. +/// +void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB, + std::vector<Instruction*> &Worklist, + Loop *L) { + if (pred_begin(BB) != pred_end(BB)) { + // This block isn't dead, since an edge to BB was just removed, see if there + // are any easy simplifications we can do now. + if (BasicBlock *Pred = BB->getSinglePredecessor()) { + // If it has one pred, fold phi nodes in BB. + while (isa<PHINode>(BB->begin())) + ReplaceUsesOfWith(BB->begin(), + cast<PHINode>(BB->begin())->getIncomingValue(0), + Worklist, L, LPM); + + // If this is the header of a loop and the only pred is the latch, we now + // have an unreachable loop. + if (Loop *L = LI->getLoopFor(BB)) + if (loopHeader == BB && L->contains(Pred)) { + // Remove the branch from the latch to the header block, this makes + // the header dead, which will make the latch dead (because the header + // dominates the latch). + LPM->deleteSimpleAnalysisValue(Pred->getTerminator(), L); + Pred->getTerminator()->eraseFromParent(); + new UnreachableInst(BB->getContext(), Pred); + + // The loop is now broken, remove it from LI. + RemoveLoopFromHierarchy(L); + + // Reprocess the header, which now IS dead. + RemoveBlockIfDead(BB, Worklist, L); + return; + } + + // If pred ends in a uncond branch, add uncond branch to worklist so that + // the two blocks will get merged. + if (BranchInst *BI = dyn_cast<BranchInst>(Pred->getTerminator())) + if (BI->isUnconditional()) + Worklist.push_back(BI); + } + return; + } + + DEBUG(dbgs() << "Nuking dead block: " << *BB); + + // Remove the instructions in the basic block from the worklist. + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + RemoveFromWorklist(I, Worklist); + + // Anything that uses the instructions in this basic block should have their + // uses replaced with undefs. + // If I is not void type then replaceAllUsesWith undef. + // This allows ValueHandlers and custom metadata to adjust itself. + if (!I->getType()->isVoidTy()) + I->replaceAllUsesWith(UndefValue::get(I->getType())); + } + + // If this is the edge to the header block for a loop, remove the loop and + // promote all subloops. + if (Loop *BBLoop = LI->getLoopFor(BB)) { + if (BBLoop->getLoopLatch() == BB) { + RemoveLoopFromHierarchy(BBLoop); + if (currentLoop == BBLoop) { + currentLoop = 0; + redoLoop = false; + } + } + } + + // Remove the block from the loop info, which removes it from any loops it + // was in. + LI->removeBlock(BB); + + + // Remove phi node entries in successors for this block. + TerminatorInst *TI = BB->getTerminator(); + SmallVector<BasicBlock*, 4> Succs; + for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) { + Succs.push_back(TI->getSuccessor(i)); + TI->getSuccessor(i)->removePredecessor(BB); + } + + // Unique the successors, remove anything with multiple uses. + array_pod_sort(Succs.begin(), Succs.end()); + Succs.erase(std::unique(Succs.begin(), Succs.end()), Succs.end()); + + // Remove the basic block, including all of the instructions contained in it. + LPM->deleteSimpleAnalysisValue(BB, L); + BB->eraseFromParent(); + // Remove successor blocks here that are not dead, so that we know we only + // have dead blocks in this list. Nondead blocks have a way of becoming dead, + // then getting removed before we revisit them, which is badness. + // + for (unsigned i = 0; i != Succs.size(); ++i) + if (pred_begin(Succs[i]) != pred_end(Succs[i])) { + // One exception is loop headers. If this block was the preheader for a + // loop, then we DO want to visit the loop so the loop gets deleted. + // We know that if the successor is a loop header, that this loop had to + // be the preheader: the case where this was the latch block was handled + // above and headers can only have two predecessors. + if (!LI->isLoopHeader(Succs[i])) { + Succs.erase(Succs.begin()+i); + --i; + } + } + + for (unsigned i = 0, e = Succs.size(); i != e; ++i) + RemoveBlockIfDead(Succs[i], Worklist, L); +} + +/// RemoveLoopFromHierarchy - We have discovered that the specified loop has +/// become unwrapped, either because the backedge was deleted, or because the +/// edge into the header was removed. If the edge into the header from the +/// latch block was removed, the loop is unwrapped but subloops are still alive, +/// so they just reparent loops. If the loops are actually dead, they will be +/// removed later. +void LoopUnswitch::RemoveLoopFromHierarchy(Loop *L) { + LPM->deleteLoopFromQueue(L); + RemoveLoopFromWorklist(L); +} + +// RewriteLoopBodyWithConditionConstant - We know either that the value LIC has +// the value specified by Val in the specified loop, or we know it does NOT have +// that value. Rewrite any uses of LIC or of properties correlated to it. +void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, + Constant *Val, + bool IsEqual) { + assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?"); + + // FIXME: Support correlated properties, like: + // for (...) + // if (li1 < li2) + // ... + // if (li1 > li2) + // ... + + // FOLD boolean conditions (X|LIC), (X&LIC). Fold conditional branches, + // selects, switches. + std::vector<Instruction*> Worklist; + LLVMContext &Context = Val->getContext(); + + + // If we know that LIC == Val, or that LIC == NotVal, just replace uses of LIC + // in the loop with the appropriate one directly. + if (IsEqual || (isa<ConstantInt>(Val) && + Val->getType()->isIntegerTy(1))) { + Value *Replacement; + if (IsEqual) + Replacement = Val; + else + Replacement = ConstantInt::get(Type::getInt1Ty(Val->getContext()), + !cast<ConstantInt>(Val)->getZExtValue()); + + for (Value::use_iterator UI = LIC->use_begin(), E = LIC->use_end(); + UI != E; ++UI) { + Instruction *U = dyn_cast<Instruction>(*UI); + if (!U || !L->contains(U)) + continue; + Worklist.push_back(U); + } + + for (std::vector<Instruction*>::iterator UI = Worklist.begin(); + UI != Worklist.end(); ++UI) + (*UI)->replaceUsesOfWith(LIC, Replacement); + + SimplifyCode(Worklist, L); + return; + } + + // Otherwise, we don't know the precise value of LIC, but we do know that it + // is certainly NOT "Val". As such, simplify any uses in the loop that we + // can. This case occurs when we unswitch switch statements. + for (Value::use_iterator UI = LIC->use_begin(), E = LIC->use_end(); + UI != E; ++UI) { + Instruction *U = dyn_cast<Instruction>(*UI); + if (!U || !L->contains(U)) + continue; + + Worklist.push_back(U); + + // TODO: We could do other simplifications, for example, turning + // 'icmp eq LIC, Val' -> false. + + // If we know that LIC is not Val, use this info to simplify code. + SwitchInst *SI = dyn_cast<SwitchInst>(U); + if (SI == 0 || !isa<ConstantInt>(Val)) continue; + + SwitchInst::CaseIt DeadCase = SI->findCaseValue(cast<ConstantInt>(Val)); + // Default case is live for multiple values. + if (DeadCase == SI->case_default()) continue; + + // Found a dead case value. Don't remove PHI nodes in the + // successor if they become single-entry, those PHI nodes may + // be in the Users list. + + BasicBlock *Switch = SI->getParent(); + BasicBlock *SISucc = DeadCase.getCaseSuccessor(); + BasicBlock *Latch = L->getLoopLatch(); + + BranchesInfo.setUnswitched(SI, Val); + + if (!SI->findCaseDest(SISucc)) continue; // Edge is critical. + // If the DeadCase successor dominates the loop latch, then the + // transformation isn't safe since it will delete the sole predecessor edge + // to the latch. + if (Latch && DT->dominates(SISucc, Latch)) + continue; + + // FIXME: This is a hack. We need to keep the successor around + // and hooked up so as to preserve the loop structure, because + // trying to update it is complicated. So instead we preserve the + // loop structure and put the block on a dead code path. + SplitEdge(Switch, SISucc, this); + // Compute the successors instead of relying on the return value + // of SplitEdge, since it may have split the switch successor + // after PHI nodes. + BasicBlock *NewSISucc = DeadCase.getCaseSuccessor(); + BasicBlock *OldSISucc = *succ_begin(NewSISucc); + // Create an "unreachable" destination. + BasicBlock *Abort = BasicBlock::Create(Context, "us-unreachable", + Switch->getParent(), + OldSISucc); + new UnreachableInst(Context, Abort); + // Force the new case destination to branch to the "unreachable" + // block while maintaining a (dead) CFG edge to the old block. + NewSISucc->getTerminator()->eraseFromParent(); + BranchInst::Create(Abort, OldSISucc, + ConstantInt::getTrue(Context), NewSISucc); + // Release the PHI operands for this edge. + for (BasicBlock::iterator II = NewSISucc->begin(); + PHINode *PN = dyn_cast<PHINode>(II); ++II) + PN->setIncomingValue(PN->getBasicBlockIndex(Switch), + UndefValue::get(PN->getType())); + // Tell the domtree about the new block. We don't fully update the + // domtree here -- instead we force it to do a full recomputation + // after the pass is complete -- but we do need to inform it of + // new blocks. + if (DT) + DT->addNewBlock(Abort, NewSISucc); + } + + SimplifyCode(Worklist, L); +} + +/// SimplifyCode - Okay, now that we have simplified some instructions in the +/// loop, walk over it and constant prop, dce, and fold control flow where +/// possible. Note that this is effectively a very simple loop-structure-aware +/// optimizer. During processing of this loop, L could very well be deleted, so +/// it must not be used. +/// +/// FIXME: When the loop optimizer is more mature, separate this out to a new +/// pass. +/// +void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) { + while (!Worklist.empty()) { + Instruction *I = Worklist.back(); + Worklist.pop_back(); + + // Simple DCE. + if (isInstructionTriviallyDead(I)) { + DEBUG(dbgs() << "Remove dead instruction '" << *I); + + // Add uses to the worklist, which may be dead now. + for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) + if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i))) + Worklist.push_back(Use); + LPM->deleteSimpleAnalysisValue(I, L); + RemoveFromWorklist(I, Worklist); + I->eraseFromParent(); + ++NumSimplify; + continue; + } + + // See if instruction simplification can hack this up. This is common for + // things like "select false, X, Y" after unswitching made the condition be + // 'false'. TODO: update the domtree properly so we can pass it here. + if (Value *V = SimplifyInstruction(I)) + if (LI->replacementPreservesLCSSAForm(I, V)) { + ReplaceUsesOfWith(I, V, Worklist, L, LPM); + continue; + } + + // Special case hacks that appear commonly in unswitched code. + if (BranchInst *BI = dyn_cast<BranchInst>(I)) { + if (BI->isUnconditional()) { + // If BI's parent is the only pred of the successor, fold the two blocks + // together. + BasicBlock *Pred = BI->getParent(); + BasicBlock *Succ = BI->getSuccessor(0); + BasicBlock *SinglePred = Succ->getSinglePredecessor(); + if (!SinglePred) continue; // Nothing to do. + assert(SinglePred == Pred && "CFG broken"); + + DEBUG(dbgs() << "Merging blocks: " << Pred->getName() << " <- " + << Succ->getName() << "\n"); + + // Resolve any single entry PHI nodes in Succ. + while (PHINode *PN = dyn_cast<PHINode>(Succ->begin())) + ReplaceUsesOfWith(PN, PN->getIncomingValue(0), Worklist, L, LPM); + + // If Succ has any successors with PHI nodes, update them to have + // entries coming from Pred instead of Succ. + Succ->replaceAllUsesWith(Pred); + + // Move all of the successor contents from Succ to Pred. + Pred->getInstList().splice(BI, Succ->getInstList(), Succ->begin(), + Succ->end()); + LPM->deleteSimpleAnalysisValue(BI, L); + BI->eraseFromParent(); + RemoveFromWorklist(BI, Worklist); + + // Remove Succ from the loop tree. + LI->removeBlock(Succ); + LPM->deleteSimpleAnalysisValue(Succ, L); + Succ->eraseFromParent(); + ++NumSimplify; + continue; + } + + if (ConstantInt *CB = dyn_cast<ConstantInt>(BI->getCondition())){ + // Conditional branch. Turn it into an unconditional branch, then + // remove dead blocks. + continue; // FIXME: Enable. + + DEBUG(dbgs() << "Folded branch: " << *BI); + BasicBlock *DeadSucc = BI->getSuccessor(CB->getZExtValue()); + BasicBlock *LiveSucc = BI->getSuccessor(!CB->getZExtValue()); + DeadSucc->removePredecessor(BI->getParent(), true); + Worklist.push_back(BranchInst::Create(LiveSucc, BI)); + LPM->deleteSimpleAnalysisValue(BI, L); + BI->eraseFromParent(); + RemoveFromWorklist(BI, Worklist); + ++NumSimplify; + + RemoveBlockIfDead(DeadSucc, Worklist, L); + } + continue; + } + } +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp new file mode 100644 index 0000000..7419a65 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp @@ -0,0 +1,142 @@ +//===- LowerAtomic.cpp - Lower atomic intrinsics --------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass lowers atomic intrinsics to non-atomic form for use in a known +// non-preemptible environment. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "loweratomic" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Function.h" +#include "llvm/IRBuilder.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Pass.h" +using namespace llvm; + +static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) { + IRBuilder<> Builder(CXI->getParent(), CXI); + Value *Ptr = CXI->getPointerOperand(); + Value *Cmp = CXI->getCompareOperand(); + Value *Val = CXI->getNewValOperand(); + + LoadInst *Orig = Builder.CreateLoad(Ptr); + Value *Equal = Builder.CreateICmpEQ(Orig, Cmp); + Value *Res = Builder.CreateSelect(Equal, Val, Orig); + Builder.CreateStore(Res, Ptr); + + CXI->replaceAllUsesWith(Orig); + CXI->eraseFromParent(); + return true; +} + +static bool LowerAtomicRMWInst(AtomicRMWInst *RMWI) { + IRBuilder<> Builder(RMWI->getParent(), RMWI); + Value *Ptr = RMWI->getPointerOperand(); + Value *Val = RMWI->getValOperand(); + + LoadInst *Orig = Builder.CreateLoad(Ptr); + Value *Res = NULL; + + switch (RMWI->getOperation()) { + default: llvm_unreachable("Unexpected RMW operation"); + case AtomicRMWInst::Xchg: + Res = Val; + break; + case AtomicRMWInst::Add: + Res = Builder.CreateAdd(Orig, Val); + break; + case AtomicRMWInst::Sub: + Res = Builder.CreateSub(Orig, Val); + break; + case AtomicRMWInst::And: + Res = Builder.CreateAnd(Orig, Val); + break; + case AtomicRMWInst::Nand: + Res = Builder.CreateNot(Builder.CreateAnd(Orig, Val)); + break; + case AtomicRMWInst::Or: + Res = Builder.CreateOr(Orig, Val); + break; + case AtomicRMWInst::Xor: + Res = Builder.CreateXor(Orig, Val); + break; + case AtomicRMWInst::Max: + Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val), + Val, Orig); + break; + case AtomicRMWInst::Min: + Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val), + Orig, Val); + break; + case AtomicRMWInst::UMax: + Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val), + Val, Orig); + break; + case AtomicRMWInst::UMin: + Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val), + Orig, Val); + break; + } + Builder.CreateStore(Res, Ptr); + RMWI->replaceAllUsesWith(Orig); + RMWI->eraseFromParent(); + return true; +} + +static bool LowerFenceInst(FenceInst *FI) { + FI->eraseFromParent(); + return true; +} + +static bool LowerLoadInst(LoadInst *LI) { + LI->setAtomic(NotAtomic); + return true; +} + +static bool LowerStoreInst(StoreInst *SI) { + SI->setAtomic(NotAtomic); + return true; +} + +namespace { + struct LowerAtomic : public BasicBlockPass { + static char ID; + LowerAtomic() : BasicBlockPass(ID) { + initializeLowerAtomicPass(*PassRegistry::getPassRegistry()); + } + bool runOnBasicBlock(BasicBlock &BB) { + bool Changed = false; + for (BasicBlock::iterator DI = BB.begin(), DE = BB.end(); DI != DE; ) { + Instruction *Inst = DI++; + if (FenceInst *FI = dyn_cast<FenceInst>(Inst)) + Changed |= LowerFenceInst(FI); + else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(Inst)) + Changed |= LowerAtomicCmpXchgInst(CXI); + else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(Inst)) + Changed |= LowerAtomicRMWInst(RMWI); + else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { + if (LI->isAtomic()) + LowerLoadInst(LI); + } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + if (SI->isAtomic()) + LowerStoreInst(SI); + } + } + return Changed; + } + }; +} + +char LowerAtomic::ID = 0; +INITIALIZE_PASS(LowerAtomic, "loweratomic", + "Lower atomic intrinsics to non-atomic form", + false, false) + +Pass *llvm::createLowerAtomicPass() { return new LowerAtomic(); } diff --git a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp new file mode 100644 index 0000000..2a5ee33 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -0,0 +1,994 @@ +//===- MemCpyOptimizer.cpp - Optimize use of memcpy and friends -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass performs various transformations related to eliminating memcpy +// calls, or transforming sets of stores into memset's. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "memcpyopt" +#include "llvm/Transforms/Scalar.h" +#include "llvm/GlobalVariable.h" +#include "llvm/IRBuilder.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/MemoryDependenceAnalysis.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/GetElementPtrTypeIterator.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/Local.h" +#include <list> +using namespace llvm; + +STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted"); +STATISTIC(NumMemSetInfer, "Number of memsets inferred"); +STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy"); +STATISTIC(NumCpyToSet, "Number of memcpys converted to memset"); + +static int64_t GetOffsetFromIndex(const GetElementPtrInst *GEP, unsigned Idx, + bool &VariableIdxFound, const TargetData &TD){ + // Skip over the first indices. + gep_type_iterator GTI = gep_type_begin(GEP); + for (unsigned i = 1; i != Idx; ++i, ++GTI) + /*skip along*/; + + // Compute the offset implied by the rest of the indices. + int64_t Offset = 0; + for (unsigned i = Idx, e = GEP->getNumOperands(); i != e; ++i, ++GTI) { + ConstantInt *OpC = dyn_cast<ConstantInt>(GEP->getOperand(i)); + if (OpC == 0) + return VariableIdxFound = true; + if (OpC->isZero()) continue; // No offset. + + // Handle struct indices, which add their field offset to the pointer. + if (StructType *STy = dyn_cast<StructType>(*GTI)) { + Offset += TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue()); + continue; + } + + // Otherwise, we have a sequential type like an array or vector. Multiply + // the index by the ElementSize. + uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType()); + Offset += Size*OpC->getSExtValue(); + } + + return Offset; +} + +/// IsPointerOffset - Return true if Ptr1 is provably equal to Ptr2 plus a +/// constant offset, and return that constant offset. For example, Ptr1 might +/// be &A[42], and Ptr2 might be &A[40]. In this case offset would be -8. +static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, + const TargetData &TD) { + Ptr1 = Ptr1->stripPointerCasts(); + Ptr2 = Ptr2->stripPointerCasts(); + GetElementPtrInst *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1); + GetElementPtrInst *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2); + + bool VariableIdxFound = false; + + // If one pointer is a GEP and the other isn't, then see if the GEP is a + // constant offset from the base, as in "P" and "gep P, 1". + if (GEP1 && GEP2 == 0 && GEP1->getOperand(0)->stripPointerCasts() == Ptr2) { + Offset = -GetOffsetFromIndex(GEP1, 1, VariableIdxFound, TD); + return !VariableIdxFound; + } + + if (GEP2 && GEP1 == 0 && GEP2->getOperand(0)->stripPointerCasts() == Ptr1) { + Offset = GetOffsetFromIndex(GEP2, 1, VariableIdxFound, TD); + return !VariableIdxFound; + } + + // Right now we handle the case when Ptr1/Ptr2 are both GEPs with an identical + // base. After that base, they may have some number of common (and + // potentially variable) indices. After that they handle some constant + // offset, which determines their offset from each other. At this point, we + // handle no other case. + if (!GEP1 || !GEP2 || GEP1->getOperand(0) != GEP2->getOperand(0)) + return false; + + // Skip any common indices and track the GEP types. + unsigned Idx = 1; + for (; Idx != GEP1->getNumOperands() && Idx != GEP2->getNumOperands(); ++Idx) + if (GEP1->getOperand(Idx) != GEP2->getOperand(Idx)) + break; + + int64_t Offset1 = GetOffsetFromIndex(GEP1, Idx, VariableIdxFound, TD); + int64_t Offset2 = GetOffsetFromIndex(GEP2, Idx, VariableIdxFound, TD); + if (VariableIdxFound) return false; + + Offset = Offset2-Offset1; + return true; +} + + +/// MemsetRange - Represents a range of memset'd bytes with the ByteVal value. +/// This allows us to analyze stores like: +/// store 0 -> P+1 +/// store 0 -> P+0 +/// store 0 -> P+3 +/// store 0 -> P+2 +/// which sometimes happens with stores to arrays of structs etc. When we see +/// the first store, we make a range [1, 2). The second store extends the range +/// to [0, 2). The third makes a new range [2, 3). The fourth store joins the +/// two ranges into [0, 3) which is memset'able. +namespace { +struct MemsetRange { + // Start/End - A semi range that describes the span that this range covers. + // The range is closed at the start and open at the end: [Start, End). + int64_t Start, End; + + /// StartPtr - The getelementptr instruction that points to the start of the + /// range. + Value *StartPtr; + + /// Alignment - The known alignment of the first store. + unsigned Alignment; + + /// TheStores - The actual stores that make up this range. + SmallVector<Instruction*, 16> TheStores; + + bool isProfitableToUseMemset(const TargetData &TD) const; + +}; +} // end anon namespace + +bool MemsetRange::isProfitableToUseMemset(const TargetData &TD) const { + // If we found more than 4 stores to merge or 16 bytes, use memset. + if (TheStores.size() >= 4 || End-Start >= 16) return true; + + // If there is nothing to merge, don't do anything. + if (TheStores.size() < 2) return false; + + // If any of the stores are a memset, then it is always good to extend the + // memset. + for (unsigned i = 0, e = TheStores.size(); i != e; ++i) + if (!isa<StoreInst>(TheStores[i])) + return true; + + // Assume that the code generator is capable of merging pairs of stores + // together if it wants to. + if (TheStores.size() == 2) return false; + + // If we have fewer than 8 stores, it can still be worthwhile to do this. + // For example, merging 4 i8 stores into an i32 store is useful almost always. + // However, merging 2 32-bit stores isn't useful on a 32-bit architecture (the + // memset will be split into 2 32-bit stores anyway) and doing so can + // pessimize the llvm optimizer. + // + // Since we don't have perfect knowledge here, make some assumptions: assume + // the maximum GPR width is the same size as the pointer size and assume that + // this width can be stored. If so, check to see whether we will end up + // actually reducing the number of stores used. + unsigned Bytes = unsigned(End-Start); + unsigned NumPointerStores = Bytes/TD.getPointerSize(); + + // Assume the remaining bytes if any are done a byte at a time. + unsigned NumByteStores = Bytes - NumPointerStores*TD.getPointerSize(); + + // If we will reduce the # stores (according to this heuristic), do the + // transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32 + // etc. + return TheStores.size() > NumPointerStores+NumByteStores; +} + + +namespace { +class MemsetRanges { + /// Ranges - A sorted list of the memset ranges. We use std::list here + /// because each element is relatively large and expensive to copy. + std::list<MemsetRange> Ranges; + typedef std::list<MemsetRange>::iterator range_iterator; + const TargetData &TD; +public: + MemsetRanges(const TargetData &td) : TD(td) {} + + typedef std::list<MemsetRange>::const_iterator const_iterator; + const_iterator begin() const { return Ranges.begin(); } + const_iterator end() const { return Ranges.end(); } + bool empty() const { return Ranges.empty(); } + + void addInst(int64_t OffsetFromFirst, Instruction *Inst) { + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) + addStore(OffsetFromFirst, SI); + else + addMemSet(OffsetFromFirst, cast<MemSetInst>(Inst)); + } + + void addStore(int64_t OffsetFromFirst, StoreInst *SI) { + int64_t StoreSize = TD.getTypeStoreSize(SI->getOperand(0)->getType()); + + addRange(OffsetFromFirst, StoreSize, + SI->getPointerOperand(), SI->getAlignment(), SI); + } + + void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) { + int64_t Size = cast<ConstantInt>(MSI->getLength())->getZExtValue(); + addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getAlignment(), MSI); + } + + void addRange(int64_t Start, int64_t Size, Value *Ptr, + unsigned Alignment, Instruction *Inst); + +}; + +} // end anon namespace + + +/// addRange - Add a new store to the MemsetRanges data structure. This adds a +/// new range for the specified store at the specified offset, merging into +/// existing ranges as appropriate. +/// +/// Do a linear search of the ranges to see if this can be joined and/or to +/// find the insertion point in the list. We keep the ranges sorted for +/// simplicity here. This is a linear search of a linked list, which is ugly, +/// however the number of ranges is limited, so this won't get crazy slow. +void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr, + unsigned Alignment, Instruction *Inst) { + int64_t End = Start+Size; + range_iterator I = Ranges.begin(), E = Ranges.end(); + + while (I != E && Start > I->End) + ++I; + + // We now know that I == E, in which case we didn't find anything to merge + // with, or that Start <= I->End. If End < I->Start or I == E, then we need + // to insert a new range. Handle this now. + if (I == E || End < I->Start) { + MemsetRange &R = *Ranges.insert(I, MemsetRange()); + R.Start = Start; + R.End = End; + R.StartPtr = Ptr; + R.Alignment = Alignment; + R.TheStores.push_back(Inst); + return; + } + + // This store overlaps with I, add it. + I->TheStores.push_back(Inst); + + // At this point, we may have an interval that completely contains our store. + // If so, just add it to the interval and return. + if (I->Start <= Start && I->End >= End) + return; + + // Now we know that Start <= I->End and End >= I->Start so the range overlaps + // but is not entirely contained within the range. + + // See if the range extends the start of the range. In this case, it couldn't + // possibly cause it to join the prior range, because otherwise we would have + // stopped on *it*. + if (Start < I->Start) { + I->Start = Start; + I->StartPtr = Ptr; + I->Alignment = Alignment; + } + + // Now we know that Start <= I->End and Start >= I->Start (so the startpoint + // is in or right at the end of I), and that End >= I->Start. Extend I out to + // End. + if (End > I->End) { + I->End = End; + range_iterator NextI = I; + while (++NextI != E && End >= NextI->Start) { + // Merge the range in. + I->TheStores.append(NextI->TheStores.begin(), NextI->TheStores.end()); + if (NextI->End > I->End) + I->End = NextI->End; + Ranges.erase(NextI); + NextI = I; + } + } +} + +//===----------------------------------------------------------------------===// +// MemCpyOpt Pass +//===----------------------------------------------------------------------===// + +namespace { + class MemCpyOpt : public FunctionPass { + MemoryDependenceAnalysis *MD; + TargetLibraryInfo *TLI; + const TargetData *TD; + public: + static char ID; // Pass identification, replacement for typeid + MemCpyOpt() : FunctionPass(ID) { + initializeMemCpyOptPass(*PassRegistry::getPassRegistry()); + MD = 0; + TLI = 0; + TD = 0; + } + + bool runOnFunction(Function &F); + + private: + // This transformation requires dominator postdominator info + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired<DominatorTree>(); + AU.addRequired<MemoryDependenceAnalysis>(); + AU.addRequired<AliasAnalysis>(); + AU.addRequired<TargetLibraryInfo>(); + AU.addPreserved<AliasAnalysis>(); + AU.addPreserved<MemoryDependenceAnalysis>(); + } + + // Helper fuctions + bool processStore(StoreInst *SI, BasicBlock::iterator &BBI); + bool processMemSet(MemSetInst *SI, BasicBlock::iterator &BBI); + bool processMemCpy(MemCpyInst *M); + bool processMemMove(MemMoveInst *M); + bool performCallSlotOptzn(Instruction *cpy, Value *cpyDst, Value *cpySrc, + uint64_t cpyLen, CallInst *C); + bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, + uint64_t MSize); + bool processByValArgument(CallSite CS, unsigned ArgNo); + Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr, + Value *ByteVal); + + bool iterateOnFunction(Function &F); + }; + + char MemCpyOpt::ID = 0; +} + +// createMemCpyOptPass - The public interface to this file... +FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOpt(); } + +INITIALIZE_PASS_BEGIN(MemCpyOpt, "memcpyopt", "MemCpy Optimization", + false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization", + false, false) + +/// tryMergingIntoMemset - When scanning forward over instructions, we look for +/// some other patterns to fold away. In particular, this looks for stores to +/// neighboring locations of memory. If it sees enough consecutive ones, it +/// attempts to merge them together into a memcpy/memset. +Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, + Value *StartPtr, Value *ByteVal) { + if (TD == 0) return 0; + + // Okay, so we now have a single store that can be splatable. Scan to find + // all subsequent stores of the same value to offset from the same pointer. + // Join these together into ranges, so we can decide whether contiguous blocks + // are stored. + MemsetRanges Ranges(*TD); + + BasicBlock::iterator BI = StartInst; + for (++BI; !isa<TerminatorInst>(BI); ++BI) { + if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) { + // If the instruction is readnone, ignore it, otherwise bail out. We + // don't even allow readonly here because we don't want something like: + // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A). + if (BI->mayWriteToMemory() || BI->mayReadFromMemory()) + break; + continue; + } + + if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) { + // If this is a store, see if we can merge it in. + if (!NextStore->isSimple()) break; + + // Check to see if this stored value is of the same byte-splattable value. + if (ByteVal != isBytewiseValue(NextStore->getOperand(0))) + break; + + // Check to see if this store is to a constant offset from the start ptr. + int64_t Offset; + if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), + Offset, *TD)) + break; + + Ranges.addStore(Offset, NextStore); + } else { + MemSetInst *MSI = cast<MemSetInst>(BI); + + if (MSI->isVolatile() || ByteVal != MSI->getValue() || + !isa<ConstantInt>(MSI->getLength())) + break; + + // Check to see if this store is to a constant offset from the start ptr. + int64_t Offset; + if (!IsPointerOffset(StartPtr, MSI->getDest(), Offset, *TD)) + break; + + Ranges.addMemSet(Offset, MSI); + } + } + + // If we have no ranges, then we just had a single store with nothing that + // could be merged in. This is a very common case of course. + if (Ranges.empty()) + return 0; + + // If we had at least one store that could be merged in, add the starting + // store as well. We try to avoid this unless there is at least something + // interesting as a small compile-time optimization. + Ranges.addInst(0, StartInst); + + // If we create any memsets, we put it right before the first instruction that + // isn't part of the memset block. This ensure that the memset is dominated + // by any addressing instruction needed by the start of the block. + IRBuilder<> Builder(BI); + + // Now that we have full information about ranges, loop over the ranges and + // emit memset's for anything big enough to be worthwhile. + Instruction *AMemSet = 0; + for (MemsetRanges::const_iterator I = Ranges.begin(), E = Ranges.end(); + I != E; ++I) { + const MemsetRange &Range = *I; + + if (Range.TheStores.size() == 1) continue; + + // If it is profitable to lower this range to memset, do so now. + if (!Range.isProfitableToUseMemset(*TD)) + continue; + + // Otherwise, we do want to transform this! Create a new memset. + // Get the starting pointer of the block. + StartPtr = Range.StartPtr; + + // Determine alignment + unsigned Alignment = Range.Alignment; + if (Alignment == 0) { + Type *EltType = + cast<PointerType>(StartPtr->getType())->getElementType(); + Alignment = TD->getABITypeAlignment(EltType); + } + + AMemSet = + Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment); + + DEBUG(dbgs() << "Replace stores:\n"; + for (unsigned i = 0, e = Range.TheStores.size(); i != e; ++i) + dbgs() << *Range.TheStores[i] << '\n'; + dbgs() << "With: " << *AMemSet << '\n'); + + if (!Range.TheStores.empty()) + AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc()); + + // Zap all the stores. + for (SmallVector<Instruction*, 16>::const_iterator + SI = Range.TheStores.begin(), + SE = Range.TheStores.end(); SI != SE; ++SI) { + MD->removeInstruction(*SI); + (*SI)->eraseFromParent(); + } + ++NumMemSetInfer; + } + + return AMemSet; +} + + +bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { + if (!SI->isSimple()) return false; + + if (TD == 0) return false; + + // Detect cases where we're performing call slot forwarding, but + // happen to be using a load-store pair to implement it, rather than + // a memcpy. + if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) { + if (LI->isSimple() && LI->hasOneUse() && + LI->getParent() == SI->getParent()) { + MemDepResult ldep = MD->getDependency(LI); + CallInst *C = 0; + if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst())) + C = dyn_cast<CallInst>(ldep.getInst()); + + if (C) { + // Check that nothing touches the dest of the "copy" between + // the call and the store. + AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); + AliasAnalysis::Location StoreLoc = AA.getLocation(SI); + for (BasicBlock::iterator I = --BasicBlock::iterator(SI), + E = C; I != E; --I) { + if (AA.getModRefInfo(&*I, StoreLoc) != AliasAnalysis::NoModRef) { + C = 0; + break; + } + } + } + + if (C) { + bool changed = performCallSlotOptzn(LI, + SI->getPointerOperand()->stripPointerCasts(), + LI->getPointerOperand()->stripPointerCasts(), + TD->getTypeStoreSize(SI->getOperand(0)->getType()), C); + if (changed) { + MD->removeInstruction(SI); + SI->eraseFromParent(); + MD->removeInstruction(LI); + LI->eraseFromParent(); + ++NumMemCpyInstr; + return true; + } + } + } + } + + // There are two cases that are interesting for this code to handle: memcpy + // and memset. Right now we only handle memset. + + // Ensure that the value being stored is something that can be memset'able a + // byte at a time like "0" or "-1" or any width, as well as things like + // 0xA0A0A0A0 and 0.0. + if (Value *ByteVal = isBytewiseValue(SI->getOperand(0))) + if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(), + ByteVal)) { + BBI = I; // Don't invalidate iterator. + return true; + } + + return false; +} + +bool MemCpyOpt::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) { + // See if there is another memset or store neighboring this memset which + // allows us to widen out the memset to do a single larger store. + if (isa<ConstantInt>(MSI->getLength()) && !MSI->isVolatile()) + if (Instruction *I = tryMergingIntoMemset(MSI, MSI->getDest(), + MSI->getValue())) { + BBI = I; // Don't invalidate iterator. + return true; + } + return false; +} + + +/// performCallSlotOptzn - takes a memcpy and a call that it depends on, +/// and checks for the possibility of a call slot optimization by having +/// the call write its result directly into the destination of the memcpy. +bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, + Value *cpyDest, Value *cpySrc, + uint64_t cpyLen, CallInst *C) { + // The general transformation to keep in mind is + // + // call @func(..., src, ...) + // memcpy(dest, src, ...) + // + // -> + // + // memcpy(dest, src, ...) + // call @func(..., dest, ...) + // + // Since moving the memcpy is technically awkward, we additionally check that + // src only holds uninitialized values at the moment of the call, meaning that + // the memcpy can be discarded rather than moved. + + // Deliberately get the source and destination with bitcasts stripped away, + // because we'll need to do type comparisons based on the underlying type. + CallSite CS(C); + + // Require that src be an alloca. This simplifies the reasoning considerably. + AllocaInst *srcAlloca = dyn_cast<AllocaInst>(cpySrc); + if (!srcAlloca) + return false; + + // Check that all of src is copied to dest. + if (TD == 0) return false; + + ConstantInt *srcArraySize = dyn_cast<ConstantInt>(srcAlloca->getArraySize()); + if (!srcArraySize) + return false; + + uint64_t srcSize = TD->getTypeAllocSize(srcAlloca->getAllocatedType()) * + srcArraySize->getZExtValue(); + + if (cpyLen < srcSize) + return false; + + // Check that accessing the first srcSize bytes of dest will not cause a + // trap. Otherwise the transform is invalid since it might cause a trap + // to occur earlier than it otherwise would. + if (AllocaInst *A = dyn_cast<AllocaInst>(cpyDest)) { + // The destination is an alloca. Check it is larger than srcSize. + ConstantInt *destArraySize = dyn_cast<ConstantInt>(A->getArraySize()); + if (!destArraySize) + return false; + + uint64_t destSize = TD->getTypeAllocSize(A->getAllocatedType()) * + destArraySize->getZExtValue(); + + if (destSize < srcSize) + return false; + } else if (Argument *A = dyn_cast<Argument>(cpyDest)) { + // If the destination is an sret parameter then only accesses that are + // outside of the returned struct type can trap. + if (!A->hasStructRetAttr()) + return false; + + Type *StructTy = cast<PointerType>(A->getType())->getElementType(); + uint64_t destSize = TD->getTypeAllocSize(StructTy); + + if (destSize < srcSize) + return false; + } else { + return false; + } + + // Check that src is not accessed except via the call and the memcpy. This + // guarantees that it holds only undefined values when passed in (so the final + // memcpy can be dropped), that it is not read or written between the call and + // the memcpy, and that writing beyond the end of it is undefined. + SmallVector<User*, 8> srcUseList(srcAlloca->use_begin(), + srcAlloca->use_end()); + while (!srcUseList.empty()) { + User *UI = srcUseList.pop_back_val(); + + if (isa<BitCastInst>(UI)) { + for (User::use_iterator I = UI->use_begin(), E = UI->use_end(); + I != E; ++I) + srcUseList.push_back(*I); + } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(UI)) { + if (G->hasAllZeroIndices()) + for (User::use_iterator I = UI->use_begin(), E = UI->use_end(); + I != E; ++I) + srcUseList.push_back(*I); + else + return false; + } else if (UI != C && UI != cpy) { + return false; + } + } + + // Since we're changing the parameter to the callsite, we need to make sure + // that what would be the new parameter dominates the callsite. + DominatorTree &DT = getAnalysis<DominatorTree>(); + if (Instruction *cpyDestInst = dyn_cast<Instruction>(cpyDest)) + if (!DT.dominates(cpyDestInst, C)) + return false; + + // In addition to knowing that the call does not access src in some + // unexpected manner, for example via a global, which we deduce from + // the use analysis, we also need to know that it does not sneakily + // access dest. We rely on AA to figure this out for us. + AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); + AliasAnalysis::ModRefResult MR = AA.getModRefInfo(C, cpyDest, srcSize); + // If necessary, perform additional analysis. + if (MR != AliasAnalysis::NoModRef) + MR = AA.callCapturesBefore(C, cpyDest, srcSize, &DT); + if (MR != AliasAnalysis::NoModRef) + return false; + + // All the checks have passed, so do the transformation. + bool changedArgument = false; + for (unsigned i = 0; i < CS.arg_size(); ++i) + if (CS.getArgument(i)->stripPointerCasts() == cpySrc) { + if (cpySrc->getType() != cpyDest->getType()) + cpyDest = CastInst::CreatePointerCast(cpyDest, cpySrc->getType(), + cpyDest->getName(), C); + changedArgument = true; + if (CS.getArgument(i)->getType() == cpyDest->getType()) + CS.setArgument(i, cpyDest); + else + CS.setArgument(i, CastInst::CreatePointerCast(cpyDest, + CS.getArgument(i)->getType(), cpyDest->getName(), C)); + } + + if (!changedArgument) + return false; + + // Drop any cached information about the call, because we may have changed + // its dependence information by changing its parameter. + MD->removeInstruction(C); + + // Remove the memcpy. + MD->removeInstruction(cpy); + ++NumMemCpyInstr; + + return true; +} + +/// processMemCpyMemCpyDependence - We've found that the (upward scanning) +/// memory dependence of memcpy 'M' is the memcpy 'MDep'. Try to simplify M to +/// copy from MDep's input if we can. MSize is the size of M's copy. +/// +bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, + uint64_t MSize) { + // We can only transforms memcpy's where the dest of one is the source of the + // other. + if (M->getSource() != MDep->getDest() || MDep->isVolatile()) + return false; + + // If dep instruction is reading from our current input, then it is a noop + // transfer and substituting the input won't change this instruction. Just + // ignore the input and let someone else zap MDep. This handles cases like: + // memcpy(a <- a) + // memcpy(b <- a) + if (M->getSource() == MDep->getSource()) + return false; + + // Second, the length of the memcpy's must be the same, or the preceding one + // must be larger than the following one. + ConstantInt *MDepLen = dyn_cast<ConstantInt>(MDep->getLength()); + ConstantInt *MLen = dyn_cast<ConstantInt>(M->getLength()); + if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue()) + return false; + + AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); + + // Verify that the copied-from memory doesn't change in between the two + // transfers. For example, in: + // memcpy(a <- b) + // *b = 42; + // memcpy(c <- a) + // It would be invalid to transform the second memcpy into memcpy(c <- b). + // + // TODO: If the code between M and MDep is transparent to the destination "c", + // then we could still perform the xform by moving M up to the first memcpy. + // + // NOTE: This is conservative, it will stop on any read from the source loc, + // not just the defining memcpy. + MemDepResult SourceDep = + MD->getPointerDependencyFrom(AA.getLocationForSource(MDep), + false, M, M->getParent()); + if (!SourceDep.isClobber() || SourceDep.getInst() != MDep) + return false; + + // If the dest of the second might alias the source of the first, then the + // source and dest might overlap. We still want to eliminate the intermediate + // value, but we have to generate a memmove instead of memcpy. + bool UseMemMove = false; + if (!AA.isNoAlias(AA.getLocationForDest(M), AA.getLocationForSource(MDep))) + UseMemMove = true; + + // If all checks passed, then we can transform M. + + // Make sure to use the lesser of the alignment of the source and the dest + // since we're changing where we're reading from, but don't want to increase + // the alignment past what can be read from or written to. + // TODO: Is this worth it if we're creating a less aligned memcpy? For + // example we could be moving from movaps -> movq on x86. + unsigned Align = std::min(MDep->getAlignment(), M->getAlignment()); + + IRBuilder<> Builder(M); + if (UseMemMove) + Builder.CreateMemMove(M->getRawDest(), MDep->getRawSource(), M->getLength(), + Align, M->isVolatile()); + else + Builder.CreateMemCpy(M->getRawDest(), MDep->getRawSource(), M->getLength(), + Align, M->isVolatile()); + + // Remove the instruction we're replacing. + MD->removeInstruction(M); + M->eraseFromParent(); + ++NumMemCpyInstr; + return true; +} + + +/// processMemCpy - perform simplification of memcpy's. If we have memcpy A +/// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite +/// B to be a memcpy from X to Z (or potentially a memmove, depending on +/// circumstances). This allows later passes to remove the first memcpy +/// altogether. +bool MemCpyOpt::processMemCpy(MemCpyInst *M) { + // We can only optimize statically-sized memcpy's that are non-volatile. + ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength()); + if (CopySize == 0 || M->isVolatile()) return false; + + // If the source and destination of the memcpy are the same, then zap it. + if (M->getSource() == M->getDest()) { + MD->removeInstruction(M); + M->eraseFromParent(); + return false; + } + + // If copying from a constant, try to turn the memcpy into a memset. + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(M->getSource())) + if (GV->isConstant() && GV->hasDefinitiveInitializer()) + if (Value *ByteVal = isBytewiseValue(GV->getInitializer())) { + IRBuilder<> Builder(M); + Builder.CreateMemSet(M->getRawDest(), ByteVal, CopySize, + M->getAlignment(), false); + MD->removeInstruction(M); + M->eraseFromParent(); + ++NumCpyToSet; + return true; + } + + // The are two possible optimizations we can do for memcpy: + // a) memcpy-memcpy xform which exposes redundance for DSE. + // b) call-memcpy xform for return slot optimization. + MemDepResult DepInfo = MD->getDependency(M); + if (DepInfo.isClobber()) { + if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) { + if (performCallSlotOptzn(M, M->getDest(), M->getSource(), + CopySize->getZExtValue(), C)) { + MD->removeInstruction(M); + M->eraseFromParent(); + return true; + } + } + } + + AliasAnalysis::Location SrcLoc = AliasAnalysis::getLocationForSource(M); + MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(SrcLoc, true, + M, M->getParent()); + if (SrcDepInfo.isClobber()) { + if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst())) + return processMemCpyMemCpyDependence(M, MDep, CopySize->getZExtValue()); + } + + return false; +} + +/// processMemMove - Transforms memmove calls to memcpy calls when the src/dst +/// are guaranteed not to alias. +bool MemCpyOpt::processMemMove(MemMoveInst *M) { + AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); + + if (!TLI->has(LibFunc::memmove)) + return false; + + // See if the pointers alias. + if (!AA.isNoAlias(AA.getLocationForDest(M), AA.getLocationForSource(M))) + return false; + + DEBUG(dbgs() << "MemCpyOpt: Optimizing memmove -> memcpy: " << *M << "\n"); + + // If not, then we know we can transform this. + Module *Mod = M->getParent()->getParent()->getParent(); + Type *ArgTys[3] = { M->getRawDest()->getType(), + M->getRawSource()->getType(), + M->getLength()->getType() }; + M->setCalledFunction(Intrinsic::getDeclaration(Mod, Intrinsic::memcpy, + ArgTys)); + + // MemDep may have over conservative information about this instruction, just + // conservatively flush it from the cache. + MD->removeInstruction(M); + + ++NumMoveToCpy; + return true; +} + +/// processByValArgument - This is called on every byval argument in call sites. +bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { + if (TD == 0) return false; + + // Find out what feeds this byval argument. + Value *ByValArg = CS.getArgument(ArgNo); + Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType(); + uint64_t ByValSize = TD->getTypeAllocSize(ByValTy); + MemDepResult DepInfo = + MD->getPointerDependencyFrom(AliasAnalysis::Location(ByValArg, ByValSize), + true, CS.getInstruction(), + CS.getInstruction()->getParent()); + if (!DepInfo.isClobber()) + return false; + + // If the byval argument isn't fed by a memcpy, ignore it. If it is fed by + // a memcpy, see if we can byval from the source of the memcpy instead of the + // result. + MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst()); + if (MDep == 0 || MDep->isVolatile() || + ByValArg->stripPointerCasts() != MDep->getDest()) + return false; + + // The length of the memcpy must be larger or equal to the size of the byval. + ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength()); + if (C1 == 0 || C1->getValue().getZExtValue() < ByValSize) + return false; + + // Get the alignment of the byval. If the call doesn't specify the alignment, + // then it is some target specific value that we can't know. + unsigned ByValAlign = CS.getParamAlignment(ArgNo+1); + if (ByValAlign == 0) return false; + + // If it is greater than the memcpy, then we check to see if we can force the + // source of the memcpy to the alignment we need. If we fail, we bail out. + if (MDep->getAlignment() < ByValAlign && + getOrEnforceKnownAlignment(MDep->getSource(),ByValAlign, TD) < ByValAlign) + return false; + + // Verify that the copied-from memory doesn't change in between the memcpy and + // the byval call. + // memcpy(a <- b) + // *b = 42; + // foo(*a) + // It would be invalid to transform the second memcpy into foo(*b). + // + // NOTE: This is conservative, it will stop on any read from the source loc, + // not just the defining memcpy. + MemDepResult SourceDep = + MD->getPointerDependencyFrom(AliasAnalysis::getLocationForSource(MDep), + false, CS.getInstruction(), MDep->getParent()); + if (!SourceDep.isClobber() || SourceDep.getInst() != MDep) + return false; + + Value *TmpCast = MDep->getSource(); + if (MDep->getSource()->getType() != ByValArg->getType()) + TmpCast = new BitCastInst(MDep->getSource(), ByValArg->getType(), + "tmpcast", CS.getInstruction()); + + DEBUG(dbgs() << "MemCpyOpt: Forwarding memcpy to byval:\n" + << " " << *MDep << "\n" + << " " << *CS.getInstruction() << "\n"); + + // Otherwise we're good! Update the byval argument. + CS.setArgument(ArgNo, TmpCast); + ++NumMemCpyInstr; + return true; +} + +/// iterateOnFunction - Executes one iteration of MemCpyOpt. +bool MemCpyOpt::iterateOnFunction(Function &F) { + bool MadeChange = false; + + // Walk all instruction in the function. + for (Function::iterator BB = F.begin(), BBE = F.end(); BB != BBE; ++BB) { + for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { + // Avoid invalidating the iterator. + Instruction *I = BI++; + + bool RepeatInstruction = false; + + if (StoreInst *SI = dyn_cast<StoreInst>(I)) + MadeChange |= processStore(SI, BI); + else if (MemSetInst *M = dyn_cast<MemSetInst>(I)) + RepeatInstruction = processMemSet(M, BI); + else if (MemCpyInst *M = dyn_cast<MemCpyInst>(I)) + RepeatInstruction = processMemCpy(M); + else if (MemMoveInst *M = dyn_cast<MemMoveInst>(I)) + RepeatInstruction = processMemMove(M); + else if (CallSite CS = (Value*)I) { + for (unsigned i = 0, e = CS.arg_size(); i != e; ++i) + if (CS.isByValArgument(i)) + MadeChange |= processByValArgument(CS, i); + } + + // Reprocess the instruction if desired. + if (RepeatInstruction) { + if (BI != BB->begin()) --BI; + MadeChange = true; + } + } + } + + return MadeChange; +} + +// MemCpyOpt::runOnFunction - This is the main transformation entry point for a +// function. +// +bool MemCpyOpt::runOnFunction(Function &F) { + bool MadeChange = false; + MD = &getAnalysis<MemoryDependenceAnalysis>(); + TD = getAnalysisIfAvailable<TargetData>(); + TLI = &getAnalysis<TargetLibraryInfo>(); + + // If we don't have at least memset and memcpy, there is little point of doing + // anything here. These are required by a freestanding implementation, so if + // even they are disabled, there is no point in trying hard. + if (!TLI->has(LibFunc::memset) || !TLI->has(LibFunc::memcpy)) + return false; + + while (1) { + if (!iterateOnFunction(F)) + break; + MadeChange = true; + } + + MD = 0; + return MadeChange; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/ObjCARC.cpp b/contrib/llvm/lib/Transforms/Scalar/ObjCARC.cpp new file mode 100644 index 0000000..3222f20 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/ObjCARC.cpp @@ -0,0 +1,4200 @@ +//===- ObjCARC.cpp - ObjC ARC Optimization --------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines ObjC ARC optimizations. ARC stands for +// Automatic Reference Counting and is a system for managing reference counts +// for objects in Objective C. +// +// The optimizations performed include elimination of redundant, partially +// redundant, and inconsequential reference count operations, elimination of +// redundant weak pointer operations, pattern-matching and replacement of +// low-level operations into higher-level operations, and numerous minor +// simplifications. +// +// This file also defines a simple ARC-aware AliasAnalysis. +// +// WARNING: This file knows about certain library functions. It recognizes them +// by name, and hardwires knowledge of their semantics. +// +// WARNING: This file knows about how certain Objective-C library functions are +// used. Naive LLVM IR transformations which would otherwise be +// behavior-preserving may break these assumptions. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "objc-arc" +#include "llvm/Support/CommandLine.h" +#include "llvm/ADT/DenseMap.h" +using namespace llvm; + +// A handy option to enable/disable all optimizations in this file. +static cl::opt<bool> EnableARCOpts("enable-objc-arc-opts", cl::init(true)); + +//===----------------------------------------------------------------------===// +// Misc. Utilities +//===----------------------------------------------------------------------===// + +namespace { + /// MapVector - An associative container with fast insertion-order + /// (deterministic) iteration over its elements. Plus the special + /// blot operation. + template<class KeyT, class ValueT> + class MapVector { + /// Map - Map keys to indices in Vector. + typedef DenseMap<KeyT, size_t> MapTy; + MapTy Map; + + /// Vector - Keys and values. + typedef std::vector<std::pair<KeyT, ValueT> > VectorTy; + VectorTy Vector; + + public: + typedef typename VectorTy::iterator iterator; + typedef typename VectorTy::const_iterator const_iterator; + iterator begin() { return Vector.begin(); } + iterator end() { return Vector.end(); } + const_iterator begin() const { return Vector.begin(); } + const_iterator end() const { return Vector.end(); } + +#ifdef XDEBUG + ~MapVector() { + assert(Vector.size() >= Map.size()); // May differ due to blotting. + for (typename MapTy::const_iterator I = Map.begin(), E = Map.end(); + I != E; ++I) { + assert(I->second < Vector.size()); + assert(Vector[I->second].first == I->first); + } + for (typename VectorTy::const_iterator I = Vector.begin(), + E = Vector.end(); I != E; ++I) + assert(!I->first || + (Map.count(I->first) && + Map[I->first] == size_t(I - Vector.begin()))); + } +#endif + + ValueT &operator[](const KeyT &Arg) { + std::pair<typename MapTy::iterator, bool> Pair = + Map.insert(std::make_pair(Arg, size_t(0))); + if (Pair.second) { + size_t Num = Vector.size(); + Pair.first->second = Num; + Vector.push_back(std::make_pair(Arg, ValueT())); + return Vector[Num].second; + } + return Vector[Pair.first->second].second; + } + + std::pair<iterator, bool> + insert(const std::pair<KeyT, ValueT> &InsertPair) { + std::pair<typename MapTy::iterator, bool> Pair = + Map.insert(std::make_pair(InsertPair.first, size_t(0))); + if (Pair.second) { + size_t Num = Vector.size(); + Pair.first->second = Num; + Vector.push_back(InsertPair); + return std::make_pair(Vector.begin() + Num, true); + } + return std::make_pair(Vector.begin() + Pair.first->second, false); + } + + const_iterator find(const KeyT &Key) const { + typename MapTy::const_iterator It = Map.find(Key); + if (It == Map.end()) return Vector.end(); + return Vector.begin() + It->second; + } + + /// blot - This is similar to erase, but instead of removing the element + /// from the vector, it just zeros out the key in the vector. This leaves + /// iterators intact, but clients must be prepared for zeroed-out keys when + /// iterating. + void blot(const KeyT &Key) { + typename MapTy::iterator It = Map.find(Key); + if (It == Map.end()) return; + Vector[It->second].first = KeyT(); + Map.erase(It); + } + + void clear() { + Map.clear(); + Vector.clear(); + } + }; +} + +//===----------------------------------------------------------------------===// +// ARC Utilities. +//===----------------------------------------------------------------------===// + +#include "llvm/Intrinsics.h" +#include "llvm/Module.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Support/CallSite.h" +#include "llvm/ADT/StringSwitch.h" + +namespace { + /// InstructionClass - A simple classification for instructions. + enum InstructionClass { + IC_Retain, ///< objc_retain + IC_RetainRV, ///< objc_retainAutoreleasedReturnValue + IC_RetainBlock, ///< objc_retainBlock + IC_Release, ///< objc_release + IC_Autorelease, ///< objc_autorelease + IC_AutoreleaseRV, ///< objc_autoreleaseReturnValue + IC_AutoreleasepoolPush, ///< objc_autoreleasePoolPush + IC_AutoreleasepoolPop, ///< objc_autoreleasePoolPop + IC_NoopCast, ///< objc_retainedObject, etc. + IC_FusedRetainAutorelease, ///< objc_retainAutorelease + IC_FusedRetainAutoreleaseRV, ///< objc_retainAutoreleaseReturnValue + IC_LoadWeakRetained, ///< objc_loadWeakRetained (primitive) + IC_StoreWeak, ///< objc_storeWeak (primitive) + IC_InitWeak, ///< objc_initWeak (derived) + IC_LoadWeak, ///< objc_loadWeak (derived) + IC_MoveWeak, ///< objc_moveWeak (derived) + IC_CopyWeak, ///< objc_copyWeak (derived) + IC_DestroyWeak, ///< objc_destroyWeak (derived) + IC_StoreStrong, ///< objc_storeStrong (derived) + IC_CallOrUser, ///< could call objc_release and/or "use" pointers + IC_Call, ///< could call objc_release + IC_User, ///< could "use" a pointer + IC_None ///< anything else + }; +} + +/// IsPotentialUse - Test whether the given value is possible a +/// reference-counted pointer. +static bool IsPotentialUse(const Value *Op) { + // Pointers to static or stack storage are not reference-counted pointers. + if (isa<Constant>(Op) || isa<AllocaInst>(Op)) + return false; + // Special arguments are not reference-counted. + if (const Argument *Arg = dyn_cast<Argument>(Op)) + if (Arg->hasByValAttr() || + Arg->hasNestAttr() || + Arg->hasStructRetAttr()) + return false; + // Only consider values with pointer types. + // It seemes intuitive to exclude function pointer types as well, since + // functions are never reference-counted, however clang occasionally + // bitcasts reference-counted pointers to function-pointer type + // temporarily. + PointerType *Ty = dyn_cast<PointerType>(Op->getType()); + if (!Ty) + return false; + // Conservatively assume anything else is a potential use. + return true; +} + +/// GetCallSiteClass - Helper for GetInstructionClass. Determines what kind +/// of construct CS is. +static InstructionClass GetCallSiteClass(ImmutableCallSite CS) { + for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); + I != E; ++I) + if (IsPotentialUse(*I)) + return CS.onlyReadsMemory() ? IC_User : IC_CallOrUser; + + return CS.onlyReadsMemory() ? IC_None : IC_Call; +} + +/// GetFunctionClass - Determine if F is one of the special known Functions. +/// If it isn't, return IC_CallOrUser. +static InstructionClass GetFunctionClass(const Function *F) { + Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end(); + + // No arguments. + if (AI == AE) + return StringSwitch<InstructionClass>(F->getName()) + .Case("objc_autoreleasePoolPush", IC_AutoreleasepoolPush) + .Default(IC_CallOrUser); + + // One argument. + const Argument *A0 = AI++; + if (AI == AE) + // Argument is a pointer. + if (PointerType *PTy = dyn_cast<PointerType>(A0->getType())) { + Type *ETy = PTy->getElementType(); + // Argument is i8*. + if (ETy->isIntegerTy(8)) + return StringSwitch<InstructionClass>(F->getName()) + .Case("objc_retain", IC_Retain) + .Case("objc_retainAutoreleasedReturnValue", IC_RetainRV) + .Case("objc_retainBlock", IC_RetainBlock) + .Case("objc_release", IC_Release) + .Case("objc_autorelease", IC_Autorelease) + .Case("objc_autoreleaseReturnValue", IC_AutoreleaseRV) + .Case("objc_autoreleasePoolPop", IC_AutoreleasepoolPop) + .Case("objc_retainedObject", IC_NoopCast) + .Case("objc_unretainedObject", IC_NoopCast) + .Case("objc_unretainedPointer", IC_NoopCast) + .Case("objc_retain_autorelease", IC_FusedRetainAutorelease) + .Case("objc_retainAutorelease", IC_FusedRetainAutorelease) + .Case("objc_retainAutoreleaseReturnValue",IC_FusedRetainAutoreleaseRV) + .Default(IC_CallOrUser); + + // Argument is i8** + if (PointerType *Pte = dyn_cast<PointerType>(ETy)) + if (Pte->getElementType()->isIntegerTy(8)) + return StringSwitch<InstructionClass>(F->getName()) + .Case("objc_loadWeakRetained", IC_LoadWeakRetained) + .Case("objc_loadWeak", IC_LoadWeak) + .Case("objc_destroyWeak", IC_DestroyWeak) + .Default(IC_CallOrUser); + } + + // Two arguments, first is i8**. + const Argument *A1 = AI++; + if (AI == AE) + if (PointerType *PTy = dyn_cast<PointerType>(A0->getType())) + if (PointerType *Pte = dyn_cast<PointerType>(PTy->getElementType())) + if (Pte->getElementType()->isIntegerTy(8)) + if (PointerType *PTy1 = dyn_cast<PointerType>(A1->getType())) { + Type *ETy1 = PTy1->getElementType(); + // Second argument is i8* + if (ETy1->isIntegerTy(8)) + return StringSwitch<InstructionClass>(F->getName()) + .Case("objc_storeWeak", IC_StoreWeak) + .Case("objc_initWeak", IC_InitWeak) + .Case("objc_storeStrong", IC_StoreStrong) + .Default(IC_CallOrUser); + // Second argument is i8**. + if (PointerType *Pte1 = dyn_cast<PointerType>(ETy1)) + if (Pte1->getElementType()->isIntegerTy(8)) + return StringSwitch<InstructionClass>(F->getName()) + .Case("objc_moveWeak", IC_MoveWeak) + .Case("objc_copyWeak", IC_CopyWeak) + .Default(IC_CallOrUser); + } + + // Anything else. + return IC_CallOrUser; +} + +/// GetInstructionClass - Determine what kind of construct V is. +static InstructionClass GetInstructionClass(const Value *V) { + if (const Instruction *I = dyn_cast<Instruction>(V)) { + // Any instruction other than bitcast and gep with a pointer operand have a + // use of an objc pointer. Bitcasts, GEPs, Selects, PHIs transfer a pointer + // to a subsequent use, rather than using it themselves, in this sense. + // As a short cut, several other opcodes are known to have no pointer + // operands of interest. And ret is never followed by a release, so it's + // not interesting to examine. + switch (I->getOpcode()) { + case Instruction::Call: { + const CallInst *CI = cast<CallInst>(I); + // Check for calls to special functions. + if (const Function *F = CI->getCalledFunction()) { + InstructionClass Class = GetFunctionClass(F); + if (Class != IC_CallOrUser) + return Class; + + // None of the intrinsic functions do objc_release. For intrinsics, the + // only question is whether or not they may be users. + switch (F->getIntrinsicID()) { + case Intrinsic::returnaddress: case Intrinsic::frameaddress: + case Intrinsic::stacksave: case Intrinsic::stackrestore: + case Intrinsic::vastart: case Intrinsic::vacopy: case Intrinsic::vaend: + case Intrinsic::objectsize: case Intrinsic::prefetch: + case Intrinsic::stackprotector: + case Intrinsic::eh_return_i32: case Intrinsic::eh_return_i64: + case Intrinsic::eh_typeid_for: case Intrinsic::eh_dwarf_cfa: + case Intrinsic::eh_sjlj_lsda: case Intrinsic::eh_sjlj_functioncontext: + case Intrinsic::init_trampoline: case Intrinsic::adjust_trampoline: + case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: + case Intrinsic::invariant_start: case Intrinsic::invariant_end: + // Don't let dbg info affect our results. + case Intrinsic::dbg_declare: case Intrinsic::dbg_value: + // Short cut: Some intrinsics obviously don't use ObjC pointers. + return IC_None; + default: + break; + } + } + return GetCallSiteClass(CI); + } + case Instruction::Invoke: + return GetCallSiteClass(cast<InvokeInst>(I)); + case Instruction::BitCast: + case Instruction::GetElementPtr: + case Instruction::Select: case Instruction::PHI: + case Instruction::Ret: case Instruction::Br: + case Instruction::Switch: case Instruction::IndirectBr: + case Instruction::Alloca: case Instruction::VAArg: + case Instruction::Add: case Instruction::FAdd: + case Instruction::Sub: case Instruction::FSub: + case Instruction::Mul: case Instruction::FMul: + case Instruction::SDiv: case Instruction::UDiv: case Instruction::FDiv: + case Instruction::SRem: case Instruction::URem: case Instruction::FRem: + case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: + case Instruction::And: case Instruction::Or: case Instruction::Xor: + case Instruction::SExt: case Instruction::ZExt: case Instruction::Trunc: + case Instruction::IntToPtr: case Instruction::FCmp: + case Instruction::FPTrunc: case Instruction::FPExt: + case Instruction::FPToUI: case Instruction::FPToSI: + case Instruction::UIToFP: case Instruction::SIToFP: + case Instruction::InsertElement: case Instruction::ExtractElement: + case Instruction::ShuffleVector: + case Instruction::ExtractValue: + break; + case Instruction::ICmp: + // Comparing a pointer with null, or any other constant, isn't an + // interesting use, because we don't care what the pointer points to, or + // about the values of any other dynamic reference-counted pointers. + if (IsPotentialUse(I->getOperand(1))) + return IC_User; + break; + default: + // For anything else, check all the operands. + // Note that this includes both operands of a Store: while the first + // operand isn't actually being dereferenced, it is being stored to + // memory where we can no longer track who might read it and dereference + // it, so we have to consider it potentially used. + for (User::const_op_iterator OI = I->op_begin(), OE = I->op_end(); + OI != OE; ++OI) + if (IsPotentialUse(*OI)) + return IC_User; + } + } + + // Otherwise, it's totally inert for ARC purposes. + return IC_None; +} + +/// GetBasicInstructionClass - Determine what kind of construct V is. This is +/// similar to GetInstructionClass except that it only detects objc runtine +/// calls. This allows it to be faster. +static InstructionClass GetBasicInstructionClass(const Value *V) { + if (const CallInst *CI = dyn_cast<CallInst>(V)) { + if (const Function *F = CI->getCalledFunction()) + return GetFunctionClass(F); + // Otherwise, be conservative. + return IC_CallOrUser; + } + + // Otherwise, be conservative. + return isa<InvokeInst>(V) ? IC_CallOrUser : IC_User; +} + +/// IsRetain - Test if the given class is objc_retain or +/// equivalent. +static bool IsRetain(InstructionClass Class) { + return Class == IC_Retain || + Class == IC_RetainRV; +} + +/// IsAutorelease - Test if the given class is objc_autorelease or +/// equivalent. +static bool IsAutorelease(InstructionClass Class) { + return Class == IC_Autorelease || + Class == IC_AutoreleaseRV; +} + +/// IsForwarding - Test if the given class represents instructions which return +/// their argument verbatim. +static bool IsForwarding(InstructionClass Class) { + // objc_retainBlock technically doesn't always return its argument + // verbatim, but it doesn't matter for our purposes here. + return Class == IC_Retain || + Class == IC_RetainRV || + Class == IC_Autorelease || + Class == IC_AutoreleaseRV || + Class == IC_RetainBlock || + Class == IC_NoopCast; +} + +/// IsNoopOnNull - Test if the given class represents instructions which do +/// nothing if passed a null pointer. +static bool IsNoopOnNull(InstructionClass Class) { + return Class == IC_Retain || + Class == IC_RetainRV || + Class == IC_Release || + Class == IC_Autorelease || + Class == IC_AutoreleaseRV || + Class == IC_RetainBlock; +} + +/// IsAlwaysTail - Test if the given class represents instructions which are +/// always safe to mark with the "tail" keyword. +static bool IsAlwaysTail(InstructionClass Class) { + // IC_RetainBlock may be given a stack argument. + return Class == IC_Retain || + Class == IC_RetainRV || + Class == IC_Autorelease || + Class == IC_AutoreleaseRV; +} + +/// IsNoThrow - Test if the given class represents instructions which are always +/// safe to mark with the nounwind attribute.. +static bool IsNoThrow(InstructionClass Class) { + // objc_retainBlock is not nounwind because it calls user copy constructors + // which could theoretically throw. + return Class == IC_Retain || + Class == IC_RetainRV || + Class == IC_Release || + Class == IC_Autorelease || + Class == IC_AutoreleaseRV || + Class == IC_AutoreleasepoolPush || + Class == IC_AutoreleasepoolPop; +} + +/// EraseInstruction - Erase the given instruction. Many ObjC calls return their +/// argument verbatim, so if it's such a call and the return value has users, +/// replace them with the argument value. +static void EraseInstruction(Instruction *CI) { + Value *OldArg = cast<CallInst>(CI)->getArgOperand(0); + + bool Unused = CI->use_empty(); + + if (!Unused) { + // Replace the return value with the argument. + assert(IsForwarding(GetBasicInstructionClass(CI)) && + "Can't delete non-forwarding instruction with users!"); + CI->replaceAllUsesWith(OldArg); + } + + CI->eraseFromParent(); + + if (Unused) + RecursivelyDeleteTriviallyDeadInstructions(OldArg); +} + +/// GetUnderlyingObjCPtr - This is a wrapper around getUnderlyingObject which +/// also knows how to look through objc_retain and objc_autorelease calls, which +/// we know to return their argument verbatim. +static const Value *GetUnderlyingObjCPtr(const Value *V) { + for (;;) { + V = GetUnderlyingObject(V); + if (!IsForwarding(GetBasicInstructionClass(V))) + break; + V = cast<CallInst>(V)->getArgOperand(0); + } + + return V; +} + +/// StripPointerCastsAndObjCCalls - This is a wrapper around +/// Value::stripPointerCasts which also knows how to look through objc_retain +/// and objc_autorelease calls, which we know to return their argument verbatim. +static const Value *StripPointerCastsAndObjCCalls(const Value *V) { + for (;;) { + V = V->stripPointerCasts(); + if (!IsForwarding(GetBasicInstructionClass(V))) + break; + V = cast<CallInst>(V)->getArgOperand(0); + } + return V; +} + +/// StripPointerCastsAndObjCCalls - This is a wrapper around +/// Value::stripPointerCasts which also knows how to look through objc_retain +/// and objc_autorelease calls, which we know to return their argument verbatim. +static Value *StripPointerCastsAndObjCCalls(Value *V) { + for (;;) { + V = V->stripPointerCasts(); + if (!IsForwarding(GetBasicInstructionClass(V))) + break; + V = cast<CallInst>(V)->getArgOperand(0); + } + return V; +} + +/// GetObjCArg - Assuming the given instruction is one of the special calls such +/// as objc_retain or objc_release, return the argument value, stripped of no-op +/// casts and forwarding calls. +static Value *GetObjCArg(Value *Inst) { + return StripPointerCastsAndObjCCalls(cast<CallInst>(Inst)->getArgOperand(0)); +} + +/// IsObjCIdentifiedObject - This is similar to AliasAnalysis' +/// isObjCIdentifiedObject, except that it uses special knowledge of +/// ObjC conventions... +static bool IsObjCIdentifiedObject(const Value *V) { + // Assume that call results and arguments have their own "provenance". + // Constants (including GlobalVariables) and Allocas are never + // reference-counted. + if (isa<CallInst>(V) || isa<InvokeInst>(V) || + isa<Argument>(V) || isa<Constant>(V) || + isa<AllocaInst>(V)) + return true; + + if (const LoadInst *LI = dyn_cast<LoadInst>(V)) { + const Value *Pointer = + StripPointerCastsAndObjCCalls(LI->getPointerOperand()); + if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Pointer)) { + // A constant pointer can't be pointing to an object on the heap. It may + // be reference-counted, but it won't be deleted. + if (GV->isConstant()) + return true; + StringRef Name = GV->getName(); + // These special variables are known to hold values which are not + // reference-counted pointers. + if (Name.startswith("\01L_OBJC_SELECTOR_REFERENCES_") || + Name.startswith("\01L_OBJC_CLASSLIST_REFERENCES_") || + Name.startswith("\01L_OBJC_CLASSLIST_SUP_REFS_$_") || + Name.startswith("\01L_OBJC_METH_VAR_NAME_") || + Name.startswith("\01l_objc_msgSend_fixup_")) + return true; + } + } + + return false; +} + +/// FindSingleUseIdentifiedObject - This is similar to +/// StripPointerCastsAndObjCCalls but it stops as soon as it finds a value +/// with multiple uses. +static const Value *FindSingleUseIdentifiedObject(const Value *Arg) { + if (Arg->hasOneUse()) { + if (const BitCastInst *BC = dyn_cast<BitCastInst>(Arg)) + return FindSingleUseIdentifiedObject(BC->getOperand(0)); + if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Arg)) + if (GEP->hasAllZeroIndices()) + return FindSingleUseIdentifiedObject(GEP->getPointerOperand()); + if (IsForwarding(GetBasicInstructionClass(Arg))) + return FindSingleUseIdentifiedObject( + cast<CallInst>(Arg)->getArgOperand(0)); + if (!IsObjCIdentifiedObject(Arg)) + return 0; + return Arg; + } + + // If we found an identifiable object but it has multiple uses, but they are + // trivial uses, we can still consider this to be a single-use value. + if (IsObjCIdentifiedObject(Arg)) { + for (Value::const_use_iterator UI = Arg->use_begin(), UE = Arg->use_end(); + UI != UE; ++UI) { + const User *U = *UI; + if (!U->use_empty() || StripPointerCastsAndObjCCalls(U) != Arg) + return 0; + } + + return Arg; + } + + return 0; +} + +/// ModuleHasARC - Test if the given module looks interesting to run ARC +/// optimization on. +static bool ModuleHasARC(const Module &M) { + return + M.getNamedValue("objc_retain") || + M.getNamedValue("objc_release") || + M.getNamedValue("objc_autorelease") || + M.getNamedValue("objc_retainAutoreleasedReturnValue") || + M.getNamedValue("objc_retainBlock") || + M.getNamedValue("objc_autoreleaseReturnValue") || + M.getNamedValue("objc_autoreleasePoolPush") || + M.getNamedValue("objc_loadWeakRetained") || + M.getNamedValue("objc_loadWeak") || + M.getNamedValue("objc_destroyWeak") || + M.getNamedValue("objc_storeWeak") || + M.getNamedValue("objc_initWeak") || + M.getNamedValue("objc_moveWeak") || + M.getNamedValue("objc_copyWeak") || + M.getNamedValue("objc_retainedObject") || + M.getNamedValue("objc_unretainedObject") || + M.getNamedValue("objc_unretainedPointer"); +} + +/// DoesObjCBlockEscape - Test whether the given pointer, which is an +/// Objective C block pointer, does not "escape". This differs from regular +/// escape analysis in that a use as an argument to a call is not considered +/// an escape. +static bool DoesObjCBlockEscape(const Value *BlockPtr) { + // Walk the def-use chains. + SmallVector<const Value *, 4> Worklist; + Worklist.push_back(BlockPtr); + do { + const Value *V = Worklist.pop_back_val(); + for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end(); + UI != UE; ++UI) { + const User *UUser = *UI; + // Special - Use by a call (callee or argument) is not considered + // to be an escape. + switch (GetBasicInstructionClass(UUser)) { + case IC_StoreWeak: + case IC_InitWeak: + case IC_StoreStrong: + case IC_Autorelease: + case IC_AutoreleaseRV: + // These special functions make copies of their pointer arguments. + return true; + case IC_User: + case IC_None: + // Use by an instruction which copies the value is an escape if the + // result is an escape. + if (isa<BitCastInst>(UUser) || isa<GetElementPtrInst>(UUser) || + isa<PHINode>(UUser) || isa<SelectInst>(UUser)) { + Worklist.push_back(UUser); + continue; + } + // Use by a load is not an escape. + if (isa<LoadInst>(UUser)) + continue; + // Use by a store is not an escape if the use is the address. + if (const StoreInst *SI = dyn_cast<StoreInst>(UUser)) + if (V != SI->getValueOperand()) + continue; + break; + default: + // Regular calls and other stuff are not considered escapes. + continue; + } + // Otherwise, conservatively assume an escape. + return true; + } + } while (!Worklist.empty()); + + // No escapes found. + return false; +} + +//===----------------------------------------------------------------------===// +// ARC AliasAnalysis. +//===----------------------------------------------------------------------===// + +#include "llvm/Pass.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/Passes.h" + +namespace { + /// ObjCARCAliasAnalysis - This is a simple alias analysis + /// implementation that uses knowledge of ARC constructs to answer queries. + /// + /// TODO: This class could be generalized to know about other ObjC-specific + /// tricks. Such as knowing that ivars in the non-fragile ABI are non-aliasing + /// even though their offsets are dynamic. + class ObjCARCAliasAnalysis : public ImmutablePass, + public AliasAnalysis { + public: + static char ID; // Class identification, replacement for typeinfo + ObjCARCAliasAnalysis() : ImmutablePass(ID) { + initializeObjCARCAliasAnalysisPass(*PassRegistry::getPassRegistry()); + } + + private: + virtual void initializePass() { + InitializeAliasAnalysis(this); + } + + /// getAdjustedAnalysisPointer - This method is used when a pass implements + /// an analysis interface through multiple inheritance. If needed, it + /// should override this to adjust the this pointer as needed for the + /// specified pass info. + virtual void *getAdjustedAnalysisPointer(const void *PI) { + if (PI == &AliasAnalysis::ID) + return static_cast<AliasAnalysis *>(this); + return this; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const; + virtual AliasResult alias(const Location &LocA, const Location &LocB); + virtual bool pointsToConstantMemory(const Location &Loc, bool OrLocal); + virtual ModRefBehavior getModRefBehavior(ImmutableCallSite CS); + virtual ModRefBehavior getModRefBehavior(const Function *F); + virtual ModRefResult getModRefInfo(ImmutableCallSite CS, + const Location &Loc); + virtual ModRefResult getModRefInfo(ImmutableCallSite CS1, + ImmutableCallSite CS2); + }; +} // End of anonymous namespace + +// Register this pass... +char ObjCARCAliasAnalysis::ID = 0; +INITIALIZE_AG_PASS(ObjCARCAliasAnalysis, AliasAnalysis, "objc-arc-aa", + "ObjC-ARC-Based Alias Analysis", false, true, false) + +ImmutablePass *llvm::createObjCARCAliasAnalysisPass() { + return new ObjCARCAliasAnalysis(); +} + +void +ObjCARCAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + AliasAnalysis::getAnalysisUsage(AU); +} + +AliasAnalysis::AliasResult +ObjCARCAliasAnalysis::alias(const Location &LocA, const Location &LocB) { + if (!EnableARCOpts) + return AliasAnalysis::alias(LocA, LocB); + + // First, strip off no-ops, including ObjC-specific no-ops, and try making a + // precise alias query. + const Value *SA = StripPointerCastsAndObjCCalls(LocA.Ptr); + const Value *SB = StripPointerCastsAndObjCCalls(LocB.Ptr); + AliasResult Result = + AliasAnalysis::alias(Location(SA, LocA.Size, LocA.TBAATag), + Location(SB, LocB.Size, LocB.TBAATag)); + if (Result != MayAlias) + return Result; + + // If that failed, climb to the underlying object, including climbing through + // ObjC-specific no-ops, and try making an imprecise alias query. + const Value *UA = GetUnderlyingObjCPtr(SA); + const Value *UB = GetUnderlyingObjCPtr(SB); + if (UA != SA || UB != SB) { + Result = AliasAnalysis::alias(Location(UA), Location(UB)); + // We can't use MustAlias or PartialAlias results here because + // GetUnderlyingObjCPtr may return an offsetted pointer value. + if (Result == NoAlias) + return NoAlias; + } + + // If that failed, fail. We don't need to chain here, since that's covered + // by the earlier precise query. + return MayAlias; +} + +bool +ObjCARCAliasAnalysis::pointsToConstantMemory(const Location &Loc, + bool OrLocal) { + if (!EnableARCOpts) + return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal); + + // First, strip off no-ops, including ObjC-specific no-ops, and try making + // a precise alias query. + const Value *S = StripPointerCastsAndObjCCalls(Loc.Ptr); + if (AliasAnalysis::pointsToConstantMemory(Location(S, Loc.Size, Loc.TBAATag), + OrLocal)) + return true; + + // If that failed, climb to the underlying object, including climbing through + // ObjC-specific no-ops, and try making an imprecise alias query. + const Value *U = GetUnderlyingObjCPtr(S); + if (U != S) + return AliasAnalysis::pointsToConstantMemory(Location(U), OrLocal); + + // If that failed, fail. We don't need to chain here, since that's covered + // by the earlier precise query. + return false; +} + +AliasAnalysis::ModRefBehavior +ObjCARCAliasAnalysis::getModRefBehavior(ImmutableCallSite CS) { + // We have nothing to do. Just chain to the next AliasAnalysis. + return AliasAnalysis::getModRefBehavior(CS); +} + +AliasAnalysis::ModRefBehavior +ObjCARCAliasAnalysis::getModRefBehavior(const Function *F) { + if (!EnableARCOpts) + return AliasAnalysis::getModRefBehavior(F); + + switch (GetFunctionClass(F)) { + case IC_NoopCast: + return DoesNotAccessMemory; + default: + break; + } + + return AliasAnalysis::getModRefBehavior(F); +} + +AliasAnalysis::ModRefResult +ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS, const Location &Loc) { + if (!EnableARCOpts) + return AliasAnalysis::getModRefInfo(CS, Loc); + + switch (GetBasicInstructionClass(CS.getInstruction())) { + case IC_Retain: + case IC_RetainRV: + case IC_Autorelease: + case IC_AutoreleaseRV: + case IC_NoopCast: + case IC_AutoreleasepoolPush: + case IC_FusedRetainAutorelease: + case IC_FusedRetainAutoreleaseRV: + // These functions don't access any memory visible to the compiler. + // Note that this doesn't include objc_retainBlock, because it updates + // pointers when it copies block data. + return NoModRef; + default: + break; + } + + return AliasAnalysis::getModRefInfo(CS, Loc); +} + +AliasAnalysis::ModRefResult +ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS1, + ImmutableCallSite CS2) { + // TODO: Theoretically we could check for dependencies between objc_* calls + // and OnlyAccessesArgumentPointees calls or other well-behaved calls. + return AliasAnalysis::getModRefInfo(CS1, CS2); +} + +//===----------------------------------------------------------------------===// +// ARC expansion. +//===----------------------------------------------------------------------===// + +#include "llvm/Support/InstIterator.h" +#include "llvm/Transforms/Scalar.h" + +namespace { + /// ObjCARCExpand - Early ARC transformations. + class ObjCARCExpand : public FunctionPass { + virtual void getAnalysisUsage(AnalysisUsage &AU) const; + virtual bool doInitialization(Module &M); + virtual bool runOnFunction(Function &F); + + /// Run - A flag indicating whether this optimization pass should run. + bool Run; + + public: + static char ID; + ObjCARCExpand() : FunctionPass(ID) { + initializeObjCARCExpandPass(*PassRegistry::getPassRegistry()); + } + }; +} + +char ObjCARCExpand::ID = 0; +INITIALIZE_PASS(ObjCARCExpand, + "objc-arc-expand", "ObjC ARC expansion", false, false) + +Pass *llvm::createObjCARCExpandPass() { + return new ObjCARCExpand(); +} + +void ObjCARCExpand::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); +} + +bool ObjCARCExpand::doInitialization(Module &M) { + Run = ModuleHasARC(M); + return false; +} + +bool ObjCARCExpand::runOnFunction(Function &F) { + if (!EnableARCOpts) + return false; + + // If nothing in the Module uses ARC, don't do anything. + if (!Run) + return false; + + bool Changed = false; + + for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) { + Instruction *Inst = &*I; + + switch (GetBasicInstructionClass(Inst)) { + case IC_Retain: + case IC_RetainRV: + case IC_Autorelease: + case IC_AutoreleaseRV: + case IC_FusedRetainAutorelease: + case IC_FusedRetainAutoreleaseRV: + // These calls return their argument verbatim, as a low-level + // optimization. However, this makes high-level optimizations + // harder. Undo any uses of this optimization that the front-end + // emitted here. We'll redo them in the contract pass. + Changed = true; + Inst->replaceAllUsesWith(cast<CallInst>(Inst)->getArgOperand(0)); + break; + default: + break; + } + } + + return Changed; +} + +//===----------------------------------------------------------------------===// +// ARC autorelease pool elimination. +//===----------------------------------------------------------------------===// + +#include "llvm/Constants.h" +#include "llvm/ADT/STLExtras.h" + +namespace { + /// ObjCARCAPElim - Autorelease pool elimination. + class ObjCARCAPElim : public ModulePass { + virtual void getAnalysisUsage(AnalysisUsage &AU) const; + virtual bool runOnModule(Module &M); + + static bool MayAutorelease(ImmutableCallSite CS, unsigned Depth = 0); + static bool OptimizeBB(BasicBlock *BB); + + public: + static char ID; + ObjCARCAPElim() : ModulePass(ID) { + initializeObjCARCAPElimPass(*PassRegistry::getPassRegistry()); + } + }; +} + +char ObjCARCAPElim::ID = 0; +INITIALIZE_PASS(ObjCARCAPElim, + "objc-arc-apelim", + "ObjC ARC autorelease pool elimination", + false, false) + +Pass *llvm::createObjCARCAPElimPass() { + return new ObjCARCAPElim(); +} + +void ObjCARCAPElim::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); +} + +/// MayAutorelease - Interprocedurally determine if calls made by the +/// given call site can possibly produce autoreleases. +bool ObjCARCAPElim::MayAutorelease(ImmutableCallSite CS, unsigned Depth) { + if (const Function *Callee = CS.getCalledFunction()) { + if (Callee->isDeclaration() || Callee->mayBeOverridden()) + return true; + for (Function::const_iterator I = Callee->begin(), E = Callee->end(); + I != E; ++I) { + const BasicBlock *BB = I; + for (BasicBlock::const_iterator J = BB->begin(), F = BB->end(); + J != F; ++J) + if (ImmutableCallSite JCS = ImmutableCallSite(J)) + // This recursion depth limit is arbitrary. It's just great + // enough to cover known interesting testcases. + if (Depth < 3 && + !JCS.onlyReadsMemory() && + MayAutorelease(JCS, Depth + 1)) + return true; + } + return false; + } + + return true; +} + +bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) { + bool Changed = false; + + Instruction *Push = 0; + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) { + Instruction *Inst = I++; + switch (GetBasicInstructionClass(Inst)) { + case IC_AutoreleasepoolPush: + Push = Inst; + break; + case IC_AutoreleasepoolPop: + // If this pop matches a push and nothing in between can autorelease, + // zap the pair. + if (Push && cast<CallInst>(Inst)->getArgOperand(0) == Push) { + Changed = true; + Inst->eraseFromParent(); + Push->eraseFromParent(); + } + Push = 0; + break; + case IC_CallOrUser: + if (MayAutorelease(ImmutableCallSite(Inst))) + Push = 0; + break; + default: + break; + } + } + + return Changed; +} + +bool ObjCARCAPElim::runOnModule(Module &M) { + if (!EnableARCOpts) + return false; + + // If nothing in the Module uses ARC, don't do anything. + if (!ModuleHasARC(M)) + return false; + + // Find the llvm.global_ctors variable, as the first step in + // identifying the global constructors. In theory, unnecessary autorelease + // pools could occur anywhere, but in practice it's pretty rare. Global + // ctors are a place where autorelease pools get inserted automatically, + // so it's pretty common for them to be unnecessary, and it's pretty + // profitable to eliminate them. + GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors"); + if (!GV) + return false; + + assert(GV->hasDefinitiveInitializer() && + "llvm.global_ctors is uncooperative!"); + + bool Changed = false; + + // Dig the constructor functions out of GV's initializer. + ConstantArray *Init = cast<ConstantArray>(GV->getInitializer()); + for (User::op_iterator OI = Init->op_begin(), OE = Init->op_end(); + OI != OE; ++OI) { + Value *Op = *OI; + // llvm.global_ctors is an array of pairs where the second members + // are constructor functions. + Function *F = dyn_cast<Function>(cast<ConstantStruct>(Op)->getOperand(1)); + // If the user used a constructor function with the wrong signature and + // it got bitcasted or whatever, look the other way. + if (!F) + continue; + // Only look at function definitions. + if (F->isDeclaration()) + continue; + // Only look at functions with one basic block. + if (llvm::next(F->begin()) != F->end()) + continue; + // Ok, a single-block constructor function definition. Try to optimize it. + Changed |= OptimizeBB(F->begin()); + } + + return Changed; +} + +//===----------------------------------------------------------------------===// +// ARC optimization. +//===----------------------------------------------------------------------===// + +// TODO: On code like this: +// +// objc_retain(%x) +// stuff_that_cannot_release() +// objc_autorelease(%x) +// stuff_that_cannot_release() +// objc_retain(%x) +// stuff_that_cannot_release() +// objc_autorelease(%x) +// +// The second retain and autorelease can be deleted. + +// TODO: It should be possible to delete +// objc_autoreleasePoolPush and objc_autoreleasePoolPop +// pairs if nothing is actually autoreleased between them. Also, autorelease +// calls followed by objc_autoreleasePoolPop calls (perhaps in ObjC++ code +// after inlining) can be turned into plain release calls. + +// TODO: Critical-edge splitting. If the optimial insertion point is +// a critical edge, the current algorithm has to fail, because it doesn't +// know how to split edges. It should be possible to make the optimizer +// think in terms of edges, rather than blocks, and then split critical +// edges on demand. + +// TODO: OptimizeSequences could generalized to be Interprocedural. + +// TODO: Recognize that a bunch of other objc runtime calls have +// non-escaping arguments and non-releasing arguments, and may be +// non-autoreleasing. + +// TODO: Sink autorelease calls as far as possible. Unfortunately we +// usually can't sink them past other calls, which would be the main +// case where it would be useful. + +// TODO: The pointer returned from objc_loadWeakRetained is retained. + +// TODO: Delete release+retain pairs (rare). + +#include "llvm/LLVMContext.h" +#include "llvm/Support/CFG.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/SmallPtrSet.h" + +STATISTIC(NumNoops, "Number of no-op objc calls eliminated"); +STATISTIC(NumPartialNoops, "Number of partially no-op objc calls eliminated"); +STATISTIC(NumAutoreleases,"Number of autoreleases converted to releases"); +STATISTIC(NumRets, "Number of return value forwarding " + "retain+autoreleaes eliminated"); +STATISTIC(NumRRs, "Number of retain+release paths eliminated"); +STATISTIC(NumPeeps, "Number of calls peephole-optimized"); + +namespace { + /// ProvenanceAnalysis - This is similar to BasicAliasAnalysis, and it + /// uses many of the same techniques, except it uses special ObjC-specific + /// reasoning about pointer relationships. + class ProvenanceAnalysis { + AliasAnalysis *AA; + + typedef std::pair<const Value *, const Value *> ValuePairTy; + typedef DenseMap<ValuePairTy, bool> CachedResultsTy; + CachedResultsTy CachedResults; + + bool relatedCheck(const Value *A, const Value *B); + bool relatedSelect(const SelectInst *A, const Value *B); + bool relatedPHI(const PHINode *A, const Value *B); + + // Do not implement. + void operator=(const ProvenanceAnalysis &); + ProvenanceAnalysis(const ProvenanceAnalysis &); + + public: + ProvenanceAnalysis() {} + + void setAA(AliasAnalysis *aa) { AA = aa; } + + AliasAnalysis *getAA() const { return AA; } + + bool related(const Value *A, const Value *B); + + void clear() { + CachedResults.clear(); + } + }; +} + +bool ProvenanceAnalysis::relatedSelect(const SelectInst *A, const Value *B) { + // If the values are Selects with the same condition, we can do a more precise + // check: just check for relations between the values on corresponding arms. + if (const SelectInst *SB = dyn_cast<SelectInst>(B)) + if (A->getCondition() == SB->getCondition()) + return related(A->getTrueValue(), SB->getTrueValue()) || + related(A->getFalseValue(), SB->getFalseValue()); + + // Check both arms of the Select node individually. + return related(A->getTrueValue(), B) || + related(A->getFalseValue(), B); +} + +bool ProvenanceAnalysis::relatedPHI(const PHINode *A, const Value *B) { + // If the values are PHIs in the same block, we can do a more precise as well + // as efficient check: just check for relations between the values on + // corresponding edges. + if (const PHINode *PNB = dyn_cast<PHINode>(B)) + if (PNB->getParent() == A->getParent()) { + for (unsigned i = 0, e = A->getNumIncomingValues(); i != e; ++i) + if (related(A->getIncomingValue(i), + PNB->getIncomingValueForBlock(A->getIncomingBlock(i)))) + return true; + return false; + } + + // Check each unique source of the PHI node against B. + SmallPtrSet<const Value *, 4> UniqueSrc; + for (unsigned i = 0, e = A->getNumIncomingValues(); i != e; ++i) { + const Value *PV1 = A->getIncomingValue(i); + if (UniqueSrc.insert(PV1) && related(PV1, B)) + return true; + } + + // All of the arms checked out. + return false; +} + +/// isStoredObjCPointer - Test if the value of P, or any value covered by its +/// provenance, is ever stored within the function (not counting callees). +static bool isStoredObjCPointer(const Value *P) { + SmallPtrSet<const Value *, 8> Visited; + SmallVector<const Value *, 8> Worklist; + Worklist.push_back(P); + Visited.insert(P); + do { + P = Worklist.pop_back_val(); + for (Value::const_use_iterator UI = P->use_begin(), UE = P->use_end(); + UI != UE; ++UI) { + const User *Ur = *UI; + if (isa<StoreInst>(Ur)) { + if (UI.getOperandNo() == 0) + // The pointer is stored. + return true; + // The pointed is stored through. + continue; + } + if (isa<CallInst>(Ur)) + // The pointer is passed as an argument, ignore this. + continue; + if (isa<PtrToIntInst>(P)) + // Assume the worst. + return true; + if (Visited.insert(Ur)) + Worklist.push_back(Ur); + } + } while (!Worklist.empty()); + + // Everything checked out. + return false; +} + +bool ProvenanceAnalysis::relatedCheck(const Value *A, const Value *B) { + // Skip past provenance pass-throughs. + A = GetUnderlyingObjCPtr(A); + B = GetUnderlyingObjCPtr(B); + + // Quick check. + if (A == B) + return true; + + // Ask regular AliasAnalysis, for a first approximation. + switch (AA->alias(A, B)) { + case AliasAnalysis::NoAlias: + return false; + case AliasAnalysis::MustAlias: + case AliasAnalysis::PartialAlias: + return true; + case AliasAnalysis::MayAlias: + break; + } + + bool AIsIdentified = IsObjCIdentifiedObject(A); + bool BIsIdentified = IsObjCIdentifiedObject(B); + + // An ObjC-Identified object can't alias a load if it is never locally stored. + if (AIsIdentified) { + if (BIsIdentified) { + // If both pointers have provenance, they can be directly compared. + if (A != B) + return false; + } else { + if (isa<LoadInst>(B)) + return isStoredObjCPointer(A); + } + } else { + if (BIsIdentified && isa<LoadInst>(A)) + return isStoredObjCPointer(B); + } + + // Special handling for PHI and Select. + if (const PHINode *PN = dyn_cast<PHINode>(A)) + return relatedPHI(PN, B); + if (const PHINode *PN = dyn_cast<PHINode>(B)) + return relatedPHI(PN, A); + if (const SelectInst *S = dyn_cast<SelectInst>(A)) + return relatedSelect(S, B); + if (const SelectInst *S = dyn_cast<SelectInst>(B)) + return relatedSelect(S, A); + + // Conservative. + return true; +} + +bool ProvenanceAnalysis::related(const Value *A, const Value *B) { + // Begin by inserting a conservative value into the map. If the insertion + // fails, we have the answer already. If it succeeds, leave it there until we + // compute the real answer to guard against recursive queries. + if (A > B) std::swap(A, B); + std::pair<CachedResultsTy::iterator, bool> Pair = + CachedResults.insert(std::make_pair(ValuePairTy(A, B), true)); + if (!Pair.second) + return Pair.first->second; + + bool Result = relatedCheck(A, B); + CachedResults[ValuePairTy(A, B)] = Result; + return Result; +} + +namespace { + // Sequence - A sequence of states that a pointer may go through in which an + // objc_retain and objc_release are actually needed. + enum Sequence { + S_None, + S_Retain, ///< objc_retain(x) + S_CanRelease, ///< foo(x) -- x could possibly see a ref count decrement + S_Use, ///< any use of x + S_Stop, ///< like S_Release, but code motion is stopped + S_Release, ///< objc_release(x) + S_MovableRelease ///< objc_release(x), !clang.imprecise_release + }; +} + +static Sequence MergeSeqs(Sequence A, Sequence B, bool TopDown) { + // The easy cases. + if (A == B) + return A; + if (A == S_None || B == S_None) + return S_None; + + if (A > B) std::swap(A, B); + if (TopDown) { + // Choose the side which is further along in the sequence. + if ((A == S_Retain || A == S_CanRelease) && + (B == S_CanRelease || B == S_Use)) + return B; + } else { + // Choose the side which is further along in the sequence. + if ((A == S_Use || A == S_CanRelease) && + (B == S_Use || B == S_Release || B == S_Stop || B == S_MovableRelease)) + return A; + // If both sides are releases, choose the more conservative one. + if (A == S_Stop && (B == S_Release || B == S_MovableRelease)) + return A; + if (A == S_Release && B == S_MovableRelease) + return A; + } + + return S_None; +} + +namespace { + /// RRInfo - Unidirectional information about either a + /// retain-decrement-use-release sequence or release-use-decrement-retain + /// reverese sequence. + struct RRInfo { + /// KnownSafe - After an objc_retain, the reference count of the referenced + /// object is known to be positive. Similarly, before an objc_release, the + /// reference count of the referenced object is known to be positive. If + /// there are retain-release pairs in code regions where the retain count + /// is known to be positive, they can be eliminated, regardless of any side + /// effects between them. + /// + /// Also, a retain+release pair nested within another retain+release + /// pair all on the known same pointer value can be eliminated, regardless + /// of any intervening side effects. + /// + /// KnownSafe is true when either of these conditions is satisfied. + bool KnownSafe; + + /// IsRetainBlock - True if the Calls are objc_retainBlock calls (as + /// opposed to objc_retain calls). + bool IsRetainBlock; + + /// IsTailCallRelease - True of the objc_release calls are all marked + /// with the "tail" keyword. + bool IsTailCallRelease; + + /// ReleaseMetadata - If the Calls are objc_release calls and they all have + /// a clang.imprecise_release tag, this is the metadata tag. + MDNode *ReleaseMetadata; + + /// Calls - For a top-down sequence, the set of objc_retains or + /// objc_retainBlocks. For bottom-up, the set of objc_releases. + SmallPtrSet<Instruction *, 2> Calls; + + /// ReverseInsertPts - The set of optimal insert positions for + /// moving calls in the opposite sequence. + SmallPtrSet<Instruction *, 2> ReverseInsertPts; + + RRInfo() : + KnownSafe(false), IsRetainBlock(false), + IsTailCallRelease(false), + ReleaseMetadata(0) {} + + void clear(); + }; +} + +void RRInfo::clear() { + KnownSafe = false; + IsRetainBlock = false; + IsTailCallRelease = false; + ReleaseMetadata = 0; + Calls.clear(); + ReverseInsertPts.clear(); +} + +namespace { + /// PtrState - This class summarizes several per-pointer runtime properties + /// which are propogated through the flow graph. + class PtrState { + /// NestCount - The known minimum level of retain+release nesting. + unsigned NestCount; + + /// KnownPositiveRefCount - True if the reference count is known to + /// be incremented. + bool KnownPositiveRefCount; + + /// Partial - True of we've seen an opportunity for partial RR elimination, + /// such as pushing calls into a CFG triangle or into one side of a + /// CFG diamond. + bool Partial; + + /// Seq - The current position in the sequence. + Sequence Seq : 8; + + public: + /// RRI - Unidirectional information about the current sequence. + /// TODO: Encapsulate this better. + RRInfo RRI; + + PtrState() : NestCount(0), KnownPositiveRefCount(false), Partial(false), + Seq(S_None) {} + + void SetKnownPositiveRefCount() { + KnownPositiveRefCount = true; + } + + void ClearRefCount() { + KnownPositiveRefCount = false; + } + + bool IsKnownIncremented() const { + return KnownPositiveRefCount; + } + + void IncrementNestCount() { + if (NestCount != UINT_MAX) ++NestCount; + } + + void DecrementNestCount() { + if (NestCount != 0) --NestCount; + } + + bool IsKnownNested() const { + return NestCount > 0; + } + + void SetSeq(Sequence NewSeq) { + Seq = NewSeq; + } + + Sequence GetSeq() const { + return Seq; + } + + void ClearSequenceProgress() { + ResetSequenceProgress(S_None); + } + + void ResetSequenceProgress(Sequence NewSeq) { + Seq = NewSeq; + Partial = false; + RRI.clear(); + } + + void Merge(const PtrState &Other, bool TopDown); + }; +} + +void +PtrState::Merge(const PtrState &Other, bool TopDown) { + Seq = MergeSeqs(Seq, Other.Seq, TopDown); + KnownPositiveRefCount = KnownPositiveRefCount && Other.KnownPositiveRefCount; + NestCount = std::min(NestCount, Other.NestCount); + + // We can't merge a plain objc_retain with an objc_retainBlock. + if (RRI.IsRetainBlock != Other.RRI.IsRetainBlock) + Seq = S_None; + + // If we're not in a sequence (anymore), drop all associated state. + if (Seq == S_None) { + Partial = false; + RRI.clear(); + } else if (Partial || Other.Partial) { + // If we're doing a merge on a path that's previously seen a partial + // merge, conservatively drop the sequence, to avoid doing partial + // RR elimination. If the branch predicates for the two merge differ, + // mixing them is unsafe. + ClearSequenceProgress(); + } else { + // Conservatively merge the ReleaseMetadata information. + if (RRI.ReleaseMetadata != Other.RRI.ReleaseMetadata) + RRI.ReleaseMetadata = 0; + + RRI.KnownSafe = RRI.KnownSafe && Other.RRI.KnownSafe; + RRI.IsTailCallRelease = RRI.IsTailCallRelease && + Other.RRI.IsTailCallRelease; + RRI.Calls.insert(Other.RRI.Calls.begin(), Other.RRI.Calls.end()); + + // Merge the insert point sets. If there are any differences, + // that makes this a partial merge. + Partial = RRI.ReverseInsertPts.size() != Other.RRI.ReverseInsertPts.size(); + for (SmallPtrSet<Instruction *, 2>::const_iterator + I = Other.RRI.ReverseInsertPts.begin(), + E = Other.RRI.ReverseInsertPts.end(); I != E; ++I) + Partial |= RRI.ReverseInsertPts.insert(*I); + } +} + +namespace { + /// BBState - Per-BasicBlock state. + class BBState { + /// TopDownPathCount - The number of unique control paths from the entry + /// which can reach this block. + unsigned TopDownPathCount; + + /// BottomUpPathCount - The number of unique control paths to exits + /// from this block. + unsigned BottomUpPathCount; + + /// MapTy - A type for PerPtrTopDown and PerPtrBottomUp. + typedef MapVector<const Value *, PtrState> MapTy; + + /// PerPtrTopDown - The top-down traversal uses this to record information + /// known about a pointer at the bottom of each block. + MapTy PerPtrTopDown; + + /// PerPtrBottomUp - The bottom-up traversal uses this to record information + /// known about a pointer at the top of each block. + MapTy PerPtrBottomUp; + + /// Preds, Succs - Effective successors and predecessors of the current + /// block (this ignores ignorable edges and ignored backedges). + SmallVector<BasicBlock *, 2> Preds; + SmallVector<BasicBlock *, 2> Succs; + + public: + BBState() : TopDownPathCount(0), BottomUpPathCount(0) {} + + typedef MapTy::iterator ptr_iterator; + typedef MapTy::const_iterator ptr_const_iterator; + + ptr_iterator top_down_ptr_begin() { return PerPtrTopDown.begin(); } + ptr_iterator top_down_ptr_end() { return PerPtrTopDown.end(); } + ptr_const_iterator top_down_ptr_begin() const { + return PerPtrTopDown.begin(); + } + ptr_const_iterator top_down_ptr_end() const { + return PerPtrTopDown.end(); + } + + ptr_iterator bottom_up_ptr_begin() { return PerPtrBottomUp.begin(); } + ptr_iterator bottom_up_ptr_end() { return PerPtrBottomUp.end(); } + ptr_const_iterator bottom_up_ptr_begin() const { + return PerPtrBottomUp.begin(); + } + ptr_const_iterator bottom_up_ptr_end() const { + return PerPtrBottomUp.end(); + } + + /// SetAsEntry - Mark this block as being an entry block, which has one + /// path from the entry by definition. + void SetAsEntry() { TopDownPathCount = 1; } + + /// SetAsExit - Mark this block as being an exit block, which has one + /// path to an exit by definition. + void SetAsExit() { BottomUpPathCount = 1; } + + PtrState &getPtrTopDownState(const Value *Arg) { + return PerPtrTopDown[Arg]; + } + + PtrState &getPtrBottomUpState(const Value *Arg) { + return PerPtrBottomUp[Arg]; + } + + void clearBottomUpPointers() { + PerPtrBottomUp.clear(); + } + + void clearTopDownPointers() { + PerPtrTopDown.clear(); + } + + void InitFromPred(const BBState &Other); + void InitFromSucc(const BBState &Other); + void MergePred(const BBState &Other); + void MergeSucc(const BBState &Other); + + /// GetAllPathCount - Return the number of possible unique paths from an + /// entry to an exit which pass through this block. This is only valid + /// after both the top-down and bottom-up traversals are complete. + unsigned GetAllPathCount() const { + assert(TopDownPathCount != 0); + assert(BottomUpPathCount != 0); + return TopDownPathCount * BottomUpPathCount; + } + + // Specialized CFG utilities. + typedef SmallVectorImpl<BasicBlock *>::const_iterator edge_iterator; + edge_iterator pred_begin() { return Preds.begin(); } + edge_iterator pred_end() { return Preds.end(); } + edge_iterator succ_begin() { return Succs.begin(); } + edge_iterator succ_end() { return Succs.end(); } + + void addSucc(BasicBlock *Succ) { Succs.push_back(Succ); } + void addPred(BasicBlock *Pred) { Preds.push_back(Pred); } + + bool isExit() const { return Succs.empty(); } + }; +} + +void BBState::InitFromPred(const BBState &Other) { + PerPtrTopDown = Other.PerPtrTopDown; + TopDownPathCount = Other.TopDownPathCount; +} + +void BBState::InitFromSucc(const BBState &Other) { + PerPtrBottomUp = Other.PerPtrBottomUp; + BottomUpPathCount = Other.BottomUpPathCount; +} + +/// MergePred - The top-down traversal uses this to merge information about +/// predecessors to form the initial state for a new block. +void BBState::MergePred(const BBState &Other) { + // Other.TopDownPathCount can be 0, in which case it is either dead or a + // loop backedge. Loop backedges are special. + TopDownPathCount += Other.TopDownPathCount; + + // For each entry in the other set, if our set has an entry with the same key, + // merge the entries. Otherwise, copy the entry and merge it with an empty + // entry. + for (ptr_const_iterator MI = Other.top_down_ptr_begin(), + ME = Other.top_down_ptr_end(); MI != ME; ++MI) { + std::pair<ptr_iterator, bool> Pair = PerPtrTopDown.insert(*MI); + Pair.first->second.Merge(Pair.second ? PtrState() : MI->second, + /*TopDown=*/true); + } + + // For each entry in our set, if the other set doesn't have an entry with the + // same key, force it to merge with an empty entry. + for (ptr_iterator MI = top_down_ptr_begin(), + ME = top_down_ptr_end(); MI != ME; ++MI) + if (Other.PerPtrTopDown.find(MI->first) == Other.PerPtrTopDown.end()) + MI->second.Merge(PtrState(), /*TopDown=*/true); +} + +/// MergeSucc - The bottom-up traversal uses this to merge information about +/// successors to form the initial state for a new block. +void BBState::MergeSucc(const BBState &Other) { + // Other.BottomUpPathCount can be 0, in which case it is either dead or a + // loop backedge. Loop backedges are special. + BottomUpPathCount += Other.BottomUpPathCount; + + // For each entry in the other set, if our set has an entry with the + // same key, merge the entries. Otherwise, copy the entry and merge + // it with an empty entry. + for (ptr_const_iterator MI = Other.bottom_up_ptr_begin(), + ME = Other.bottom_up_ptr_end(); MI != ME; ++MI) { + std::pair<ptr_iterator, bool> Pair = PerPtrBottomUp.insert(*MI); + Pair.first->second.Merge(Pair.second ? PtrState() : MI->second, + /*TopDown=*/false); + } + + // For each entry in our set, if the other set doesn't have an entry + // with the same key, force it to merge with an empty entry. + for (ptr_iterator MI = bottom_up_ptr_begin(), + ME = bottom_up_ptr_end(); MI != ME; ++MI) + if (Other.PerPtrBottomUp.find(MI->first) == Other.PerPtrBottomUp.end()) + MI->second.Merge(PtrState(), /*TopDown=*/false); +} + +namespace { + /// ObjCARCOpt - The main ARC optimization pass. + class ObjCARCOpt : public FunctionPass { + bool Changed; + ProvenanceAnalysis PA; + + /// Run - A flag indicating whether this optimization pass should run. + bool Run; + + /// RetainRVCallee, etc. - Declarations for ObjC runtime + /// functions, for use in creating calls to them. These are initialized + /// lazily to avoid cluttering up the Module with unused declarations. + Constant *RetainRVCallee, *AutoreleaseRVCallee, *ReleaseCallee, + *RetainCallee, *RetainBlockCallee, *AutoreleaseCallee; + + /// UsedInThisFunciton - Flags which determine whether each of the + /// interesting runtine functions is in fact used in the current function. + unsigned UsedInThisFunction; + + /// ImpreciseReleaseMDKind - The Metadata Kind for clang.imprecise_release + /// metadata. + unsigned ImpreciseReleaseMDKind; + + /// CopyOnEscapeMDKind - The Metadata Kind for clang.arc.copy_on_escape + /// metadata. + unsigned CopyOnEscapeMDKind; + + /// NoObjCARCExceptionsMDKind - The Metadata Kind for + /// clang.arc.no_objc_arc_exceptions metadata. + unsigned NoObjCARCExceptionsMDKind; + + Constant *getRetainRVCallee(Module *M); + Constant *getAutoreleaseRVCallee(Module *M); + Constant *getReleaseCallee(Module *M); + Constant *getRetainCallee(Module *M); + Constant *getRetainBlockCallee(Module *M); + Constant *getAutoreleaseCallee(Module *M); + + bool IsRetainBlockOptimizable(const Instruction *Inst); + + void OptimizeRetainCall(Function &F, Instruction *Retain); + bool OptimizeRetainRVCall(Function &F, Instruction *RetainRV); + void OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV); + void OptimizeIndividualCalls(Function &F); + + void CheckForCFGHazards(const BasicBlock *BB, + DenseMap<const BasicBlock *, BBState> &BBStates, + BBState &MyStates) const; + bool VisitInstructionBottomUp(Instruction *Inst, + BasicBlock *BB, + MapVector<Value *, RRInfo> &Retains, + BBState &MyStates); + bool VisitBottomUp(BasicBlock *BB, + DenseMap<const BasicBlock *, BBState> &BBStates, + MapVector<Value *, RRInfo> &Retains); + bool VisitInstructionTopDown(Instruction *Inst, + DenseMap<Value *, RRInfo> &Releases, + BBState &MyStates); + bool VisitTopDown(BasicBlock *BB, + DenseMap<const BasicBlock *, BBState> &BBStates, + DenseMap<Value *, RRInfo> &Releases); + bool Visit(Function &F, + DenseMap<const BasicBlock *, BBState> &BBStates, + MapVector<Value *, RRInfo> &Retains, + DenseMap<Value *, RRInfo> &Releases); + + void MoveCalls(Value *Arg, RRInfo &RetainsToMove, RRInfo &ReleasesToMove, + MapVector<Value *, RRInfo> &Retains, + DenseMap<Value *, RRInfo> &Releases, + SmallVectorImpl<Instruction *> &DeadInsts, + Module *M); + + bool PerformCodePlacement(DenseMap<const BasicBlock *, BBState> &BBStates, + MapVector<Value *, RRInfo> &Retains, + DenseMap<Value *, RRInfo> &Releases, + Module *M); + + void OptimizeWeakCalls(Function &F); + + bool OptimizeSequences(Function &F); + + void OptimizeReturns(Function &F); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const; + virtual bool doInitialization(Module &M); + virtual bool runOnFunction(Function &F); + virtual void releaseMemory(); + + public: + static char ID; + ObjCARCOpt() : FunctionPass(ID) { + initializeObjCARCOptPass(*PassRegistry::getPassRegistry()); + } + }; +} + +char ObjCARCOpt::ID = 0; +INITIALIZE_PASS_BEGIN(ObjCARCOpt, + "objc-arc", "ObjC ARC optimization", false, false) +INITIALIZE_PASS_DEPENDENCY(ObjCARCAliasAnalysis) +INITIALIZE_PASS_END(ObjCARCOpt, + "objc-arc", "ObjC ARC optimization", false, false) + +Pass *llvm::createObjCARCOptPass() { + return new ObjCARCOpt(); +} + +void ObjCARCOpt::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<ObjCARCAliasAnalysis>(); + AU.addRequired<AliasAnalysis>(); + // ARC optimization doesn't currently split critical edges. + AU.setPreservesCFG(); +} + +bool ObjCARCOpt::IsRetainBlockOptimizable(const Instruction *Inst) { + // Without the magic metadata tag, we have to assume this might be an + // objc_retainBlock call inserted to convert a block pointer to an id, + // in which case it really is needed. + if (!Inst->getMetadata(CopyOnEscapeMDKind)) + return false; + + // If the pointer "escapes" (not including being used in a call), + // the copy may be needed. + if (DoesObjCBlockEscape(Inst)) + return false; + + // Otherwise, it's not needed. + return true; +} + +Constant *ObjCARCOpt::getRetainRVCallee(Module *M) { + if (!RetainRVCallee) { + LLVMContext &C = M->getContext(); + Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); + Type *Params[] = { I8X }; + FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false); + AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); + RetainRVCallee = + M->getOrInsertFunction("objc_retainAutoreleasedReturnValue", FTy, + Attributes); + } + return RetainRVCallee; +} + +Constant *ObjCARCOpt::getAutoreleaseRVCallee(Module *M) { + if (!AutoreleaseRVCallee) { + LLVMContext &C = M->getContext(); + Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); + Type *Params[] = { I8X }; + FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false); + AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); + AutoreleaseRVCallee = + M->getOrInsertFunction("objc_autoreleaseReturnValue", FTy, + Attributes); + } + return AutoreleaseRVCallee; +} + +Constant *ObjCARCOpt::getReleaseCallee(Module *M) { + if (!ReleaseCallee) { + LLVMContext &C = M->getContext(); + Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) }; + AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); + ReleaseCallee = + M->getOrInsertFunction( + "objc_release", + FunctionType::get(Type::getVoidTy(C), Params, /*isVarArg=*/false), + Attributes); + } + return ReleaseCallee; +} + +Constant *ObjCARCOpt::getRetainCallee(Module *M) { + if (!RetainCallee) { + LLVMContext &C = M->getContext(); + Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) }; + AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); + RetainCallee = + M->getOrInsertFunction( + "objc_retain", + FunctionType::get(Params[0], Params, /*isVarArg=*/false), + Attributes); + } + return RetainCallee; +} + +Constant *ObjCARCOpt::getRetainBlockCallee(Module *M) { + if (!RetainBlockCallee) { + LLVMContext &C = M->getContext(); + Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) }; + // objc_retainBlock is not nounwind because it calls user copy constructors + // which could theoretically throw. + RetainBlockCallee = + M->getOrInsertFunction( + "objc_retainBlock", + FunctionType::get(Params[0], Params, /*isVarArg=*/false), + AttrListPtr()); + } + return RetainBlockCallee; +} + +Constant *ObjCARCOpt::getAutoreleaseCallee(Module *M) { + if (!AutoreleaseCallee) { + LLVMContext &C = M->getContext(); + Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) }; + AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); + AutoreleaseCallee = + M->getOrInsertFunction( + "objc_autorelease", + FunctionType::get(Params[0], Params, /*isVarArg=*/false), + Attributes); + } + return AutoreleaseCallee; +} + +/// CanAlterRefCount - Test whether the given instruction can result in a +/// reference count modification (positive or negative) for the pointer's +/// object. +static bool +CanAlterRefCount(const Instruction *Inst, const Value *Ptr, + ProvenanceAnalysis &PA, InstructionClass Class) { + switch (Class) { + case IC_Autorelease: + case IC_AutoreleaseRV: + case IC_User: + // These operations never directly modify a reference count. + return false; + default: break; + } + + ImmutableCallSite CS = static_cast<const Value *>(Inst); + assert(CS && "Only calls can alter reference counts!"); + + // See if AliasAnalysis can help us with the call. + AliasAnalysis::ModRefBehavior MRB = PA.getAA()->getModRefBehavior(CS); + if (AliasAnalysis::onlyReadsMemory(MRB)) + return false; + if (AliasAnalysis::onlyAccessesArgPointees(MRB)) { + for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); + I != E; ++I) { + const Value *Op = *I; + if (IsPotentialUse(Op) && PA.related(Ptr, Op)) + return true; + } + return false; + } + + // Assume the worst. + return true; +} + +/// CanUse - Test whether the given instruction can "use" the given pointer's +/// object in a way that requires the reference count to be positive. +static bool +CanUse(const Instruction *Inst, const Value *Ptr, ProvenanceAnalysis &PA, + InstructionClass Class) { + // IC_Call operations (as opposed to IC_CallOrUser) never "use" objc pointers. + if (Class == IC_Call) + return false; + + // Consider various instructions which may have pointer arguments which are + // not "uses". + if (const ICmpInst *ICI = dyn_cast<ICmpInst>(Inst)) { + // Comparing a pointer with null, or any other constant, isn't really a use, + // because we don't care what the pointer points to, or about the values + // of any other dynamic reference-counted pointers. + if (!IsPotentialUse(ICI->getOperand(1))) + return false; + } else if (ImmutableCallSite CS = static_cast<const Value *>(Inst)) { + // For calls, just check the arguments (and not the callee operand). + for (ImmutableCallSite::arg_iterator OI = CS.arg_begin(), + OE = CS.arg_end(); OI != OE; ++OI) { + const Value *Op = *OI; + if (IsPotentialUse(Op) && PA.related(Ptr, Op)) + return true; + } + return false; + } else if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + // Special-case stores, because we don't care about the stored value, just + // the store address. + const Value *Op = GetUnderlyingObjCPtr(SI->getPointerOperand()); + // If we can't tell what the underlying object was, assume there is a + // dependence. + return IsPotentialUse(Op) && PA.related(Op, Ptr); + } + + // Check each operand for a match. + for (User::const_op_iterator OI = Inst->op_begin(), OE = Inst->op_end(); + OI != OE; ++OI) { + const Value *Op = *OI; + if (IsPotentialUse(Op) && PA.related(Ptr, Op)) + return true; + } + return false; +} + +/// CanInterruptRV - Test whether the given instruction can autorelease +/// any pointer or cause an autoreleasepool pop. +static bool +CanInterruptRV(InstructionClass Class) { + switch (Class) { + case IC_AutoreleasepoolPop: + case IC_CallOrUser: + case IC_Call: + case IC_Autorelease: + case IC_AutoreleaseRV: + case IC_FusedRetainAutorelease: + case IC_FusedRetainAutoreleaseRV: + return true; + default: + return false; + } +} + +namespace { + /// DependenceKind - There are several kinds of dependence-like concepts in + /// use here. + enum DependenceKind { + NeedsPositiveRetainCount, + AutoreleasePoolBoundary, + CanChangeRetainCount, + RetainAutoreleaseDep, ///< Blocks objc_retainAutorelease. + RetainAutoreleaseRVDep, ///< Blocks objc_retainAutoreleaseReturnValue. + RetainRVDep ///< Blocks objc_retainAutoreleasedReturnValue. + }; +} + +/// Depends - Test if there can be dependencies on Inst through Arg. This +/// function only tests dependencies relevant for removing pairs of calls. +static bool +Depends(DependenceKind Flavor, Instruction *Inst, const Value *Arg, + ProvenanceAnalysis &PA) { + // If we've reached the definition of Arg, stop. + if (Inst == Arg) + return true; + + switch (Flavor) { + case NeedsPositiveRetainCount: { + InstructionClass Class = GetInstructionClass(Inst); + switch (Class) { + case IC_AutoreleasepoolPop: + case IC_AutoreleasepoolPush: + case IC_None: + return false; + default: + return CanUse(Inst, Arg, PA, Class); + } + } + + case AutoreleasePoolBoundary: { + InstructionClass Class = GetInstructionClass(Inst); + switch (Class) { + case IC_AutoreleasepoolPop: + case IC_AutoreleasepoolPush: + // These mark the end and begin of an autorelease pool scope. + return true; + default: + // Nothing else does this. + return false; + } + } + + case CanChangeRetainCount: { + InstructionClass Class = GetInstructionClass(Inst); + switch (Class) { + case IC_AutoreleasepoolPop: + // Conservatively assume this can decrement any count. + return true; + case IC_AutoreleasepoolPush: + case IC_None: + return false; + default: + return CanAlterRefCount(Inst, Arg, PA, Class); + } + } + + case RetainAutoreleaseDep: + switch (GetBasicInstructionClass(Inst)) { + case IC_AutoreleasepoolPop: + case IC_AutoreleasepoolPush: + // Don't merge an objc_autorelease with an objc_retain inside a different + // autoreleasepool scope. + return true; + case IC_Retain: + case IC_RetainRV: + // Check for a retain of the same pointer for merging. + return GetObjCArg(Inst) == Arg; + default: + // Nothing else matters for objc_retainAutorelease formation. + return false; + } + + case RetainAutoreleaseRVDep: { + InstructionClass Class = GetBasicInstructionClass(Inst); + switch (Class) { + case IC_Retain: + case IC_RetainRV: + // Check for a retain of the same pointer for merging. + return GetObjCArg(Inst) == Arg; + default: + // Anything that can autorelease interrupts + // retainAutoreleaseReturnValue formation. + return CanInterruptRV(Class); + } + } + + case RetainRVDep: + return CanInterruptRV(GetBasicInstructionClass(Inst)); + } + + llvm_unreachable("Invalid dependence flavor"); +} + +/// FindDependencies - Walk up the CFG from StartPos (which is in StartBB) and +/// find local and non-local dependencies on Arg. +/// TODO: Cache results? +static void +FindDependencies(DependenceKind Flavor, + const Value *Arg, + BasicBlock *StartBB, Instruction *StartInst, + SmallPtrSet<Instruction *, 4> &DependingInstructions, + SmallPtrSet<const BasicBlock *, 4> &Visited, + ProvenanceAnalysis &PA) { + BasicBlock::iterator StartPos = StartInst; + + SmallVector<std::pair<BasicBlock *, BasicBlock::iterator>, 4> Worklist; + Worklist.push_back(std::make_pair(StartBB, StartPos)); + do { + std::pair<BasicBlock *, BasicBlock::iterator> Pair = + Worklist.pop_back_val(); + BasicBlock *LocalStartBB = Pair.first; + BasicBlock::iterator LocalStartPos = Pair.second; + BasicBlock::iterator StartBBBegin = LocalStartBB->begin(); + for (;;) { + if (LocalStartPos == StartBBBegin) { + pred_iterator PI(LocalStartBB), PE(LocalStartBB, false); + if (PI == PE) + // If we've reached the function entry, produce a null dependence. + DependingInstructions.insert(0); + else + // Add the predecessors to the worklist. + do { + BasicBlock *PredBB = *PI; + if (Visited.insert(PredBB)) + Worklist.push_back(std::make_pair(PredBB, PredBB->end())); + } while (++PI != PE); + break; + } + + Instruction *Inst = --LocalStartPos; + if (Depends(Flavor, Inst, Arg, PA)) { + DependingInstructions.insert(Inst); + break; + } + } + } while (!Worklist.empty()); + + // Determine whether the original StartBB post-dominates all of the blocks we + // visited. If not, insert a sentinal indicating that most optimizations are + // not safe. + for (SmallPtrSet<const BasicBlock *, 4>::const_iterator I = Visited.begin(), + E = Visited.end(); I != E; ++I) { + const BasicBlock *BB = *I; + if (BB == StartBB) + continue; + const TerminatorInst *TI = cast<TerminatorInst>(&BB->back()); + for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI) { + const BasicBlock *Succ = *SI; + if (Succ != StartBB && !Visited.count(Succ)) { + DependingInstructions.insert(reinterpret_cast<Instruction *>(-1)); + return; + } + } + } +} + +static bool isNullOrUndef(const Value *V) { + return isa<ConstantPointerNull>(V) || isa<UndefValue>(V); +} + +static bool isNoopInstruction(const Instruction *I) { + return isa<BitCastInst>(I) || + (isa<GetElementPtrInst>(I) && + cast<GetElementPtrInst>(I)->hasAllZeroIndices()); +} + +/// OptimizeRetainCall - Turn objc_retain into +/// objc_retainAutoreleasedReturnValue if the operand is a return value. +void +ObjCARCOpt::OptimizeRetainCall(Function &F, Instruction *Retain) { + ImmutableCallSite CS(GetObjCArg(Retain)); + const Instruction *Call = CS.getInstruction(); + if (!Call) return; + if (Call->getParent() != Retain->getParent()) return; + + // Check that the call is next to the retain. + BasicBlock::const_iterator I = Call; + ++I; + while (isNoopInstruction(I)) ++I; + if (&*I != Retain) + return; + + // Turn it to an objc_retainAutoreleasedReturnValue.. + Changed = true; + ++NumPeeps; + cast<CallInst>(Retain)->setCalledFunction(getRetainRVCallee(F.getParent())); +} + +/// OptimizeRetainRVCall - Turn objc_retainAutoreleasedReturnValue into +/// objc_retain if the operand is not a return value. Or, if it can be paired +/// with an objc_autoreleaseReturnValue, delete the pair and return true. +bool +ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) { + // Check for the argument being from an immediately preceding call or invoke. + const Value *Arg = GetObjCArg(RetainRV); + ImmutableCallSite CS(Arg); + if (const Instruction *Call = CS.getInstruction()) { + if (Call->getParent() == RetainRV->getParent()) { + BasicBlock::const_iterator I = Call; + ++I; + while (isNoopInstruction(I)) ++I; + if (&*I == RetainRV) + return false; + } else if (const InvokeInst *II = dyn_cast<InvokeInst>(Call)) { + BasicBlock *RetainRVParent = RetainRV->getParent(); + if (II->getNormalDest() == RetainRVParent) { + BasicBlock::const_iterator I = RetainRVParent->begin(); + while (isNoopInstruction(I)) ++I; + if (&*I == RetainRV) + return false; + } + } + } + + // Check for being preceded by an objc_autoreleaseReturnValue on the same + // pointer. In this case, we can delete the pair. + BasicBlock::iterator I = RetainRV, Begin = RetainRV->getParent()->begin(); + if (I != Begin) { + do --I; while (I != Begin && isNoopInstruction(I)); + if (GetBasicInstructionClass(I) == IC_AutoreleaseRV && + GetObjCArg(I) == Arg) { + Changed = true; + ++NumPeeps; + EraseInstruction(I); + EraseInstruction(RetainRV); + return true; + } + } + + // Turn it to a plain objc_retain. + Changed = true; + ++NumPeeps; + cast<CallInst>(RetainRV)->setCalledFunction(getRetainCallee(F.getParent())); + return false; +} + +/// OptimizeAutoreleaseRVCall - Turn objc_autoreleaseReturnValue into +/// objc_autorelease if the result is not used as a return value. +void +ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV) { + // Check for a return of the pointer value. + const Value *Ptr = GetObjCArg(AutoreleaseRV); + SmallVector<const Value *, 2> Users; + Users.push_back(Ptr); + do { + Ptr = Users.pop_back_val(); + for (Value::const_use_iterator UI = Ptr->use_begin(), UE = Ptr->use_end(); + UI != UE; ++UI) { + const User *I = *UI; + if (isa<ReturnInst>(I) || GetBasicInstructionClass(I) == IC_RetainRV) + return; + if (isa<BitCastInst>(I)) + Users.push_back(I); + } + } while (!Users.empty()); + + Changed = true; + ++NumPeeps; + cast<CallInst>(AutoreleaseRV)-> + setCalledFunction(getAutoreleaseCallee(F.getParent())); +} + +/// OptimizeIndividualCalls - Visit each call, one at a time, and make +/// simplifications without doing any additional analysis. +void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { + // Reset all the flags in preparation for recomputing them. + UsedInThisFunction = 0; + + // Visit all objc_* calls in F. + for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) { + Instruction *Inst = &*I++; + InstructionClass Class = GetBasicInstructionClass(Inst); + + switch (Class) { + default: break; + + // Delete no-op casts. These function calls have special semantics, but + // the semantics are entirely implemented via lowering in the front-end, + // so by the time they reach the optimizer, they are just no-op calls + // which return their argument. + // + // There are gray areas here, as the ability to cast reference-counted + // pointers to raw void* and back allows code to break ARC assumptions, + // however these are currently considered to be unimportant. + case IC_NoopCast: + Changed = true; + ++NumNoops; + EraseInstruction(Inst); + continue; + + // If the pointer-to-weak-pointer is null, it's undefined behavior. + case IC_StoreWeak: + case IC_LoadWeak: + case IC_LoadWeakRetained: + case IC_InitWeak: + case IC_DestroyWeak: { + CallInst *CI = cast<CallInst>(Inst); + if (isNullOrUndef(CI->getArgOperand(0))) { + Changed = true; + Type *Ty = CI->getArgOperand(0)->getType(); + new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()), + Constant::getNullValue(Ty), + CI); + CI->replaceAllUsesWith(UndefValue::get(CI->getType())); + CI->eraseFromParent(); + continue; + } + break; + } + case IC_CopyWeak: + case IC_MoveWeak: { + CallInst *CI = cast<CallInst>(Inst); + if (isNullOrUndef(CI->getArgOperand(0)) || + isNullOrUndef(CI->getArgOperand(1))) { + Changed = true; + Type *Ty = CI->getArgOperand(0)->getType(); + new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()), + Constant::getNullValue(Ty), + CI); + CI->replaceAllUsesWith(UndefValue::get(CI->getType())); + CI->eraseFromParent(); + continue; + } + break; + } + case IC_Retain: + OptimizeRetainCall(F, Inst); + break; + case IC_RetainRV: + if (OptimizeRetainRVCall(F, Inst)) + continue; + break; + case IC_AutoreleaseRV: + OptimizeAutoreleaseRVCall(F, Inst); + break; + } + + // objc_autorelease(x) -> objc_release(x) if x is otherwise unused. + if (IsAutorelease(Class) && Inst->use_empty()) { + CallInst *Call = cast<CallInst>(Inst); + const Value *Arg = Call->getArgOperand(0); + Arg = FindSingleUseIdentifiedObject(Arg); + if (Arg) { + Changed = true; + ++NumAutoreleases; + + // Create the declaration lazily. + LLVMContext &C = Inst->getContext(); + CallInst *NewCall = + CallInst::Create(getReleaseCallee(F.getParent()), + Call->getArgOperand(0), "", Call); + NewCall->setMetadata(ImpreciseReleaseMDKind, + MDNode::get(C, ArrayRef<Value *>())); + EraseInstruction(Call); + Inst = NewCall; + Class = IC_Release; + } + } + + // For functions which can never be passed stack arguments, add + // a tail keyword. + if (IsAlwaysTail(Class)) { + Changed = true; + cast<CallInst>(Inst)->setTailCall(); + } + + // Set nounwind as needed. + if (IsNoThrow(Class)) { + Changed = true; + cast<CallInst>(Inst)->setDoesNotThrow(); + } + + if (!IsNoopOnNull(Class)) { + UsedInThisFunction |= 1 << Class; + continue; + } + + const Value *Arg = GetObjCArg(Inst); + + // ARC calls with null are no-ops. Delete them. + if (isNullOrUndef(Arg)) { + Changed = true; + ++NumNoops; + EraseInstruction(Inst); + continue; + } + + // Keep track of which of retain, release, autorelease, and retain_block + // are actually present in this function. + UsedInThisFunction |= 1 << Class; + + // If Arg is a PHI, and one or more incoming values to the + // PHI are null, and the call is control-equivalent to the PHI, and there + // are no relevant side effects between the PHI and the call, the call + // could be pushed up to just those paths with non-null incoming values. + // For now, don't bother splitting critical edges for this. + SmallVector<std::pair<Instruction *, const Value *>, 4> Worklist; + Worklist.push_back(std::make_pair(Inst, Arg)); + do { + std::pair<Instruction *, const Value *> Pair = Worklist.pop_back_val(); + Inst = Pair.first; + Arg = Pair.second; + + const PHINode *PN = dyn_cast<PHINode>(Arg); + if (!PN) continue; + + // Determine if the PHI has any null operands, or any incoming + // critical edges. + bool HasNull = false; + bool HasCriticalEdges = false; + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + Value *Incoming = + StripPointerCastsAndObjCCalls(PN->getIncomingValue(i)); + if (isNullOrUndef(Incoming)) + HasNull = true; + else if (cast<TerminatorInst>(PN->getIncomingBlock(i)->back()) + .getNumSuccessors() != 1) { + HasCriticalEdges = true; + break; + } + } + // If we have null operands and no critical edges, optimize. + if (!HasCriticalEdges && HasNull) { + SmallPtrSet<Instruction *, 4> DependingInstructions; + SmallPtrSet<const BasicBlock *, 4> Visited; + + // Check that there is nothing that cares about the reference + // count between the call and the phi. + switch (Class) { + case IC_Retain: + case IC_RetainBlock: + // These can always be moved up. + break; + case IC_Release: + // These can't be moved across things that care about the retain + // count. + FindDependencies(NeedsPositiveRetainCount, Arg, + Inst->getParent(), Inst, + DependingInstructions, Visited, PA); + break; + case IC_Autorelease: + // These can't be moved across autorelease pool scope boundaries. + FindDependencies(AutoreleasePoolBoundary, Arg, + Inst->getParent(), Inst, + DependingInstructions, Visited, PA); + break; + case IC_RetainRV: + case IC_AutoreleaseRV: + // Don't move these; the RV optimization depends on the autoreleaseRV + // being tail called, and the retainRV being immediately after a call + // (which might still happen if we get lucky with codegen layout, but + // it's not worth taking the chance). + continue; + default: + llvm_unreachable("Invalid dependence flavor"); + } + + if (DependingInstructions.size() == 1 && + *DependingInstructions.begin() == PN) { + Changed = true; + ++NumPartialNoops; + // Clone the call into each predecessor that has a non-null value. + CallInst *CInst = cast<CallInst>(Inst); + Type *ParamTy = CInst->getArgOperand(0)->getType(); + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + Value *Incoming = + StripPointerCastsAndObjCCalls(PN->getIncomingValue(i)); + if (!isNullOrUndef(Incoming)) { + CallInst *Clone = cast<CallInst>(CInst->clone()); + Value *Op = PN->getIncomingValue(i); + Instruction *InsertPos = &PN->getIncomingBlock(i)->back(); + if (Op->getType() != ParamTy) + Op = new BitCastInst(Op, ParamTy, "", InsertPos); + Clone->setArgOperand(0, Op); + Clone->insertBefore(InsertPos); + Worklist.push_back(std::make_pair(Clone, Incoming)); + } + } + // Erase the original call. + EraseInstruction(CInst); + continue; + } + } + } while (!Worklist.empty()); + } +} + +/// CheckForCFGHazards - Check for critical edges, loop boundaries, irreducible +/// control flow, or other CFG structures where moving code across the edge +/// would result in it being executed more. +void +ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB, + DenseMap<const BasicBlock *, BBState> &BBStates, + BBState &MyStates) const { + // If any top-down local-use or possible-dec has a succ which is earlier in + // the sequence, forget it. + for (BBState::ptr_iterator I = MyStates.top_down_ptr_begin(), + E = MyStates.top_down_ptr_end(); I != E; ++I) + switch (I->second.GetSeq()) { + default: break; + case S_Use: { + const Value *Arg = I->first; + const TerminatorInst *TI = cast<TerminatorInst>(&BB->back()); + bool SomeSuccHasSame = false; + bool AllSuccsHaveSame = true; + PtrState &S = I->second; + succ_const_iterator SI(TI), SE(TI, false); + + // If the terminator is an invoke marked with the + // clang.arc.no_objc_arc_exceptions metadata, the unwind edge can be + // ignored, for ARC purposes. + if (isa<InvokeInst>(TI) && TI->getMetadata(NoObjCARCExceptionsMDKind)) + --SE; + + for (; SI != SE; ++SI) { + Sequence SuccSSeq = S_None; + bool SuccSRRIKnownSafe = false; + // If VisitBottomUp has pointer information for this successor, take + // what we know about it. + DenseMap<const BasicBlock *, BBState>::iterator BBI = + BBStates.find(*SI); + assert(BBI != BBStates.end()); + const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg); + SuccSSeq = SuccS.GetSeq(); + SuccSRRIKnownSafe = SuccS.RRI.KnownSafe; + switch (SuccSSeq) { + case S_None: + case S_CanRelease: { + if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) { + S.ClearSequenceProgress(); + break; + } + continue; + } + case S_Use: + SomeSuccHasSame = true; + break; + case S_Stop: + case S_Release: + case S_MovableRelease: + if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) + AllSuccsHaveSame = false; + break; + case S_Retain: + llvm_unreachable("bottom-up pointer in retain state!"); + } + } + // If the state at the other end of any of the successor edges + // matches the current state, require all edges to match. This + // guards against loops in the middle of a sequence. + if (SomeSuccHasSame && !AllSuccsHaveSame) + S.ClearSequenceProgress(); + break; + } + case S_CanRelease: { + const Value *Arg = I->first; + const TerminatorInst *TI = cast<TerminatorInst>(&BB->back()); + bool SomeSuccHasSame = false; + bool AllSuccsHaveSame = true; + PtrState &S = I->second; + succ_const_iterator SI(TI), SE(TI, false); + + // If the terminator is an invoke marked with the + // clang.arc.no_objc_arc_exceptions metadata, the unwind edge can be + // ignored, for ARC purposes. + if (isa<InvokeInst>(TI) && TI->getMetadata(NoObjCARCExceptionsMDKind)) + --SE; + + for (; SI != SE; ++SI) { + Sequence SuccSSeq = S_None; + bool SuccSRRIKnownSafe = false; + // If VisitBottomUp has pointer information for this successor, take + // what we know about it. + DenseMap<const BasicBlock *, BBState>::iterator BBI = + BBStates.find(*SI); + assert(BBI != BBStates.end()); + const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg); + SuccSSeq = SuccS.GetSeq(); + SuccSRRIKnownSafe = SuccS.RRI.KnownSafe; + switch (SuccSSeq) { + case S_None: { + if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) { + S.ClearSequenceProgress(); + break; + } + continue; + } + case S_CanRelease: + SomeSuccHasSame = true; + break; + case S_Stop: + case S_Release: + case S_MovableRelease: + case S_Use: + if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) + AllSuccsHaveSame = false; + break; + case S_Retain: + llvm_unreachable("bottom-up pointer in retain state!"); + } + } + // If the state at the other end of any of the successor edges + // matches the current state, require all edges to match. This + // guards against loops in the middle of a sequence. + if (SomeSuccHasSame && !AllSuccsHaveSame) + S.ClearSequenceProgress(); + break; + } + } +} + +bool +ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst, + BasicBlock *BB, + MapVector<Value *, RRInfo> &Retains, + BBState &MyStates) { + bool NestingDetected = false; + InstructionClass Class = GetInstructionClass(Inst); + const Value *Arg = 0; + + switch (Class) { + case IC_Release: { + Arg = GetObjCArg(Inst); + + PtrState &S = MyStates.getPtrBottomUpState(Arg); + + // If we see two releases in a row on the same pointer. If so, make + // a note, and we'll cicle back to revisit it after we've + // hopefully eliminated the second release, which may allow us to + // eliminate the first release too. + // Theoretically we could implement removal of nested retain+release + // pairs by making PtrState hold a stack of states, but this is + // simple and avoids adding overhead for the non-nested case. + if (S.GetSeq() == S_Release || S.GetSeq() == S_MovableRelease) + NestingDetected = true; + + MDNode *ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind); + S.ResetSequenceProgress(ReleaseMetadata ? S_MovableRelease : S_Release); + S.RRI.ReleaseMetadata = ReleaseMetadata; + S.RRI.KnownSafe = S.IsKnownNested() || S.IsKnownIncremented(); + S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall(); + S.RRI.Calls.insert(Inst); + + S.IncrementNestCount(); + break; + } + case IC_RetainBlock: + // An objc_retainBlock call with just a use may need to be kept, + // because it may be copying a block from the stack to the heap. + if (!IsRetainBlockOptimizable(Inst)) + break; + // FALLTHROUGH + case IC_Retain: + case IC_RetainRV: { + Arg = GetObjCArg(Inst); + + PtrState &S = MyStates.getPtrBottomUpState(Arg); + S.SetKnownPositiveRefCount(); + S.DecrementNestCount(); + + switch (S.GetSeq()) { + case S_Stop: + case S_Release: + case S_MovableRelease: + case S_Use: + S.RRI.ReverseInsertPts.clear(); + // FALL THROUGH + case S_CanRelease: + // Don't do retain+release tracking for IC_RetainRV, because it's + // better to let it remain as the first instruction after a call. + if (Class != IC_RetainRV) { + S.RRI.IsRetainBlock = Class == IC_RetainBlock; + Retains[Inst] = S.RRI; + } + S.ClearSequenceProgress(); + break; + case S_None: + break; + case S_Retain: + llvm_unreachable("bottom-up pointer in retain state!"); + } + return NestingDetected; + } + case IC_AutoreleasepoolPop: + // Conservatively, clear MyStates for all known pointers. + MyStates.clearBottomUpPointers(); + return NestingDetected; + case IC_AutoreleasepoolPush: + case IC_None: + // These are irrelevant. + return NestingDetected; + default: + break; + } + + // Consider any other possible effects of this instruction on each + // pointer being tracked. + for (BBState::ptr_iterator MI = MyStates.bottom_up_ptr_begin(), + ME = MyStates.bottom_up_ptr_end(); MI != ME; ++MI) { + const Value *Ptr = MI->first; + if (Ptr == Arg) + continue; // Handled above. + PtrState &S = MI->second; + Sequence Seq = S.GetSeq(); + + // Check for possible releases. + if (CanAlterRefCount(Inst, Ptr, PA, Class)) { + S.ClearRefCount(); + switch (Seq) { + case S_Use: + S.SetSeq(S_CanRelease); + continue; + case S_CanRelease: + case S_Release: + case S_MovableRelease: + case S_Stop: + case S_None: + break; + case S_Retain: + llvm_unreachable("bottom-up pointer in retain state!"); + } + } + + // Check for possible direct uses. + switch (Seq) { + case S_Release: + case S_MovableRelease: + if (CanUse(Inst, Ptr, PA, Class)) { + assert(S.RRI.ReverseInsertPts.empty()); + // If this is an invoke instruction, we're scanning it as part of + // one of its successor blocks, since we can't insert code after it + // in its own block, and we don't want to split critical edges. + if (isa<InvokeInst>(Inst)) + S.RRI.ReverseInsertPts.insert(BB->getFirstInsertionPt()); + else + S.RRI.ReverseInsertPts.insert(llvm::next(BasicBlock::iterator(Inst))); + S.SetSeq(S_Use); + } else if (Seq == S_Release && + (Class == IC_User || Class == IC_CallOrUser)) { + // Non-movable releases depend on any possible objc pointer use. + S.SetSeq(S_Stop); + assert(S.RRI.ReverseInsertPts.empty()); + // As above; handle invoke specially. + if (isa<InvokeInst>(Inst)) + S.RRI.ReverseInsertPts.insert(BB->getFirstInsertionPt()); + else + S.RRI.ReverseInsertPts.insert(llvm::next(BasicBlock::iterator(Inst))); + } + break; + case S_Stop: + if (CanUse(Inst, Ptr, PA, Class)) + S.SetSeq(S_Use); + break; + case S_CanRelease: + case S_Use: + case S_None: + break; + case S_Retain: + llvm_unreachable("bottom-up pointer in retain state!"); + } + } + + return NestingDetected; +} + +bool +ObjCARCOpt::VisitBottomUp(BasicBlock *BB, + DenseMap<const BasicBlock *, BBState> &BBStates, + MapVector<Value *, RRInfo> &Retains) { + bool NestingDetected = false; + BBState &MyStates = BBStates[BB]; + + // Merge the states from each successor to compute the initial state + // for the current block. + for (BBState::edge_iterator SI(MyStates.succ_begin()), + SE(MyStates.succ_end()); SI != SE; ++SI) { + const BasicBlock *Succ = *SI; + DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Succ); + assert(I != BBStates.end()); + MyStates.InitFromSucc(I->second); + ++SI; + for (; SI != SE; ++SI) { + Succ = *SI; + I = BBStates.find(Succ); + assert(I != BBStates.end()); + MyStates.MergeSucc(I->second); + } + break; + } + + // Visit all the instructions, bottom-up. + for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; --I) { + Instruction *Inst = llvm::prior(I); + + // Invoke instructions are visited as part of their successors (below). + if (isa<InvokeInst>(Inst)) + continue; + + NestingDetected |= VisitInstructionBottomUp(Inst, BB, Retains, MyStates); + } + + // If there's a predecessor with an invoke, visit the invoke as if it were + // part of this block, since we can't insert code after an invoke in its own + // block, and we don't want to split critical edges. + for (BBState::edge_iterator PI(MyStates.pred_begin()), + PE(MyStates.pred_end()); PI != PE; ++PI) { + BasicBlock *Pred = *PI; + if (InvokeInst *II = dyn_cast<InvokeInst>(&Pred->back())) + NestingDetected |= VisitInstructionBottomUp(II, BB, Retains, MyStates); + } + + return NestingDetected; +} + +bool +ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst, + DenseMap<Value *, RRInfo> &Releases, + BBState &MyStates) { + bool NestingDetected = false; + InstructionClass Class = GetInstructionClass(Inst); + const Value *Arg = 0; + + switch (Class) { + case IC_RetainBlock: + // An objc_retainBlock call with just a use may need to be kept, + // because it may be copying a block from the stack to the heap. + if (!IsRetainBlockOptimizable(Inst)) + break; + // FALLTHROUGH + case IC_Retain: + case IC_RetainRV: { + Arg = GetObjCArg(Inst); + + PtrState &S = MyStates.getPtrTopDownState(Arg); + + // Don't do retain+release tracking for IC_RetainRV, because it's + // better to let it remain as the first instruction after a call. + if (Class != IC_RetainRV) { + // If we see two retains in a row on the same pointer. If so, make + // a note, and we'll cicle back to revisit it after we've + // hopefully eliminated the second retain, which may allow us to + // eliminate the first retain too. + // Theoretically we could implement removal of nested retain+release + // pairs by making PtrState hold a stack of states, but this is + // simple and avoids adding overhead for the non-nested case. + if (S.GetSeq() == S_Retain) + NestingDetected = true; + + S.ResetSequenceProgress(S_Retain); + S.RRI.IsRetainBlock = Class == IC_RetainBlock; + // Don't check S.IsKnownIncremented() here because it's not sufficient. + S.RRI.KnownSafe = S.IsKnownNested(); + S.RRI.Calls.insert(Inst); + } + + S.IncrementNestCount(); + + // A retain can be a potential use; procede to the generic checking + // code below. + break; + } + case IC_Release: { + Arg = GetObjCArg(Inst); + + PtrState &S = MyStates.getPtrTopDownState(Arg); + S.DecrementNestCount(); + + switch (S.GetSeq()) { + case S_Retain: + case S_CanRelease: + S.RRI.ReverseInsertPts.clear(); + // FALL THROUGH + case S_Use: + S.RRI.ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind); + S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall(); + Releases[Inst] = S.RRI; + S.ClearSequenceProgress(); + break; + case S_None: + break; + case S_Stop: + case S_Release: + case S_MovableRelease: + llvm_unreachable("top-down pointer in release state!"); + } + break; + } + case IC_AutoreleasepoolPop: + // Conservatively, clear MyStates for all known pointers. + MyStates.clearTopDownPointers(); + return NestingDetected; + case IC_AutoreleasepoolPush: + case IC_None: + // These are irrelevant. + return NestingDetected; + default: + break; + } + + // Consider any other possible effects of this instruction on each + // pointer being tracked. + for (BBState::ptr_iterator MI = MyStates.top_down_ptr_begin(), + ME = MyStates.top_down_ptr_end(); MI != ME; ++MI) { + const Value *Ptr = MI->first; + if (Ptr == Arg) + continue; // Handled above. + PtrState &S = MI->second; + Sequence Seq = S.GetSeq(); + + // Check for possible releases. + if (CanAlterRefCount(Inst, Ptr, PA, Class)) { + S.ClearRefCount(); + switch (Seq) { + case S_Retain: + S.SetSeq(S_CanRelease); + assert(S.RRI.ReverseInsertPts.empty()); + S.RRI.ReverseInsertPts.insert(Inst); + + // One call can't cause a transition from S_Retain to S_CanRelease + // and S_CanRelease to S_Use. If we've made the first transition, + // we're done. + continue; + case S_Use: + case S_CanRelease: + case S_None: + break; + case S_Stop: + case S_Release: + case S_MovableRelease: + llvm_unreachable("top-down pointer in release state!"); + } + } + + // Check for possible direct uses. + switch (Seq) { + case S_CanRelease: + if (CanUse(Inst, Ptr, PA, Class)) + S.SetSeq(S_Use); + break; + case S_Retain: + case S_Use: + case S_None: + break; + case S_Stop: + case S_Release: + case S_MovableRelease: + llvm_unreachable("top-down pointer in release state!"); + } + } + + return NestingDetected; +} + +bool +ObjCARCOpt::VisitTopDown(BasicBlock *BB, + DenseMap<const BasicBlock *, BBState> &BBStates, + DenseMap<Value *, RRInfo> &Releases) { + bool NestingDetected = false; + BBState &MyStates = BBStates[BB]; + + // Merge the states from each predecessor to compute the initial state + // for the current block. + for (BBState::edge_iterator PI(MyStates.pred_begin()), + PE(MyStates.pred_end()); PI != PE; ++PI) { + const BasicBlock *Pred = *PI; + DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred); + assert(I != BBStates.end()); + MyStates.InitFromPred(I->second); + ++PI; + for (; PI != PE; ++PI) { + Pred = *PI; + I = BBStates.find(Pred); + assert(I != BBStates.end()); + MyStates.MergePred(I->second); + } + break; + } + + // Visit all the instructions, top-down. + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + Instruction *Inst = I; + NestingDetected |= VisitInstructionTopDown(Inst, Releases, MyStates); + } + + CheckForCFGHazards(BB, BBStates, MyStates); + return NestingDetected; +} + +static void +ComputePostOrders(Function &F, + SmallVectorImpl<BasicBlock *> &PostOrder, + SmallVectorImpl<BasicBlock *> &ReverseCFGPostOrder, + unsigned NoObjCARCExceptionsMDKind, + DenseMap<const BasicBlock *, BBState> &BBStates) { + /// Visited - The visited set, for doing DFS walks. + SmallPtrSet<BasicBlock *, 16> Visited; + + // Do DFS, computing the PostOrder. + SmallPtrSet<BasicBlock *, 16> OnStack; + SmallVector<std::pair<BasicBlock *, succ_iterator>, 16> SuccStack; + + // Functions always have exactly one entry block, and we don't have + // any other block that we treat like an entry block. + BasicBlock *EntryBB = &F.getEntryBlock(); + BBState &MyStates = BBStates[EntryBB]; + MyStates.SetAsEntry(); + TerminatorInst *EntryTI = cast<TerminatorInst>(&EntryBB->back()); + SuccStack.push_back(std::make_pair(EntryBB, succ_iterator(EntryTI))); + Visited.insert(EntryBB); + OnStack.insert(EntryBB); + do { + dfs_next_succ: + BasicBlock *CurrBB = SuccStack.back().first; + TerminatorInst *TI = cast<TerminatorInst>(&CurrBB->back()); + succ_iterator SE(TI, false); + + // If the terminator is an invoke marked with the + // clang.arc.no_objc_arc_exceptions metadata, the unwind edge can be + // ignored, for ARC purposes. + if (isa<InvokeInst>(TI) && TI->getMetadata(NoObjCARCExceptionsMDKind)) + --SE; + + while (SuccStack.back().second != SE) { + BasicBlock *SuccBB = *SuccStack.back().second++; + if (Visited.insert(SuccBB)) { + TerminatorInst *TI = cast<TerminatorInst>(&SuccBB->back()); + SuccStack.push_back(std::make_pair(SuccBB, succ_iterator(TI))); + BBStates[CurrBB].addSucc(SuccBB); + BBState &SuccStates = BBStates[SuccBB]; + SuccStates.addPred(CurrBB); + OnStack.insert(SuccBB); + goto dfs_next_succ; + } + + if (!OnStack.count(SuccBB)) { + BBStates[CurrBB].addSucc(SuccBB); + BBStates[SuccBB].addPred(CurrBB); + } + } + OnStack.erase(CurrBB); + PostOrder.push_back(CurrBB); + SuccStack.pop_back(); + } while (!SuccStack.empty()); + + Visited.clear(); + + // Do reverse-CFG DFS, computing the reverse-CFG PostOrder. + // Functions may have many exits, and there also blocks which we treat + // as exits due to ignored edges. + SmallVector<std::pair<BasicBlock *, BBState::edge_iterator>, 16> PredStack; + for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { + BasicBlock *ExitBB = I; + BBState &MyStates = BBStates[ExitBB]; + if (!MyStates.isExit()) + continue; + + MyStates.SetAsExit(); + + PredStack.push_back(std::make_pair(ExitBB, MyStates.pred_begin())); + Visited.insert(ExitBB); + while (!PredStack.empty()) { + reverse_dfs_next_succ: + BBState::edge_iterator PE = BBStates[PredStack.back().first].pred_end(); + while (PredStack.back().second != PE) { + BasicBlock *BB = *PredStack.back().second++; + if (Visited.insert(BB)) { + PredStack.push_back(std::make_pair(BB, BBStates[BB].pred_begin())); + goto reverse_dfs_next_succ; + } + } + ReverseCFGPostOrder.push_back(PredStack.pop_back_val().first); + } + } +} + +// Visit - Visit the function both top-down and bottom-up. +bool +ObjCARCOpt::Visit(Function &F, + DenseMap<const BasicBlock *, BBState> &BBStates, + MapVector<Value *, RRInfo> &Retains, + DenseMap<Value *, RRInfo> &Releases) { + + // Use reverse-postorder traversals, because we magically know that loops + // will be well behaved, i.e. they won't repeatedly call retain on a single + // pointer without doing a release. We can't use the ReversePostOrderTraversal + // class here because we want the reverse-CFG postorder to consider each + // function exit point, and we want to ignore selected cycle edges. + SmallVector<BasicBlock *, 16> PostOrder; + SmallVector<BasicBlock *, 16> ReverseCFGPostOrder; + ComputePostOrders(F, PostOrder, ReverseCFGPostOrder, + NoObjCARCExceptionsMDKind, + BBStates); + + // Use reverse-postorder on the reverse CFG for bottom-up. + bool BottomUpNestingDetected = false; + for (SmallVectorImpl<BasicBlock *>::const_reverse_iterator I = + ReverseCFGPostOrder.rbegin(), E = ReverseCFGPostOrder.rend(); + I != E; ++I) + BottomUpNestingDetected |= VisitBottomUp(*I, BBStates, Retains); + + // Use reverse-postorder for top-down. + bool TopDownNestingDetected = false; + for (SmallVectorImpl<BasicBlock *>::const_reverse_iterator I = + PostOrder.rbegin(), E = PostOrder.rend(); + I != E; ++I) + TopDownNestingDetected |= VisitTopDown(*I, BBStates, Releases); + + return TopDownNestingDetected && BottomUpNestingDetected; +} + +/// MoveCalls - Move the calls in RetainsToMove and ReleasesToMove. +void ObjCARCOpt::MoveCalls(Value *Arg, + RRInfo &RetainsToMove, + RRInfo &ReleasesToMove, + MapVector<Value *, RRInfo> &Retains, + DenseMap<Value *, RRInfo> &Releases, + SmallVectorImpl<Instruction *> &DeadInsts, + Module *M) { + Type *ArgTy = Arg->getType(); + Type *ParamTy = PointerType::getUnqual(Type::getInt8Ty(ArgTy->getContext())); + + // Insert the new retain and release calls. + for (SmallPtrSet<Instruction *, 2>::const_iterator + PI = ReleasesToMove.ReverseInsertPts.begin(), + PE = ReleasesToMove.ReverseInsertPts.end(); PI != PE; ++PI) { + Instruction *InsertPt = *PI; + Value *MyArg = ArgTy == ParamTy ? Arg : + new BitCastInst(Arg, ParamTy, "", InsertPt); + CallInst *Call = + CallInst::Create(RetainsToMove.IsRetainBlock ? + getRetainBlockCallee(M) : getRetainCallee(M), + MyArg, "", InsertPt); + Call->setDoesNotThrow(); + if (RetainsToMove.IsRetainBlock) + Call->setMetadata(CopyOnEscapeMDKind, + MDNode::get(M->getContext(), ArrayRef<Value *>())); + else + Call->setTailCall(); + } + for (SmallPtrSet<Instruction *, 2>::const_iterator + PI = RetainsToMove.ReverseInsertPts.begin(), + PE = RetainsToMove.ReverseInsertPts.end(); PI != PE; ++PI) { + Instruction *InsertPt = *PI; + Value *MyArg = ArgTy == ParamTy ? Arg : + new BitCastInst(Arg, ParamTy, "", InsertPt); + CallInst *Call = CallInst::Create(getReleaseCallee(M), MyArg, + "", InsertPt); + // Attach a clang.imprecise_release metadata tag, if appropriate. + if (MDNode *M = ReleasesToMove.ReleaseMetadata) + Call->setMetadata(ImpreciseReleaseMDKind, M); + Call->setDoesNotThrow(); + if (ReleasesToMove.IsTailCallRelease) + Call->setTailCall(); + } + + // Delete the original retain and release calls. + for (SmallPtrSet<Instruction *, 2>::const_iterator + AI = RetainsToMove.Calls.begin(), + AE = RetainsToMove.Calls.end(); AI != AE; ++AI) { + Instruction *OrigRetain = *AI; + Retains.blot(OrigRetain); + DeadInsts.push_back(OrigRetain); + } + for (SmallPtrSet<Instruction *, 2>::const_iterator + AI = ReleasesToMove.Calls.begin(), + AE = ReleasesToMove.Calls.end(); AI != AE; ++AI) { + Instruction *OrigRelease = *AI; + Releases.erase(OrigRelease); + DeadInsts.push_back(OrigRelease); + } +} + +/// PerformCodePlacement - Identify pairings between the retains and releases, +/// and delete and/or move them. +bool +ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState> + &BBStates, + MapVector<Value *, RRInfo> &Retains, + DenseMap<Value *, RRInfo> &Releases, + Module *M) { + bool AnyPairsCompletelyEliminated = false; + RRInfo RetainsToMove; + RRInfo ReleasesToMove; + SmallVector<Instruction *, 4> NewRetains; + SmallVector<Instruction *, 4> NewReleases; + SmallVector<Instruction *, 8> DeadInsts; + + // Visit each retain. + for (MapVector<Value *, RRInfo>::const_iterator I = Retains.begin(), + E = Retains.end(); I != E; ++I) { + Value *V = I->first; + if (!V) continue; // blotted + + Instruction *Retain = cast<Instruction>(V); + Value *Arg = GetObjCArg(Retain); + + // If the object being released is in static or stack storage, we know it's + // not being managed by ObjC reference counting, so we can delete pairs + // regardless of what possible decrements or uses lie between them. + bool KnownSafe = isa<Constant>(Arg) || isa<AllocaInst>(Arg); + + // A constant pointer can't be pointing to an object on the heap. It may + // be reference-counted, but it won't be deleted. + if (const LoadInst *LI = dyn_cast<LoadInst>(Arg)) + if (const GlobalVariable *GV = + dyn_cast<GlobalVariable>( + StripPointerCastsAndObjCCalls(LI->getPointerOperand()))) + if (GV->isConstant()) + KnownSafe = true; + + // If a pair happens in a region where it is known that the reference count + // is already incremented, we can similarly ignore possible decrements. + bool KnownSafeTD = true, KnownSafeBU = true; + + // Connect the dots between the top-down-collected RetainsToMove and + // bottom-up-collected ReleasesToMove to form sets of related calls. + // This is an iterative process so that we connect multiple releases + // to multiple retains if needed. + unsigned OldDelta = 0; + unsigned NewDelta = 0; + unsigned OldCount = 0; + unsigned NewCount = 0; + bool FirstRelease = true; + bool FirstRetain = true; + NewRetains.push_back(Retain); + for (;;) { + for (SmallVectorImpl<Instruction *>::const_iterator + NI = NewRetains.begin(), NE = NewRetains.end(); NI != NE; ++NI) { + Instruction *NewRetain = *NI; + MapVector<Value *, RRInfo>::const_iterator It = Retains.find(NewRetain); + assert(It != Retains.end()); + const RRInfo &NewRetainRRI = It->second; + KnownSafeTD &= NewRetainRRI.KnownSafe; + for (SmallPtrSet<Instruction *, 2>::const_iterator + LI = NewRetainRRI.Calls.begin(), + LE = NewRetainRRI.Calls.end(); LI != LE; ++LI) { + Instruction *NewRetainRelease = *LI; + DenseMap<Value *, RRInfo>::const_iterator Jt = + Releases.find(NewRetainRelease); + if (Jt == Releases.end()) + goto next_retain; + const RRInfo &NewRetainReleaseRRI = Jt->second; + assert(NewRetainReleaseRRI.Calls.count(NewRetain)); + if (ReleasesToMove.Calls.insert(NewRetainRelease)) { + OldDelta -= + BBStates[NewRetainRelease->getParent()].GetAllPathCount(); + + // Merge the ReleaseMetadata and IsTailCallRelease values. + if (FirstRelease) { + ReleasesToMove.ReleaseMetadata = + NewRetainReleaseRRI.ReleaseMetadata; + ReleasesToMove.IsTailCallRelease = + NewRetainReleaseRRI.IsTailCallRelease; + FirstRelease = false; + } else { + if (ReleasesToMove.ReleaseMetadata != + NewRetainReleaseRRI.ReleaseMetadata) + ReleasesToMove.ReleaseMetadata = 0; + if (ReleasesToMove.IsTailCallRelease != + NewRetainReleaseRRI.IsTailCallRelease) + ReleasesToMove.IsTailCallRelease = false; + } + + // Collect the optimal insertion points. + if (!KnownSafe) + for (SmallPtrSet<Instruction *, 2>::const_iterator + RI = NewRetainReleaseRRI.ReverseInsertPts.begin(), + RE = NewRetainReleaseRRI.ReverseInsertPts.end(); + RI != RE; ++RI) { + Instruction *RIP = *RI; + if (ReleasesToMove.ReverseInsertPts.insert(RIP)) + NewDelta -= BBStates[RIP->getParent()].GetAllPathCount(); + } + NewReleases.push_back(NewRetainRelease); + } + } + } + NewRetains.clear(); + if (NewReleases.empty()) break; + + // Back the other way. + for (SmallVectorImpl<Instruction *>::const_iterator + NI = NewReleases.begin(), NE = NewReleases.end(); NI != NE; ++NI) { + Instruction *NewRelease = *NI; + DenseMap<Value *, RRInfo>::const_iterator It = + Releases.find(NewRelease); + assert(It != Releases.end()); + const RRInfo &NewReleaseRRI = It->second; + KnownSafeBU &= NewReleaseRRI.KnownSafe; + for (SmallPtrSet<Instruction *, 2>::const_iterator + LI = NewReleaseRRI.Calls.begin(), + LE = NewReleaseRRI.Calls.end(); LI != LE; ++LI) { + Instruction *NewReleaseRetain = *LI; + MapVector<Value *, RRInfo>::const_iterator Jt = + Retains.find(NewReleaseRetain); + if (Jt == Retains.end()) + goto next_retain; + const RRInfo &NewReleaseRetainRRI = Jt->second; + assert(NewReleaseRetainRRI.Calls.count(NewRelease)); + if (RetainsToMove.Calls.insert(NewReleaseRetain)) { + unsigned PathCount = + BBStates[NewReleaseRetain->getParent()].GetAllPathCount(); + OldDelta += PathCount; + OldCount += PathCount; + + // Merge the IsRetainBlock values. + if (FirstRetain) { + RetainsToMove.IsRetainBlock = NewReleaseRetainRRI.IsRetainBlock; + FirstRetain = false; + } else if (ReleasesToMove.IsRetainBlock != + NewReleaseRetainRRI.IsRetainBlock) + // It's not possible to merge the sequences if one uses + // objc_retain and the other uses objc_retainBlock. + goto next_retain; + + // Collect the optimal insertion points. + if (!KnownSafe) + for (SmallPtrSet<Instruction *, 2>::const_iterator + RI = NewReleaseRetainRRI.ReverseInsertPts.begin(), + RE = NewReleaseRetainRRI.ReverseInsertPts.end(); + RI != RE; ++RI) { + Instruction *RIP = *RI; + if (RetainsToMove.ReverseInsertPts.insert(RIP)) { + PathCount = BBStates[RIP->getParent()].GetAllPathCount(); + NewDelta += PathCount; + NewCount += PathCount; + } + } + NewRetains.push_back(NewReleaseRetain); + } + } + } + NewReleases.clear(); + if (NewRetains.empty()) break; + } + + // If the pointer is known incremented or nested, we can safely delete the + // pair regardless of what's between them. + if (KnownSafeTD || KnownSafeBU) { + RetainsToMove.ReverseInsertPts.clear(); + ReleasesToMove.ReverseInsertPts.clear(); + NewCount = 0; + } else { + // Determine whether the new insertion points we computed preserve the + // balance of retain and release calls through the program. + // TODO: If the fully aggressive solution isn't valid, try to find a + // less aggressive solution which is. + if (NewDelta != 0) + goto next_retain; + } + + // Determine whether the original call points are balanced in the retain and + // release calls through the program. If not, conservatively don't touch + // them. + // TODO: It's theoretically possible to do code motion in this case, as + // long as the existing imbalances are maintained. + if (OldDelta != 0) + goto next_retain; + + // Ok, everything checks out and we're all set. Let's move some code! + Changed = true; + assert(OldCount != 0 && "Unreachable code?"); + AnyPairsCompletelyEliminated = NewCount == 0; + NumRRs += OldCount - NewCount; + MoveCalls(Arg, RetainsToMove, ReleasesToMove, + Retains, Releases, DeadInsts, M); + + next_retain: + NewReleases.clear(); + NewRetains.clear(); + RetainsToMove.clear(); + ReleasesToMove.clear(); + } + + // Now that we're done moving everything, we can delete the newly dead + // instructions, as we no longer need them as insert points. + while (!DeadInsts.empty()) + EraseInstruction(DeadInsts.pop_back_val()); + + return AnyPairsCompletelyEliminated; +} + +/// OptimizeWeakCalls - Weak pointer optimizations. +void ObjCARCOpt::OptimizeWeakCalls(Function &F) { + // First, do memdep-style RLE and S2L optimizations. We can't use memdep + // itself because it uses AliasAnalysis and we need to do provenance + // queries instead. + for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) { + Instruction *Inst = &*I++; + InstructionClass Class = GetBasicInstructionClass(Inst); + if (Class != IC_LoadWeak && Class != IC_LoadWeakRetained) + continue; + + // Delete objc_loadWeak calls with no users. + if (Class == IC_LoadWeak && Inst->use_empty()) { + Inst->eraseFromParent(); + continue; + } + + // TODO: For now, just look for an earlier available version of this value + // within the same block. Theoretically, we could do memdep-style non-local + // analysis too, but that would want caching. A better approach would be to + // use the technique that EarlyCSE uses. + inst_iterator Current = llvm::prior(I); + BasicBlock *CurrentBB = Current.getBasicBlockIterator(); + for (BasicBlock::iterator B = CurrentBB->begin(), + J = Current.getInstructionIterator(); + J != B; --J) { + Instruction *EarlierInst = &*llvm::prior(J); + InstructionClass EarlierClass = GetInstructionClass(EarlierInst); + switch (EarlierClass) { + case IC_LoadWeak: + case IC_LoadWeakRetained: { + // If this is loading from the same pointer, replace this load's value + // with that one. + CallInst *Call = cast<CallInst>(Inst); + CallInst *EarlierCall = cast<CallInst>(EarlierInst); + Value *Arg = Call->getArgOperand(0); + Value *EarlierArg = EarlierCall->getArgOperand(0); + switch (PA.getAA()->alias(Arg, EarlierArg)) { + case AliasAnalysis::MustAlias: + Changed = true; + // If the load has a builtin retain, insert a plain retain for it. + if (Class == IC_LoadWeakRetained) { + CallInst *CI = + CallInst::Create(getRetainCallee(F.getParent()), EarlierCall, + "", Call); + CI->setTailCall(); + } + // Zap the fully redundant load. + Call->replaceAllUsesWith(EarlierCall); + Call->eraseFromParent(); + goto clobbered; + case AliasAnalysis::MayAlias: + case AliasAnalysis::PartialAlias: + goto clobbered; + case AliasAnalysis::NoAlias: + break; + } + break; + } + case IC_StoreWeak: + case IC_InitWeak: { + // If this is storing to the same pointer and has the same size etc. + // replace this load's value with the stored value. + CallInst *Call = cast<CallInst>(Inst); + CallInst *EarlierCall = cast<CallInst>(EarlierInst); + Value *Arg = Call->getArgOperand(0); + Value *EarlierArg = EarlierCall->getArgOperand(0); + switch (PA.getAA()->alias(Arg, EarlierArg)) { + case AliasAnalysis::MustAlias: + Changed = true; + // If the load has a builtin retain, insert a plain retain for it. + if (Class == IC_LoadWeakRetained) { + CallInst *CI = + CallInst::Create(getRetainCallee(F.getParent()), EarlierCall, + "", Call); + CI->setTailCall(); + } + // Zap the fully redundant load. + Call->replaceAllUsesWith(EarlierCall->getArgOperand(1)); + Call->eraseFromParent(); + goto clobbered; + case AliasAnalysis::MayAlias: + case AliasAnalysis::PartialAlias: + goto clobbered; + case AliasAnalysis::NoAlias: + break; + } + break; + } + case IC_MoveWeak: + case IC_CopyWeak: + // TOOD: Grab the copied value. + goto clobbered; + case IC_AutoreleasepoolPush: + case IC_None: + case IC_User: + // Weak pointers are only modified through the weak entry points + // (and arbitrary calls, which could call the weak entry points). + break; + default: + // Anything else could modify the weak pointer. + goto clobbered; + } + } + clobbered:; + } + + // Then, for each destroyWeak with an alloca operand, check to see if + // the alloca and all its users can be zapped. + for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) { + Instruction *Inst = &*I++; + InstructionClass Class = GetBasicInstructionClass(Inst); + if (Class != IC_DestroyWeak) + continue; + + CallInst *Call = cast<CallInst>(Inst); + Value *Arg = Call->getArgOperand(0); + if (AllocaInst *Alloca = dyn_cast<AllocaInst>(Arg)) { + for (Value::use_iterator UI = Alloca->use_begin(), + UE = Alloca->use_end(); UI != UE; ++UI) { + const Instruction *UserInst = cast<Instruction>(*UI); + switch (GetBasicInstructionClass(UserInst)) { + case IC_InitWeak: + case IC_StoreWeak: + case IC_DestroyWeak: + continue; + default: + goto done; + } + } + Changed = true; + for (Value::use_iterator UI = Alloca->use_begin(), + UE = Alloca->use_end(); UI != UE; ) { + CallInst *UserInst = cast<CallInst>(*UI++); + switch (GetBasicInstructionClass(UserInst)) { + case IC_InitWeak: + case IC_StoreWeak: + // These functions return their second argument. + UserInst->replaceAllUsesWith(UserInst->getArgOperand(1)); + break; + case IC_DestroyWeak: + // No return value. + break; + default: + llvm_unreachable("alloca really is used!"); + } + UserInst->eraseFromParent(); + } + Alloca->eraseFromParent(); + done:; + } + } +} + +/// OptimizeSequences - Identify program paths which execute sequences of +/// retains and releases which can be eliminated. +bool ObjCARCOpt::OptimizeSequences(Function &F) { + /// Releases, Retains - These are used to store the results of the main flow + /// analysis. These use Value* as the key instead of Instruction* so that the + /// map stays valid when we get around to rewriting code and calls get + /// replaced by arguments. + DenseMap<Value *, RRInfo> Releases; + MapVector<Value *, RRInfo> Retains; + + /// BBStates, This is used during the traversal of the function to track the + /// states for each identified object at each block. + DenseMap<const BasicBlock *, BBState> BBStates; + + // Analyze the CFG of the function, and all instructions. + bool NestingDetected = Visit(F, BBStates, Retains, Releases); + + // Transform. + return PerformCodePlacement(BBStates, Retains, Releases, F.getParent()) && + NestingDetected; +} + +/// OptimizeReturns - Look for this pattern: +/// +/// %call = call i8* @something(...) +/// %2 = call i8* @objc_retain(i8* %call) +/// %3 = call i8* @objc_autorelease(i8* %2) +/// ret i8* %3 +/// +/// And delete the retain and autorelease. +/// +/// Otherwise if it's just this: +/// +/// %3 = call i8* @objc_autorelease(i8* %2) +/// ret i8* %3 +/// +/// convert the autorelease to autoreleaseRV. +void ObjCARCOpt::OptimizeReturns(Function &F) { + if (!F.getReturnType()->isPointerTy()) + return; + + SmallPtrSet<Instruction *, 4> DependingInstructions; + SmallPtrSet<const BasicBlock *, 4> Visited; + for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) { + BasicBlock *BB = FI; + ReturnInst *Ret = dyn_cast<ReturnInst>(&BB->back()); + if (!Ret) continue; + + const Value *Arg = StripPointerCastsAndObjCCalls(Ret->getOperand(0)); + FindDependencies(NeedsPositiveRetainCount, Arg, + BB, Ret, DependingInstructions, Visited, PA); + if (DependingInstructions.size() != 1) + goto next_block; + + { + CallInst *Autorelease = + dyn_cast_or_null<CallInst>(*DependingInstructions.begin()); + if (!Autorelease) + goto next_block; + InstructionClass AutoreleaseClass = GetBasicInstructionClass(Autorelease); + if (!IsAutorelease(AutoreleaseClass)) + goto next_block; + if (GetObjCArg(Autorelease) != Arg) + goto next_block; + + DependingInstructions.clear(); + Visited.clear(); + + // Check that there is nothing that can affect the reference + // count between the autorelease and the retain. + FindDependencies(CanChangeRetainCount, Arg, + BB, Autorelease, DependingInstructions, Visited, PA); + if (DependingInstructions.size() != 1) + goto next_block; + + { + CallInst *Retain = + dyn_cast_or_null<CallInst>(*DependingInstructions.begin()); + + // Check that we found a retain with the same argument. + if (!Retain || + !IsRetain(GetBasicInstructionClass(Retain)) || + GetObjCArg(Retain) != Arg) + goto next_block; + + DependingInstructions.clear(); + Visited.clear(); + + // Convert the autorelease to an autoreleaseRV, since it's + // returning the value. + if (AutoreleaseClass == IC_Autorelease) { + Autorelease->setCalledFunction(getAutoreleaseRVCallee(F.getParent())); + AutoreleaseClass = IC_AutoreleaseRV; + } + + // Check that there is nothing that can affect the reference + // count between the retain and the call. + // Note that Retain need not be in BB. + FindDependencies(CanChangeRetainCount, Arg, Retain->getParent(), Retain, + DependingInstructions, Visited, PA); + if (DependingInstructions.size() != 1) + goto next_block; + + { + CallInst *Call = + dyn_cast_or_null<CallInst>(*DependingInstructions.begin()); + + // Check that the pointer is the return value of the call. + if (!Call || Arg != Call) + goto next_block; + + // Check that the call is a regular call. + InstructionClass Class = GetBasicInstructionClass(Call); + if (Class != IC_CallOrUser && Class != IC_Call) + goto next_block; + + // If so, we can zap the retain and autorelease. + Changed = true; + ++NumRets; + EraseInstruction(Retain); + EraseInstruction(Autorelease); + } + } + } + + next_block: + DependingInstructions.clear(); + Visited.clear(); + } +} + +bool ObjCARCOpt::doInitialization(Module &M) { + if (!EnableARCOpts) + return false; + + // If nothing in the Module uses ARC, don't do anything. + Run = ModuleHasARC(M); + if (!Run) + return false; + + // Identify the imprecise release metadata kind. + ImpreciseReleaseMDKind = + M.getContext().getMDKindID("clang.imprecise_release"); + CopyOnEscapeMDKind = + M.getContext().getMDKindID("clang.arc.copy_on_escape"); + NoObjCARCExceptionsMDKind = + M.getContext().getMDKindID("clang.arc.no_objc_arc_exceptions"); + + // Intuitively, objc_retain and others are nocapture, however in practice + // they are not, because they return their argument value. And objc_release + // calls finalizers which can have arbitrary side effects. + + // These are initialized lazily. + RetainRVCallee = 0; + AutoreleaseRVCallee = 0; + ReleaseCallee = 0; + RetainCallee = 0; + RetainBlockCallee = 0; + AutoreleaseCallee = 0; + + return false; +} + +bool ObjCARCOpt::runOnFunction(Function &F) { + if (!EnableARCOpts) + return false; + + // If nothing in the Module uses ARC, don't do anything. + if (!Run) + return false; + + Changed = false; + + PA.setAA(&getAnalysis<AliasAnalysis>()); + + // This pass performs several distinct transformations. As a compile-time aid + // when compiling code that isn't ObjC, skip these if the relevant ObjC + // library functions aren't declared. + + // Preliminary optimizations. This also computs UsedInThisFunction. + OptimizeIndividualCalls(F); + + // Optimizations for weak pointers. + if (UsedInThisFunction & ((1 << IC_LoadWeak) | + (1 << IC_LoadWeakRetained) | + (1 << IC_StoreWeak) | + (1 << IC_InitWeak) | + (1 << IC_CopyWeak) | + (1 << IC_MoveWeak) | + (1 << IC_DestroyWeak))) + OptimizeWeakCalls(F); + + // Optimizations for retain+release pairs. + if (UsedInThisFunction & ((1 << IC_Retain) | + (1 << IC_RetainRV) | + (1 << IC_RetainBlock))) + if (UsedInThisFunction & (1 << IC_Release)) + // Run OptimizeSequences until it either stops making changes or + // no retain+release pair nesting is detected. + while (OptimizeSequences(F)) {} + + // Optimizations if objc_autorelease is used. + if (UsedInThisFunction & ((1 << IC_Autorelease) | + (1 << IC_AutoreleaseRV))) + OptimizeReturns(F); + + return Changed; +} + +void ObjCARCOpt::releaseMemory() { + PA.clear(); +} + +//===----------------------------------------------------------------------===// +// ARC contraction. +//===----------------------------------------------------------------------===// + +// TODO: ObjCARCContract could insert PHI nodes when uses aren't +// dominated by single calls. + +#include "llvm/Operator.h" +#include "llvm/InlineAsm.h" +#include "llvm/Analysis/Dominators.h" + +STATISTIC(NumStoreStrongs, "Number objc_storeStrong calls formed"); + +namespace { + /// ObjCARCContract - Late ARC optimizations. These change the IR in a way + /// that makes it difficult to be analyzed by ObjCARCOpt, so it's run late. + class ObjCARCContract : public FunctionPass { + bool Changed; + AliasAnalysis *AA; + DominatorTree *DT; + ProvenanceAnalysis PA; + + /// Run - A flag indicating whether this optimization pass should run. + bool Run; + + /// StoreStrongCallee, etc. - Declarations for ObjC runtime + /// functions, for use in creating calls to them. These are initialized + /// lazily to avoid cluttering up the Module with unused declarations. + Constant *StoreStrongCallee, + *RetainAutoreleaseCallee, *RetainAutoreleaseRVCallee; + + /// RetainRVMarker - The inline asm string to insert between calls and + /// RetainRV calls to make the optimization work on targets which need it. + const MDString *RetainRVMarker; + + /// StoreStrongCalls - The set of inserted objc_storeStrong calls. If + /// at the end of walking the function we have found no alloca + /// instructions, these calls can be marked "tail". + SmallPtrSet<CallInst *, 8> StoreStrongCalls; + + Constant *getStoreStrongCallee(Module *M); + Constant *getRetainAutoreleaseCallee(Module *M); + Constant *getRetainAutoreleaseRVCallee(Module *M); + + bool ContractAutorelease(Function &F, Instruction *Autorelease, + InstructionClass Class, + SmallPtrSet<Instruction *, 4> + &DependingInstructions, + SmallPtrSet<const BasicBlock *, 4> + &Visited); + + void ContractRelease(Instruction *Release, + inst_iterator &Iter); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const; + virtual bool doInitialization(Module &M); + virtual bool runOnFunction(Function &F); + + public: + static char ID; + ObjCARCContract() : FunctionPass(ID) { + initializeObjCARCContractPass(*PassRegistry::getPassRegistry()); + } + }; +} + +char ObjCARCContract::ID = 0; +INITIALIZE_PASS_BEGIN(ObjCARCContract, + "objc-arc-contract", "ObjC ARC contraction", false, false) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_END(ObjCARCContract, + "objc-arc-contract", "ObjC ARC contraction", false, false) + +Pass *llvm::createObjCARCContractPass() { + return new ObjCARCContract(); +} + +void ObjCARCContract::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<AliasAnalysis>(); + AU.addRequired<DominatorTree>(); + AU.setPreservesCFG(); +} + +Constant *ObjCARCContract::getStoreStrongCallee(Module *M) { + if (!StoreStrongCallee) { + LLVMContext &C = M->getContext(); + Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); + Type *I8XX = PointerType::getUnqual(I8X); + Type *Params[] = { I8XX, I8X }; + + AttrListPtr Attributes = AttrListPtr() + .addAttr(~0u, Attribute::NoUnwind) + .addAttr(1, Attribute::NoCapture); + + StoreStrongCallee = + M->getOrInsertFunction( + "objc_storeStrong", + FunctionType::get(Type::getVoidTy(C), Params, /*isVarArg=*/false), + Attributes); + } + return StoreStrongCallee; +} + +Constant *ObjCARCContract::getRetainAutoreleaseCallee(Module *M) { + if (!RetainAutoreleaseCallee) { + LLVMContext &C = M->getContext(); + Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); + Type *Params[] = { I8X }; + FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false); + AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); + RetainAutoreleaseCallee = + M->getOrInsertFunction("objc_retainAutorelease", FTy, Attributes); + } + return RetainAutoreleaseCallee; +} + +Constant *ObjCARCContract::getRetainAutoreleaseRVCallee(Module *M) { + if (!RetainAutoreleaseRVCallee) { + LLVMContext &C = M->getContext(); + Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); + Type *Params[] = { I8X }; + FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false); + AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); + RetainAutoreleaseRVCallee = + M->getOrInsertFunction("objc_retainAutoreleaseReturnValue", FTy, + Attributes); + } + return RetainAutoreleaseRVCallee; +} + +/// ContractAutorelease - Merge an autorelease with a retain into a fused call. +bool +ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease, + InstructionClass Class, + SmallPtrSet<Instruction *, 4> + &DependingInstructions, + SmallPtrSet<const BasicBlock *, 4> + &Visited) { + const Value *Arg = GetObjCArg(Autorelease); + + // Check that there are no instructions between the retain and the autorelease + // (such as an autorelease_pop) which may change the count. + CallInst *Retain = 0; + if (Class == IC_AutoreleaseRV) + FindDependencies(RetainAutoreleaseRVDep, Arg, + Autorelease->getParent(), Autorelease, + DependingInstructions, Visited, PA); + else + FindDependencies(RetainAutoreleaseDep, Arg, + Autorelease->getParent(), Autorelease, + DependingInstructions, Visited, PA); + + Visited.clear(); + if (DependingInstructions.size() != 1) { + DependingInstructions.clear(); + return false; + } + + Retain = dyn_cast_or_null<CallInst>(*DependingInstructions.begin()); + DependingInstructions.clear(); + + if (!Retain || + GetBasicInstructionClass(Retain) != IC_Retain || + GetObjCArg(Retain) != Arg) + return false; + + Changed = true; + ++NumPeeps; + + if (Class == IC_AutoreleaseRV) + Retain->setCalledFunction(getRetainAutoreleaseRVCallee(F.getParent())); + else + Retain->setCalledFunction(getRetainAutoreleaseCallee(F.getParent())); + + EraseInstruction(Autorelease); + return true; +} + +/// ContractRelease - Attempt to merge an objc_release with a store, load, and +/// objc_retain to form an objc_storeStrong. This can be a little tricky because +/// the instructions don't always appear in order, and there may be unrelated +/// intervening instructions. +void ObjCARCContract::ContractRelease(Instruction *Release, + inst_iterator &Iter) { + LoadInst *Load = dyn_cast<LoadInst>(GetObjCArg(Release)); + if (!Load || !Load->isSimple()) return; + + // For now, require everything to be in one basic block. + BasicBlock *BB = Release->getParent(); + if (Load->getParent() != BB) return; + + // Walk down to find the store and the release, which may be in either order. + BasicBlock::iterator I = Load, End = BB->end(); + ++I; + AliasAnalysis::Location Loc = AA->getLocation(Load); + StoreInst *Store = 0; + bool SawRelease = false; + for (; !Store || !SawRelease; ++I) { + if (I == End) + return; + + Instruction *Inst = I; + if (Inst == Release) { + SawRelease = true; + continue; + } + + InstructionClass Class = GetBasicInstructionClass(Inst); + + // Unrelated retains are harmless. + if (IsRetain(Class)) + continue; + + if (Store) { + // The store is the point where we're going to put the objc_storeStrong, + // so make sure there are no uses after it. + if (CanUse(Inst, Load, PA, Class)) + return; + } else if (AA->getModRefInfo(Inst, Loc) & AliasAnalysis::Mod) { + // We are moving the load down to the store, so check for anything + // else which writes to the memory between the load and the store. + Store = dyn_cast<StoreInst>(Inst); + if (!Store || !Store->isSimple()) return; + if (Store->getPointerOperand() != Loc.Ptr) return; + } + } + + Value *New = StripPointerCastsAndObjCCalls(Store->getValueOperand()); + + // Walk up to find the retain. + I = Store; + BasicBlock::iterator Begin = BB->begin(); + while (I != Begin && GetBasicInstructionClass(I) != IC_Retain) + --I; + Instruction *Retain = I; + if (GetBasicInstructionClass(Retain) != IC_Retain) return; + if (GetObjCArg(Retain) != New) return; + + Changed = true; + ++NumStoreStrongs; + + LLVMContext &C = Release->getContext(); + Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); + Type *I8XX = PointerType::getUnqual(I8X); + + Value *Args[] = { Load->getPointerOperand(), New }; + if (Args[0]->getType() != I8XX) + Args[0] = new BitCastInst(Args[0], I8XX, "", Store); + if (Args[1]->getType() != I8X) + Args[1] = new BitCastInst(Args[1], I8X, "", Store); + CallInst *StoreStrong = + CallInst::Create(getStoreStrongCallee(BB->getParent()->getParent()), + Args, "", Store); + StoreStrong->setDoesNotThrow(); + StoreStrong->setDebugLoc(Store->getDebugLoc()); + + // We can't set the tail flag yet, because we haven't yet determined + // whether there are any escaping allocas. Remember this call, so that + // we can set the tail flag once we know it's safe. + StoreStrongCalls.insert(StoreStrong); + + if (&*Iter == Store) ++Iter; + Store->eraseFromParent(); + Release->eraseFromParent(); + EraseInstruction(Retain); + if (Load->use_empty()) + Load->eraseFromParent(); +} + +bool ObjCARCContract::doInitialization(Module &M) { + // If nothing in the Module uses ARC, don't do anything. + Run = ModuleHasARC(M); + if (!Run) + return false; + + // These are initialized lazily. + StoreStrongCallee = 0; + RetainAutoreleaseCallee = 0; + RetainAutoreleaseRVCallee = 0; + + // Initialize RetainRVMarker. + RetainRVMarker = 0; + if (NamedMDNode *NMD = + M.getNamedMetadata("clang.arc.retainAutoreleasedReturnValueMarker")) + if (NMD->getNumOperands() == 1) { + const MDNode *N = NMD->getOperand(0); + if (N->getNumOperands() == 1) + if (const MDString *S = dyn_cast<MDString>(N->getOperand(0))) + RetainRVMarker = S; + } + + return false; +} + +bool ObjCARCContract::runOnFunction(Function &F) { + if (!EnableARCOpts) + return false; + + // If nothing in the Module uses ARC, don't do anything. + if (!Run) + return false; + + Changed = false; + AA = &getAnalysis<AliasAnalysis>(); + DT = &getAnalysis<DominatorTree>(); + + PA.setAA(&getAnalysis<AliasAnalysis>()); + + // Track whether it's ok to mark objc_storeStrong calls with the "tail" + // keyword. Be conservative if the function has variadic arguments. + // It seems that functions which "return twice" are also unsafe for the + // "tail" argument, because they are setjmp, which could need to + // return to an earlier stack state. + bool TailOkForStoreStrongs = !F.isVarArg() && + !F.callsFunctionThatReturnsTwice(); + + // For ObjC library calls which return their argument, replace uses of the + // argument with uses of the call return value, if it dominates the use. This + // reduces register pressure. + SmallPtrSet<Instruction *, 4> DependingInstructions; + SmallPtrSet<const BasicBlock *, 4> Visited; + for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) { + Instruction *Inst = &*I++; + + // Only these library routines return their argument. In particular, + // objc_retainBlock does not necessarily return its argument. + InstructionClass Class = GetBasicInstructionClass(Inst); + switch (Class) { + case IC_Retain: + case IC_FusedRetainAutorelease: + case IC_FusedRetainAutoreleaseRV: + break; + case IC_Autorelease: + case IC_AutoreleaseRV: + if (ContractAutorelease(F, Inst, Class, DependingInstructions, Visited)) + continue; + break; + case IC_RetainRV: { + // If we're compiling for a target which needs a special inline-asm + // marker to do the retainAutoreleasedReturnValue optimization, + // insert it now. + if (!RetainRVMarker) + break; + BasicBlock::iterator BBI = Inst; + BasicBlock *InstParent = Inst->getParent(); + + // Step up to see if the call immediately precedes the RetainRV call. + // If it's an invoke, we have to cross a block boundary. And we have + // to carefully dodge no-op instructions. + do { + if (&*BBI == InstParent->begin()) { + BasicBlock *Pred = InstParent->getSinglePredecessor(); + if (!Pred) + goto decline_rv_optimization; + BBI = Pred->getTerminator(); + break; + } + --BBI; + } while (isNoopInstruction(BBI)); + + if (&*BBI == GetObjCArg(Inst)) { + Changed = true; + InlineAsm *IA = + InlineAsm::get(FunctionType::get(Type::getVoidTy(Inst->getContext()), + /*isVarArg=*/false), + RetainRVMarker->getString(), + /*Constraints=*/"", /*hasSideEffects=*/true); + CallInst::Create(IA, "", Inst); + } + decline_rv_optimization: + break; + } + case IC_InitWeak: { + // objc_initWeak(p, null) => *p = null + CallInst *CI = cast<CallInst>(Inst); + if (isNullOrUndef(CI->getArgOperand(1))) { + Value *Null = + ConstantPointerNull::get(cast<PointerType>(CI->getType())); + Changed = true; + new StoreInst(Null, CI->getArgOperand(0), CI); + CI->replaceAllUsesWith(Null); + CI->eraseFromParent(); + } + continue; + } + case IC_Release: + ContractRelease(Inst, I); + continue; + case IC_User: + // Be conservative if the function has any alloca instructions. + // Technically we only care about escaping alloca instructions, + // but this is sufficient to handle some interesting cases. + if (isa<AllocaInst>(Inst)) + TailOkForStoreStrongs = false; + continue; + default: + continue; + } + + // Don't use GetObjCArg because we don't want to look through bitcasts + // and such; to do the replacement, the argument must have type i8*. + const Value *Arg = cast<CallInst>(Inst)->getArgOperand(0); + for (;;) { + // If we're compiling bugpointed code, don't get in trouble. + if (!isa<Instruction>(Arg) && !isa<Argument>(Arg)) + break; + // Look through the uses of the pointer. + for (Value::const_use_iterator UI = Arg->use_begin(), UE = Arg->use_end(); + UI != UE; ) { + Use &U = UI.getUse(); + unsigned OperandNo = UI.getOperandNo(); + ++UI; // Increment UI now, because we may unlink its element. + + // If the call's return value dominates a use of the call's argument + // value, rewrite the use to use the return value. We check for + // reachability here because an unreachable call is considered to + // trivially dominate itself, which would lead us to rewriting its + // argument in terms of its return value, which would lead to + // infinite loops in GetObjCArg. + if (DT->isReachableFromEntry(U) && DT->dominates(Inst, U)) { + Changed = true; + Instruction *Replacement = Inst; + Type *UseTy = U.get()->getType(); + if (PHINode *PHI = dyn_cast<PHINode>(U.getUser())) { + // For PHI nodes, insert the bitcast in the predecessor block. + unsigned ValNo = PHINode::getIncomingValueNumForOperand(OperandNo); + BasicBlock *BB = PHI->getIncomingBlock(ValNo); + if (Replacement->getType() != UseTy) + Replacement = new BitCastInst(Replacement, UseTy, "", + &BB->back()); + // While we're here, rewrite all edges for this PHI, rather + // than just one use at a time, to minimize the number of + // bitcasts we emit. + for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i) + if (PHI->getIncomingBlock(i) == BB) { + // Keep the UI iterator valid. + if (&PHI->getOperandUse( + PHINode::getOperandNumForIncomingValue(i)) == + &UI.getUse()) + ++UI; + PHI->setIncomingValue(i, Replacement); + } + } else { + if (Replacement->getType() != UseTy) + Replacement = new BitCastInst(Replacement, UseTy, "", + cast<Instruction>(U.getUser())); + U.set(Replacement); + } + } + } + + // If Arg is a no-op casted pointer, strip one level of casts and iterate. + if (const BitCastInst *BI = dyn_cast<BitCastInst>(Arg)) + Arg = BI->getOperand(0); + else if (isa<GEPOperator>(Arg) && + cast<GEPOperator>(Arg)->hasAllZeroIndices()) + Arg = cast<GEPOperator>(Arg)->getPointerOperand(); + else if (isa<GlobalAlias>(Arg) && + !cast<GlobalAlias>(Arg)->mayBeOverridden()) + Arg = cast<GlobalAlias>(Arg)->getAliasee(); + else + break; + } + } + + // If this function has no escaping allocas or suspicious vararg usage, + // objc_storeStrong calls can be marked with the "tail" keyword. + if (TailOkForStoreStrongs) + for (SmallPtrSet<CallInst *, 8>::iterator I = StoreStrongCalls.begin(), + E = StoreStrongCalls.end(); I != E; ++I) + (*I)->setTailCall(); + StoreStrongCalls.clear(); + + return Changed; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp new file mode 100644 index 0000000..09687d8 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -0,0 +1,1704 @@ +//===- Reassociate.cpp - Reassociate binary expressions -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass reassociates commutative expressions in an order that is designed +// to promote better constant propagation, GCSE, LICM, PRE, etc. +// +// For example: 4 + (x + 5) -> x + (4 + 5) +// +// In the implementation of this algorithm, constants are assigned rank = 0, +// function arguments are rank = 1, and other values are assigned ranks +// corresponding to the reverse post order traversal of current function +// (starting at 2), which effectively gives values in deep loops higher rank +// than values not in loops. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "reassociate" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/IRBuilder.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Pass.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Assembly/Writer.h" +#include "llvm/Support/CFG.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ValueHandle.h" +#include "llvm/Support/raw_ostream.h" +#include <algorithm> +using namespace llvm; + +STATISTIC(NumChanged, "Number of insts reassociated"); +STATISTIC(NumAnnihil, "Number of expr tree annihilated"); +STATISTIC(NumFactor , "Number of multiplies factored"); + +namespace { + struct ValueEntry { + unsigned Rank; + Value *Op; + ValueEntry(unsigned R, Value *O) : Rank(R), Op(O) {} + }; + inline bool operator<(const ValueEntry &LHS, const ValueEntry &RHS) { + return LHS.Rank > RHS.Rank; // Sort so that highest rank goes to start. + } +} + +#ifndef NDEBUG +/// PrintOps - Print out the expression identified in the Ops list. +/// +static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) { + Module *M = I->getParent()->getParent()->getParent(); + dbgs() << Instruction::getOpcodeName(I->getOpcode()) << " " + << *Ops[0].Op->getType() << '\t'; + for (unsigned i = 0, e = Ops.size(); i != e; ++i) { + dbgs() << "[ "; + WriteAsOperand(dbgs(), Ops[i].Op, false, M); + dbgs() << ", #" << Ops[i].Rank << "] "; + } +} +#endif + +namespace { + /// \brief Utility class representing a base and exponent pair which form one + /// factor of some product. + struct Factor { + Value *Base; + unsigned Power; + + Factor(Value *Base, unsigned Power) : Base(Base), Power(Power) {} + + /// \brief Sort factors by their Base. + struct BaseSorter { + bool operator()(const Factor &LHS, const Factor &RHS) { + return LHS.Base < RHS.Base; + } + }; + + /// \brief Compare factors for equal bases. + struct BaseEqual { + bool operator()(const Factor &LHS, const Factor &RHS) { + return LHS.Base == RHS.Base; + } + }; + + /// \brief Sort factors in descending order by their power. + struct PowerDescendingSorter { + bool operator()(const Factor &LHS, const Factor &RHS) { + return LHS.Power > RHS.Power; + } + }; + + /// \brief Compare factors for equal powers. + struct PowerEqual { + bool operator()(const Factor &LHS, const Factor &RHS) { + return LHS.Power == RHS.Power; + } + }; + }; +} + +namespace { + class Reassociate : public FunctionPass { + DenseMap<BasicBlock*, unsigned> RankMap; + DenseMap<AssertingVH<Value>, unsigned> ValueRankMap; + SetVector<AssertingVH<Instruction> > RedoInsts; + bool MadeChange; + public: + static char ID; // Pass identification, replacement for typeid + Reassociate() : FunctionPass(ID) { + initializeReassociatePass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + } + private: + void BuildRankMap(Function &F); + unsigned getRank(Value *V); + void ReassociateExpression(BinaryOperator *I); + void RewriteExprTree(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops); + Value *OptimizeExpression(BinaryOperator *I, + SmallVectorImpl<ValueEntry> &Ops); + Value *OptimizeAdd(Instruction *I, SmallVectorImpl<ValueEntry> &Ops); + bool collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops, + SmallVectorImpl<Factor> &Factors); + Value *buildMinimalMultiplyDAG(IRBuilder<> &Builder, + SmallVectorImpl<Factor> &Factors); + Value *OptimizeMul(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops); + Value *RemoveFactorFromExpression(Value *V, Value *Factor); + void EraseInst(Instruction *I); + void OptimizeInst(Instruction *I); + }; +} + +char Reassociate::ID = 0; +INITIALIZE_PASS(Reassociate, "reassociate", + "Reassociate expressions", false, false) + +// Public interface to the Reassociate pass +FunctionPass *llvm::createReassociatePass() { return new Reassociate(); } + +/// isReassociableOp - Return true if V is an instruction of the specified +/// opcode and if it only has one use. +static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) { + if (V->hasOneUse() && isa<Instruction>(V) && + cast<Instruction>(V)->getOpcode() == Opcode) + return cast<BinaryOperator>(V); + return 0; +} + +static bool isUnmovableInstruction(Instruction *I) { + if (I->getOpcode() == Instruction::PHI || + I->getOpcode() == Instruction::LandingPad || + I->getOpcode() == Instruction::Alloca || + I->getOpcode() == Instruction::Load || + I->getOpcode() == Instruction::Invoke || + (I->getOpcode() == Instruction::Call && + !isa<DbgInfoIntrinsic>(I)) || + I->getOpcode() == Instruction::UDiv || + I->getOpcode() == Instruction::SDiv || + I->getOpcode() == Instruction::FDiv || + I->getOpcode() == Instruction::URem || + I->getOpcode() == Instruction::SRem || + I->getOpcode() == Instruction::FRem) + return true; + return false; +} + +void Reassociate::BuildRankMap(Function &F) { + unsigned i = 2; + + // Assign distinct ranks to function arguments + for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) + ValueRankMap[&*I] = ++i; + + ReversePostOrderTraversal<Function*> RPOT(&F); + for (ReversePostOrderTraversal<Function*>::rpo_iterator I = RPOT.begin(), + E = RPOT.end(); I != E; ++I) { + BasicBlock *BB = *I; + unsigned BBRank = RankMap[BB] = ++i << 16; + + // Walk the basic block, adding precomputed ranks for any instructions that + // we cannot move. This ensures that the ranks for these instructions are + // all different in the block. + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) + if (isUnmovableInstruction(I)) + ValueRankMap[&*I] = ++BBRank; + } +} + +unsigned Reassociate::getRank(Value *V) { + Instruction *I = dyn_cast<Instruction>(V); + if (I == 0) { + if (isa<Argument>(V)) return ValueRankMap[V]; // Function argument. + return 0; // Otherwise it's a global or constant, rank 0. + } + + if (unsigned Rank = ValueRankMap[I]) + return Rank; // Rank already known? + + // If this is an expression, return the 1+MAX(rank(LHS), rank(RHS)) so that + // we can reassociate expressions for code motion! Since we do not recurse + // for PHI nodes, we cannot have infinite recursion here, because there + // cannot be loops in the value graph that do not go through PHI nodes. + unsigned Rank = 0, MaxRank = RankMap[I->getParent()]; + for (unsigned i = 0, e = I->getNumOperands(); + i != e && Rank != MaxRank; ++i) + Rank = std::max(Rank, getRank(I->getOperand(i))); + + // If this is a not or neg instruction, do not count it for rank. This + // assures us that X and ~X will have the same rank. + if (!I->getType()->isIntegerTy() || + (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I))) + ++Rank; + + //DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = " + // << Rank << "\n"); + + return ValueRankMap[I] = Rank; +} + +/// LowerNegateToMultiply - Replace 0-X with X*-1. +/// +static BinaryOperator *LowerNegateToMultiply(Instruction *Neg) { + Constant *Cst = Constant::getAllOnesValue(Neg->getType()); + + BinaryOperator *Res = + BinaryOperator::CreateMul(Neg->getOperand(1), Cst, "",Neg); + Neg->setOperand(1, Constant::getNullValue(Neg->getType())); // Drop use of op. + Res->takeName(Neg); + Neg->replaceAllUsesWith(Res); + Res->setDebugLoc(Neg->getDebugLoc()); + return Res; +} + +/// CarmichaelShift - Returns k such that lambda(2^Bitwidth) = 2^k, where lambda +/// is the Carmichael function. This means that x^(2^k) === 1 mod 2^Bitwidth for +/// every odd x, i.e. x^(2^k) = 1 for every odd x in Bitwidth-bit arithmetic. +/// Note that 0 <= k < Bitwidth, and if Bitwidth > 3 then x^(2^k) = 0 for every +/// even x in Bitwidth-bit arithmetic. +static unsigned CarmichaelShift(unsigned Bitwidth) { + if (Bitwidth < 3) + return Bitwidth - 1; + return Bitwidth - 2; +} + +/// IncorporateWeight - Add the extra weight 'RHS' to the existing weight 'LHS', +/// reducing the combined weight using any special properties of the operation. +/// The existing weight LHS represents the computation X op X op ... op X where +/// X occurs LHS times. The combined weight represents X op X op ... op X with +/// X occurring LHS + RHS times. If op is "Xor" for example then the combined +/// operation is equivalent to X if LHS + RHS is odd, or 0 if LHS + RHS is even; +/// the routine returns 1 in LHS in the first case, and 0 in LHS in the second. +static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) { + // If we were working with infinite precision arithmetic then the combined + // weight would be LHS + RHS. But we are using finite precision arithmetic, + // and the APInt sum LHS + RHS may not be correct if it wraps (it is correct + // for nilpotent operations and addition, but not for idempotent operations + // and multiplication), so it is important to correctly reduce the combined + // weight back into range if wrapping would be wrong. + + // If RHS is zero then the weight didn't change. + if (RHS.isMinValue()) + return; + // If LHS is zero then the combined weight is RHS. + if (LHS.isMinValue()) { + LHS = RHS; + return; + } + // From this point on we know that neither LHS nor RHS is zero. + + if (Instruction::isIdempotent(Opcode)) { + // Idempotent means X op X === X, so any non-zero weight is equivalent to a + // weight of 1. Keeping weights at zero or one also means that wrapping is + // not a problem. + assert(LHS == 1 && RHS == 1 && "Weights not reduced!"); + return; // Return a weight of 1. + } + if (Instruction::isNilpotent(Opcode)) { + // Nilpotent means X op X === 0, so reduce weights modulo 2. + assert(LHS == 1 && RHS == 1 && "Weights not reduced!"); + LHS = 0; // 1 + 1 === 0 modulo 2. + return; + } + if (Opcode == Instruction::Add) { + // TODO: Reduce the weight by exploiting nsw/nuw? + LHS += RHS; + return; + } + + assert(Opcode == Instruction::Mul && "Unknown associative operation!"); + unsigned Bitwidth = LHS.getBitWidth(); + // If CM is the Carmichael number then a weight W satisfying W >= CM+Bitwidth + // can be replaced with W-CM. That's because x^W=x^(W-CM) for every Bitwidth + // bit number x, since either x is odd in which case x^CM = 1, or x is even in + // which case both x^W and x^(W - CM) are zero. By subtracting off multiples + // of CM like this weights can always be reduced to the range [0, CM+Bitwidth) + // which by a happy accident means that they can always be represented using + // Bitwidth bits. + // TODO: Reduce the weight by exploiting nsw/nuw? (Could do much better than + // the Carmichael number). + if (Bitwidth > 3) { + /// CM - The value of Carmichael's lambda function. + APInt CM = APInt::getOneBitSet(Bitwidth, CarmichaelShift(Bitwidth)); + // Any weight W >= Threshold can be replaced with W - CM. + APInt Threshold = CM + Bitwidth; + assert(LHS.ult(Threshold) && RHS.ult(Threshold) && "Weights not reduced!"); + // For Bitwidth 4 or more the following sum does not overflow. + LHS += RHS; + while (LHS.uge(Threshold)) + LHS -= CM; + } else { + // To avoid problems with overflow do everything the same as above but using + // a larger type. + unsigned CM = 1U << CarmichaelShift(Bitwidth); + unsigned Threshold = CM + Bitwidth; + assert(LHS.getZExtValue() < Threshold && RHS.getZExtValue() < Threshold && + "Weights not reduced!"); + unsigned Total = LHS.getZExtValue() + RHS.getZExtValue(); + while (Total >= Threshold) + Total -= CM; + LHS = Total; + } +} + +/// EvaluateRepeatedConstant - Compute C op C op ... op C where the constant C +/// is repeated Weight times. +static Constant *EvaluateRepeatedConstant(unsigned Opcode, Constant *C, + APInt Weight) { + // For addition the result can be efficiently computed as the product of the + // constant and the weight. + if (Opcode == Instruction::Add) + return ConstantExpr::getMul(C, ConstantInt::get(C->getContext(), Weight)); + + // The weight might be huge, so compute by repeated squaring to ensure that + // compile time is proportional to the logarithm of the weight. + Constant *Result = 0; + Constant *Power = C; // Successively C, C op C, (C op C) op (C op C) etc. + // Visit the bits in Weight. + while (Weight != 0) { + // If the current bit in Weight is non-zero do Result = Result op Power. + if (Weight[0]) + Result = Result ? ConstantExpr::get(Opcode, Result, Power) : Power; + // Move on to the next bit if any more are non-zero. + Weight = Weight.lshr(1); + if (Weight.isMinValue()) + break; + // Square the power. + Power = ConstantExpr::get(Opcode, Power, Power); + } + + assert(Result && "Only positive weights supported!"); + return Result; +} + +typedef std::pair<Value*, APInt> RepeatedValue; + +/// LinearizeExprTree - Given an associative binary expression, return the leaf +/// nodes in Ops along with their weights (how many times the leaf occurs). The +/// original expression is the same as +/// (Ops[0].first op Ops[0].first op ... Ops[0].first) <- Ops[0].second times +/// op +/// (Ops[1].first op Ops[1].first op ... Ops[1].first) <- Ops[1].second times +/// op +/// ... +/// op +/// (Ops[N].first op Ops[N].first op ... Ops[N].first) <- Ops[N].second times +/// +/// Note that the values Ops[0].first, ..., Ops[N].first are all distinct, and +/// they are all non-constant except possibly for the last one, which if it is +/// constant will have weight one (Ops[N].second === 1). +/// +/// This routine may modify the function, in which case it returns 'true'. The +/// changes it makes may well be destructive, changing the value computed by 'I' +/// to something completely different. Thus if the routine returns 'true' then +/// you MUST either replace I with a new expression computed from the Ops array, +/// or use RewriteExprTree to put the values back in. +/// +/// A leaf node is either not a binary operation of the same kind as the root +/// node 'I' (i.e. is not a binary operator at all, or is, but with a different +/// opcode), or is the same kind of binary operator but has a use which either +/// does not belong to the expression, or does belong to the expression but is +/// a leaf node. Every leaf node has at least one use that is a non-leaf node +/// of the expression, while for non-leaf nodes (except for the root 'I') every +/// use is a non-leaf node of the expression. +/// +/// For example: +/// expression graph node names +/// +/// + | I +/// / \ | +/// + + | A, B +/// / \ / \ | +/// * + * | C, D, E +/// / \ / \ / \ | +/// + * | F, G +/// +/// The leaf nodes are C, E, F and G. The Ops array will contain (maybe not in +/// that order) (C, 1), (E, 1), (F, 2), (G, 2). +/// +/// The expression is maximal: if some instruction is a binary operator of the +/// same kind as 'I', and all of its uses are non-leaf nodes of the expression, +/// then the instruction also belongs to the expression, is not a leaf node of +/// it, and its operands also belong to the expression (but may be leaf nodes). +/// +/// NOTE: This routine will set operands of non-leaf non-root nodes to undef in +/// order to ensure that every non-root node in the expression has *exactly one* +/// use by a non-leaf node of the expression. This destruction means that the +/// caller MUST either replace 'I' with a new expression or use something like +/// RewriteExprTree to put the values back in if the routine indicates that it +/// made a change by returning 'true'. +/// +/// In the above example either the right operand of A or the left operand of B +/// will be replaced by undef. If it is B's operand then this gives: +/// +/// + | I +/// / \ | +/// + + | A, B - operand of B replaced with undef +/// / \ \ | +/// * + * | C, D, E +/// / \ / \ / \ | +/// + * | F, G +/// +/// Note that such undef operands can only be reached by passing through 'I'. +/// For example, if you visit operands recursively starting from a leaf node +/// then you will never see such an undef operand unless you get back to 'I', +/// which requires passing through a phi node. +/// +/// Note that this routine may also mutate binary operators of the wrong type +/// that have all uses inside the expression (i.e. only used by non-leaf nodes +/// of the expression) if it can turn them into binary operators of the right +/// type and thus make the expression bigger. + +static bool LinearizeExprTree(BinaryOperator *I, + SmallVectorImpl<RepeatedValue> &Ops) { + DEBUG(dbgs() << "LINEARIZE: " << *I << '\n'); + unsigned Bitwidth = I->getType()->getScalarType()->getPrimitiveSizeInBits(); + unsigned Opcode = I->getOpcode(); + assert(Instruction::isAssociative(Opcode) && + Instruction::isCommutative(Opcode) && + "Expected an associative and commutative operation!"); + // If we see an absorbing element then the entire expression must be equal to + // it. For example, if this is a multiplication expression and zero occurs as + // an operand somewhere in it then the result of the expression must be zero. + Constant *Absorber = ConstantExpr::getBinOpAbsorber(Opcode, I->getType()); + + // Visit all operands of the expression, keeping track of their weight (the + // number of paths from the expression root to the operand, or if you like + // the number of times that operand occurs in the linearized expression). + // For example, if I = X + A, where X = A + B, then I, X and B have weight 1 + // while A has weight two. + + // Worklist of non-leaf nodes (their operands are in the expression too) along + // with their weights, representing a certain number of paths to the operator. + // If an operator occurs in the worklist multiple times then we found multiple + // ways to get to it. + SmallVector<std::pair<BinaryOperator*, APInt>, 8> Worklist; // (Op, Weight) + Worklist.push_back(std::make_pair(I, APInt(Bitwidth, 1))); + bool MadeChange = false; + + // Leaves of the expression are values that either aren't the right kind of + // operation (eg: a constant, or a multiply in an add tree), or are, but have + // some uses that are not inside the expression. For example, in I = X + X, + // X = A + B, the value X has two uses (by I) that are in the expression. If + // X has any other uses, for example in a return instruction, then we consider + // X to be a leaf, and won't analyze it further. When we first visit a value, + // if it has more than one use then at first we conservatively consider it to + // be a leaf. Later, as the expression is explored, we may discover some more + // uses of the value from inside the expression. If all uses turn out to be + // from within the expression (and the value is a binary operator of the right + // kind) then the value is no longer considered to be a leaf, and its operands + // are explored. + + // Leaves - Keeps track of the set of putative leaves as well as the number of + // paths to each leaf seen so far. + typedef DenseMap<Value*, APInt> LeafMap; + LeafMap Leaves; // Leaf -> Total weight so far. + SmallVector<Value*, 8> LeafOrder; // Ensure deterministic leaf output order. + +#ifndef NDEBUG + SmallPtrSet<Value*, 8> Visited; // For sanity checking the iteration scheme. +#endif + while (!Worklist.empty()) { + std::pair<BinaryOperator*, APInt> P = Worklist.pop_back_val(); + I = P.first; // We examine the operands of this binary operator. + + for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx) { // Visit operands. + Value *Op = I->getOperand(OpIdx); + APInt Weight = P.second; // Number of paths to this operand. + DEBUG(dbgs() << "OPERAND: " << *Op << " (" << Weight << ")\n"); + assert(!Op->use_empty() && "No uses, so how did we get to it?!"); + + // If the expression contains an absorbing element then there is no need + // to analyze it further: it must evaluate to the absorbing element. + if (Op == Absorber && !Weight.isMinValue()) { + Ops.push_back(std::make_pair(Absorber, APInt(Bitwidth, 1))); + return MadeChange; + } + + // If this is a binary operation of the right kind with only one use then + // add its operands to the expression. + if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) { + assert(Visited.insert(Op) && "Not first visit!"); + DEBUG(dbgs() << "DIRECT ADD: " << *Op << " (" << Weight << ")\n"); + Worklist.push_back(std::make_pair(BO, Weight)); + continue; + } + + // Appears to be a leaf. Is the operand already in the set of leaves? + LeafMap::iterator It = Leaves.find(Op); + if (It == Leaves.end()) { + // Not in the leaf map. Must be the first time we saw this operand. + assert(Visited.insert(Op) && "Not first visit!"); + if (!Op->hasOneUse()) { + // This value has uses not accounted for by the expression, so it is + // not safe to modify. Mark it as being a leaf. + DEBUG(dbgs() << "ADD USES LEAF: " << *Op << " (" << Weight << ")\n"); + LeafOrder.push_back(Op); + Leaves[Op] = Weight; + continue; + } + // No uses outside the expression, try morphing it. + } else if (It != Leaves.end()) { + // Already in the leaf map. + assert(Visited.count(Op) && "In leaf map but not visited!"); + + // Update the number of paths to the leaf. + IncorporateWeight(It->second, Weight, Opcode); + +#if 0 // TODO: Re-enable once PR13021 is fixed. + // The leaf already has one use from inside the expression. As we want + // exactly one such use, drop this new use of the leaf. + assert(!Op->hasOneUse() && "Only one use, but we got here twice!"); + I->setOperand(OpIdx, UndefValue::get(I->getType())); + MadeChange = true; + + // If the leaf is a binary operation of the right kind and we now see + // that its multiple original uses were in fact all by nodes belonging + // to the expression, then no longer consider it to be a leaf and add + // its operands to the expression. + if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) { + DEBUG(dbgs() << "UNLEAF: " << *Op << " (" << It->second << ")\n"); + Worklist.push_back(std::make_pair(BO, It->second)); + Leaves.erase(It); + continue; + } +#endif + + // If we still have uses that are not accounted for by the expression + // then it is not safe to modify the value. + if (!Op->hasOneUse()) + continue; + + // No uses outside the expression, try morphing it. + Weight = It->second; + Leaves.erase(It); // Since the value may be morphed below. + } + + // At this point we have a value which, first of all, is not a binary + // expression of the right kind, and secondly, is only used inside the + // expression. This means that it can safely be modified. See if we + // can usefully morph it into an expression of the right kind. + assert((!isa<Instruction>(Op) || + cast<Instruction>(Op)->getOpcode() != Opcode) && + "Should have been handled above!"); + assert(Op->hasOneUse() && "Has uses outside the expression tree!"); + + // If this is a multiply expression, turn any internal negations into + // multiplies by -1 so they can be reassociated. + BinaryOperator *BO = dyn_cast<BinaryOperator>(Op); + if (Opcode == Instruction::Mul && BO && BinaryOperator::isNeg(BO)) { + DEBUG(dbgs() << "MORPH LEAF: " << *Op << " (" << Weight << ") TO "); + BO = LowerNegateToMultiply(BO); + DEBUG(dbgs() << *BO << 'n'); + Worklist.push_back(std::make_pair(BO, Weight)); + MadeChange = true; + continue; + } + + // Failed to morph into an expression of the right type. This really is + // a leaf. + DEBUG(dbgs() << "ADD LEAF: " << *Op << " (" << Weight << ")\n"); + assert(!isReassociableOp(Op, Opcode) && "Value was morphed?"); + LeafOrder.push_back(Op); + Leaves[Op] = Weight; + } + } + + // The leaves, repeated according to their weights, represent the linearized + // form of the expression. + Constant *Cst = 0; // Accumulate constants here. + for (unsigned i = 0, e = LeafOrder.size(); i != e; ++i) { + Value *V = LeafOrder[i]; + LeafMap::iterator It = Leaves.find(V); + if (It == Leaves.end()) + // Node initially thought to be a leaf wasn't. + continue; + assert(!isReassociableOp(V, Opcode) && "Shouldn't be a leaf!"); + APInt Weight = It->second; + if (Weight.isMinValue()) + // Leaf already output or weight reduction eliminated it. + continue; + // Ensure the leaf is only output once. + It->second = 0; + // Glob all constants together into Cst. + if (Constant *C = dyn_cast<Constant>(V)) { + C = EvaluateRepeatedConstant(Opcode, C, Weight); + Cst = Cst ? ConstantExpr::get(Opcode, Cst, C) : C; + continue; + } + // Add non-constant + Ops.push_back(std::make_pair(V, Weight)); + } + + // Add any constants back into Ops, all globbed together and reduced to having + // weight 1 for the convenience of users. + Constant *Identity = ConstantExpr::getBinOpIdentity(Opcode, I->getType()); + if (Cst && Cst != Identity) { + // If combining multiple constants resulted in the absorber then the entire + // expression must evaluate to the absorber. + if (Cst == Absorber) + Ops.clear(); + Ops.push_back(std::make_pair(Cst, APInt(Bitwidth, 1))); + } + + // For nilpotent operations or addition there may be no operands, for example + // because the expression was "X xor X" or consisted of 2^Bitwidth additions: + // in both cases the weight reduces to 0 causing the value to be skipped. + if (Ops.empty()) { + assert(Identity && "Associative operation without identity!"); + Ops.push_back(std::make_pair(Identity, APInt(Bitwidth, 1))); + } + + return MadeChange; +} + +// RewriteExprTree - Now that the operands for this expression tree are +// linearized and optimized, emit them in-order. +void Reassociate::RewriteExprTree(BinaryOperator *I, + SmallVectorImpl<ValueEntry> &Ops) { + assert(Ops.size() > 1 && "Single values should be used directly!"); + + // Since our optimizations never increase the number of operations, the new + // expression can always be written by reusing the existing binary operators + // from the original expression tree, without creating any new instructions, + // though the rewritten expression may have a completely different topology. + // We take care to not change anything if the new expression will be the same + // as the original. If more than trivial changes (like commuting operands) + // were made then we are obliged to clear out any optional subclass data like + // nsw flags. + + /// NodesToRewrite - Nodes from the original expression available for writing + /// the new expression into. + SmallVector<BinaryOperator*, 8> NodesToRewrite; + unsigned Opcode = I->getOpcode(); + BinaryOperator *Op = I; + + // ExpressionChanged - Non-null if the rewritten expression differs from the + // original in some non-trivial way, requiring the clearing of optional flags. + // Flags are cleared from the operator in ExpressionChanged up to I inclusive. + BinaryOperator *ExpressionChanged = 0; + for (unsigned i = 0; ; ++i) { + // The last operation (which comes earliest in the IR) is special as both + // operands will come from Ops, rather than just one with the other being + // a subexpression. + if (i+2 == Ops.size()) { + Value *NewLHS = Ops[i].Op; + Value *NewRHS = Ops[i+1].Op; + Value *OldLHS = Op->getOperand(0); + Value *OldRHS = Op->getOperand(1); + + if (NewLHS == OldLHS && NewRHS == OldRHS) + // Nothing changed, leave it alone. + break; + + if (NewLHS == OldRHS && NewRHS == OldLHS) { + // The order of the operands was reversed. Swap them. + DEBUG(dbgs() << "RA: " << *Op << '\n'); + Op->swapOperands(); + DEBUG(dbgs() << "TO: " << *Op << '\n'); + MadeChange = true; + ++NumChanged; + break; + } + + // The new operation differs non-trivially from the original. Overwrite + // the old operands with the new ones. + DEBUG(dbgs() << "RA: " << *Op << '\n'); + if (NewLHS != OldLHS) { + if (BinaryOperator *BO = isReassociableOp(OldLHS, Opcode)) + NodesToRewrite.push_back(BO); + Op->setOperand(0, NewLHS); + } + if (NewRHS != OldRHS) { + if (BinaryOperator *BO = isReassociableOp(OldRHS, Opcode)) + NodesToRewrite.push_back(BO); + Op->setOperand(1, NewRHS); + } + DEBUG(dbgs() << "TO: " << *Op << '\n'); + + ExpressionChanged = Op; + MadeChange = true; + ++NumChanged; + + break; + } + + // Not the last operation. The left-hand side will be a sub-expression + // while the right-hand side will be the current element of Ops. + Value *NewRHS = Ops[i].Op; + if (NewRHS != Op->getOperand(1)) { + DEBUG(dbgs() << "RA: " << *Op << '\n'); + if (NewRHS == Op->getOperand(0)) { + // The new right-hand side was already present as the left operand. If + // we are lucky then swapping the operands will sort out both of them. + Op->swapOperands(); + } else { + // Overwrite with the new right-hand side. + if (BinaryOperator *BO = isReassociableOp(Op->getOperand(1), Opcode)) + NodesToRewrite.push_back(BO); + Op->setOperand(1, NewRHS); + ExpressionChanged = Op; + } + DEBUG(dbgs() << "TO: " << *Op << '\n'); + MadeChange = true; + ++NumChanged; + } + + // Now deal with the left-hand side. If this is already an operation node + // from the original expression then just rewrite the rest of the expression + // into it. + if (BinaryOperator *BO = isReassociableOp(Op->getOperand(0), Opcode)) { + Op = BO; + continue; + } + + // Otherwise, grab a spare node from the original expression and use that as + // the left-hand side. If there are no nodes left then the optimizers made + // an expression with more nodes than the original! This usually means that + // they did something stupid but it might mean that the problem was just too + // hard (finding the mimimal number of multiplications needed to realize a + // multiplication expression is NP-complete). Whatever the reason, smart or + // stupid, create a new node if there are none left. + BinaryOperator *NewOp; + if (NodesToRewrite.empty()) { + Constant *Undef = UndefValue::get(I->getType()); + NewOp = BinaryOperator::Create(Instruction::BinaryOps(Opcode), + Undef, Undef, "", I); + } else { + NewOp = NodesToRewrite.pop_back_val(); + } + + DEBUG(dbgs() << "RA: " << *Op << '\n'); + Op->setOperand(0, NewOp); + DEBUG(dbgs() << "TO: " << *Op << '\n'); + ExpressionChanged = Op; + MadeChange = true; + ++NumChanged; + Op = NewOp; + } + + // If the expression changed non-trivially then clear out all subclass data + // starting from the operator specified in ExpressionChanged, and compactify + // the operators to just before the expression root to guarantee that the + // expression tree is dominated by all of Ops. + if (ExpressionChanged) + do { + ExpressionChanged->clearSubclassOptionalData(); + if (ExpressionChanged == I) + break; + ExpressionChanged->moveBefore(I); + ExpressionChanged = cast<BinaryOperator>(*ExpressionChanged->use_begin()); + } while (1); + + // Throw away any left over nodes from the original expression. + for (unsigned i = 0, e = NodesToRewrite.size(); i != e; ++i) + RedoInsts.insert(NodesToRewrite[i]); +} + +/// NegateValue - Insert instructions before the instruction pointed to by BI, +/// that computes the negative version of the value specified. The negative +/// version of the value is returned, and BI is left pointing at the instruction +/// that should be processed next by the reassociation pass. +static Value *NegateValue(Value *V, Instruction *BI) { + if (Constant *C = dyn_cast<Constant>(V)) + return ConstantExpr::getNeg(C); + + // We are trying to expose opportunity for reassociation. One of the things + // that we want to do to achieve this is to push a negation as deep into an + // expression chain as possible, to expose the add instructions. In practice, + // this means that we turn this: + // X = -(A+12+C+D) into X = -A + -12 + -C + -D = -12 + -A + -C + -D + // so that later, a: Y = 12+X could get reassociated with the -12 to eliminate + // the constants. We assume that instcombine will clean up the mess later if + // we introduce tons of unnecessary negation instructions. + // + if (BinaryOperator *I = isReassociableOp(V, Instruction::Add)) { + // Push the negates through the add. + I->setOperand(0, NegateValue(I->getOperand(0), BI)); + I->setOperand(1, NegateValue(I->getOperand(1), BI)); + + // We must move the add instruction here, because the neg instructions do + // not dominate the old add instruction in general. By moving it, we are + // assured that the neg instructions we just inserted dominate the + // instruction we are about to insert after them. + // + I->moveBefore(BI); + I->setName(I->getName()+".neg"); + return I; + } + + // Okay, we need to materialize a negated version of V with an instruction. + // Scan the use lists of V to see if we have one already. + for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;++UI){ + User *U = *UI; + if (!BinaryOperator::isNeg(U)) continue; + + // We found one! Now we have to make sure that the definition dominates + // this use. We do this by moving it to the entry block (if it is a + // non-instruction value) or right after the definition. These negates will + // be zapped by reassociate later, so we don't need much finesse here. + BinaryOperator *TheNeg = cast<BinaryOperator>(U); + + // Verify that the negate is in this function, V might be a constant expr. + if (TheNeg->getParent()->getParent() != BI->getParent()->getParent()) + continue; + + BasicBlock::iterator InsertPt; + if (Instruction *InstInput = dyn_cast<Instruction>(V)) { + if (InvokeInst *II = dyn_cast<InvokeInst>(InstInput)) { + InsertPt = II->getNormalDest()->begin(); + } else { + InsertPt = InstInput; + ++InsertPt; + } + while (isa<PHINode>(InsertPt)) ++InsertPt; + } else { + InsertPt = TheNeg->getParent()->getParent()->getEntryBlock().begin(); + } + TheNeg->moveBefore(InsertPt); + return TheNeg; + } + + // Insert a 'neg' instruction that subtracts the value from zero to get the + // negation. + return BinaryOperator::CreateNeg(V, V->getName() + ".neg", BI); +} + +/// ShouldBreakUpSubtract - Return true if we should break up this subtract of +/// X-Y into (X + -Y). +static bool ShouldBreakUpSubtract(Instruction *Sub) { + // If this is a negation, we can't split it up! + if (BinaryOperator::isNeg(Sub)) + return false; + + // Don't bother to break this up unless either the LHS is an associable add or + // subtract or if this is only used by one. + if (isReassociableOp(Sub->getOperand(0), Instruction::Add) || + isReassociableOp(Sub->getOperand(0), Instruction::Sub)) + return true; + if (isReassociableOp(Sub->getOperand(1), Instruction::Add) || + isReassociableOp(Sub->getOperand(1), Instruction::Sub)) + return true; + if (Sub->hasOneUse() && + (isReassociableOp(Sub->use_back(), Instruction::Add) || + isReassociableOp(Sub->use_back(), Instruction::Sub))) + return true; + + return false; +} + +/// BreakUpSubtract - If we have (X-Y), and if either X is an add, or if this is +/// only used by an add, transform this into (X+(0-Y)) to promote better +/// reassociation. +static BinaryOperator *BreakUpSubtract(Instruction *Sub) { + // Convert a subtract into an add and a neg instruction. This allows sub + // instructions to be commuted with other add instructions. + // + // Calculate the negative value of Operand 1 of the sub instruction, + // and set it as the RHS of the add instruction we just made. + // + Value *NegVal = NegateValue(Sub->getOperand(1), Sub); + BinaryOperator *New = + BinaryOperator::CreateAdd(Sub->getOperand(0), NegVal, "", Sub); + Sub->setOperand(0, Constant::getNullValue(Sub->getType())); // Drop use of op. + Sub->setOperand(1, Constant::getNullValue(Sub->getType())); // Drop use of op. + New->takeName(Sub); + + // Everyone now refers to the add instruction. + Sub->replaceAllUsesWith(New); + New->setDebugLoc(Sub->getDebugLoc()); + + DEBUG(dbgs() << "Negated: " << *New << '\n'); + return New; +} + +/// ConvertShiftToMul - If this is a shift of a reassociable multiply or is used +/// by one, change this into a multiply by a constant to assist with further +/// reassociation. +static BinaryOperator *ConvertShiftToMul(Instruction *Shl) { + Constant *MulCst = ConstantInt::get(Shl->getType(), 1); + MulCst = ConstantExpr::getShl(MulCst, cast<Constant>(Shl->getOperand(1))); + + BinaryOperator *Mul = + BinaryOperator::CreateMul(Shl->getOperand(0), MulCst, "", Shl); + Shl->setOperand(0, UndefValue::get(Shl->getType())); // Drop use of op. + Mul->takeName(Shl); + Shl->replaceAllUsesWith(Mul); + Mul->setDebugLoc(Shl->getDebugLoc()); + return Mul; +} + +/// FindInOperandList - Scan backwards and forwards among values with the same +/// rank as element i to see if X exists. If X does not exist, return i. This +/// is useful when scanning for 'x' when we see '-x' because they both get the +/// same rank. +static unsigned FindInOperandList(SmallVectorImpl<ValueEntry> &Ops, unsigned i, + Value *X) { + unsigned XRank = Ops[i].Rank; + unsigned e = Ops.size(); + for (unsigned j = i+1; j != e && Ops[j].Rank == XRank; ++j) + if (Ops[j].Op == X) + return j; + // Scan backwards. + for (unsigned j = i-1; j != ~0U && Ops[j].Rank == XRank; --j) + if (Ops[j].Op == X) + return j; + return i; +} + +/// EmitAddTreeOfValues - Emit a tree of add instructions, summing Ops together +/// and returning the result. Insert the tree before I. +static Value *EmitAddTreeOfValues(Instruction *I, + SmallVectorImpl<WeakVH> &Ops){ + if (Ops.size() == 1) return Ops.back(); + + Value *V1 = Ops.back(); + Ops.pop_back(); + Value *V2 = EmitAddTreeOfValues(I, Ops); + return BinaryOperator::CreateAdd(V2, V1, "tmp", I); +} + +/// RemoveFactorFromExpression - If V is an expression tree that is a +/// multiplication sequence, and if this sequence contains a multiply by Factor, +/// remove Factor from the tree and return the new tree. +Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) { + BinaryOperator *BO = isReassociableOp(V, Instruction::Mul); + if (!BO) return 0; + + SmallVector<RepeatedValue, 8> Tree; + MadeChange |= LinearizeExprTree(BO, Tree); + SmallVector<ValueEntry, 8> Factors; + Factors.reserve(Tree.size()); + for (unsigned i = 0, e = Tree.size(); i != e; ++i) { + RepeatedValue E = Tree[i]; + Factors.append(E.second.getZExtValue(), + ValueEntry(getRank(E.first), E.first)); + } + + bool FoundFactor = false; + bool NeedsNegate = false; + for (unsigned i = 0, e = Factors.size(); i != e; ++i) { + if (Factors[i].Op == Factor) { + FoundFactor = true; + Factors.erase(Factors.begin()+i); + break; + } + + // If this is a negative version of this factor, remove it. + if (ConstantInt *FC1 = dyn_cast<ConstantInt>(Factor)) + if (ConstantInt *FC2 = dyn_cast<ConstantInt>(Factors[i].Op)) + if (FC1->getValue() == -FC2->getValue()) { + FoundFactor = NeedsNegate = true; + Factors.erase(Factors.begin()+i); + break; + } + } + + if (!FoundFactor) { + // Make sure to restore the operands to the expression tree. + RewriteExprTree(BO, Factors); + return 0; + } + + BasicBlock::iterator InsertPt = BO; ++InsertPt; + + // If this was just a single multiply, remove the multiply and return the only + // remaining operand. + if (Factors.size() == 1) { + RedoInsts.insert(BO); + V = Factors[0].Op; + } else { + RewriteExprTree(BO, Factors); + V = BO; + } + + if (NeedsNegate) + V = BinaryOperator::CreateNeg(V, "neg", InsertPt); + + return V; +} + +/// FindSingleUseMultiplyFactors - If V is a single-use multiply, recursively +/// add its operands as factors, otherwise add V to the list of factors. +/// +/// Ops is the top-level list of add operands we're trying to factor. +static void FindSingleUseMultiplyFactors(Value *V, + SmallVectorImpl<Value*> &Factors, + const SmallVectorImpl<ValueEntry> &Ops) { + BinaryOperator *BO = isReassociableOp(V, Instruction::Mul); + if (!BO) { + Factors.push_back(V); + return; + } + + // Otherwise, add the LHS and RHS to the list of factors. + FindSingleUseMultiplyFactors(BO->getOperand(1), Factors, Ops); + FindSingleUseMultiplyFactors(BO->getOperand(0), Factors, Ops); +} + +/// OptimizeAndOrXor - Optimize a series of operands to an 'and', 'or', or 'xor' +/// instruction. This optimizes based on identities. If it can be reduced to +/// a single Value, it is returned, otherwise the Ops list is mutated as +/// necessary. +static Value *OptimizeAndOrXor(unsigned Opcode, + SmallVectorImpl<ValueEntry> &Ops) { + // Scan the operand lists looking for X and ~X pairs, along with X,X pairs. + // If we find any, we can simplify the expression. X&~X == 0, X|~X == -1. + for (unsigned i = 0, e = Ops.size(); i != e; ++i) { + // First, check for X and ~X in the operand list. + assert(i < Ops.size()); + if (BinaryOperator::isNot(Ops[i].Op)) { // Cannot occur for ^. + Value *X = BinaryOperator::getNotArgument(Ops[i].Op); + unsigned FoundX = FindInOperandList(Ops, i, X); + if (FoundX != i) { + if (Opcode == Instruction::And) // ...&X&~X = 0 + return Constant::getNullValue(X->getType()); + + if (Opcode == Instruction::Or) // ...|X|~X = -1 + return Constant::getAllOnesValue(X->getType()); + } + } + + // Next, check for duplicate pairs of values, which we assume are next to + // each other, due to our sorting criteria. + assert(i < Ops.size()); + if (i+1 != Ops.size() && Ops[i+1].Op == Ops[i].Op) { + if (Opcode == Instruction::And || Opcode == Instruction::Or) { + // Drop duplicate values for And and Or. + Ops.erase(Ops.begin()+i); + --i; --e; + ++NumAnnihil; + continue; + } + + // Drop pairs of values for Xor. + assert(Opcode == Instruction::Xor); + if (e == 2) + return Constant::getNullValue(Ops[0].Op->getType()); + + // Y ^ X^X -> Y + Ops.erase(Ops.begin()+i, Ops.begin()+i+2); + i -= 1; e -= 2; + ++NumAnnihil; + } + } + return 0; +} + +/// OptimizeAdd - Optimize a series of operands to an 'add' instruction. This +/// optimizes based on identities. If it can be reduced to a single Value, it +/// is returned, otherwise the Ops list is mutated as necessary. +Value *Reassociate::OptimizeAdd(Instruction *I, + SmallVectorImpl<ValueEntry> &Ops) { + // Scan the operand lists looking for X and -X pairs. If we find any, we + // can simplify the expression. X+-X == 0. While we're at it, scan for any + // duplicates. We want to canonicalize Y+Y+Y+Z -> 3*Y+Z. + // + // TODO: We could handle "X + ~X" -> "-1" if we wanted, since "-X = ~X+1". + // + for (unsigned i = 0, e = Ops.size(); i != e; ++i) { + Value *TheOp = Ops[i].Op; + // Check to see if we've seen this operand before. If so, we factor all + // instances of the operand together. Due to our sorting criteria, we know + // that these need to be next to each other in the vector. + if (i+1 != Ops.size() && Ops[i+1].Op == TheOp) { + // Rescan the list, remove all instances of this operand from the expr. + unsigned NumFound = 0; + do { + Ops.erase(Ops.begin()+i); + ++NumFound; + } while (i != Ops.size() && Ops[i].Op == TheOp); + + DEBUG(errs() << "\nFACTORING [" << NumFound << "]: " << *TheOp << '\n'); + ++NumFactor; + + // Insert a new multiply. + Value *Mul = ConstantInt::get(cast<IntegerType>(I->getType()), NumFound); + Mul = BinaryOperator::CreateMul(TheOp, Mul, "factor", I); + + // Now that we have inserted a multiply, optimize it. This allows us to + // handle cases that require multiple factoring steps, such as this: + // (X*2) + (X*2) + (X*2) -> (X*2)*3 -> X*6 + RedoInsts.insert(cast<Instruction>(Mul)); + + // If every add operand was a duplicate, return the multiply. + if (Ops.empty()) + return Mul; + + // Otherwise, we had some input that didn't have the dupe, such as + // "A + A + B" -> "A*2 + B". Add the new multiply to the list of + // things being added by this operation. + Ops.insert(Ops.begin(), ValueEntry(getRank(Mul), Mul)); + + --i; + e = Ops.size(); + continue; + } + + // Check for X and -X in the operand list. + if (!BinaryOperator::isNeg(TheOp)) + continue; + + Value *X = BinaryOperator::getNegArgument(TheOp); + unsigned FoundX = FindInOperandList(Ops, i, X); + if (FoundX == i) + continue; + + // Remove X and -X from the operand list. + if (Ops.size() == 2) + return Constant::getNullValue(X->getType()); + + Ops.erase(Ops.begin()+i); + if (i < FoundX) + --FoundX; + else + --i; // Need to back up an extra one. + Ops.erase(Ops.begin()+FoundX); + ++NumAnnihil; + --i; // Revisit element. + e -= 2; // Removed two elements. + } + + // Scan the operand list, checking to see if there are any common factors + // between operands. Consider something like A*A+A*B*C+D. We would like to + // reassociate this to A*(A+B*C)+D, which reduces the number of multiplies. + // To efficiently find this, we count the number of times a factor occurs + // for any ADD operands that are MULs. + DenseMap<Value*, unsigned> FactorOccurrences; + + // Keep track of each multiply we see, to avoid triggering on (X*4)+(X*4) + // where they are actually the same multiply. + unsigned MaxOcc = 0; + Value *MaxOccVal = 0; + for (unsigned i = 0, e = Ops.size(); i != e; ++i) { + BinaryOperator *BOp = isReassociableOp(Ops[i].Op, Instruction::Mul); + if (!BOp) + continue; + + // Compute all of the factors of this added value. + SmallVector<Value*, 8> Factors; + FindSingleUseMultiplyFactors(BOp, Factors, Ops); + assert(Factors.size() > 1 && "Bad linearize!"); + + // Add one to FactorOccurrences for each unique factor in this op. + SmallPtrSet<Value*, 8> Duplicates; + for (unsigned i = 0, e = Factors.size(); i != e; ++i) { + Value *Factor = Factors[i]; + if (!Duplicates.insert(Factor)) continue; + + unsigned Occ = ++FactorOccurrences[Factor]; + if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factor; } + + // If Factor is a negative constant, add the negated value as a factor + // because we can percolate the negate out. Watch for minint, which + // cannot be positivified. + if (ConstantInt *CI = dyn_cast<ConstantInt>(Factor)) + if (CI->isNegative() && !CI->isMinValue(true)) { + Factor = ConstantInt::get(CI->getContext(), -CI->getValue()); + assert(!Duplicates.count(Factor) && + "Shouldn't have two constant factors, missed a canonicalize"); + + unsigned Occ = ++FactorOccurrences[Factor]; + if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factor; } + } + } + } + + // If any factor occurred more than one time, we can pull it out. + if (MaxOcc > 1) { + DEBUG(errs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << '\n'); + ++NumFactor; + + // Create a new instruction that uses the MaxOccVal twice. If we don't do + // this, we could otherwise run into situations where removing a factor + // from an expression will drop a use of maxocc, and this can cause + // RemoveFactorFromExpression on successive values to behave differently. + Instruction *DummyInst = BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal); + SmallVector<WeakVH, 4> NewMulOps; + for (unsigned i = 0; i != Ops.size(); ++i) { + // Only try to remove factors from expressions we're allowed to. + BinaryOperator *BOp = isReassociableOp(Ops[i].Op, Instruction::Mul); + if (!BOp) + continue; + + if (Value *V = RemoveFactorFromExpression(Ops[i].Op, MaxOccVal)) { + // The factorized operand may occur several times. Convert them all in + // one fell swoop. + for (unsigned j = Ops.size(); j != i;) { + --j; + if (Ops[j].Op == Ops[i].Op) { + NewMulOps.push_back(V); + Ops.erase(Ops.begin()+j); + } + } + --i; + } + } + + // No need for extra uses anymore. + delete DummyInst; + + unsigned NumAddedValues = NewMulOps.size(); + Value *V = EmitAddTreeOfValues(I, NewMulOps); + + // Now that we have inserted the add tree, optimize it. This allows us to + // handle cases that require multiple factoring steps, such as this: + // A*A*B + A*A*C --> A*(A*B+A*C) --> A*(A*(B+C)) + assert(NumAddedValues > 1 && "Each occurrence should contribute a value"); + (void)NumAddedValues; + if (Instruction *VI = dyn_cast<Instruction>(V)) + RedoInsts.insert(VI); + + // Create the multiply. + Instruction *V2 = BinaryOperator::CreateMul(V, MaxOccVal, "tmp", I); + + // Rerun associate on the multiply in case the inner expression turned into + // a multiply. We want to make sure that we keep things in canonical form. + RedoInsts.insert(V2); + + // If every add operand included the factor (e.g. "A*B + A*C"), then the + // entire result expression is just the multiply "A*(B+C)". + if (Ops.empty()) + return V2; + + // Otherwise, we had some input that didn't have the factor, such as + // "A*B + A*C + D" -> "A*(B+C) + D". Add the new multiply to the list of + // things being added by this operation. + Ops.insert(Ops.begin(), ValueEntry(getRank(V2), V2)); + } + + return 0; +} + +namespace { + /// \brief Predicate tests whether a ValueEntry's op is in a map. + struct IsValueInMap { + const DenseMap<Value *, unsigned> ⤅ + + IsValueInMap(const DenseMap<Value *, unsigned> &Map) : Map(Map) {} + + bool operator()(const ValueEntry &Entry) { + return Map.find(Entry.Op) != Map.end(); + } + }; +} + +/// \brief Build up a vector of value/power pairs factoring a product. +/// +/// Given a series of multiplication operands, build a vector of factors and +/// the powers each is raised to when forming the final product. Sort them in +/// the order of descending power. +/// +/// (x*x) -> [(x, 2)] +/// ((x*x)*x) -> [(x, 3)] +/// ((((x*y)*x)*y)*x) -> [(x, 3), (y, 2)] +/// +/// \returns Whether any factors have a power greater than one. +bool Reassociate::collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops, + SmallVectorImpl<Factor> &Factors) { + // FIXME: Have Ops be (ValueEntry, Multiplicity) pairs, simplifying this. + // Compute the sum of powers of simplifiable factors. + unsigned FactorPowerSum = 0; + for (unsigned Idx = 1, Size = Ops.size(); Idx < Size; ++Idx) { + Value *Op = Ops[Idx-1].Op; + + // Count the number of occurrences of this value. + unsigned Count = 1; + for (; Idx < Size && Ops[Idx].Op == Op; ++Idx) + ++Count; + // Track for simplification all factors which occur 2 or more times. + if (Count > 1) + FactorPowerSum += Count; + } + + // We can only simplify factors if the sum of the powers of our simplifiable + // factors is 4 or higher. When that is the case, we will *always* have + // a simplification. This is an important invariant to prevent cyclicly + // trying to simplify already minimal formations. + if (FactorPowerSum < 4) + return false; + + // Now gather the simplifiable factors, removing them from Ops. + FactorPowerSum = 0; + for (unsigned Idx = 1; Idx < Ops.size(); ++Idx) { + Value *Op = Ops[Idx-1].Op; + + // Count the number of occurrences of this value. + unsigned Count = 1; + for (; Idx < Ops.size() && Ops[Idx].Op == Op; ++Idx) + ++Count; + if (Count == 1) + continue; + // Move an even number of occurrences to Factors. + Count &= ~1U; + Idx -= Count; + FactorPowerSum += Count; + Factors.push_back(Factor(Op, Count)); + Ops.erase(Ops.begin()+Idx, Ops.begin()+Idx+Count); + } + + // None of the adjustments above should have reduced the sum of factor powers + // below our mininum of '4'. + assert(FactorPowerSum >= 4); + + std::sort(Factors.begin(), Factors.end(), Factor::PowerDescendingSorter()); + return true; +} + +/// \brief Build a tree of multiplies, computing the product of Ops. +static Value *buildMultiplyTree(IRBuilder<> &Builder, + SmallVectorImpl<Value*> &Ops) { + if (Ops.size() == 1) + return Ops.back(); + + Value *LHS = Ops.pop_back_val(); + do { + LHS = Builder.CreateMul(LHS, Ops.pop_back_val()); + } while (!Ops.empty()); + + return LHS; +} + +/// \brief Build a minimal multiplication DAG for (a^x)*(b^y)*(c^z)*... +/// +/// Given a vector of values raised to various powers, where no two values are +/// equal and the powers are sorted in decreasing order, compute the minimal +/// DAG of multiplies to compute the final product, and return that product +/// value. +Value *Reassociate::buildMinimalMultiplyDAG(IRBuilder<> &Builder, + SmallVectorImpl<Factor> &Factors) { + assert(Factors[0].Power); + SmallVector<Value *, 4> OuterProduct; + for (unsigned LastIdx = 0, Idx = 1, Size = Factors.size(); + Idx < Size && Factors[Idx].Power > 0; ++Idx) { + if (Factors[Idx].Power != Factors[LastIdx].Power) { + LastIdx = Idx; + continue; + } + + // We want to multiply across all the factors with the same power so that + // we can raise them to that power as a single entity. Build a mini tree + // for that. + SmallVector<Value *, 4> InnerProduct; + InnerProduct.push_back(Factors[LastIdx].Base); + do { + InnerProduct.push_back(Factors[Idx].Base); + ++Idx; + } while (Idx < Size && Factors[Idx].Power == Factors[LastIdx].Power); + + // Reset the base value of the first factor to the new expression tree. + // We'll remove all the factors with the same power in a second pass. + Value *M = Factors[LastIdx].Base = buildMultiplyTree(Builder, InnerProduct); + if (Instruction *MI = dyn_cast<Instruction>(M)) + RedoInsts.insert(MI); + + LastIdx = Idx; + } + // Unique factors with equal powers -- we've folded them into the first one's + // base. + Factors.erase(std::unique(Factors.begin(), Factors.end(), + Factor::PowerEqual()), + Factors.end()); + + // Iteratively collect the base of each factor with an add power into the + // outer product, and halve each power in preparation for squaring the + // expression. + for (unsigned Idx = 0, Size = Factors.size(); Idx != Size; ++Idx) { + if (Factors[Idx].Power & 1) + OuterProduct.push_back(Factors[Idx].Base); + Factors[Idx].Power >>= 1; + } + if (Factors[0].Power) { + Value *SquareRoot = buildMinimalMultiplyDAG(Builder, Factors); + OuterProduct.push_back(SquareRoot); + OuterProduct.push_back(SquareRoot); + } + if (OuterProduct.size() == 1) + return OuterProduct.front(); + + Value *V = buildMultiplyTree(Builder, OuterProduct); + return V; +} + +Value *Reassociate::OptimizeMul(BinaryOperator *I, + SmallVectorImpl<ValueEntry> &Ops) { + // We can only optimize the multiplies when there is a chain of more than + // three, such that a balanced tree might require fewer total multiplies. + if (Ops.size() < 4) + return 0; + + // Try to turn linear trees of multiplies without other uses of the + // intermediate stages into minimal multiply DAGs with perfect sub-expression + // re-use. + SmallVector<Factor, 4> Factors; + if (!collectMultiplyFactors(Ops, Factors)) + return 0; // All distinct factors, so nothing left for us to do. + + IRBuilder<> Builder(I); + Value *V = buildMinimalMultiplyDAG(Builder, Factors); + if (Ops.empty()) + return V; + + ValueEntry NewEntry = ValueEntry(getRank(V), V); + Ops.insert(std::lower_bound(Ops.begin(), Ops.end(), NewEntry), NewEntry); + return 0; +} + +Value *Reassociate::OptimizeExpression(BinaryOperator *I, + SmallVectorImpl<ValueEntry> &Ops) { + // Now that we have the linearized expression tree, try to optimize it. + // Start by folding any constants that we found. + if (Ops.size() == 1) return Ops[0].Op; + + unsigned Opcode = I->getOpcode(); + + // Handle destructive annihilation due to identities between elements in the + // argument list here. + unsigned NumOps = Ops.size(); + switch (Opcode) { + default: break; + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + if (Value *Result = OptimizeAndOrXor(Opcode, Ops)) + return Result; + break; + + case Instruction::Add: + if (Value *Result = OptimizeAdd(I, Ops)) + return Result; + break; + + case Instruction::Mul: + if (Value *Result = OptimizeMul(I, Ops)) + return Result; + break; + } + + if (Ops.size() != NumOps) + return OptimizeExpression(I, Ops); + return 0; +} + +/// EraseInst - Zap the given instruction, adding interesting operands to the +/// work list. +void Reassociate::EraseInst(Instruction *I) { + assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!"); + SmallVector<Value*, 8> Ops(I->op_begin(), I->op_end()); + // Erase the dead instruction. + ValueRankMap.erase(I); + RedoInsts.remove(I); + I->eraseFromParent(); + // Optimize its operands. + SmallPtrSet<Instruction *, 8> Visited; // Detect self-referential nodes. + for (unsigned i = 0, e = Ops.size(); i != e; ++i) + if (Instruction *Op = dyn_cast<Instruction>(Ops[i])) { + // If this is a node in an expression tree, climb to the expression root + // and add that since that's where optimization actually happens. + unsigned Opcode = Op->getOpcode(); + while (Op->hasOneUse() && Op->use_back()->getOpcode() == Opcode && + Visited.insert(Op)) + Op = Op->use_back(); + RedoInsts.insert(Op); + } +} + +/// OptimizeInst - Inspect and optimize the given instruction. Note that erasing +/// instructions is not allowed. +void Reassociate::OptimizeInst(Instruction *I) { + // Only consider operations that we understand. + if (!isa<BinaryOperator>(I)) + return; + + if (I->getOpcode() == Instruction::Shl && + isa<ConstantInt>(I->getOperand(1))) + // If an operand of this shift is a reassociable multiply, or if the shift + // is used by a reassociable multiply or add, turn into a multiply. + if (isReassociableOp(I->getOperand(0), Instruction::Mul) || + (I->hasOneUse() && + (isReassociableOp(I->use_back(), Instruction::Mul) || + isReassociableOp(I->use_back(), Instruction::Add)))) { + Instruction *NI = ConvertShiftToMul(I); + RedoInsts.insert(I); + MadeChange = true; + I = NI; + } + + // Floating point binary operators are not associative, but we can still + // commute (some) of them, to canonicalize the order of their operands. + // This can potentially expose more CSE opportunities, and makes writing + // other transformations simpler. + if ((I->getType()->isFloatingPointTy() || I->getType()->isVectorTy())) { + // FAdd and FMul can be commuted. + if (I->getOpcode() != Instruction::FMul && + I->getOpcode() != Instruction::FAdd) + return; + + Value *LHS = I->getOperand(0); + Value *RHS = I->getOperand(1); + unsigned LHSRank = getRank(LHS); + unsigned RHSRank = getRank(RHS); + + // Sort the operands by rank. + if (RHSRank < LHSRank) { + I->setOperand(0, RHS); + I->setOperand(1, LHS); + } + + return; + } + + // Do not reassociate boolean (i1) expressions. We want to preserve the + // original order of evaluation for short-circuited comparisons that + // SimplifyCFG has folded to AND/OR expressions. If the expression + // is not further optimized, it is likely to be transformed back to a + // short-circuited form for code gen, and the source order may have been + // optimized for the most likely conditions. + if (I->getType()->isIntegerTy(1)) + return; + + // If this is a subtract instruction which is not already in negate form, + // see if we can convert it to X+-Y. + if (I->getOpcode() == Instruction::Sub) { + if (ShouldBreakUpSubtract(I)) { + Instruction *NI = BreakUpSubtract(I); + RedoInsts.insert(I); + MadeChange = true; + I = NI; + } else if (BinaryOperator::isNeg(I)) { + // Otherwise, this is a negation. See if the operand is a multiply tree + // and if this is not an inner node of a multiply tree. + if (isReassociableOp(I->getOperand(1), Instruction::Mul) && + (!I->hasOneUse() || + !isReassociableOp(I->use_back(), Instruction::Mul))) { + Instruction *NI = LowerNegateToMultiply(I); + RedoInsts.insert(I); + MadeChange = true; + I = NI; + } + } + } + + // If this instruction is an associative binary operator, process it. + if (!I->isAssociative()) return; + BinaryOperator *BO = cast<BinaryOperator>(I); + + // If this is an interior node of a reassociable tree, ignore it until we + // get to the root of the tree, to avoid N^2 analysis. + unsigned Opcode = BO->getOpcode(); + if (BO->hasOneUse() && BO->use_back()->getOpcode() == Opcode) + return; + + // If this is an add tree that is used by a sub instruction, ignore it + // until we process the subtract. + if (BO->hasOneUse() && BO->getOpcode() == Instruction::Add && + cast<Instruction>(BO->use_back())->getOpcode() == Instruction::Sub) + return; + + ReassociateExpression(BO); +} + +void Reassociate::ReassociateExpression(BinaryOperator *I) { + + // First, walk the expression tree, linearizing the tree, collecting the + // operand information. + SmallVector<RepeatedValue, 8> Tree; + MadeChange |= LinearizeExprTree(I, Tree); + SmallVector<ValueEntry, 8> Ops; + Ops.reserve(Tree.size()); + for (unsigned i = 0, e = Tree.size(); i != e; ++i) { + RepeatedValue E = Tree[i]; + Ops.append(E.second.getZExtValue(), + ValueEntry(getRank(E.first), E.first)); + } + + DEBUG(dbgs() << "RAIn:\t"; PrintOps(I, Ops); dbgs() << '\n'); + + // Now that we have linearized the tree to a list and have gathered all of + // the operands and their ranks, sort the operands by their rank. Use a + // stable_sort so that values with equal ranks will have their relative + // positions maintained (and so the compiler is deterministic). Note that + // this sorts so that the highest ranking values end up at the beginning of + // the vector. + std::stable_sort(Ops.begin(), Ops.end()); + + // OptimizeExpression - Now that we have the expression tree in a convenient + // sorted form, optimize it globally if possible. + if (Value *V = OptimizeExpression(I, Ops)) { + if (V == I) + // Self-referential expression in unreachable code. + return; + // This expression tree simplified to something that isn't a tree, + // eliminate it. + DEBUG(dbgs() << "Reassoc to scalar: " << *V << '\n'); + I->replaceAllUsesWith(V); + if (Instruction *VI = dyn_cast<Instruction>(V)) + VI->setDebugLoc(I->getDebugLoc()); + RedoInsts.insert(I); + ++NumAnnihil; + return; + } + + // We want to sink immediates as deeply as possible except in the case where + // this is a multiply tree used only by an add, and the immediate is a -1. + // In this case we reassociate to put the negation on the outside so that we + // can fold the negation into the add: (-X)*Y + Z -> Z-X*Y + if (I->getOpcode() == Instruction::Mul && I->hasOneUse() && + cast<Instruction>(I->use_back())->getOpcode() == Instruction::Add && + isa<ConstantInt>(Ops.back().Op) && + cast<ConstantInt>(Ops.back().Op)->isAllOnesValue()) { + ValueEntry Tmp = Ops.pop_back_val(); + Ops.insert(Ops.begin(), Tmp); + } + + DEBUG(dbgs() << "RAOut:\t"; PrintOps(I, Ops); dbgs() << '\n'); + + if (Ops.size() == 1) { + if (Ops[0].Op == I) + // Self-referential expression in unreachable code. + return; + + // This expression tree simplified to something that isn't a tree, + // eliminate it. + I->replaceAllUsesWith(Ops[0].Op); + if (Instruction *OI = dyn_cast<Instruction>(Ops[0].Op)) + OI->setDebugLoc(I->getDebugLoc()); + RedoInsts.insert(I); + return; + } + + // Now that we ordered and optimized the expressions, splat them back into + // the expression tree, removing any unneeded nodes. + RewriteExprTree(I, Ops); +} + +bool Reassociate::runOnFunction(Function &F) { + // Calculate the rank map for F + BuildRankMap(F); + + MadeChange = false; + for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { + // Optimize every instruction in the basic block. + for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE; ) + if (isInstructionTriviallyDead(II)) { + EraseInst(II++); + } else { + OptimizeInst(II); + assert(II->getParent() == BI && "Moved to a different block!"); + ++II; + } + + // If this produced extra instructions to optimize, handle them now. + while (!RedoInsts.empty()) { + Instruction *I = RedoInsts.pop_back_val(); + if (isInstructionTriviallyDead(I)) + EraseInst(I); + else + OptimizeInst(I); + } + } + + // We are done with the rank map. + RankMap.clear(); + ValueRankMap.clear(); + + return MadeChange; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp new file mode 100644 index 0000000..ea1de63 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp @@ -0,0 +1,133 @@ +//===- Reg2Mem.cpp - Convert registers to allocas -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file demotes all registers to memory references. It is intended to be +// the inverse of PromoteMemoryToRegister. By converting to loads, the only +// values live across basic blocks are allocas and loads before phi nodes. +// It is intended that this should make CFG hacking much easier. +// To make later hacking easier, the entry block is split into two, such that +// all introduced allocas and nothing else are in the entry block. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "reg2mem" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Pass.h" +#include "llvm/Function.h" +#include "llvm/LLVMContext.h" +#include "llvm/Module.h" +#include "llvm/BasicBlock.h" +#include "llvm/Instructions.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/CFG.h" +#include <list> +using namespace llvm; + +STATISTIC(NumRegsDemoted, "Number of registers demoted"); +STATISTIC(NumPhisDemoted, "Number of phi-nodes demoted"); + +namespace { + struct RegToMem : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + RegToMem() : FunctionPass(ID) { + initializeRegToMemPass(*PassRegistry::getPassRegistry()); + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequiredID(BreakCriticalEdgesID); + AU.addPreservedID(BreakCriticalEdgesID); + } + + bool valueEscapes(const Instruction *Inst) const { + const BasicBlock *BB = Inst->getParent(); + for (Value::const_use_iterator UI = Inst->use_begin(),E = Inst->use_end(); + UI != E; ++UI) { + const Instruction *I = cast<Instruction>(*UI); + if (I->getParent() != BB || isa<PHINode>(I)) + return true; + } + return false; + } + + virtual bool runOnFunction(Function &F); + }; +} + +char RegToMem::ID = 0; +INITIALIZE_PASS_BEGIN(RegToMem, "reg2mem", "Demote all values to stack slots", + false, false) +INITIALIZE_PASS_DEPENDENCY(BreakCriticalEdges) +INITIALIZE_PASS_END(RegToMem, "reg2mem", "Demote all values to stack slots", + false, false) + +bool RegToMem::runOnFunction(Function &F) { + if (F.isDeclaration()) + return false; + + // Insert all new allocas into entry block. + BasicBlock *BBEntry = &F.getEntryBlock(); + assert(pred_begin(BBEntry) == pred_end(BBEntry) && + "Entry block to function must not have predecessors!"); + + // Find first non-alloca instruction and create insertion point. This is + // safe if block is well-formed: it always have terminator, otherwise + // we'll get and assertion. + BasicBlock::iterator I = BBEntry->begin(); + while (isa<AllocaInst>(I)) ++I; + + CastInst *AllocaInsertionPoint = + new BitCastInst(Constant::getNullValue(Type::getInt32Ty(F.getContext())), + Type::getInt32Ty(F.getContext()), + "reg2mem alloca point", I); + + // Find the escaped instructions. But don't create stack slots for + // allocas in entry block. + std::list<Instruction*> WorkList; + for (Function::iterator ibb = F.begin(), ibe = F.end(); + ibb != ibe; ++ibb) + for (BasicBlock::iterator iib = ibb->begin(), iie = ibb->end(); + iib != iie; ++iib) { + if (!(isa<AllocaInst>(iib) && iib->getParent() == BBEntry) && + valueEscapes(iib)) { + WorkList.push_front(&*iib); + } + } + + // Demote escaped instructions + NumRegsDemoted += WorkList.size(); + for (std::list<Instruction*>::iterator ilb = WorkList.begin(), + ile = WorkList.end(); ilb != ile; ++ilb) + DemoteRegToStack(**ilb, false, AllocaInsertionPoint); + + WorkList.clear(); + + // Find all phi's + for (Function::iterator ibb = F.begin(), ibe = F.end(); + ibb != ibe; ++ibb) + for (BasicBlock::iterator iib = ibb->begin(), iie = ibb->end(); + iib != iie; ++iib) + if (isa<PHINode>(iib)) + WorkList.push_front(&*iib); + + // Demote phi nodes + NumPhisDemoted += WorkList.size(); + for (std::list<Instruction*>::iterator ilb = WorkList.begin(), + ile = WorkList.end(); ilb != ile; ++ilb) + DemotePHIToStack(cast<PHINode>(*ilb), AllocaInsertionPoint); + + return true; +} + + +// createDemoteRegisterToMemory - Provide an entry point to create this pass. +char &llvm::DemoteRegisterToMemoryID = RegToMem::ID; +FunctionPass *llvm::createDemoteRegisterToMemoryPass() { + return new RegToMem(); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp new file mode 100644 index 0000000..2c39aab --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -0,0 +1,1946 @@ +//===- SCCP.cpp - Sparse Conditional Constant Propagation -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements sparse conditional constant propagation and merging: +// +// Specifically, this: +// * Assumes values are constant unless proven otherwise +// * Assumes BasicBlocks are dead unless proven otherwise +// * Proves values to be constant, and replaces them with constants +// * Proves conditional branches to be unconditional +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "sccp" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Instructions.h" +#include "llvm/Pass.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/InstVisitor.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/PointerIntPair.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include <algorithm> +using namespace llvm; + +STATISTIC(NumInstRemoved, "Number of instructions removed"); +STATISTIC(NumDeadBlocks , "Number of basic blocks unreachable"); + +STATISTIC(IPNumInstRemoved, "Number of instructions removed by IPSCCP"); +STATISTIC(IPNumArgsElimed ,"Number of arguments constant propagated by IPSCCP"); +STATISTIC(IPNumGlobalConst, "Number of globals found to be constant by IPSCCP"); + +namespace { +/// LatticeVal class - This class represents the different lattice values that +/// an LLVM value may occupy. It is a simple class with value semantics. +/// +class LatticeVal { + enum LatticeValueTy { + /// undefined - This LLVM Value has no known value yet. + undefined, + + /// constant - This LLVM Value has a specific constant value. + constant, + + /// forcedconstant - This LLVM Value was thought to be undef until + /// ResolvedUndefsIn. This is treated just like 'constant', but if merged + /// with another (different) constant, it goes to overdefined, instead of + /// asserting. + forcedconstant, + + /// overdefined - This instruction is not known to be constant, and we know + /// it has a value. + overdefined + }; + + /// Val: This stores the current lattice value along with the Constant* for + /// the constant if this is a 'constant' or 'forcedconstant' value. + PointerIntPair<Constant *, 2, LatticeValueTy> Val; + + LatticeValueTy getLatticeValue() const { + return Val.getInt(); + } + +public: + LatticeVal() : Val(0, undefined) {} + + bool isUndefined() const { return getLatticeValue() == undefined; } + bool isConstant() const { + return getLatticeValue() == constant || getLatticeValue() == forcedconstant; + } + bool isOverdefined() const { return getLatticeValue() == overdefined; } + + Constant *getConstant() const { + assert(isConstant() && "Cannot get the constant of a non-constant!"); + return Val.getPointer(); + } + + /// markOverdefined - Return true if this is a change in status. + bool markOverdefined() { + if (isOverdefined()) + return false; + + Val.setInt(overdefined); + return true; + } + + /// markConstant - Return true if this is a change in status. + bool markConstant(Constant *V) { + if (getLatticeValue() == constant) { // Constant but not forcedconstant. + assert(getConstant() == V && "Marking constant with different value"); + return false; + } + + if (isUndefined()) { + Val.setInt(constant); + assert(V && "Marking constant with NULL"); + Val.setPointer(V); + } else { + assert(getLatticeValue() == forcedconstant && + "Cannot move from overdefined to constant!"); + // Stay at forcedconstant if the constant is the same. + if (V == getConstant()) return false; + + // Otherwise, we go to overdefined. Assumptions made based on the + // forced value are possibly wrong. Assuming this is another constant + // could expose a contradiction. + Val.setInt(overdefined); + } + return true; + } + + /// getConstantInt - If this is a constant with a ConstantInt value, return it + /// otherwise return null. + ConstantInt *getConstantInt() const { + if (isConstant()) + return dyn_cast<ConstantInt>(getConstant()); + return 0; + } + + void markForcedConstant(Constant *V) { + assert(isUndefined() && "Can't force a defined value!"); + Val.setInt(forcedconstant); + Val.setPointer(V); + } +}; +} // end anonymous namespace. + + +namespace { + +//===----------------------------------------------------------------------===// +// +/// SCCPSolver - This class is a general purpose solver for Sparse Conditional +/// Constant Propagation. +/// +class SCCPSolver : public InstVisitor<SCCPSolver> { + const TargetData *TD; + const TargetLibraryInfo *TLI; + SmallPtrSet<BasicBlock*, 8> BBExecutable; // The BBs that are executable. + DenseMap<Value*, LatticeVal> ValueState; // The state each value is in. + + /// StructValueState - This maintains ValueState for values that have + /// StructType, for example for formal arguments, calls, insertelement, etc. + /// + DenseMap<std::pair<Value*, unsigned>, LatticeVal> StructValueState; + + /// GlobalValue - If we are tracking any values for the contents of a global + /// variable, we keep a mapping from the constant accessor to the element of + /// the global, to the currently known value. If the value becomes + /// overdefined, it's entry is simply removed from this map. + DenseMap<GlobalVariable*, LatticeVal> TrackedGlobals; + + /// TrackedRetVals - If we are tracking arguments into and the return + /// value out of a function, it will have an entry in this map, indicating + /// what the known return value for the function is. + DenseMap<Function*, LatticeVal> TrackedRetVals; + + /// TrackedMultipleRetVals - Same as TrackedRetVals, but used for functions + /// that return multiple values. + DenseMap<std::pair<Function*, unsigned>, LatticeVal> TrackedMultipleRetVals; + + /// MRVFunctionsTracked - Each function in TrackedMultipleRetVals is + /// represented here for efficient lookup. + SmallPtrSet<Function*, 16> MRVFunctionsTracked; + + /// TrackingIncomingArguments - This is the set of functions for whose + /// arguments we make optimistic assumptions about and try to prove as + /// constants. + SmallPtrSet<Function*, 16> TrackingIncomingArguments; + + /// The reason for two worklists is that overdefined is the lowest state + /// on the lattice, and moving things to overdefined as fast as possible + /// makes SCCP converge much faster. + /// + /// By having a separate worklist, we accomplish this because everything + /// possibly overdefined will become overdefined at the soonest possible + /// point. + SmallVector<Value*, 64> OverdefinedInstWorkList; + SmallVector<Value*, 64> InstWorkList; + + + SmallVector<BasicBlock*, 64> BBWorkList; // The BasicBlock work list + + /// KnownFeasibleEdges - Entries in this set are edges which have already had + /// PHI nodes retriggered. + typedef std::pair<BasicBlock*, BasicBlock*> Edge; + DenseSet<Edge> KnownFeasibleEdges; +public: + SCCPSolver(const TargetData *td, const TargetLibraryInfo *tli) + : TD(td), TLI(tli) {} + + /// MarkBlockExecutable - This method can be used by clients to mark all of + /// the blocks that are known to be intrinsically live in the processed unit. + /// + /// This returns true if the block was not considered live before. + bool MarkBlockExecutable(BasicBlock *BB) { + if (!BBExecutable.insert(BB)) return false; + DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << "\n"); + BBWorkList.push_back(BB); // Add the block to the work list! + return true; + } + + /// TrackValueOfGlobalVariable - Clients can use this method to + /// inform the SCCPSolver that it should track loads and stores to the + /// specified global variable if it can. This is only legal to call if + /// performing Interprocedural SCCP. + void TrackValueOfGlobalVariable(GlobalVariable *GV) { + // We only track the contents of scalar globals. + if (GV->getType()->getElementType()->isSingleValueType()) { + LatticeVal &IV = TrackedGlobals[GV]; + if (!isa<UndefValue>(GV->getInitializer())) + IV.markConstant(GV->getInitializer()); + } + } + + /// AddTrackedFunction - If the SCCP solver is supposed to track calls into + /// and out of the specified function (which cannot have its address taken), + /// this method must be called. + void AddTrackedFunction(Function *F) { + // Add an entry, F -> undef. + if (StructType *STy = dyn_cast<StructType>(F->getReturnType())) { + MRVFunctionsTracked.insert(F); + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) + TrackedMultipleRetVals.insert(std::make_pair(std::make_pair(F, i), + LatticeVal())); + } else + TrackedRetVals.insert(std::make_pair(F, LatticeVal())); + } + + void AddArgumentTrackedFunction(Function *F) { + TrackingIncomingArguments.insert(F); + } + + /// Solve - Solve for constants and executable blocks. + /// + void Solve(); + + /// ResolvedUndefsIn - While solving the dataflow for a function, we assume + /// that branches on undef values cannot reach any of their successors. + /// However, this is not a safe assumption. After we solve dataflow, this + /// method should be use to handle this. If this returns true, the solver + /// should be rerun. + bool ResolvedUndefsIn(Function &F); + + bool isBlockExecutable(BasicBlock *BB) const { + return BBExecutable.count(BB); + } + + LatticeVal getLatticeValueFor(Value *V) const { + DenseMap<Value*, LatticeVal>::const_iterator I = ValueState.find(V); + assert(I != ValueState.end() && "V is not in valuemap!"); + return I->second; + } + + /*LatticeVal getStructLatticeValueFor(Value *V, unsigned i) const { + DenseMap<std::pair<Value*, unsigned>, LatticeVal>::const_iterator I = + StructValueState.find(std::make_pair(V, i)); + assert(I != StructValueState.end() && "V is not in valuemap!"); + return I->second; + }*/ + + /// getTrackedRetVals - Get the inferred return value map. + /// + const DenseMap<Function*, LatticeVal> &getTrackedRetVals() { + return TrackedRetVals; + } + + /// getTrackedGlobals - Get and return the set of inferred initializers for + /// global variables. + const DenseMap<GlobalVariable*, LatticeVal> &getTrackedGlobals() { + return TrackedGlobals; + } + + void markOverdefined(Value *V) { + assert(!V->getType()->isStructTy() && "Should use other method"); + markOverdefined(ValueState[V], V); + } + + /// markAnythingOverdefined - Mark the specified value overdefined. This + /// works with both scalars and structs. + void markAnythingOverdefined(Value *V) { + if (StructType *STy = dyn_cast<StructType>(V->getType())) + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) + markOverdefined(getStructValueState(V, i), V); + else + markOverdefined(V); + } + +private: + // markConstant - Make a value be marked as "constant". If the value + // is not already a constant, add it to the instruction work list so that + // the users of the instruction are updated later. + // + void markConstant(LatticeVal &IV, Value *V, Constant *C) { + if (!IV.markConstant(C)) return; + DEBUG(dbgs() << "markConstant: " << *C << ": " << *V << '\n'); + if (IV.isOverdefined()) + OverdefinedInstWorkList.push_back(V); + else + InstWorkList.push_back(V); + } + + void markConstant(Value *V, Constant *C) { + assert(!V->getType()->isStructTy() && "Should use other method"); + markConstant(ValueState[V], V, C); + } + + void markForcedConstant(Value *V, Constant *C) { + assert(!V->getType()->isStructTy() && "Should use other method"); + LatticeVal &IV = ValueState[V]; + IV.markForcedConstant(C); + DEBUG(dbgs() << "markForcedConstant: " << *C << ": " << *V << '\n'); + if (IV.isOverdefined()) + OverdefinedInstWorkList.push_back(V); + else + InstWorkList.push_back(V); + } + + + // markOverdefined - Make a value be marked as "overdefined". If the + // value is not already overdefined, add it to the overdefined instruction + // work list so that the users of the instruction are updated later. + void markOverdefined(LatticeVal &IV, Value *V) { + if (!IV.markOverdefined()) return; + + DEBUG(dbgs() << "markOverdefined: "; + if (Function *F = dyn_cast<Function>(V)) + dbgs() << "Function '" << F->getName() << "'\n"; + else + dbgs() << *V << '\n'); + // Only instructions go on the work list + OverdefinedInstWorkList.push_back(V); + } + + void mergeInValue(LatticeVal &IV, Value *V, LatticeVal MergeWithV) { + if (IV.isOverdefined() || MergeWithV.isUndefined()) + return; // Noop. + if (MergeWithV.isOverdefined()) + markOverdefined(IV, V); + else if (IV.isUndefined()) + markConstant(IV, V, MergeWithV.getConstant()); + else if (IV.getConstant() != MergeWithV.getConstant()) + markOverdefined(IV, V); + } + + void mergeInValue(Value *V, LatticeVal MergeWithV) { + assert(!V->getType()->isStructTy() && "Should use other method"); + mergeInValue(ValueState[V], V, MergeWithV); + } + + + /// getValueState - Return the LatticeVal object that corresponds to the + /// value. This function handles the case when the value hasn't been seen yet + /// by properly seeding constants etc. + LatticeVal &getValueState(Value *V) { + assert(!V->getType()->isStructTy() && "Should use getStructValueState"); + + std::pair<DenseMap<Value*, LatticeVal>::iterator, bool> I = + ValueState.insert(std::make_pair(V, LatticeVal())); + LatticeVal &LV = I.first->second; + + if (!I.second) + return LV; // Common case, already in the map. + + if (Constant *C = dyn_cast<Constant>(V)) { + // Undef values remain undefined. + if (!isa<UndefValue>(V)) + LV.markConstant(C); // Constants are constant + } + + // All others are underdefined by default. + return LV; + } + + /// getStructValueState - Return the LatticeVal object that corresponds to the + /// value/field pair. This function handles the case when the value hasn't + /// been seen yet by properly seeding constants etc. + LatticeVal &getStructValueState(Value *V, unsigned i) { + assert(V->getType()->isStructTy() && "Should use getValueState"); + assert(i < cast<StructType>(V->getType())->getNumElements() && + "Invalid element #"); + + std::pair<DenseMap<std::pair<Value*, unsigned>, LatticeVal>::iterator, + bool> I = StructValueState.insert( + std::make_pair(std::make_pair(V, i), LatticeVal())); + LatticeVal &LV = I.first->second; + + if (!I.second) + return LV; // Common case, already in the map. + + if (Constant *C = dyn_cast<Constant>(V)) { + Constant *Elt = C->getAggregateElement(i); + + if (Elt == 0) + LV.markOverdefined(); // Unknown sort of constant. + else if (isa<UndefValue>(Elt)) + ; // Undef values remain undefined. + else + LV.markConstant(Elt); // Constants are constant. + } + + // All others are underdefined by default. + return LV; + } + + + /// markEdgeExecutable - Mark a basic block as executable, adding it to the BB + /// work list if it is not already executable. + void markEdgeExecutable(BasicBlock *Source, BasicBlock *Dest) { + if (!KnownFeasibleEdges.insert(Edge(Source, Dest)).second) + return; // This edge is already known to be executable! + + if (!MarkBlockExecutable(Dest)) { + // If the destination is already executable, we just made an *edge* + // feasible that wasn't before. Revisit the PHI nodes in the block + // because they have potentially new operands. + DEBUG(dbgs() << "Marking Edge Executable: " << Source->getName() + << " -> " << Dest->getName() << "\n"); + + PHINode *PN; + for (BasicBlock::iterator I = Dest->begin(); + (PN = dyn_cast<PHINode>(I)); ++I) + visitPHINode(*PN); + } + } + + // getFeasibleSuccessors - Return a vector of booleans to indicate which + // successors are reachable from a given terminator instruction. + // + void getFeasibleSuccessors(TerminatorInst &TI, SmallVector<bool, 16> &Succs); + + // isEdgeFeasible - Return true if the control flow edge from the 'From' basic + // block to the 'To' basic block is currently feasible. + // + bool isEdgeFeasible(BasicBlock *From, BasicBlock *To); + + // OperandChangedState - This method is invoked on all of the users of an + // instruction that was just changed state somehow. Based on this + // information, we need to update the specified user of this instruction. + // + void OperandChangedState(Instruction *I) { + if (BBExecutable.count(I->getParent())) // Inst is executable? + visit(*I); + } + +private: + friend class InstVisitor<SCCPSolver>; + + // visit implementations - Something changed in this instruction. Either an + // operand made a transition, or the instruction is newly executable. Change + // the value type of I to reflect these changes if appropriate. + void visitPHINode(PHINode &I); + + // Terminators + void visitReturnInst(ReturnInst &I); + void visitTerminatorInst(TerminatorInst &TI); + + void visitCastInst(CastInst &I); + void visitSelectInst(SelectInst &I); + void visitBinaryOperator(Instruction &I); + void visitCmpInst(CmpInst &I); + void visitExtractElementInst(ExtractElementInst &I); + void visitInsertElementInst(InsertElementInst &I); + void visitShuffleVectorInst(ShuffleVectorInst &I); + void visitExtractValueInst(ExtractValueInst &EVI); + void visitInsertValueInst(InsertValueInst &IVI); + void visitLandingPadInst(LandingPadInst &I) { markAnythingOverdefined(&I); } + + // Instructions that cannot be folded away. + void visitStoreInst (StoreInst &I); + void visitLoadInst (LoadInst &I); + void visitGetElementPtrInst(GetElementPtrInst &I); + void visitCallInst (CallInst &I) { + visitCallSite(&I); + } + void visitInvokeInst (InvokeInst &II) { + visitCallSite(&II); + visitTerminatorInst(II); + } + void visitCallSite (CallSite CS); + void visitResumeInst (TerminatorInst &I) { /*returns void*/ } + void visitUnwindInst (TerminatorInst &I) { /*returns void*/ } + void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ } + void visitFenceInst (FenceInst &I) { /*returns void*/ } + void visitAtomicCmpXchgInst (AtomicCmpXchgInst &I) { markOverdefined(&I); } + void visitAtomicRMWInst (AtomicRMWInst &I) { markOverdefined(&I); } + void visitAllocaInst (Instruction &I) { markOverdefined(&I); } + void visitVAArgInst (Instruction &I) { markAnythingOverdefined(&I); } + + void visitInstruction(Instruction &I) { + // If a new instruction is added to LLVM that we don't handle. + dbgs() << "SCCP: Don't know how to handle: " << I; + markAnythingOverdefined(&I); // Just in case + } +}; + +} // end anonymous namespace + + +// getFeasibleSuccessors - Return a vector of booleans to indicate which +// successors are reachable from a given terminator instruction. +// +void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI, + SmallVector<bool, 16> &Succs) { + Succs.resize(TI.getNumSuccessors()); + if (BranchInst *BI = dyn_cast<BranchInst>(&TI)) { + if (BI->isUnconditional()) { + Succs[0] = true; + return; + } + + LatticeVal BCValue = getValueState(BI->getCondition()); + ConstantInt *CI = BCValue.getConstantInt(); + if (CI == 0) { + // Overdefined condition variables, and branches on unfoldable constant + // conditions, mean the branch could go either way. + if (!BCValue.isUndefined()) + Succs[0] = Succs[1] = true; + return; + } + + // Constant condition variables mean the branch can only go a single way. + Succs[CI->isZero()] = true; + return; + } + + if (isa<InvokeInst>(TI)) { + // Invoke instructions successors are always executable. + Succs[0] = Succs[1] = true; + return; + } + + if (SwitchInst *SI = dyn_cast<SwitchInst>(&TI)) { + if (!SI->getNumCases()) { + Succs[0] = true; + return; + } + LatticeVal SCValue = getValueState(SI->getCondition()); + ConstantInt *CI = SCValue.getConstantInt(); + + if (CI == 0) { // Overdefined or undefined condition? + // All destinations are executable! + if (!SCValue.isUndefined()) + Succs.assign(TI.getNumSuccessors(), true); + return; + } + + Succs[SI->findCaseValue(CI).getSuccessorIndex()] = true; + return; + } + + // TODO: This could be improved if the operand is a [cast of a] BlockAddress. + if (isa<IndirectBrInst>(&TI)) { + // Just mark all destinations executable! + Succs.assign(TI.getNumSuccessors(), true); + return; + } + +#ifndef NDEBUG + dbgs() << "Unknown terminator instruction: " << TI << '\n'; +#endif + llvm_unreachable("SCCP: Don't know how to handle this terminator!"); +} + + +// isEdgeFeasible - Return true if the control flow edge from the 'From' basic +// block to the 'To' basic block is currently feasible. +// +bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) { + assert(BBExecutable.count(To) && "Dest should always be alive!"); + + // Make sure the source basic block is executable!! + if (!BBExecutable.count(From)) return false; + + // Check to make sure this edge itself is actually feasible now. + TerminatorInst *TI = From->getTerminator(); + if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { + if (BI->isUnconditional()) + return true; + + LatticeVal BCValue = getValueState(BI->getCondition()); + + // Overdefined condition variables mean the branch could go either way, + // undef conditions mean that neither edge is feasible yet. + ConstantInt *CI = BCValue.getConstantInt(); + if (CI == 0) + return !BCValue.isUndefined(); + + // Constant condition variables mean the branch can only go a single way. + return BI->getSuccessor(CI->isZero()) == To; + } + + // Invoke instructions successors are always executable. + if (isa<InvokeInst>(TI)) + return true; + + if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { + if (SI->getNumCases() < 1) + return true; + + LatticeVal SCValue = getValueState(SI->getCondition()); + ConstantInt *CI = SCValue.getConstantInt(); + + if (CI == 0) + return !SCValue.isUndefined(); + + return SI->findCaseValue(CI).getCaseSuccessor() == To; + } + + // Just mark all destinations executable! + // TODO: This could be improved if the operand is a [cast of a] BlockAddress. + if (isa<IndirectBrInst>(TI)) + return true; + +#ifndef NDEBUG + dbgs() << "Unknown terminator instruction: " << *TI << '\n'; +#endif + llvm_unreachable(0); +} + +// visit Implementations - Something changed in this instruction, either an +// operand made a transition, or the instruction is newly executable. Change +// the value type of I to reflect these changes if appropriate. This method +// makes sure to do the following actions: +// +// 1. If a phi node merges two constants in, and has conflicting value coming +// from different branches, or if the PHI node merges in an overdefined +// value, then the PHI node becomes overdefined. +// 2. If a phi node merges only constants in, and they all agree on value, the +// PHI node becomes a constant value equal to that. +// 3. If V <- x (op) y && isConstant(x) && isConstant(y) V = Constant +// 4. If V <- x (op) y && (isOverdefined(x) || isOverdefined(y)) V = Overdefined +// 5. If V <- MEM or V <- CALL or V <- (unknown) then V = Overdefined +// 6. If a conditional branch has a value that is constant, make the selected +// destination executable +// 7. If a conditional branch has a value that is overdefined, make all +// successors executable. +// +void SCCPSolver::visitPHINode(PHINode &PN) { + // If this PN returns a struct, just mark the result overdefined. + // TODO: We could do a lot better than this if code actually uses this. + if (PN.getType()->isStructTy()) + return markAnythingOverdefined(&PN); + + if (getValueState(&PN).isOverdefined()) + return; // Quick exit + + // Super-extra-high-degree PHI nodes are unlikely to ever be marked constant, + // and slow us down a lot. Just mark them overdefined. + if (PN.getNumIncomingValues() > 64) + return markOverdefined(&PN); + + // Look at all of the executable operands of the PHI node. If any of them + // are overdefined, the PHI becomes overdefined as well. If they are all + // constant, and they agree with each other, the PHI becomes the identical + // constant. If they are constant and don't agree, the PHI is overdefined. + // If there are no executable operands, the PHI remains undefined. + // + Constant *OperandVal = 0; + for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) { + LatticeVal IV = getValueState(PN.getIncomingValue(i)); + if (IV.isUndefined()) continue; // Doesn't influence PHI node. + + if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent())) + continue; + + if (IV.isOverdefined()) // PHI node becomes overdefined! + return markOverdefined(&PN); + + if (OperandVal == 0) { // Grab the first value. + OperandVal = IV.getConstant(); + continue; + } + + // There is already a reachable operand. If we conflict with it, + // then the PHI node becomes overdefined. If we agree with it, we + // can continue on. + + // Check to see if there are two different constants merging, if so, the PHI + // node is overdefined. + if (IV.getConstant() != OperandVal) + return markOverdefined(&PN); + } + + // If we exited the loop, this means that the PHI node only has constant + // arguments that agree with each other(and OperandVal is the constant) or + // OperandVal is null because there are no defined incoming arguments. If + // this is the case, the PHI remains undefined. + // + if (OperandVal) + markConstant(&PN, OperandVal); // Acquire operand value +} + + + + +void SCCPSolver::visitReturnInst(ReturnInst &I) { + if (I.getNumOperands() == 0) return; // ret void + + Function *F = I.getParent()->getParent(); + Value *ResultOp = I.getOperand(0); + + // If we are tracking the return value of this function, merge it in. + if (!TrackedRetVals.empty() && !ResultOp->getType()->isStructTy()) { + DenseMap<Function*, LatticeVal>::iterator TFRVI = + TrackedRetVals.find(F); + if (TFRVI != TrackedRetVals.end()) { + mergeInValue(TFRVI->second, F, getValueState(ResultOp)); + return; + } + } + + // Handle functions that return multiple values. + if (!TrackedMultipleRetVals.empty()) { + if (StructType *STy = dyn_cast<StructType>(ResultOp->getType())) + if (MRVFunctionsTracked.count(F)) + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) + mergeInValue(TrackedMultipleRetVals[std::make_pair(F, i)], F, + getStructValueState(ResultOp, i)); + + } +} + +void SCCPSolver::visitTerminatorInst(TerminatorInst &TI) { + SmallVector<bool, 16> SuccFeasible; + getFeasibleSuccessors(TI, SuccFeasible); + + BasicBlock *BB = TI.getParent(); + + // Mark all feasible successors executable. + for (unsigned i = 0, e = SuccFeasible.size(); i != e; ++i) + if (SuccFeasible[i]) + markEdgeExecutable(BB, TI.getSuccessor(i)); +} + +void SCCPSolver::visitCastInst(CastInst &I) { + LatticeVal OpSt = getValueState(I.getOperand(0)); + if (OpSt.isOverdefined()) // Inherit overdefinedness of operand + markOverdefined(&I); + else if (OpSt.isConstant()) // Propagate constant value + markConstant(&I, ConstantExpr::getCast(I.getOpcode(), + OpSt.getConstant(), I.getType())); +} + + +void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) { + // If this returns a struct, mark all elements over defined, we don't track + // structs in structs. + if (EVI.getType()->isStructTy()) + return markAnythingOverdefined(&EVI); + + // If this is extracting from more than one level of struct, we don't know. + if (EVI.getNumIndices() != 1) + return markOverdefined(&EVI); + + Value *AggVal = EVI.getAggregateOperand(); + if (AggVal->getType()->isStructTy()) { + unsigned i = *EVI.idx_begin(); + LatticeVal EltVal = getStructValueState(AggVal, i); + mergeInValue(getValueState(&EVI), &EVI, EltVal); + } else { + // Otherwise, must be extracting from an array. + return markOverdefined(&EVI); + } +} + +void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) { + StructType *STy = dyn_cast<StructType>(IVI.getType()); + if (STy == 0) + return markOverdefined(&IVI); + + // If this has more than one index, we can't handle it, drive all results to + // undef. + if (IVI.getNumIndices() != 1) + return markAnythingOverdefined(&IVI); + + Value *Aggr = IVI.getAggregateOperand(); + unsigned Idx = *IVI.idx_begin(); + + // Compute the result based on what we're inserting. + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + // This passes through all values that aren't the inserted element. + if (i != Idx) { + LatticeVal EltVal = getStructValueState(Aggr, i); + mergeInValue(getStructValueState(&IVI, i), &IVI, EltVal); + continue; + } + + Value *Val = IVI.getInsertedValueOperand(); + if (Val->getType()->isStructTy()) + // We don't track structs in structs. + markOverdefined(getStructValueState(&IVI, i), &IVI); + else { + LatticeVal InVal = getValueState(Val); + mergeInValue(getStructValueState(&IVI, i), &IVI, InVal); + } + } +} + +void SCCPSolver::visitSelectInst(SelectInst &I) { + // If this select returns a struct, just mark the result overdefined. + // TODO: We could do a lot better than this if code actually uses this. + if (I.getType()->isStructTy()) + return markAnythingOverdefined(&I); + + LatticeVal CondValue = getValueState(I.getCondition()); + if (CondValue.isUndefined()) + return; + + if (ConstantInt *CondCB = CondValue.getConstantInt()) { + Value *OpVal = CondCB->isZero() ? I.getFalseValue() : I.getTrueValue(); + mergeInValue(&I, getValueState(OpVal)); + return; + } + + // Otherwise, the condition is overdefined or a constant we can't evaluate. + // See if we can produce something better than overdefined based on the T/F + // value. + LatticeVal TVal = getValueState(I.getTrueValue()); + LatticeVal FVal = getValueState(I.getFalseValue()); + + // select ?, C, C -> C. + if (TVal.isConstant() && FVal.isConstant() && + TVal.getConstant() == FVal.getConstant()) + return markConstant(&I, FVal.getConstant()); + + if (TVal.isUndefined()) // select ?, undef, X -> X. + return mergeInValue(&I, FVal); + if (FVal.isUndefined()) // select ?, X, undef -> X. + return mergeInValue(&I, TVal); + markOverdefined(&I); +} + +// Handle Binary Operators. +void SCCPSolver::visitBinaryOperator(Instruction &I) { + LatticeVal V1State = getValueState(I.getOperand(0)); + LatticeVal V2State = getValueState(I.getOperand(1)); + + LatticeVal &IV = ValueState[&I]; + if (IV.isOverdefined()) return; + + if (V1State.isConstant() && V2State.isConstant()) + return markConstant(IV, &I, + ConstantExpr::get(I.getOpcode(), V1State.getConstant(), + V2State.getConstant())); + + // If something is undef, wait for it to resolve. + if (!V1State.isOverdefined() && !V2State.isOverdefined()) + return; + + // Otherwise, one of our operands is overdefined. Try to produce something + // better than overdefined with some tricks. + + // If this is an AND or OR with 0 or -1, it doesn't matter that the other + // operand is overdefined. + if (I.getOpcode() == Instruction::And || I.getOpcode() == Instruction::Or) { + LatticeVal *NonOverdefVal = 0; + if (!V1State.isOverdefined()) + NonOverdefVal = &V1State; + else if (!V2State.isOverdefined()) + NonOverdefVal = &V2State; + + if (NonOverdefVal) { + if (NonOverdefVal->isUndefined()) { + // Could annihilate value. + if (I.getOpcode() == Instruction::And) + markConstant(IV, &I, Constant::getNullValue(I.getType())); + else if (VectorType *PT = dyn_cast<VectorType>(I.getType())) + markConstant(IV, &I, Constant::getAllOnesValue(PT)); + else + markConstant(IV, &I, + Constant::getAllOnesValue(I.getType())); + return; + } + + if (I.getOpcode() == Instruction::And) { + // X and 0 = 0 + if (NonOverdefVal->getConstant()->isNullValue()) + return markConstant(IV, &I, NonOverdefVal->getConstant()); + } else { + if (ConstantInt *CI = NonOverdefVal->getConstantInt()) + if (CI->isAllOnesValue()) // X or -1 = -1 + return markConstant(IV, &I, NonOverdefVal->getConstant()); + } + } + } + + + markOverdefined(&I); +} + +// Handle ICmpInst instruction. +void SCCPSolver::visitCmpInst(CmpInst &I) { + LatticeVal V1State = getValueState(I.getOperand(0)); + LatticeVal V2State = getValueState(I.getOperand(1)); + + LatticeVal &IV = ValueState[&I]; + if (IV.isOverdefined()) return; + + if (V1State.isConstant() && V2State.isConstant()) + return markConstant(IV, &I, ConstantExpr::getCompare(I.getPredicate(), + V1State.getConstant(), + V2State.getConstant())); + + // If operands are still undefined, wait for it to resolve. + if (!V1State.isOverdefined() && !V2State.isOverdefined()) + return; + + markOverdefined(&I); +} + +void SCCPSolver::visitExtractElementInst(ExtractElementInst &I) { + // TODO : SCCP does not handle vectors properly. + return markOverdefined(&I); + +#if 0 + LatticeVal &ValState = getValueState(I.getOperand(0)); + LatticeVal &IdxState = getValueState(I.getOperand(1)); + + if (ValState.isOverdefined() || IdxState.isOverdefined()) + markOverdefined(&I); + else if(ValState.isConstant() && IdxState.isConstant()) + markConstant(&I, ConstantExpr::getExtractElement(ValState.getConstant(), + IdxState.getConstant())); +#endif +} + +void SCCPSolver::visitInsertElementInst(InsertElementInst &I) { + // TODO : SCCP does not handle vectors properly. + return markOverdefined(&I); +#if 0 + LatticeVal &ValState = getValueState(I.getOperand(0)); + LatticeVal &EltState = getValueState(I.getOperand(1)); + LatticeVal &IdxState = getValueState(I.getOperand(2)); + + if (ValState.isOverdefined() || EltState.isOverdefined() || + IdxState.isOverdefined()) + markOverdefined(&I); + else if(ValState.isConstant() && EltState.isConstant() && + IdxState.isConstant()) + markConstant(&I, ConstantExpr::getInsertElement(ValState.getConstant(), + EltState.getConstant(), + IdxState.getConstant())); + else if (ValState.isUndefined() && EltState.isConstant() && + IdxState.isConstant()) + markConstant(&I,ConstantExpr::getInsertElement(UndefValue::get(I.getType()), + EltState.getConstant(), + IdxState.getConstant())); +#endif +} + +void SCCPSolver::visitShuffleVectorInst(ShuffleVectorInst &I) { + // TODO : SCCP does not handle vectors properly. + return markOverdefined(&I); +#if 0 + LatticeVal &V1State = getValueState(I.getOperand(0)); + LatticeVal &V2State = getValueState(I.getOperand(1)); + LatticeVal &MaskState = getValueState(I.getOperand(2)); + + if (MaskState.isUndefined() || + (V1State.isUndefined() && V2State.isUndefined())) + return; // Undefined output if mask or both inputs undefined. + + if (V1State.isOverdefined() || V2State.isOverdefined() || + MaskState.isOverdefined()) { + markOverdefined(&I); + } else { + // A mix of constant/undef inputs. + Constant *V1 = V1State.isConstant() ? + V1State.getConstant() : UndefValue::get(I.getType()); + Constant *V2 = V2State.isConstant() ? + V2State.getConstant() : UndefValue::get(I.getType()); + Constant *Mask = MaskState.isConstant() ? + MaskState.getConstant() : UndefValue::get(I.getOperand(2)->getType()); + markConstant(&I, ConstantExpr::getShuffleVector(V1, V2, Mask)); + } +#endif +} + +// Handle getelementptr instructions. If all operands are constants then we +// can turn this into a getelementptr ConstantExpr. +// +void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) { + if (ValueState[&I].isOverdefined()) return; + + SmallVector<Constant*, 8> Operands; + Operands.reserve(I.getNumOperands()); + + for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) { + LatticeVal State = getValueState(I.getOperand(i)); + if (State.isUndefined()) + return; // Operands are not resolved yet. + + if (State.isOverdefined()) + return markOverdefined(&I); + + assert(State.isConstant() && "Unknown state!"); + Operands.push_back(State.getConstant()); + } + + Constant *Ptr = Operands[0]; + ArrayRef<Constant *> Indices(Operands.begin() + 1, Operands.end()); + markConstant(&I, ConstantExpr::getGetElementPtr(Ptr, Indices)); +} + +void SCCPSolver::visitStoreInst(StoreInst &SI) { + // If this store is of a struct, ignore it. + if (SI.getOperand(0)->getType()->isStructTy()) + return; + + if (TrackedGlobals.empty() || !isa<GlobalVariable>(SI.getOperand(1))) + return; + + GlobalVariable *GV = cast<GlobalVariable>(SI.getOperand(1)); + DenseMap<GlobalVariable*, LatticeVal>::iterator I = TrackedGlobals.find(GV); + if (I == TrackedGlobals.end() || I->second.isOverdefined()) return; + + // Get the value we are storing into the global, then merge it. + mergeInValue(I->second, GV, getValueState(SI.getOperand(0))); + if (I->second.isOverdefined()) + TrackedGlobals.erase(I); // No need to keep tracking this! +} + + +// Handle load instructions. If the operand is a constant pointer to a constant +// global, we can replace the load with the loaded constant value! +void SCCPSolver::visitLoadInst(LoadInst &I) { + // If this load is of a struct, just mark the result overdefined. + if (I.getType()->isStructTy()) + return markAnythingOverdefined(&I); + + LatticeVal PtrVal = getValueState(I.getOperand(0)); + if (PtrVal.isUndefined()) return; // The pointer is not resolved yet! + + LatticeVal &IV = ValueState[&I]; + if (IV.isOverdefined()) return; + + if (!PtrVal.isConstant() || I.isVolatile()) + return markOverdefined(IV, &I); + + Constant *Ptr = PtrVal.getConstant(); + + // load null -> null + if (isa<ConstantPointerNull>(Ptr) && I.getPointerAddressSpace() == 0) + return markConstant(IV, &I, Constant::getNullValue(I.getType())); + + // Transform load (constant global) into the value loaded. + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) { + if (!TrackedGlobals.empty()) { + // If we are tracking this global, merge in the known value for it. + DenseMap<GlobalVariable*, LatticeVal>::iterator It = + TrackedGlobals.find(GV); + if (It != TrackedGlobals.end()) { + mergeInValue(IV, &I, It->second); + return; + } + } + } + + // Transform load from a constant into a constant if possible. + if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, TD)) + return markConstant(IV, &I, C); + + // Otherwise we cannot say for certain what value this load will produce. + // Bail out. + markOverdefined(IV, &I); +} + +void SCCPSolver::visitCallSite(CallSite CS) { + Function *F = CS.getCalledFunction(); + Instruction *I = CS.getInstruction(); + + // The common case is that we aren't tracking the callee, either because we + // are not doing interprocedural analysis or the callee is indirect, or is + // external. Handle these cases first. + if (F == 0 || F->isDeclaration()) { +CallOverdefined: + // Void return and not tracking callee, just bail. + if (I->getType()->isVoidTy()) return; + + // Otherwise, if we have a single return value case, and if the function is + // a declaration, maybe we can constant fold it. + if (F && F->isDeclaration() && !I->getType()->isStructTy() && + canConstantFoldCallTo(F)) { + + SmallVector<Constant*, 8> Operands; + for (CallSite::arg_iterator AI = CS.arg_begin(), E = CS.arg_end(); + AI != E; ++AI) { + LatticeVal State = getValueState(*AI); + + if (State.isUndefined()) + return; // Operands are not resolved yet. + if (State.isOverdefined()) + return markOverdefined(I); + assert(State.isConstant() && "Unknown state!"); + Operands.push_back(State.getConstant()); + } + + // If we can constant fold this, mark the result of the call as a + // constant. + if (Constant *C = ConstantFoldCall(F, Operands, TLI)) + return markConstant(I, C); + } + + // Otherwise, we don't know anything about this call, mark it overdefined. + return markAnythingOverdefined(I); + } + + // If this is a local function that doesn't have its address taken, mark its + // entry block executable and merge in the actual arguments to the call into + // the formal arguments of the function. + if (!TrackingIncomingArguments.empty() && TrackingIncomingArguments.count(F)){ + MarkBlockExecutable(F->begin()); + + // Propagate information from this call site into the callee. + CallSite::arg_iterator CAI = CS.arg_begin(); + for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); + AI != E; ++AI, ++CAI) { + // If this argument is byval, and if the function is not readonly, there + // will be an implicit copy formed of the input aggregate. + if (AI->hasByValAttr() && !F->onlyReadsMemory()) { + markOverdefined(AI); + continue; + } + + if (StructType *STy = dyn_cast<StructType>(AI->getType())) { + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + LatticeVal CallArg = getStructValueState(*CAI, i); + mergeInValue(getStructValueState(AI, i), AI, CallArg); + } + } else { + mergeInValue(AI, getValueState(*CAI)); + } + } + } + + // If this is a single/zero retval case, see if we're tracking the function. + if (StructType *STy = dyn_cast<StructType>(F->getReturnType())) { + if (!MRVFunctionsTracked.count(F)) + goto CallOverdefined; // Not tracking this callee. + + // If we are tracking this callee, propagate the result of the function + // into this call site. + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) + mergeInValue(getStructValueState(I, i), I, + TrackedMultipleRetVals[std::make_pair(F, i)]); + } else { + DenseMap<Function*, LatticeVal>::iterator TFRVI = TrackedRetVals.find(F); + if (TFRVI == TrackedRetVals.end()) + goto CallOverdefined; // Not tracking this callee. + + // If so, propagate the return value of the callee into this call result. + mergeInValue(I, TFRVI->second); + } +} + +void SCCPSolver::Solve() { + // Process the work lists until they are empty! + while (!BBWorkList.empty() || !InstWorkList.empty() || + !OverdefinedInstWorkList.empty()) { + // Process the overdefined instruction's work list first, which drives other + // things to overdefined more quickly. + while (!OverdefinedInstWorkList.empty()) { + Value *I = OverdefinedInstWorkList.pop_back_val(); + + DEBUG(dbgs() << "\nPopped off OI-WL: " << *I << '\n'); + + // "I" got into the work list because it either made the transition from + // bottom to constant + // + // Anything on this worklist that is overdefined need not be visited + // since all of its users will have already been marked as overdefined + // Update all of the users of this instruction's value. + // + for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); + UI != E; ++UI) + if (Instruction *I = dyn_cast<Instruction>(*UI)) + OperandChangedState(I); + } + + // Process the instruction work list. + while (!InstWorkList.empty()) { + Value *I = InstWorkList.pop_back_val(); + + DEBUG(dbgs() << "\nPopped off I-WL: " << *I << '\n'); + + // "I" got into the work list because it made the transition from undef to + // constant. + // + // Anything on this worklist that is overdefined need not be visited + // since all of its users will have already been marked as overdefined. + // Update all of the users of this instruction's value. + // + if (I->getType()->isStructTy() || !getValueState(I).isOverdefined()) + for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); + UI != E; ++UI) + if (Instruction *I = dyn_cast<Instruction>(*UI)) + OperandChangedState(I); + } + + // Process the basic block work list. + while (!BBWorkList.empty()) { + BasicBlock *BB = BBWorkList.back(); + BBWorkList.pop_back(); + + DEBUG(dbgs() << "\nPopped off BBWL: " << *BB << '\n'); + + // Notify all instructions in this basic block that they are newly + // executable. + visit(BB); + } + } +} + +/// ResolvedUndefsIn - While solving the dataflow for a function, we assume +/// that branches on undef values cannot reach any of their successors. +/// However, this is not a safe assumption. After we solve dataflow, this +/// method should be use to handle this. If this returns true, the solver +/// should be rerun. +/// +/// This method handles this by finding an unresolved branch and marking it one +/// of the edges from the block as being feasible, even though the condition +/// doesn't say it would otherwise be. This allows SCCP to find the rest of the +/// CFG and only slightly pessimizes the analysis results (by marking one, +/// potentially infeasible, edge feasible). This cannot usefully modify the +/// constraints on the condition of the branch, as that would impact other users +/// of the value. +/// +/// This scan also checks for values that use undefs, whose results are actually +/// defined. For example, 'zext i8 undef to i32' should produce all zeros +/// conservatively, as "(zext i8 X -> i32) & 0xFF00" must always return zero, +/// even if X isn't defined. +bool SCCPSolver::ResolvedUndefsIn(Function &F) { + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { + if (!BBExecutable.count(BB)) + continue; + + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + // Look for instructions which produce undef values. + if (I->getType()->isVoidTy()) continue; + + if (StructType *STy = dyn_cast<StructType>(I->getType())) { + // Only a few things that can be structs matter for undef. + + // Tracked calls must never be marked overdefined in ResolvedUndefsIn. + if (CallSite CS = CallSite(I)) + if (Function *F = CS.getCalledFunction()) + if (MRVFunctionsTracked.count(F)) + continue; + + // extractvalue and insertvalue don't need to be marked; they are + // tracked as precisely as their operands. + if (isa<ExtractValueInst>(I) || isa<InsertValueInst>(I)) + continue; + + // Send the results of everything else to overdefined. We could be + // more precise than this but it isn't worth bothering. + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + LatticeVal &LV = getStructValueState(I, i); + if (LV.isUndefined()) + markOverdefined(LV, I); + } + continue; + } + + LatticeVal &LV = getValueState(I); + if (!LV.isUndefined()) continue; + + // extractvalue is safe; check here because the argument is a struct. + if (isa<ExtractValueInst>(I)) + continue; + + // Compute the operand LatticeVals, for convenience below. + // Anything taking a struct is conservatively assumed to require + // overdefined markings. + if (I->getOperand(0)->getType()->isStructTy()) { + markOverdefined(I); + return true; + } + LatticeVal Op0LV = getValueState(I->getOperand(0)); + LatticeVal Op1LV; + if (I->getNumOperands() == 2) { + if (I->getOperand(1)->getType()->isStructTy()) { + markOverdefined(I); + return true; + } + + Op1LV = getValueState(I->getOperand(1)); + } + // If this is an instructions whose result is defined even if the input is + // not fully defined, propagate the information. + Type *ITy = I->getType(); + switch (I->getOpcode()) { + case Instruction::Add: + case Instruction::Sub: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: + break; // Any undef -> undef + case Instruction::FSub: + case Instruction::FAdd: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + // Floating-point binary operation: be conservative. + if (Op0LV.isUndefined() && Op1LV.isUndefined()) + markForcedConstant(I, Constant::getNullValue(ITy)); + else + markOverdefined(I); + return true; + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + // undef -> 0; some outputs are impossible + markForcedConstant(I, Constant::getNullValue(ITy)); + return true; + case Instruction::Mul: + case Instruction::And: + // Both operands undef -> undef + if (Op0LV.isUndefined() && Op1LV.isUndefined()) + break; + // undef * X -> 0. X could be zero. + // undef & X -> 0. X could be zero. + markForcedConstant(I, Constant::getNullValue(ITy)); + return true; + + case Instruction::Or: + // Both operands undef -> undef + if (Op0LV.isUndefined() && Op1LV.isUndefined()) + break; + // undef | X -> -1. X could be -1. + markForcedConstant(I, Constant::getAllOnesValue(ITy)); + return true; + + case Instruction::Xor: + // undef ^ undef -> 0; strictly speaking, this is not strictly + // necessary, but we try to be nice to people who expect this + // behavior in simple cases + if (Op0LV.isUndefined() && Op1LV.isUndefined()) { + markForcedConstant(I, Constant::getNullValue(ITy)); + return true; + } + // undef ^ X -> undef + break; + + case Instruction::SDiv: + case Instruction::UDiv: + case Instruction::SRem: + case Instruction::URem: + // X / undef -> undef. No change. + // X % undef -> undef. No change. + if (Op1LV.isUndefined()) break; + + // undef / X -> 0. X could be maxint. + // undef % X -> 0. X could be 1. + markForcedConstant(I, Constant::getNullValue(ITy)); + return true; + + case Instruction::AShr: + // X >>a undef -> undef. + if (Op1LV.isUndefined()) break; + + // undef >>a X -> all ones + markForcedConstant(I, Constant::getAllOnesValue(ITy)); + return true; + case Instruction::LShr: + case Instruction::Shl: + // X << undef -> undef. + // X >> undef -> undef. + if (Op1LV.isUndefined()) break; + + // undef << X -> 0 + // undef >> X -> 0 + markForcedConstant(I, Constant::getNullValue(ITy)); + return true; + case Instruction::Select: + Op1LV = getValueState(I->getOperand(1)); + // undef ? X : Y -> X or Y. There could be commonality between X/Y. + if (Op0LV.isUndefined()) { + if (!Op1LV.isConstant()) // Pick the constant one if there is any. + Op1LV = getValueState(I->getOperand(2)); + } else if (Op1LV.isUndefined()) { + // c ? undef : undef -> undef. No change. + Op1LV = getValueState(I->getOperand(2)); + if (Op1LV.isUndefined()) + break; + // Otherwise, c ? undef : x -> x. + } else { + // Leave Op1LV as Operand(1)'s LatticeValue. + } + + if (Op1LV.isConstant()) + markForcedConstant(I, Op1LV.getConstant()); + else + markOverdefined(I); + return true; + case Instruction::Load: + // A load here means one of two things: a load of undef from a global, + // a load from an unknown pointer. Either way, having it return undef + // is okay. + break; + case Instruction::ICmp: + // X == undef -> undef. Other comparisons get more complicated. + if (cast<ICmpInst>(I)->isEquality()) + break; + markOverdefined(I); + return true; + case Instruction::Call: + case Instruction::Invoke: { + // There are two reasons a call can have an undef result + // 1. It could be tracked. + // 2. It could be constant-foldable. + // Because of the way we solve return values, tracked calls must + // never be marked overdefined in ResolvedUndefsIn. + if (Function *F = CallSite(I).getCalledFunction()) + if (TrackedRetVals.count(F)) + break; + + // If the call is constant-foldable, we mark it overdefined because + // we do not know what return values are valid. + markOverdefined(I); + return true; + } + default: + // If we don't know what should happen here, conservatively mark it + // overdefined. + markOverdefined(I); + return true; + } + } + + // Check to see if we have a branch or switch on an undefined value. If so + // we force the branch to go one way or the other to make the successor + // values live. It doesn't really matter which way we force it. + TerminatorInst *TI = BB->getTerminator(); + if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { + if (!BI->isConditional()) continue; + if (!getValueState(BI->getCondition()).isUndefined()) + continue; + + // If the input to SCCP is actually branch on undef, fix the undef to + // false. + if (isa<UndefValue>(BI->getCondition())) { + BI->setCondition(ConstantInt::getFalse(BI->getContext())); + markEdgeExecutable(BB, TI->getSuccessor(1)); + return true; + } + + // Otherwise, it is a branch on a symbolic value which is currently + // considered to be undef. Handle this by forcing the input value to the + // branch to false. + markForcedConstant(BI->getCondition(), + ConstantInt::getFalse(TI->getContext())); + return true; + } + + if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { + if (!SI->getNumCases()) + continue; + if (!getValueState(SI->getCondition()).isUndefined()) + continue; + + // If the input to SCCP is actually switch on undef, fix the undef to + // the first constant. + if (isa<UndefValue>(SI->getCondition())) { + SI->setCondition(SI->case_begin().getCaseValue()); + markEdgeExecutable(BB, SI->case_begin().getCaseSuccessor()); + return true; + } + + markForcedConstant(SI->getCondition(), SI->case_begin().getCaseValue()); + return true; + } + } + + return false; +} + + +namespace { + //===--------------------------------------------------------------------===// + // + /// SCCP Class - This class uses the SCCPSolver to implement a per-function + /// Sparse Conditional Constant Propagator. + /// + struct SCCP : public FunctionPass { + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<TargetLibraryInfo>(); + } + static char ID; // Pass identification, replacement for typeid + SCCP() : FunctionPass(ID) { + initializeSCCPPass(*PassRegistry::getPassRegistry()); + } + + // runOnFunction - Run the Sparse Conditional Constant Propagation + // algorithm, and return true if the function was modified. + // + bool runOnFunction(Function &F); + }; +} // end anonymous namespace + +char SCCP::ID = 0; +INITIALIZE_PASS(SCCP, "sccp", + "Sparse Conditional Constant Propagation", false, false) + +// createSCCPPass - This is the public interface to this file. +FunctionPass *llvm::createSCCPPass() { + return new SCCP(); +} + +static void DeleteInstructionInBlock(BasicBlock *BB) { + DEBUG(dbgs() << " BasicBlock Dead:" << *BB); + ++NumDeadBlocks; + + // Check to see if there are non-terminating instructions to delete. + if (isa<TerminatorInst>(BB->begin())) + return; + + // Delete the instructions backwards, as it has a reduced likelihood of having + // to update as many def-use and use-def chains. + Instruction *EndInst = BB->getTerminator(); // Last not to be deleted. + while (EndInst != BB->begin()) { + // Delete the next to last instruction. + BasicBlock::iterator I = EndInst; + Instruction *Inst = --I; + if (!Inst->use_empty()) + Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); + if (isa<LandingPadInst>(Inst)) { + EndInst = Inst; + continue; + } + BB->getInstList().erase(Inst); + ++NumInstRemoved; + } +} + +// runOnFunction() - Run the Sparse Conditional Constant Propagation algorithm, +// and return true if the function was modified. +// +bool SCCP::runOnFunction(Function &F) { + DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n"); + const TargetData *TD = getAnalysisIfAvailable<TargetData>(); + const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); + SCCPSolver Solver(TD, TLI); + + // Mark the first block of the function as being executable. + Solver.MarkBlockExecutable(F.begin()); + + // Mark all arguments to the function as being overdefined. + for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E;++AI) + Solver.markAnythingOverdefined(AI); + + // Solve for constants. + bool ResolvedUndefs = true; + while (ResolvedUndefs) { + Solver.Solve(); + DEBUG(dbgs() << "RESOLVING UNDEFs\n"); + ResolvedUndefs = Solver.ResolvedUndefsIn(F); + } + + bool MadeChanges = false; + + // If we decided that there are basic blocks that are dead in this function, + // delete their contents now. Note that we cannot actually delete the blocks, + // as we cannot modify the CFG of the function. + + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { + if (!Solver.isBlockExecutable(BB)) { + DeleteInstructionInBlock(BB); + MadeChanges = true; + continue; + } + + // Iterate over all of the instructions in a function, replacing them with + // constants if we have found them to be of constant values. + // + for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) { + Instruction *Inst = BI++; + if (Inst->getType()->isVoidTy() || isa<TerminatorInst>(Inst)) + continue; + + // TODO: Reconstruct structs from their elements. + if (Inst->getType()->isStructTy()) + continue; + + LatticeVal IV = Solver.getLatticeValueFor(Inst); + if (IV.isOverdefined()) + continue; + + Constant *Const = IV.isConstant() + ? IV.getConstant() : UndefValue::get(Inst->getType()); + DEBUG(dbgs() << " Constant: " << *Const << " = " << *Inst); + + // Replaces all of the uses of a variable with uses of the constant. + Inst->replaceAllUsesWith(Const); + + // Delete the instruction. + Inst->eraseFromParent(); + + // Hey, we just changed something! + MadeChanges = true; + ++NumInstRemoved; + } + } + + return MadeChanges; +} + +namespace { + //===--------------------------------------------------------------------===// + // + /// IPSCCP Class - This class implements interprocedural Sparse Conditional + /// Constant Propagation. + /// + struct IPSCCP : public ModulePass { + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<TargetLibraryInfo>(); + } + static char ID; + IPSCCP() : ModulePass(ID) { + initializeIPSCCPPass(*PassRegistry::getPassRegistry()); + } + bool runOnModule(Module &M); + }; +} // end anonymous namespace + +char IPSCCP::ID = 0; +INITIALIZE_PASS_BEGIN(IPSCCP, "ipsccp", + "Interprocedural Sparse Conditional Constant Propagation", + false, false) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_END(IPSCCP, "ipsccp", + "Interprocedural Sparse Conditional Constant Propagation", + false, false) + +// createIPSCCPPass - This is the public interface to this file. +ModulePass *llvm::createIPSCCPPass() { + return new IPSCCP(); +} + + +static bool AddressIsTaken(const GlobalValue *GV) { + // Delete any dead constantexpr klingons. + GV->removeDeadConstantUsers(); + + for (Value::const_use_iterator UI = GV->use_begin(), E = GV->use_end(); + UI != E; ++UI) { + const User *U = *UI; + if (const StoreInst *SI = dyn_cast<StoreInst>(U)) { + if (SI->getOperand(0) == GV || SI->isVolatile()) + return true; // Storing addr of GV. + } else if (isa<InvokeInst>(U) || isa<CallInst>(U)) { + // Make sure we are calling the function, not passing the address. + ImmutableCallSite CS(cast<Instruction>(U)); + if (!CS.isCallee(UI)) + return true; + } else if (const LoadInst *LI = dyn_cast<LoadInst>(U)) { + if (LI->isVolatile()) + return true; + } else if (isa<BlockAddress>(U)) { + // blockaddress doesn't take the address of the function, it takes addr + // of label. + } else { + return true; + } + } + return false; +} + +bool IPSCCP::runOnModule(Module &M) { + const TargetData *TD = getAnalysisIfAvailable<TargetData>(); + const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); + SCCPSolver Solver(TD, TLI); + + // AddressTakenFunctions - This set keeps track of the address-taken functions + // that are in the input. As IPSCCP runs through and simplifies code, + // functions that were address taken can end up losing their + // address-taken-ness. Because of this, we keep track of their addresses from + // the first pass so we can use them for the later simplification pass. + SmallPtrSet<Function*, 32> AddressTakenFunctions; + + // Loop over all functions, marking arguments to those with their addresses + // taken or that are external as overdefined. + // + for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) { + if (F->isDeclaration()) + continue; + + // If this is a strong or ODR definition of this function, then we can + // propagate information about its result into callsites of it. + if (!F->mayBeOverridden()) + Solver.AddTrackedFunction(F); + + // If this function only has direct calls that we can see, we can track its + // arguments and return value aggressively, and can assume it is not called + // unless we see evidence to the contrary. + if (F->hasLocalLinkage()) { + if (AddressIsTaken(F)) + AddressTakenFunctions.insert(F); + else { + Solver.AddArgumentTrackedFunction(F); + continue; + } + } + + // Assume the function is called. + Solver.MarkBlockExecutable(F->begin()); + + // Assume nothing about the incoming arguments. + for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); + AI != E; ++AI) + Solver.markAnythingOverdefined(AI); + } + + // Loop over global variables. We inform the solver about any internal global + // variables that do not have their 'addresses taken'. If they don't have + // their addresses taken, we can propagate constants through them. + for (Module::global_iterator G = M.global_begin(), E = M.global_end(); + G != E; ++G) + if (!G->isConstant() && G->hasLocalLinkage() && !AddressIsTaken(G)) + Solver.TrackValueOfGlobalVariable(G); + + // Solve for constants. + bool ResolvedUndefs = true; + while (ResolvedUndefs) { + Solver.Solve(); + + DEBUG(dbgs() << "RESOLVING UNDEFS\n"); + ResolvedUndefs = false; + for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) + ResolvedUndefs |= Solver.ResolvedUndefsIn(*F); + } + + bool MadeChanges = false; + + // Iterate over all of the instructions in the module, replacing them with + // constants if we have found them to be of constant values. + // + SmallVector<BasicBlock*, 512> BlocksToErase; + + for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) { + if (Solver.isBlockExecutable(F->begin())) { + for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); + AI != E; ++AI) { + if (AI->use_empty() || AI->getType()->isStructTy()) continue; + + // TODO: Could use getStructLatticeValueFor to find out if the entire + // result is a constant and replace it entirely if so. + + LatticeVal IV = Solver.getLatticeValueFor(AI); + if (IV.isOverdefined()) continue; + + Constant *CST = IV.isConstant() ? + IV.getConstant() : UndefValue::get(AI->getType()); + DEBUG(dbgs() << "*** Arg " << *AI << " = " << *CST <<"\n"); + + // Replaces all of the uses of a variable with uses of the + // constant. + AI->replaceAllUsesWith(CST); + ++IPNumArgsElimed; + } + } + + for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { + if (!Solver.isBlockExecutable(BB)) { + DeleteInstructionInBlock(BB); + MadeChanges = true; + + TerminatorInst *TI = BB->getTerminator(); + for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) { + BasicBlock *Succ = TI->getSuccessor(i); + if (!Succ->empty() && isa<PHINode>(Succ->begin())) + TI->getSuccessor(i)->removePredecessor(BB); + } + if (!TI->use_empty()) + TI->replaceAllUsesWith(UndefValue::get(TI->getType())); + TI->eraseFromParent(); + + if (&*BB != &F->front()) + BlocksToErase.push_back(BB); + else + new UnreachableInst(M.getContext(), BB); + continue; + } + + for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) { + Instruction *Inst = BI++; + if (Inst->getType()->isVoidTy() || Inst->getType()->isStructTy()) + continue; + + // TODO: Could use getStructLatticeValueFor to find out if the entire + // result is a constant and replace it entirely if so. + + LatticeVal IV = Solver.getLatticeValueFor(Inst); + if (IV.isOverdefined()) + continue; + + Constant *Const = IV.isConstant() + ? IV.getConstant() : UndefValue::get(Inst->getType()); + DEBUG(dbgs() << " Constant: " << *Const << " = " << *Inst); + + // Replaces all of the uses of a variable with uses of the + // constant. + Inst->replaceAllUsesWith(Const); + + // Delete the instruction. + if (!isa<CallInst>(Inst) && !isa<TerminatorInst>(Inst)) + Inst->eraseFromParent(); + + // Hey, we just changed something! + MadeChanges = true; + ++IPNumInstRemoved; + } + } + + // Now that all instructions in the function are constant folded, erase dead + // blocks, because we can now use ConstantFoldTerminator to get rid of + // in-edges. + for (unsigned i = 0, e = BlocksToErase.size(); i != e; ++i) { + // If there are any PHI nodes in this successor, drop entries for BB now. + BasicBlock *DeadBB = BlocksToErase[i]; + for (Value::use_iterator UI = DeadBB->use_begin(), UE = DeadBB->use_end(); + UI != UE; ) { + // Grab the user and then increment the iterator early, as the user + // will be deleted. Step past all adjacent uses from the same user. + Instruction *I = dyn_cast<Instruction>(*UI); + do { ++UI; } while (UI != UE && *UI == I); + + // Ignore blockaddress users; BasicBlock's dtor will handle them. + if (!I) continue; + + bool Folded = ConstantFoldTerminator(I->getParent()); + if (!Folded) { + // The constant folder may not have been able to fold the terminator + // if this is a branch or switch on undef. Fold it manually as a + // branch to the first successor. +#ifndef NDEBUG + if (BranchInst *BI = dyn_cast<BranchInst>(I)) { + assert(BI->isConditional() && isa<UndefValue>(BI->getCondition()) && + "Branch should be foldable!"); + } else if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) { + assert(isa<UndefValue>(SI->getCondition()) && "Switch should fold"); + } else { + llvm_unreachable("Didn't fold away reference to block!"); + } +#endif + + // Make this an uncond branch to the first successor. + TerminatorInst *TI = I->getParent()->getTerminator(); + BranchInst::Create(TI->getSuccessor(0), TI); + + // Remove entries in successor phi nodes to remove edges. + for (unsigned i = 1, e = TI->getNumSuccessors(); i != e; ++i) + TI->getSuccessor(i)->removePredecessor(TI->getParent()); + + // Remove the old terminator. + TI->eraseFromParent(); + } + } + + // Finally, delete the basic block. + F->getBasicBlockList().erase(DeadBB); + } + BlocksToErase.clear(); + } + + // If we inferred constant or undef return values for a function, we replaced + // all call uses with the inferred value. This means we don't need to bother + // actually returning anything from the function. Replace all return + // instructions with return undef. + // + // Do this in two stages: first identify the functions we should process, then + // actually zap their returns. This is important because we can only do this + // if the address of the function isn't taken. In cases where a return is the + // last use of a function, the order of processing functions would affect + // whether other functions are optimizable. + SmallVector<ReturnInst*, 8> ReturnsToZap; + + // TODO: Process multiple value ret instructions also. + const DenseMap<Function*, LatticeVal> &RV = Solver.getTrackedRetVals(); + for (DenseMap<Function*, LatticeVal>::const_iterator I = RV.begin(), + E = RV.end(); I != E; ++I) { + Function *F = I->first; + if (I->second.isOverdefined() || F->getReturnType()->isVoidTy()) + continue; + + // We can only do this if we know that nothing else can call the function. + if (!F->hasLocalLinkage() || AddressTakenFunctions.count(F)) + continue; + + for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) + if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) + if (!isa<UndefValue>(RI->getOperand(0))) + ReturnsToZap.push_back(RI); + } + + // Zap all returns which we've identified as zap to change. + for (unsigned i = 0, e = ReturnsToZap.size(); i != e; ++i) { + Function *F = ReturnsToZap[i]->getParent()->getParent(); + ReturnsToZap[i]->setOperand(0, UndefValue::get(F->getReturnType())); + } + + // If we inferred constant or undef values for globals variables, we can + // delete the global and any stores that remain to it. + const DenseMap<GlobalVariable*, LatticeVal> &TG = Solver.getTrackedGlobals(); + for (DenseMap<GlobalVariable*, LatticeVal>::const_iterator I = TG.begin(), + E = TG.end(); I != E; ++I) { + GlobalVariable *GV = I->first; + assert(!I->second.isOverdefined() && + "Overdefined values should have been taken out of the map!"); + DEBUG(dbgs() << "Found that GV '" << GV->getName() << "' is constant!\n"); + while (!GV->use_empty()) { + StoreInst *SI = cast<StoreInst>(GV->use_back()); + SI->eraseFromParent(); + } + M.getGlobalList().erase(GV); + ++IPNumGlobalConst; + } + + return MadeChanges; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp new file mode 100644 index 0000000..48318c8 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -0,0 +1,193 @@ +//===-- Scalar.cpp --------------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements common infrastructure for libLLVMScalarOpts.a, which +// implements several scalar transformations over the LLVM intermediate +// representation, including the C bindings for that library. +// +//===----------------------------------------------------------------------===// + +#include "llvm-c/Transforms/Scalar.h" +#include "llvm-c/Initialization.h" +#include "llvm/InitializePasses.h" +#include "llvm/PassManager.h" +#include "llvm/Analysis/Passes.h" +#include "llvm/Analysis/Verifier.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Transforms/Scalar.h" + +using namespace llvm; + +/// initializeScalarOptsPasses - Initialize all passes linked into the +/// ScalarOpts library. +void llvm::initializeScalarOpts(PassRegistry &Registry) { + initializeADCEPass(Registry); + initializeBlockPlacementPass(Registry); + initializeCodeGenPreparePass(Registry); + initializeConstantPropagationPass(Registry); + initializeCorrelatedValuePropagationPass(Registry); + initializeDCEPass(Registry); + initializeDeadInstEliminationPass(Registry); + initializeDSEPass(Registry); + initializeGVNPass(Registry); + initializeEarlyCSEPass(Registry); + initializeIndVarSimplifyPass(Registry); + initializeJumpThreadingPass(Registry); + initializeLICMPass(Registry); + initializeLoopDeletionPass(Registry); + initializeLoopInstSimplifyPass(Registry); + initializeLoopRotatePass(Registry); + initializeLoopStrengthReducePass(Registry); + initializeLoopUnrollPass(Registry); + initializeLoopUnswitchPass(Registry); + initializeLoopIdiomRecognizePass(Registry); + initializeLowerAtomicPass(Registry); + initializeLowerExpectIntrinsicPass(Registry); + initializeMemCpyOptPass(Registry); + initializeObjCARCAliasAnalysisPass(Registry); + initializeObjCARCAPElimPass(Registry); + initializeObjCARCExpandPass(Registry); + initializeObjCARCContractPass(Registry); + initializeObjCARCOptPass(Registry); + initializeReassociatePass(Registry); + initializeRegToMemPass(Registry); + initializeSCCPPass(Registry); + initializeIPSCCPPass(Registry); + initializeSROA_DTPass(Registry); + initializeSROA_SSAUpPass(Registry); + initializeCFGSimplifyPassPass(Registry); + initializeSimplifyLibCallsPass(Registry); + initializeSinkingPass(Registry); + initializeTailCallElimPass(Registry); +} + +void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) { + initializeScalarOpts(*unwrap(R)); +} + +void LLVMAddAggressiveDCEPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createAggressiveDCEPass()); +} + +void LLVMAddCFGSimplificationPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createCFGSimplificationPass()); +} + +void LLVMAddDeadStoreEliminationPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createDeadStoreEliminationPass()); +} + +void LLVMAddGVNPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createGVNPass()); +} + +void LLVMAddIndVarSimplifyPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createIndVarSimplifyPass()); +} + +void LLVMAddInstructionCombiningPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createInstructionCombiningPass()); +} + +void LLVMAddJumpThreadingPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createJumpThreadingPass()); +} + +void LLVMAddLICMPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLICMPass()); +} + +void LLVMAddLoopDeletionPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLoopDeletionPass()); +} + +void LLVMAddLoopIdiomPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLoopIdiomPass()); +} + +void LLVMAddLoopRotatePass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLoopRotatePass()); +} + +void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLoopUnrollPass()); +} + +void LLVMAddLoopUnswitchPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLoopUnswitchPass()); +} + +void LLVMAddMemCpyOptPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createMemCpyOptPass()); +} + +void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createPromoteMemoryToRegisterPass()); +} + +void LLVMAddReassociatePass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createReassociatePass()); +} + +void LLVMAddSCCPPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createSCCPPass()); +} + +void LLVMAddScalarReplAggregatesPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createScalarReplAggregatesPass()); +} + +void LLVMAddScalarReplAggregatesPassSSA(LLVMPassManagerRef PM) { + unwrap(PM)->add(createScalarReplAggregatesPass(-1, false)); +} + +void LLVMAddScalarReplAggregatesPassWithThreshold(LLVMPassManagerRef PM, + int Threshold) { + unwrap(PM)->add(createScalarReplAggregatesPass(Threshold)); +} + +void LLVMAddSimplifyLibCallsPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createSimplifyLibCallsPass()); +} + +void LLVMAddTailCallEliminationPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createTailCallEliminationPass()); +} + +void LLVMAddConstantPropagationPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createConstantPropagationPass()); +} + +void LLVMAddDemoteMemoryToRegisterPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createDemoteRegisterToMemoryPass()); +} + +void LLVMAddVerifierPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createVerifierPass()); +} + +void LLVMAddCorrelatedValuePropagationPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createCorrelatedValuePropagationPass()); +} + +void LLVMAddEarlyCSEPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createEarlyCSEPass()); +} + +void LLVMAddTypeBasedAliasAnalysisPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createTypeBasedAliasAnalysisPass()); +} + +void LLVMAddBasicAliasAnalysisPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createBasicAliasAnalysisPass()); +} + +void LLVMAddLowerExpectIntrinsicPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLowerExpectIntrinsicPass()); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp new file mode 100644 index 0000000..6637126 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -0,0 +1,2789 @@ +//===- ScalarReplAggregates.cpp - Scalar Replacement of Aggregates --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This transformation implements the well known scalar replacement of +// aggregates transformation. This xform breaks up alloca instructions of +// aggregate type (structure or array) into individual alloca instructions for +// each member (if possible). Then, if possible, it transforms the individual +// alloca instructions into nice clean scalar SSA form. +// +// This combines a simple SRoA algorithm with the Mem2Reg algorithm because they +// often interact, especially for C++ programs. As such, iterating between +// SRoA, then Mem2Reg until we run out of things to promote works well. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "scalarrepl" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Constants.h" +#include "llvm/DIBuilder.h" +#include "llvm/DebugInfo.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/GlobalVariable.h" +#include "llvm/IRBuilder.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/LLVMContext.h" +#include "llvm/Module.h" +#include "llvm/Operator.h" +#include "llvm/Pass.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/GetElementPtrTypeIterator.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/PromoteMemToReg.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" +using namespace llvm; + +STATISTIC(NumReplaced, "Number of allocas broken up"); +STATISTIC(NumPromoted, "Number of allocas promoted"); +STATISTIC(NumAdjusted, "Number of scalar allocas adjusted to allow promotion"); +STATISTIC(NumConverted, "Number of aggregates converted to scalar"); +STATISTIC(NumGlobals, "Number of allocas copied from constant global"); + +namespace { + struct SROA : public FunctionPass { + SROA(int T, bool hasDT, char &ID, int ST, int AT, int SLT) + : FunctionPass(ID), HasDomTree(hasDT) { + if (T == -1) + SRThreshold = 128; + else + SRThreshold = T; + if (ST == -1) + StructMemberThreshold = 32; + else + StructMemberThreshold = ST; + if (AT == -1) + ArrayElementThreshold = 8; + else + ArrayElementThreshold = AT; + if (SLT == -1) + // Do not limit the scalar integer load size if no threshold is given. + ScalarLoadThreshold = -1; + else + ScalarLoadThreshold = SLT; + } + + bool runOnFunction(Function &F); + + bool performScalarRepl(Function &F); + bool performPromotion(Function &F); + + private: + bool HasDomTree; + TargetData *TD; + + /// DeadInsts - Keep track of instructions we have made dead, so that + /// we can remove them after we are done working. + SmallVector<Value*, 32> DeadInsts; + + /// AllocaInfo - When analyzing uses of an alloca instruction, this captures + /// information about the uses. All these fields are initialized to false + /// and set to true when something is learned. + struct AllocaInfo { + /// The alloca to promote. + AllocaInst *AI; + + /// CheckedPHIs - This is a set of verified PHI nodes, to prevent infinite + /// looping and avoid redundant work. + SmallPtrSet<PHINode*, 8> CheckedPHIs; + + /// isUnsafe - This is set to true if the alloca cannot be SROA'd. + bool isUnsafe : 1; + + /// isMemCpySrc - This is true if this aggregate is memcpy'd from. + bool isMemCpySrc : 1; + + /// isMemCpyDst - This is true if this aggregate is memcpy'd into. + bool isMemCpyDst : 1; + + /// hasSubelementAccess - This is true if a subelement of the alloca is + /// ever accessed, or false if the alloca is only accessed with mem + /// intrinsics or load/store that only access the entire alloca at once. + bool hasSubelementAccess : 1; + + /// hasALoadOrStore - This is true if there are any loads or stores to it. + /// The alloca may just be accessed with memcpy, for example, which would + /// not set this. + bool hasALoadOrStore : 1; + + explicit AllocaInfo(AllocaInst *ai) + : AI(ai), isUnsafe(false), isMemCpySrc(false), isMemCpyDst(false), + hasSubelementAccess(false), hasALoadOrStore(false) {} + }; + + /// SRThreshold - The maximum alloca size to considered for SROA. + unsigned SRThreshold; + + /// StructMemberThreshold - The maximum number of members a struct can + /// contain to be considered for SROA. + unsigned StructMemberThreshold; + + /// ArrayElementThreshold - The maximum number of elements an array can + /// have to be considered for SROA. + unsigned ArrayElementThreshold; + + /// ScalarLoadThreshold - The maximum size in bits of scalars to load when + /// converting to scalar + unsigned ScalarLoadThreshold; + + void MarkUnsafe(AllocaInfo &I, Instruction *User) { + I.isUnsafe = true; + DEBUG(dbgs() << " Transformation preventing inst: " << *User << '\n'); + } + + bool isSafeAllocaToScalarRepl(AllocaInst *AI); + + void isSafeForScalarRepl(Instruction *I, uint64_t Offset, AllocaInfo &Info); + void isSafePHISelectUseForScalarRepl(Instruction *User, uint64_t Offset, + AllocaInfo &Info); + void isSafeGEP(GetElementPtrInst *GEPI, uint64_t &Offset, AllocaInfo &Info); + void isSafeMemAccess(uint64_t Offset, uint64_t MemSize, + Type *MemOpType, bool isStore, AllocaInfo &Info, + Instruction *TheAccess, bool AllowWholeAccess); + bool TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size); + uint64_t FindElementAndOffset(Type *&T, uint64_t &Offset, + Type *&IdxTy); + + void DoScalarReplacement(AllocaInst *AI, + std::vector<AllocaInst*> &WorkList); + void DeleteDeadInstructions(); + + void RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, + SmallVector<AllocaInst*, 32> &NewElts); + void RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset, + SmallVector<AllocaInst*, 32> &NewElts); + void RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, + SmallVector<AllocaInst*, 32> &NewElts); + void RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI, + uint64_t Offset, + SmallVector<AllocaInst*, 32> &NewElts); + void RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, + AllocaInst *AI, + SmallVector<AllocaInst*, 32> &NewElts); + void RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, + SmallVector<AllocaInst*, 32> &NewElts); + void RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, + SmallVector<AllocaInst*, 32> &NewElts); + bool ShouldAttemptScalarRepl(AllocaInst *AI); + + static MemTransferInst *isOnlyCopiedFromConstantGlobal( + AllocaInst *AI, SmallVector<Instruction*, 4> &ToDelete); + }; + + // SROA_DT - SROA that uses DominatorTree. + struct SROA_DT : public SROA { + static char ID; + public: + SROA_DT(int T = -1, int ST = -1, int AT = -1, int SLT = -1) : + SROA(T, true, ID, ST, AT, SLT) { + initializeSROA_DTPass(*PassRegistry::getPassRegistry()); + } + + // getAnalysisUsage - This pass does not require any passes, but we know it + // will not alter the CFG, so say so. + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<DominatorTree>(); + AU.setPreservesCFG(); + } + }; + + // SROA_SSAUp - SROA that uses SSAUpdater. + struct SROA_SSAUp : public SROA { + static char ID; + public: + SROA_SSAUp(int T = -1, int ST = -1, int AT = -1, int SLT = -1) : + SROA(T, false, ID, ST, AT, SLT) { + initializeSROA_SSAUpPass(*PassRegistry::getPassRegistry()); + } + + // getAnalysisUsage - This pass does not require any passes, but we know it + // will not alter the CFG, so say so. + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + } + }; + +} + +char SROA_DT::ID = 0; +char SROA_SSAUp::ID = 0; + +INITIALIZE_PASS_BEGIN(SROA_DT, "scalarrepl", + "Scalar Replacement of Aggregates (DT)", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_END(SROA_DT, "scalarrepl", + "Scalar Replacement of Aggregates (DT)", false, false) + +INITIALIZE_PASS_BEGIN(SROA_SSAUp, "scalarrepl-ssa", + "Scalar Replacement of Aggregates (SSAUp)", false, false) +INITIALIZE_PASS_END(SROA_SSAUp, "scalarrepl-ssa", + "Scalar Replacement of Aggregates (SSAUp)", false, false) + +// Public interface to the ScalarReplAggregates pass +FunctionPass *llvm::createScalarReplAggregatesPass(int Threshold, + bool UseDomTree, + int StructMemberThreshold, + int ArrayElementThreshold, + int ScalarLoadThreshold) { + if (UseDomTree) + return new SROA_DT(Threshold, StructMemberThreshold, ArrayElementThreshold, + ScalarLoadThreshold); + return new SROA_SSAUp(Threshold, StructMemberThreshold, + ArrayElementThreshold, ScalarLoadThreshold); +} + + +//===----------------------------------------------------------------------===// +// Convert To Scalar Optimization. +//===----------------------------------------------------------------------===// + +namespace { +/// ConvertToScalarInfo - This class implements the "Convert To Scalar" +/// optimization, which scans the uses of an alloca and determines if it can +/// rewrite it in terms of a single new alloca that can be mem2reg'd. +class ConvertToScalarInfo { + /// AllocaSize - The size of the alloca being considered in bytes. + unsigned AllocaSize; + const TargetData &TD; + unsigned ScalarLoadThreshold; + + /// IsNotTrivial - This is set to true if there is some access to the object + /// which means that mem2reg can't promote it. + bool IsNotTrivial; + + /// ScalarKind - Tracks the kind of alloca being considered for promotion, + /// computed based on the uses of the alloca rather than the LLVM type system. + enum { + Unknown, + + // Accesses via GEPs that are consistent with element access of a vector + // type. This will not be converted into a vector unless there is a later + // access using an actual vector type. + ImplicitVector, + + // Accesses via vector operations and GEPs that are consistent with the + // layout of a vector type. + Vector, + + // An integer bag-of-bits with bitwise operations for insertion and + // extraction. Any combination of types can be converted into this kind + // of scalar. + Integer + } ScalarKind; + + /// VectorTy - This tracks the type that we should promote the vector to if + /// it is possible to turn it into a vector. This starts out null, and if it + /// isn't possible to turn into a vector type, it gets set to VoidTy. + VectorType *VectorTy; + + /// HadNonMemTransferAccess - True if there is at least one access to the + /// alloca that is not a MemTransferInst. We don't want to turn structs into + /// large integers unless there is some potential for optimization. + bool HadNonMemTransferAccess; + + /// HadDynamicAccess - True if some element of this alloca was dynamic. + /// We don't yet have support for turning a dynamic access into a large + /// integer. + bool HadDynamicAccess; + +public: + explicit ConvertToScalarInfo(unsigned Size, const TargetData &td, + unsigned SLT) + : AllocaSize(Size), TD(td), ScalarLoadThreshold(SLT), IsNotTrivial(false), + ScalarKind(Unknown), VectorTy(0), HadNonMemTransferAccess(false), + HadDynamicAccess(false) { } + + AllocaInst *TryConvert(AllocaInst *AI); + +private: + bool CanConvertToScalar(Value *V, uint64_t Offset, Value* NonConstantIdx); + void MergeInTypeForLoadOrStore(Type *In, uint64_t Offset); + bool MergeInVectorType(VectorType *VInTy, uint64_t Offset); + void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset, + Value *NonConstantIdx); + + Value *ConvertScalar_ExtractValue(Value *NV, Type *ToType, + uint64_t Offset, Value* NonConstantIdx, + IRBuilder<> &Builder); + Value *ConvertScalar_InsertValue(Value *StoredVal, Value *ExistingVal, + uint64_t Offset, Value* NonConstantIdx, + IRBuilder<> &Builder); +}; +} // end anonymous namespace. + + +/// TryConvert - Analyze the specified alloca, and if it is safe to do so, +/// rewrite it to be a new alloca which is mem2reg'able. This returns the new +/// alloca if possible or null if not. +AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) { + // If we can't convert this scalar, or if mem2reg can trivially do it, bail + // out. + if (!CanConvertToScalar(AI, 0, 0) || !IsNotTrivial) + return 0; + + // If an alloca has only memset / memcpy uses, it may still have an Unknown + // ScalarKind. Treat it as an Integer below. + if (ScalarKind == Unknown) + ScalarKind = Integer; + + if (ScalarKind == Vector && VectorTy->getBitWidth() != AllocaSize * 8) + ScalarKind = Integer; + + // If we were able to find a vector type that can handle this with + // insert/extract elements, and if there was at least one use that had + // a vector type, promote this to a vector. We don't want to promote + // random stuff that doesn't use vectors (e.g. <9 x double>) because then + // we just get a lot of insert/extracts. If at least one vector is + // involved, then we probably really do have a union of vector/array. + Type *NewTy; + if (ScalarKind == Vector) { + assert(VectorTy && "Missing type for vector scalar."); + DEBUG(dbgs() << "CONVERT TO VECTOR: " << *AI << "\n TYPE = " + << *VectorTy << '\n'); + NewTy = VectorTy; // Use the vector type. + } else { + unsigned BitWidth = AllocaSize * 8; + + // Do not convert to scalar integer if the alloca size exceeds the + // scalar load threshold. + if (BitWidth > ScalarLoadThreshold) + return 0; + + if ((ScalarKind == ImplicitVector || ScalarKind == Integer) && + !HadNonMemTransferAccess && !TD.fitsInLegalInteger(BitWidth)) + return 0; + // Dynamic accesses on integers aren't yet supported. They need us to shift + // by a dynamic amount which could be difficult to work out as we might not + // know whether to use a left or right shift. + if (ScalarKind == Integer && HadDynamicAccess) + return 0; + + DEBUG(dbgs() << "CONVERT TO SCALAR INTEGER: " << *AI << "\n"); + // Create and insert the integer alloca. + NewTy = IntegerType::get(AI->getContext(), BitWidth); + } + AllocaInst *NewAI = new AllocaInst(NewTy, 0, "", AI->getParent()->begin()); + ConvertUsesToScalar(AI, NewAI, 0, 0); + return NewAI; +} + +/// MergeInTypeForLoadOrStore - Add the 'In' type to the accumulated vector type +/// (VectorTy) so far at the offset specified by Offset (which is specified in +/// bytes). +/// +/// There are two cases we handle here: +/// 1) A union of vector types of the same size and potentially its elements. +/// Here we turn element accesses into insert/extract element operations. +/// This promotes a <4 x float> with a store of float to the third element +/// into a <4 x float> that uses insert element. +/// 2) A fully general blob of memory, which we turn into some (potentially +/// large) integer type with extract and insert operations where the loads +/// and stores would mutate the memory. We mark this by setting VectorTy +/// to VoidTy. +void ConvertToScalarInfo::MergeInTypeForLoadOrStore(Type *In, + uint64_t Offset) { + // If we already decided to turn this into a blob of integer memory, there is + // nothing to be done. + if (ScalarKind == Integer) + return; + + // If this could be contributing to a vector, analyze it. + + // If the In type is a vector that is the same size as the alloca, see if it + // matches the existing VecTy. + if (VectorType *VInTy = dyn_cast<VectorType>(In)) { + if (MergeInVectorType(VInTy, Offset)) + return; + } else if (In->isFloatTy() || In->isDoubleTy() || + (In->isIntegerTy() && In->getPrimitiveSizeInBits() >= 8 && + isPowerOf2_32(In->getPrimitiveSizeInBits()))) { + // Full width accesses can be ignored, because they can always be turned + // into bitcasts. + unsigned EltSize = In->getPrimitiveSizeInBits()/8; + if (EltSize == AllocaSize) + return; + + // If we're accessing something that could be an element of a vector, see + // if the implied vector agrees with what we already have and if Offset is + // compatible with it. + if (Offset % EltSize == 0 && AllocaSize % EltSize == 0 && + (!VectorTy || EltSize == VectorTy->getElementType() + ->getPrimitiveSizeInBits()/8)) { + if (!VectorTy) { + ScalarKind = ImplicitVector; + VectorTy = VectorType::get(In, AllocaSize/EltSize); + } + return; + } + } + + // Otherwise, we have a case that we can't handle with an optimized vector + // form. We can still turn this into a large integer. + ScalarKind = Integer; +} + +/// MergeInVectorType - Handles the vector case of MergeInTypeForLoadOrStore, +/// returning true if the type was successfully merged and false otherwise. +bool ConvertToScalarInfo::MergeInVectorType(VectorType *VInTy, + uint64_t Offset) { + if (VInTy->getBitWidth()/8 == AllocaSize && Offset == 0) { + // If we're storing/loading a vector of the right size, allow it as a + // vector. If this the first vector we see, remember the type so that + // we know the element size. If this is a subsequent access, ignore it + // even if it is a differing type but the same size. Worst case we can + // bitcast the resultant vectors. + if (!VectorTy) + VectorTy = VInTy; + ScalarKind = Vector; + return true; + } + + return false; +} + +/// CanConvertToScalar - V is a pointer. If we can convert the pointee and all +/// its accesses to a single vector type, return true and set VecTy to +/// the new type. If we could convert the alloca into a single promotable +/// integer, return true but set VecTy to VoidTy. Further, if the use is not a +/// completely trivial use that mem2reg could promote, set IsNotTrivial. Offset +/// is the current offset from the base of the alloca being analyzed. +/// +/// If we see at least one access to the value that is as a vector type, set the +/// SawVec flag. +bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset, + Value* NonConstantIdx) { + for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) { + Instruction *User = cast<Instruction>(*UI); + + if (LoadInst *LI = dyn_cast<LoadInst>(User)) { + // Don't break volatile loads. + if (!LI->isSimple()) + return false; + // Don't touch MMX operations. + if (LI->getType()->isX86_MMXTy()) + return false; + HadNonMemTransferAccess = true; + MergeInTypeForLoadOrStore(LI->getType(), Offset); + continue; + } + + if (StoreInst *SI = dyn_cast<StoreInst>(User)) { + // Storing the pointer, not into the value? + if (SI->getOperand(0) == V || !SI->isSimple()) return false; + // Don't touch MMX operations. + if (SI->getOperand(0)->getType()->isX86_MMXTy()) + return false; + HadNonMemTransferAccess = true; + MergeInTypeForLoadOrStore(SI->getOperand(0)->getType(), Offset); + continue; + } + + if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) { + if (!onlyUsedByLifetimeMarkers(BCI)) + IsNotTrivial = true; // Can't be mem2reg'd. + if (!CanConvertToScalar(BCI, Offset, NonConstantIdx)) + return false; + continue; + } + + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) { + // If this is a GEP with a variable indices, we can't handle it. + PointerType* PtrTy = dyn_cast<PointerType>(GEP->getPointerOperandType()); + if (!PtrTy) + return false; + + // Compute the offset that this GEP adds to the pointer. + SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end()); + Value *GEPNonConstantIdx = 0; + if (!GEP->hasAllConstantIndices()) { + if (!isa<VectorType>(PtrTy->getElementType())) + return false; + if (NonConstantIdx) + return false; + GEPNonConstantIdx = Indices.pop_back_val(); + if (!GEPNonConstantIdx->getType()->isIntegerTy(32)) + return false; + HadDynamicAccess = true; + } else + GEPNonConstantIdx = NonConstantIdx; + uint64_t GEPOffset = TD.getIndexedOffset(PtrTy, + Indices); + // See if all uses can be converted. + if (!CanConvertToScalar(GEP, Offset+GEPOffset, GEPNonConstantIdx)) + return false; + IsNotTrivial = true; // Can't be mem2reg'd. + HadNonMemTransferAccess = true; + continue; + } + + // If this is a constant sized memset of a constant value (e.g. 0) we can + // handle it. + if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) { + // Store to dynamic index. + if (NonConstantIdx) + return false; + // Store of constant value. + if (!isa<ConstantInt>(MSI->getValue())) + return false; + + // Store of constant size. + ConstantInt *Len = dyn_cast<ConstantInt>(MSI->getLength()); + if (!Len) + return false; + + // If the size differs from the alloca, we can only convert the alloca to + // an integer bag-of-bits. + // FIXME: This should handle all of the cases that are currently accepted + // as vector element insertions. + if (Len->getZExtValue() != AllocaSize || Offset != 0) + ScalarKind = Integer; + + IsNotTrivial = true; // Can't be mem2reg'd. + HadNonMemTransferAccess = true; + continue; + } + + // If this is a memcpy or memmove into or out of the whole allocation, we + // can handle it like a load or store of the scalar type. + if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) { + // Store to dynamic index. + if (NonConstantIdx) + return false; + ConstantInt *Len = dyn_cast<ConstantInt>(MTI->getLength()); + if (Len == 0 || Len->getZExtValue() != AllocaSize || Offset != 0) + return false; + + IsNotTrivial = true; // Can't be mem2reg'd. + continue; + } + + // If this is a lifetime intrinsic, we can handle it. + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) { + if (II->getIntrinsicID() == Intrinsic::lifetime_start || + II->getIntrinsicID() == Intrinsic::lifetime_end) { + continue; + } + } + + // Otherwise, we cannot handle this! + return false; + } + + return true; +} + +/// ConvertUsesToScalar - Convert all of the users of Ptr to use the new alloca +/// directly. This happens when we are converting an "integer union" to a +/// single integer scalar, or when we are converting a "vector union" to a +/// vector with insert/extractelement instructions. +/// +/// Offset is an offset from the original alloca, in bits that need to be +/// shifted to the right. By the end of this, there should be no uses of Ptr. +void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, + uint64_t Offset, + Value* NonConstantIdx) { + while (!Ptr->use_empty()) { + Instruction *User = cast<Instruction>(Ptr->use_back()); + + if (BitCastInst *CI = dyn_cast<BitCastInst>(User)) { + ConvertUsesToScalar(CI, NewAI, Offset, NonConstantIdx); + CI->eraseFromParent(); + continue; + } + + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) { + // Compute the offset that this GEP adds to the pointer. + SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end()); + Value* GEPNonConstantIdx = 0; + if (!GEP->hasAllConstantIndices()) { + assert(!NonConstantIdx && + "Dynamic GEP reading from dynamic GEP unsupported"); + GEPNonConstantIdx = Indices.pop_back_val(); + } else + GEPNonConstantIdx = NonConstantIdx; + uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(), + Indices); + ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8, GEPNonConstantIdx); + GEP->eraseFromParent(); + continue; + } + + IRBuilder<> Builder(User); + + if (LoadInst *LI = dyn_cast<LoadInst>(User)) { + // The load is a bit extract from NewAI shifted right by Offset bits. + Value *LoadedVal = Builder.CreateLoad(NewAI); + Value *NewLoadVal + = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset, + NonConstantIdx, Builder); + LI->replaceAllUsesWith(NewLoadVal); + LI->eraseFromParent(); + continue; + } + + if (StoreInst *SI = dyn_cast<StoreInst>(User)) { + assert(SI->getOperand(0) != Ptr && "Consistency error!"); + Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in"); + Value *New = ConvertScalar_InsertValue(SI->getOperand(0), Old, Offset, + NonConstantIdx, Builder); + Builder.CreateStore(New, NewAI); + SI->eraseFromParent(); + + // If the load we just inserted is now dead, then the inserted store + // overwrote the entire thing. + if (Old->use_empty()) + Old->eraseFromParent(); + continue; + } + + // If this is a constant sized memset of a constant value (e.g. 0) we can + // transform it into a store of the expanded constant value. + if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) { + assert(MSI->getRawDest() == Ptr && "Consistency error!"); + assert(!NonConstantIdx && "Cannot replace dynamic memset with insert"); + int64_t SNumBytes = cast<ConstantInt>(MSI->getLength())->getSExtValue(); + if (SNumBytes > 0 && (SNumBytes >> 32) == 0) { + unsigned NumBytes = static_cast<unsigned>(SNumBytes); + unsigned Val = cast<ConstantInt>(MSI->getValue())->getZExtValue(); + + // Compute the value replicated the right number of times. + APInt APVal(NumBytes*8, Val); + + // Splat the value if non-zero. + if (Val) + for (unsigned i = 1; i != NumBytes; ++i) + APVal |= APVal << 8; + + Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in"); + Value *New = ConvertScalar_InsertValue( + ConstantInt::get(User->getContext(), APVal), + Old, Offset, 0, Builder); + Builder.CreateStore(New, NewAI); + + // If the load we just inserted is now dead, then the memset overwrote + // the entire thing. + if (Old->use_empty()) + Old->eraseFromParent(); + } + MSI->eraseFromParent(); + continue; + } + + // If this is a memcpy or memmove into or out of the whole allocation, we + // can handle it like a load or store of the scalar type. + if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) { + assert(Offset == 0 && "must be store to start of alloca"); + assert(!NonConstantIdx && "Cannot replace dynamic transfer with insert"); + + // If the source and destination are both to the same alloca, then this is + // a noop copy-to-self, just delete it. Otherwise, emit a load and store + // as appropriate. + AllocaInst *OrigAI = cast<AllocaInst>(GetUnderlyingObject(Ptr, &TD, 0)); + + if (GetUnderlyingObject(MTI->getSource(), &TD, 0) != OrigAI) { + // Dest must be OrigAI, change this to be a load from the original + // pointer (bitcasted), then a store to our new alloca. + assert(MTI->getRawDest() == Ptr && "Neither use is of pointer?"); + Value *SrcPtr = MTI->getSource(); + PointerType* SPTy = cast<PointerType>(SrcPtr->getType()); + PointerType* AIPTy = cast<PointerType>(NewAI->getType()); + if (SPTy->getAddressSpace() != AIPTy->getAddressSpace()) { + AIPTy = PointerType::get(AIPTy->getElementType(), + SPTy->getAddressSpace()); + } + SrcPtr = Builder.CreateBitCast(SrcPtr, AIPTy); + + LoadInst *SrcVal = Builder.CreateLoad(SrcPtr, "srcval"); + SrcVal->setAlignment(MTI->getAlignment()); + Builder.CreateStore(SrcVal, NewAI); + } else if (GetUnderlyingObject(MTI->getDest(), &TD, 0) != OrigAI) { + // Src must be OrigAI, change this to be a load from NewAI then a store + // through the original dest pointer (bitcasted). + assert(MTI->getRawSource() == Ptr && "Neither use is of pointer?"); + LoadInst *SrcVal = Builder.CreateLoad(NewAI, "srcval"); + + PointerType* DPTy = cast<PointerType>(MTI->getDest()->getType()); + PointerType* AIPTy = cast<PointerType>(NewAI->getType()); + if (DPTy->getAddressSpace() != AIPTy->getAddressSpace()) { + AIPTy = PointerType::get(AIPTy->getElementType(), + DPTy->getAddressSpace()); + } + Value *DstPtr = Builder.CreateBitCast(MTI->getDest(), AIPTy); + + StoreInst *NewStore = Builder.CreateStore(SrcVal, DstPtr); + NewStore->setAlignment(MTI->getAlignment()); + } else { + // Noop transfer. Src == Dst + } + + MTI->eraseFromParent(); + continue; + } + + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) { + if (II->getIntrinsicID() == Intrinsic::lifetime_start || + II->getIntrinsicID() == Intrinsic::lifetime_end) { + // There's no need to preserve these, as the resulting alloca will be + // converted to a register anyways. + II->eraseFromParent(); + continue; + } + } + + llvm_unreachable("Unsupported operation!"); + } +} + +/// ConvertScalar_ExtractValue - Extract a value of type ToType from an integer +/// or vector value FromVal, extracting the bits from the offset specified by +/// Offset. This returns the value, which is of type ToType. +/// +/// This happens when we are converting an "integer union" to a single +/// integer scalar, or when we are converting a "vector union" to a vector with +/// insert/extractelement instructions. +/// +/// Offset is an offset from the original alloca, in bits that need to be +/// shifted to the right. +Value *ConvertToScalarInfo:: +ConvertScalar_ExtractValue(Value *FromVal, Type *ToType, + uint64_t Offset, Value* NonConstantIdx, + IRBuilder<> &Builder) { + // If the load is of the whole new alloca, no conversion is needed. + Type *FromType = FromVal->getType(); + if (FromType == ToType && Offset == 0) + return FromVal; + + // If the result alloca is a vector type, this is either an element + // access or a bitcast to another vector type of the same size. + if (VectorType *VTy = dyn_cast<VectorType>(FromType)) { + unsigned FromTypeSize = TD.getTypeAllocSize(FromType); + unsigned ToTypeSize = TD.getTypeAllocSize(ToType); + if (FromTypeSize == ToTypeSize) + return Builder.CreateBitCast(FromVal, ToType); + + // Otherwise it must be an element access. + unsigned Elt = 0; + if (Offset) { + unsigned EltSize = TD.getTypeAllocSizeInBits(VTy->getElementType()); + Elt = Offset/EltSize; + assert(EltSize*Elt == Offset && "Invalid modulus in validity checking"); + } + // Return the element extracted out of it. + Value *Idx; + if (NonConstantIdx) { + if (Elt) + Idx = Builder.CreateAdd(NonConstantIdx, + Builder.getInt32(Elt), + "dyn.offset"); + else + Idx = NonConstantIdx; + } else + Idx = Builder.getInt32(Elt); + Value *V = Builder.CreateExtractElement(FromVal, Idx); + if (V->getType() != ToType) + V = Builder.CreateBitCast(V, ToType); + return V; + } + + // If ToType is a first class aggregate, extract out each of the pieces and + // use insertvalue's to form the FCA. + if (StructType *ST = dyn_cast<StructType>(ToType)) { + assert(!NonConstantIdx && + "Dynamic indexing into struct types not supported"); + const StructLayout &Layout = *TD.getStructLayout(ST); + Value *Res = UndefValue::get(ST); + for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { + Value *Elt = ConvertScalar_ExtractValue(FromVal, ST->getElementType(i), + Offset+Layout.getElementOffsetInBits(i), + 0, Builder); + Res = Builder.CreateInsertValue(Res, Elt, i); + } + return Res; + } + + if (ArrayType *AT = dyn_cast<ArrayType>(ToType)) { + assert(!NonConstantIdx && + "Dynamic indexing into array types not supported"); + uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType()); + Value *Res = UndefValue::get(AT); + for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { + Value *Elt = ConvertScalar_ExtractValue(FromVal, AT->getElementType(), + Offset+i*EltSize, 0, Builder); + Res = Builder.CreateInsertValue(Res, Elt, i); + } + return Res; + } + + // Otherwise, this must be a union that was converted to an integer value. + IntegerType *NTy = cast<IntegerType>(FromVal->getType()); + + // If this is a big-endian system and the load is narrower than the + // full alloca type, we need to do a shift to get the right bits. + int ShAmt = 0; + if (TD.isBigEndian()) { + // On big-endian machines, the lowest bit is stored at the bit offset + // from the pointer given by getTypeStoreSizeInBits. This matters for + // integers with a bitwidth that is not a multiple of 8. + ShAmt = TD.getTypeStoreSizeInBits(NTy) - + TD.getTypeStoreSizeInBits(ToType) - Offset; + } else { + ShAmt = Offset; + } + + // Note: we support negative bitwidths (with shl) which are not defined. + // We do this to support (f.e.) loads off the end of a structure where + // only some bits are used. + if (ShAmt > 0 && (unsigned)ShAmt < NTy->getBitWidth()) + FromVal = Builder.CreateLShr(FromVal, + ConstantInt::get(FromVal->getType(), ShAmt)); + else if (ShAmt < 0 && (unsigned)-ShAmt < NTy->getBitWidth()) + FromVal = Builder.CreateShl(FromVal, + ConstantInt::get(FromVal->getType(), -ShAmt)); + + // Finally, unconditionally truncate the integer to the right width. + unsigned LIBitWidth = TD.getTypeSizeInBits(ToType); + if (LIBitWidth < NTy->getBitWidth()) + FromVal = + Builder.CreateTrunc(FromVal, IntegerType::get(FromVal->getContext(), + LIBitWidth)); + else if (LIBitWidth > NTy->getBitWidth()) + FromVal = + Builder.CreateZExt(FromVal, IntegerType::get(FromVal->getContext(), + LIBitWidth)); + + // If the result is an integer, this is a trunc or bitcast. + if (ToType->isIntegerTy()) { + // Should be done. + } else if (ToType->isFloatingPointTy() || ToType->isVectorTy()) { + // Just do a bitcast, we know the sizes match up. + FromVal = Builder.CreateBitCast(FromVal, ToType); + } else { + // Otherwise must be a pointer. + FromVal = Builder.CreateIntToPtr(FromVal, ToType); + } + assert(FromVal->getType() == ToType && "Didn't convert right?"); + return FromVal; +} + +/// ConvertScalar_InsertValue - Insert the value "SV" into the existing integer +/// or vector value "Old" at the offset specified by Offset. +/// +/// This happens when we are converting an "integer union" to a +/// single integer scalar, or when we are converting a "vector union" to a +/// vector with insert/extractelement instructions. +/// +/// Offset is an offset from the original alloca, in bits that need to be +/// shifted to the right. +/// +/// NonConstantIdx is an index value if there was a GEP with a non-constant +/// index value. If this is 0 then all GEPs used to find this insert address +/// are constant. +Value *ConvertToScalarInfo:: +ConvertScalar_InsertValue(Value *SV, Value *Old, + uint64_t Offset, Value* NonConstantIdx, + IRBuilder<> &Builder) { + // Convert the stored type to the actual type, shift it left to insert + // then 'or' into place. + Type *AllocaType = Old->getType(); + LLVMContext &Context = Old->getContext(); + + if (VectorType *VTy = dyn_cast<VectorType>(AllocaType)) { + uint64_t VecSize = TD.getTypeAllocSizeInBits(VTy); + uint64_t ValSize = TD.getTypeAllocSizeInBits(SV->getType()); + + // Changing the whole vector with memset or with an access of a different + // vector type? + if (ValSize == VecSize) + return Builder.CreateBitCast(SV, AllocaType); + + // Must be an element insertion. + Type *EltTy = VTy->getElementType(); + if (SV->getType() != EltTy) + SV = Builder.CreateBitCast(SV, EltTy); + uint64_t EltSize = TD.getTypeAllocSizeInBits(EltTy); + unsigned Elt = Offset/EltSize; + Value *Idx; + if (NonConstantIdx) { + if (Elt) + Idx = Builder.CreateAdd(NonConstantIdx, + Builder.getInt32(Elt), + "dyn.offset"); + else + Idx = NonConstantIdx; + } else + Idx = Builder.getInt32(Elt); + return Builder.CreateInsertElement(Old, SV, Idx); + } + + // If SV is a first-class aggregate value, insert each value recursively. + if (StructType *ST = dyn_cast<StructType>(SV->getType())) { + assert(!NonConstantIdx && + "Dynamic indexing into struct types not supported"); + const StructLayout &Layout = *TD.getStructLayout(ST); + for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { + Value *Elt = Builder.CreateExtractValue(SV, i); + Old = ConvertScalar_InsertValue(Elt, Old, + Offset+Layout.getElementOffsetInBits(i), + 0, Builder); + } + return Old; + } + + if (ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) { + assert(!NonConstantIdx && + "Dynamic indexing into array types not supported"); + uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType()); + for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { + Value *Elt = Builder.CreateExtractValue(SV, i); + Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, 0, Builder); + } + return Old; + } + + // If SV is a float, convert it to the appropriate integer type. + // If it is a pointer, do the same. + unsigned SrcWidth = TD.getTypeSizeInBits(SV->getType()); + unsigned DestWidth = TD.getTypeSizeInBits(AllocaType); + unsigned SrcStoreWidth = TD.getTypeStoreSizeInBits(SV->getType()); + unsigned DestStoreWidth = TD.getTypeStoreSizeInBits(AllocaType); + if (SV->getType()->isFloatingPointTy() || SV->getType()->isVectorTy()) + SV = Builder.CreateBitCast(SV, IntegerType::get(SV->getContext(),SrcWidth)); + else if (SV->getType()->isPointerTy()) + SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getContext())); + + // Zero extend or truncate the value if needed. + if (SV->getType() != AllocaType) { + if (SV->getType()->getPrimitiveSizeInBits() < + AllocaType->getPrimitiveSizeInBits()) + SV = Builder.CreateZExt(SV, AllocaType); + else { + // Truncation may be needed if storing more than the alloca can hold + // (undefined behavior). + SV = Builder.CreateTrunc(SV, AllocaType); + SrcWidth = DestWidth; + SrcStoreWidth = DestStoreWidth; + } + } + + // If this is a big-endian system and the store is narrower than the + // full alloca type, we need to do a shift to get the right bits. + int ShAmt = 0; + if (TD.isBigEndian()) { + // On big-endian machines, the lowest bit is stored at the bit offset + // from the pointer given by getTypeStoreSizeInBits. This matters for + // integers with a bitwidth that is not a multiple of 8. + ShAmt = DestStoreWidth - SrcStoreWidth - Offset; + } else { + ShAmt = Offset; + } + + // Note: we support negative bitwidths (with shr) which are not defined. + // We do this to support (f.e.) stores off the end of a structure where + // only some bits in the structure are set. + APInt Mask(APInt::getLowBitsSet(DestWidth, SrcWidth)); + if (ShAmt > 0 && (unsigned)ShAmt < DestWidth) { + SV = Builder.CreateShl(SV, ConstantInt::get(SV->getType(), ShAmt)); + Mask <<= ShAmt; + } else if (ShAmt < 0 && (unsigned)-ShAmt < DestWidth) { + SV = Builder.CreateLShr(SV, ConstantInt::get(SV->getType(), -ShAmt)); + Mask = Mask.lshr(-ShAmt); + } + + // Mask out the bits we are about to insert from the old value, and or + // in the new bits. + if (SrcWidth != DestWidth) { + assert(DestWidth > SrcWidth); + Old = Builder.CreateAnd(Old, ConstantInt::get(Context, ~Mask), "mask"); + SV = Builder.CreateOr(Old, SV, "ins"); + } + return SV; +} + + +//===----------------------------------------------------------------------===// +// SRoA Driver +//===----------------------------------------------------------------------===// + + +bool SROA::runOnFunction(Function &F) { + TD = getAnalysisIfAvailable<TargetData>(); + + bool Changed = performPromotion(F); + + // FIXME: ScalarRepl currently depends on TargetData more than it + // theoretically needs to. It should be refactored in order to support + // target-independent IR. Until this is done, just skip the actual + // scalar-replacement portion of this pass. + if (!TD) return Changed; + + while (1) { + bool LocalChange = performScalarRepl(F); + if (!LocalChange) break; // No need to repromote if no scalarrepl + Changed = true; + LocalChange = performPromotion(F); + if (!LocalChange) break; // No need to re-scalarrepl if no promotion + } + + return Changed; +} + +namespace { +class AllocaPromoter : public LoadAndStorePromoter { + AllocaInst *AI; + DIBuilder *DIB; + SmallVector<DbgDeclareInst *, 4> DDIs; + SmallVector<DbgValueInst *, 4> DVIs; +public: + AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S, + DIBuilder *DB) + : LoadAndStorePromoter(Insts, S), AI(0), DIB(DB) {} + + void run(AllocaInst *AI, const SmallVectorImpl<Instruction*> &Insts) { + // Remember which alloca we're promoting (for isInstInList). + this->AI = AI; + if (MDNode *DebugNode = MDNode::getIfExists(AI->getContext(), AI)) { + for (Value::use_iterator UI = DebugNode->use_begin(), + E = DebugNode->use_end(); UI != E; ++UI) + if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(*UI)) + DDIs.push_back(DDI); + else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(*UI)) + DVIs.push_back(DVI); + } + + LoadAndStorePromoter::run(Insts); + AI->eraseFromParent(); + for (SmallVector<DbgDeclareInst *, 4>::iterator I = DDIs.begin(), + E = DDIs.end(); I != E; ++I) { + DbgDeclareInst *DDI = *I; + DDI->eraseFromParent(); + } + for (SmallVector<DbgValueInst *, 4>::iterator I = DVIs.begin(), + E = DVIs.end(); I != E; ++I) { + DbgValueInst *DVI = *I; + DVI->eraseFromParent(); + } + } + + virtual bool isInstInList(Instruction *I, + const SmallVectorImpl<Instruction*> &Insts) const { + if (LoadInst *LI = dyn_cast<LoadInst>(I)) + return LI->getOperand(0) == AI; + return cast<StoreInst>(I)->getPointerOperand() == AI; + } + + virtual void updateDebugInfo(Instruction *Inst) const { + for (SmallVector<DbgDeclareInst *, 4>::const_iterator I = DDIs.begin(), + E = DDIs.end(); I != E; ++I) { + DbgDeclareInst *DDI = *I; + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) + ConvertDebugDeclareToDebugValue(DDI, SI, *DIB); + else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) + ConvertDebugDeclareToDebugValue(DDI, LI, *DIB); + } + for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(), + E = DVIs.end(); I != E; ++I) { + DbgValueInst *DVI = *I; + Value *Arg = NULL; + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + // If an argument is zero extended then use argument directly. The ZExt + // may be zapped by an optimization pass in future. + if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0))) + Arg = dyn_cast<Argument>(ZExt->getOperand(0)); + if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0))) + Arg = dyn_cast<Argument>(SExt->getOperand(0)); + if (!Arg) + Arg = SI->getOperand(0); + } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { + Arg = LI->getOperand(0); + } else { + continue; + } + Instruction *DbgVal = + DIB->insertDbgValueIntrinsic(Arg, 0, DIVariable(DVI->getVariable()), + Inst); + DbgVal->setDebugLoc(DVI->getDebugLoc()); + } + } +}; +} // end anon namespace + +/// isSafeSelectToSpeculate - Select instructions that use an alloca and are +/// subsequently loaded can be rewritten to load both input pointers and then +/// select between the result, allowing the load of the alloca to be promoted. +/// From this: +/// %P2 = select i1 %cond, i32* %Alloca, i32* %Other +/// %V = load i32* %P2 +/// to: +/// %V1 = load i32* %Alloca -> will be mem2reg'd +/// %V2 = load i32* %Other +/// %V = select i1 %cond, i32 %V1, i32 %V2 +/// +/// We can do this to a select if its only uses are loads and if the operand to +/// the select can be loaded unconditionally. +static bool isSafeSelectToSpeculate(SelectInst *SI, const TargetData *TD) { + bool TDerefable = SI->getTrueValue()->isDereferenceablePointer(); + bool FDerefable = SI->getFalseValue()->isDereferenceablePointer(); + + for (Value::use_iterator UI = SI->use_begin(), UE = SI->use_end(); + UI != UE; ++UI) { + LoadInst *LI = dyn_cast<LoadInst>(*UI); + if (LI == 0 || !LI->isSimple()) return false; + + // Both operands to the select need to be dereferencable, either absolutely + // (e.g. allocas) or at this point because we can see other accesses to it. + if (!TDerefable && !isSafeToLoadUnconditionally(SI->getTrueValue(), LI, + LI->getAlignment(), TD)) + return false; + if (!FDerefable && !isSafeToLoadUnconditionally(SI->getFalseValue(), LI, + LI->getAlignment(), TD)) + return false; + } + + return true; +} + +/// isSafePHIToSpeculate - PHI instructions that use an alloca and are +/// subsequently loaded can be rewritten to load both input pointers in the pred +/// blocks and then PHI the results, allowing the load of the alloca to be +/// promoted. +/// From this: +/// %P2 = phi [i32* %Alloca, i32* %Other] +/// %V = load i32* %P2 +/// to: +/// %V1 = load i32* %Alloca -> will be mem2reg'd +/// ... +/// %V2 = load i32* %Other +/// ... +/// %V = phi [i32 %V1, i32 %V2] +/// +/// We can do this to a select if its only uses are loads and if the operand to +/// the select can be loaded unconditionally. +static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) { + // For now, we can only do this promotion if the load is in the same block as + // the PHI, and if there are no stores between the phi and load. + // TODO: Allow recursive phi users. + // TODO: Allow stores. + BasicBlock *BB = PN->getParent(); + unsigned MaxAlign = 0; + for (Value::use_iterator UI = PN->use_begin(), UE = PN->use_end(); + UI != UE; ++UI) { + LoadInst *LI = dyn_cast<LoadInst>(*UI); + if (LI == 0 || !LI->isSimple()) return false; + + // For now we only allow loads in the same block as the PHI. This is a + // common case that happens when instcombine merges two loads through a PHI. + if (LI->getParent() != BB) return false; + + // Ensure that there are no instructions between the PHI and the load that + // could store. + for (BasicBlock::iterator BBI = PN; &*BBI != LI; ++BBI) + if (BBI->mayWriteToMemory()) + return false; + + MaxAlign = std::max(MaxAlign, LI->getAlignment()); + } + + // Okay, we know that we have one or more loads in the same block as the PHI. + // We can transform this if it is safe to push the loads into the predecessor + // blocks. The only thing to watch out for is that we can't put a possibly + // trapping load in the predecessor if it is a critical edge. + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + BasicBlock *Pred = PN->getIncomingBlock(i); + Value *InVal = PN->getIncomingValue(i); + + // If the terminator of the predecessor has side-effects (an invoke), + // there is no safe place to put a load in the predecessor. + if (Pred->getTerminator()->mayHaveSideEffects()) + return false; + + // If the value is produced by the terminator of the predecessor + // (an invoke), there is no valid place to put a load in the predecessor. + if (Pred->getTerminator() == InVal) + return false; + + // If the predecessor has a single successor, then the edge isn't critical. + if (Pred->getTerminator()->getNumSuccessors() == 1) + continue; + + // If this pointer is always safe to load, or if we can prove that there is + // already a load in the block, then we can move the load to the pred block. + if (InVal->isDereferenceablePointer() || + isSafeToLoadUnconditionally(InVal, Pred->getTerminator(), MaxAlign, TD)) + continue; + + return false; + } + + return true; +} + + +/// tryToMakeAllocaBePromotable - This returns true if the alloca only has +/// direct (non-volatile) loads and stores to it. If the alloca is close but +/// not quite there, this will transform the code to allow promotion. As such, +/// it is a non-pure predicate. +static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { + SetVector<Instruction*, SmallVector<Instruction*, 4>, + SmallPtrSet<Instruction*, 4> > InstsToRewrite; + + for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end(); + UI != UE; ++UI) { + User *U = *UI; + if (LoadInst *LI = dyn_cast<LoadInst>(U)) { + if (!LI->isSimple()) + return false; + continue; + } + + if (StoreInst *SI = dyn_cast<StoreInst>(U)) { + if (SI->getOperand(0) == AI || !SI->isSimple()) + return false; // Don't allow a store OF the AI, only INTO the AI. + continue; + } + + if (SelectInst *SI = dyn_cast<SelectInst>(U)) { + // If the condition being selected on is a constant, fold the select, yes + // this does (rarely) happen early on. + if (ConstantInt *CI = dyn_cast<ConstantInt>(SI->getCondition())) { + Value *Result = SI->getOperand(1+CI->isZero()); + SI->replaceAllUsesWith(Result); + SI->eraseFromParent(); + + // This is very rare and we just scrambled the use list of AI, start + // over completely. + return tryToMakeAllocaBePromotable(AI, TD); + } + + // If it is safe to turn "load (select c, AI, ptr)" into a select of two + // loads, then we can transform this by rewriting the select. + if (!isSafeSelectToSpeculate(SI, TD)) + return false; + + InstsToRewrite.insert(SI); + continue; + } + + if (PHINode *PN = dyn_cast<PHINode>(U)) { + if (PN->use_empty()) { // Dead PHIs can be stripped. + InstsToRewrite.insert(PN); + continue; + } + + // If it is safe to turn "load (phi [AI, ptr, ...])" into a PHI of loads + // in the pred blocks, then we can transform this by rewriting the PHI. + if (!isSafePHIToSpeculate(PN, TD)) + return false; + + InstsToRewrite.insert(PN); + continue; + } + + if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) { + if (onlyUsedByLifetimeMarkers(BCI)) { + InstsToRewrite.insert(BCI); + continue; + } + } + + return false; + } + + // If there are no instructions to rewrite, then all uses are load/stores and + // we're done! + if (InstsToRewrite.empty()) + return true; + + // If we have instructions that need to be rewritten for this to be promotable + // take care of it now. + for (unsigned i = 0, e = InstsToRewrite.size(); i != e; ++i) { + if (BitCastInst *BCI = dyn_cast<BitCastInst>(InstsToRewrite[i])) { + // This could only be a bitcast used by nothing but lifetime intrinsics. + for (BitCastInst::use_iterator I = BCI->use_begin(), E = BCI->use_end(); + I != E;) { + Use &U = I.getUse(); + ++I; + cast<Instruction>(U.getUser())->eraseFromParent(); + } + BCI->eraseFromParent(); + continue; + } + + if (SelectInst *SI = dyn_cast<SelectInst>(InstsToRewrite[i])) { + // Selects in InstsToRewrite only have load uses. Rewrite each as two + // loads with a new select. + while (!SI->use_empty()) { + LoadInst *LI = cast<LoadInst>(SI->use_back()); + + IRBuilder<> Builder(LI); + LoadInst *TrueLoad = + Builder.CreateLoad(SI->getTrueValue(), LI->getName()+".t"); + LoadInst *FalseLoad = + Builder.CreateLoad(SI->getFalseValue(), LI->getName()+".f"); + + // Transfer alignment and TBAA info if present. + TrueLoad->setAlignment(LI->getAlignment()); + FalseLoad->setAlignment(LI->getAlignment()); + if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) { + TrueLoad->setMetadata(LLVMContext::MD_tbaa, Tag); + FalseLoad->setMetadata(LLVMContext::MD_tbaa, Tag); + } + + Value *V = Builder.CreateSelect(SI->getCondition(), TrueLoad, FalseLoad); + V->takeName(LI); + LI->replaceAllUsesWith(V); + LI->eraseFromParent(); + } + + // Now that all the loads are gone, the select is gone too. + SI->eraseFromParent(); + continue; + } + + // Otherwise, we have a PHI node which allows us to push the loads into the + // predecessors. + PHINode *PN = cast<PHINode>(InstsToRewrite[i]); + if (PN->use_empty()) { + PN->eraseFromParent(); + continue; + } + + Type *LoadTy = cast<PointerType>(PN->getType())->getElementType(); + PHINode *NewPN = PHINode::Create(LoadTy, PN->getNumIncomingValues(), + PN->getName()+".ld", PN); + + // Get the TBAA tag and alignment to use from one of the loads. It doesn't + // matter which one we get and if any differ, it doesn't matter. + LoadInst *SomeLoad = cast<LoadInst>(PN->use_back()); + MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa); + unsigned Align = SomeLoad->getAlignment(); + + // Rewrite all loads of the PN to use the new PHI. + while (!PN->use_empty()) { + LoadInst *LI = cast<LoadInst>(PN->use_back()); + LI->replaceAllUsesWith(NewPN); + LI->eraseFromParent(); + } + + // Inject loads into all of the pred blocks. Keep track of which blocks we + // insert them into in case we have multiple edges from the same block. + DenseMap<BasicBlock*, LoadInst*> InsertedLoads; + + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + BasicBlock *Pred = PN->getIncomingBlock(i); + LoadInst *&Load = InsertedLoads[Pred]; + if (Load == 0) { + Load = new LoadInst(PN->getIncomingValue(i), + PN->getName() + "." + Pred->getName(), + Pred->getTerminator()); + Load->setAlignment(Align); + if (TBAATag) Load->setMetadata(LLVMContext::MD_tbaa, TBAATag); + } + + NewPN->addIncoming(Load, Pred); + } + + PN->eraseFromParent(); + } + + ++NumAdjusted; + return true; +} + +bool SROA::performPromotion(Function &F) { + std::vector<AllocaInst*> Allocas; + DominatorTree *DT = 0; + if (HasDomTree) + DT = &getAnalysis<DominatorTree>(); + + BasicBlock &BB = F.getEntryBlock(); // Get the entry node for the function + DIBuilder DIB(*F.getParent()); + bool Changed = false; + SmallVector<Instruction*, 64> Insts; + while (1) { + Allocas.clear(); + + // Find allocas that are safe to promote, by looking at all instructions in + // the entry node + for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I) + if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) // Is it an alloca? + if (tryToMakeAllocaBePromotable(AI, TD)) + Allocas.push_back(AI); + + if (Allocas.empty()) break; + + if (HasDomTree) + PromoteMemToReg(Allocas, *DT); + else { + SSAUpdater SSA; + for (unsigned i = 0, e = Allocas.size(); i != e; ++i) { + AllocaInst *AI = Allocas[i]; + + // Build list of instructions to promote. + for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); + UI != E; ++UI) + Insts.push_back(cast<Instruction>(*UI)); + AllocaPromoter(Insts, SSA, &DIB).run(AI, Insts); + Insts.clear(); + } + } + NumPromoted += Allocas.size(); + Changed = true; + } + + return Changed; +} + + +/// ShouldAttemptScalarRepl - Decide if an alloca is a good candidate for +/// SROA. It must be a struct or array type with a small number of elements. +bool SROA::ShouldAttemptScalarRepl(AllocaInst *AI) { + Type *T = AI->getAllocatedType(); + // Do not promote any struct that has too many members. + if (StructType *ST = dyn_cast<StructType>(T)) + return ST->getNumElements() <= StructMemberThreshold; + // Do not promote any array that has too many elements. + if (ArrayType *AT = dyn_cast<ArrayType>(T)) + return AT->getNumElements() <= ArrayElementThreshold; + return false; +} + +/// getPointeeAlignment - Compute the minimum alignment of the value pointed +/// to by the given pointer. +static unsigned getPointeeAlignment(Value *V, const TargetData &TD) { + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) + if (CE->getOpcode() == Instruction::BitCast || + (CE->getOpcode() == Instruction::GetElementPtr && + cast<GEPOperator>(CE)->hasAllZeroIndices())) + return getPointeeAlignment(CE->getOperand(0), TD); + + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) + if (!GV->isDeclaration()) + return TD.getPreferredAlignment(GV); + + if (PointerType *PT = dyn_cast<PointerType>(V->getType())) + return TD.getABITypeAlignment(PT->getElementType()); + + return 0; +} + + +// performScalarRepl - This algorithm is a simple worklist driven algorithm, +// which runs on all of the alloca instructions in the function, removing them +// if they are only used by getelementptr instructions. +// +bool SROA::performScalarRepl(Function &F) { + std::vector<AllocaInst*> WorkList; + + // Scan the entry basic block, adding allocas to the worklist. + BasicBlock &BB = F.getEntryBlock(); + for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I) + if (AllocaInst *A = dyn_cast<AllocaInst>(I)) + WorkList.push_back(A); + + // Process the worklist + bool Changed = false; + while (!WorkList.empty()) { + AllocaInst *AI = WorkList.back(); + WorkList.pop_back(); + + // Handle dead allocas trivially. These can be formed by SROA'ing arrays + // with unused elements. + if (AI->use_empty()) { + AI->eraseFromParent(); + Changed = true; + continue; + } + + // If this alloca is impossible for us to promote, reject it early. + if (AI->isArrayAllocation() || !AI->getAllocatedType()->isSized()) + continue; + + // Check to see if this allocation is only modified by a memcpy/memmove from + // a constant global whose alignment is equal to or exceeds that of the + // allocation. If this is the case, we can change all users to use + // the constant global instead. This is commonly produced by the CFE by + // constructs like "void foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A' + // is only subsequently read. + SmallVector<Instruction *, 4> ToDelete; + if (MemTransferInst *Copy = isOnlyCopiedFromConstantGlobal(AI, ToDelete)) { + if (AI->getAlignment() <= getPointeeAlignment(Copy->getSource(), *TD)) { + DEBUG(dbgs() << "Found alloca equal to global: " << *AI << '\n'); + DEBUG(dbgs() << " memcpy = " << *Copy << '\n'); + for (unsigned i = 0, e = ToDelete.size(); i != e; ++i) + ToDelete[i]->eraseFromParent(); + Constant *TheSrc = cast<Constant>(Copy->getSource()); + AI->replaceAllUsesWith(ConstantExpr::getBitCast(TheSrc, AI->getType())); + Copy->eraseFromParent(); // Don't mutate the global. + AI->eraseFromParent(); + ++NumGlobals; + Changed = true; + continue; + } + } + + // Check to see if we can perform the core SROA transformation. We cannot + // transform the allocation instruction if it is an array allocation + // (allocations OF arrays are ok though), and an allocation of a scalar + // value cannot be decomposed at all. + uint64_t AllocaSize = TD->getTypeAllocSize(AI->getAllocatedType()); + + // Do not promote [0 x %struct]. + if (AllocaSize == 0) continue; + + // Do not promote any struct whose size is too big. + if (AllocaSize > SRThreshold) continue; + + // If the alloca looks like a good candidate for scalar replacement, and if + // all its users can be transformed, then split up the aggregate into its + // separate elements. + if (ShouldAttemptScalarRepl(AI) && isSafeAllocaToScalarRepl(AI)) { + DoScalarReplacement(AI, WorkList); + Changed = true; + continue; + } + + // If we can turn this aggregate value (potentially with casts) into a + // simple scalar value that can be mem2reg'd into a register value. + // IsNotTrivial tracks whether this is something that mem2reg could have + // promoted itself. If so, we don't want to transform it needlessly. Note + // that we can't just check based on the type: the alloca may be of an i32 + // but that has pointer arithmetic to set byte 3 of it or something. + if (AllocaInst *NewAI = ConvertToScalarInfo( + (unsigned)AllocaSize, *TD, ScalarLoadThreshold).TryConvert(AI)) { + NewAI->takeName(AI); + AI->eraseFromParent(); + ++NumConverted; + Changed = true; + continue; + } + + // Otherwise, couldn't process this alloca. + } + + return Changed; +} + +/// DoScalarReplacement - This alloca satisfied the isSafeAllocaToScalarRepl +/// predicate, do SROA now. +void SROA::DoScalarReplacement(AllocaInst *AI, + std::vector<AllocaInst*> &WorkList) { + DEBUG(dbgs() << "Found inst to SROA: " << *AI << '\n'); + SmallVector<AllocaInst*, 32> ElementAllocas; + if (StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) { + ElementAllocas.reserve(ST->getNumContainedTypes()); + for (unsigned i = 0, e = ST->getNumContainedTypes(); i != e; ++i) { + AllocaInst *NA = new AllocaInst(ST->getContainedType(i), 0, + AI->getAlignment(), + AI->getName() + "." + Twine(i), AI); + ElementAllocas.push_back(NA); + WorkList.push_back(NA); // Add to worklist for recursive processing + } + } else { + ArrayType *AT = cast<ArrayType>(AI->getAllocatedType()); + ElementAllocas.reserve(AT->getNumElements()); + Type *ElTy = AT->getElementType(); + for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { + AllocaInst *NA = new AllocaInst(ElTy, 0, AI->getAlignment(), + AI->getName() + "." + Twine(i), AI); + ElementAllocas.push_back(NA); + WorkList.push_back(NA); // Add to worklist for recursive processing + } + } + + // Now that we have created the new alloca instructions, rewrite all the + // uses of the old alloca. + RewriteForScalarRepl(AI, AI, 0, ElementAllocas); + + // Now erase any instructions that were made dead while rewriting the alloca. + DeleteDeadInstructions(); + AI->eraseFromParent(); + + ++NumReplaced; +} + +/// DeleteDeadInstructions - Erase instructions on the DeadInstrs list, +/// recursively including all their operands that become trivially dead. +void SROA::DeleteDeadInstructions() { + while (!DeadInsts.empty()) { + Instruction *I = cast<Instruction>(DeadInsts.pop_back_val()); + + for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) + if (Instruction *U = dyn_cast<Instruction>(*OI)) { + // Zero out the operand and see if it becomes trivially dead. + // (But, don't add allocas to the dead instruction list -- they are + // already on the worklist and will be deleted separately.) + *OI = 0; + if (isInstructionTriviallyDead(U) && !isa<AllocaInst>(U)) + DeadInsts.push_back(U); + } + + I->eraseFromParent(); + } +} + +/// isSafeForScalarRepl - Check if instruction I is a safe use with regard to +/// performing scalar replacement of alloca AI. The results are flagged in +/// the Info parameter. Offset indicates the position within AI that is +/// referenced by this instruction. +void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset, + AllocaInfo &Info) { + for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E; ++UI) { + Instruction *User = cast<Instruction>(*UI); + + if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) { + isSafeForScalarRepl(BC, Offset, Info); + } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) { + uint64_t GEPOffset = Offset; + isSafeGEP(GEPI, GEPOffset, Info); + if (!Info.isUnsafe) + isSafeForScalarRepl(GEPI, GEPOffset, Info); + } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) { + ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength()); + if (Length == 0) + return MarkUnsafe(Info, User); + if (Length->isNegative()) + return MarkUnsafe(Info, User); + + isSafeMemAccess(Offset, Length->getZExtValue(), 0, + UI.getOperandNo() == 0, Info, MI, + true /*AllowWholeAccess*/); + } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) { + if (!LI->isSimple()) + return MarkUnsafe(Info, User); + Type *LIType = LI->getType(); + isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType), + LIType, false, Info, LI, true /*AllowWholeAccess*/); + Info.hasALoadOrStore = true; + + } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) { + // Store is ok if storing INTO the pointer, not storing the pointer + if (!SI->isSimple() || SI->getOperand(0) == I) + return MarkUnsafe(Info, User); + + Type *SIType = SI->getOperand(0)->getType(); + isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType), + SIType, true, Info, SI, true /*AllowWholeAccess*/); + Info.hasALoadOrStore = true; + } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) { + if (II->getIntrinsicID() != Intrinsic::lifetime_start && + II->getIntrinsicID() != Intrinsic::lifetime_end) + return MarkUnsafe(Info, User); + } else if (isa<PHINode>(User) || isa<SelectInst>(User)) { + isSafePHISelectUseForScalarRepl(User, Offset, Info); + } else { + return MarkUnsafe(Info, User); + } + if (Info.isUnsafe) return; + } +} + + +/// isSafePHIUseForScalarRepl - If we see a PHI node or select using a pointer +/// derived from the alloca, we can often still split the alloca into elements. +/// This is useful if we have a large alloca where one element is phi'd +/// together somewhere: we can SRoA and promote all the other elements even if +/// we end up not being able to promote this one. +/// +/// All we require is that the uses of the PHI do not index into other parts of +/// the alloca. The most important use case for this is single load and stores +/// that are PHI'd together, which can happen due to code sinking. +void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset, + AllocaInfo &Info) { + // If we've already checked this PHI, don't do it again. + if (PHINode *PN = dyn_cast<PHINode>(I)) + if (!Info.CheckedPHIs.insert(PN)) + return; + + for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E; ++UI) { + Instruction *User = cast<Instruction>(*UI); + + if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) { + isSafePHISelectUseForScalarRepl(BC, Offset, Info); + } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) { + // Only allow "bitcast" GEPs for simplicity. We could generalize this, + // but would have to prove that we're staying inside of an element being + // promoted. + if (!GEPI->hasAllZeroIndices()) + return MarkUnsafe(Info, User); + isSafePHISelectUseForScalarRepl(GEPI, Offset, Info); + } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) { + if (!LI->isSimple()) + return MarkUnsafe(Info, User); + Type *LIType = LI->getType(); + isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType), + LIType, false, Info, LI, false /*AllowWholeAccess*/); + Info.hasALoadOrStore = true; + + } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) { + // Store is ok if storing INTO the pointer, not storing the pointer + if (!SI->isSimple() || SI->getOperand(0) == I) + return MarkUnsafe(Info, User); + + Type *SIType = SI->getOperand(0)->getType(); + isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType), + SIType, true, Info, SI, false /*AllowWholeAccess*/); + Info.hasALoadOrStore = true; + } else if (isa<PHINode>(User) || isa<SelectInst>(User)) { + isSafePHISelectUseForScalarRepl(User, Offset, Info); + } else { + return MarkUnsafe(Info, User); + } + if (Info.isUnsafe) return; + } +} + +/// isSafeGEP - Check if a GEP instruction can be handled for scalar +/// replacement. It is safe when all the indices are constant, in-bounds +/// references, and when the resulting offset corresponds to an element within +/// the alloca type. The results are flagged in the Info parameter. Upon +/// return, Offset is adjusted as specified by the GEP indices. +void SROA::isSafeGEP(GetElementPtrInst *GEPI, + uint64_t &Offset, AllocaInfo &Info) { + gep_type_iterator GEPIt = gep_type_begin(GEPI), E = gep_type_end(GEPI); + if (GEPIt == E) + return; + bool NonConstant = false; + unsigned NonConstantIdxSize = 0; + + // Walk through the GEP type indices, checking the types that this indexes + // into. + for (; GEPIt != E; ++GEPIt) { + // Ignore struct elements, no extra checking needed for these. + if ((*GEPIt)->isStructTy()) + continue; + + ConstantInt *IdxVal = dyn_cast<ConstantInt>(GEPIt.getOperand()); + if (!IdxVal) { + // Non constant GEPs are only a problem on arrays, structs, and pointers + // Vectors can be dynamically indexed. + // FIXME: Add support for dynamic indexing on arrays. This should be + // ok on any subarrays of the alloca array, eg, a[0][i] is ok, but a[i][0] + // isn't. + if (!(*GEPIt)->isVectorTy()) + return MarkUnsafe(Info, GEPI); + NonConstant = true; + NonConstantIdxSize = TD->getTypeAllocSize(*GEPIt); + } + } + + // Compute the offset due to this GEP and check if the alloca has a + // component element at that offset. + SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end()); + // If this GEP is non constant then the last operand must have been a + // dynamic index into a vector. Pop this now as it has no impact on the + // constant part of the offset. + if (NonConstant) + Indices.pop_back(); + Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(), Indices); + if (!TypeHasComponent(Info.AI->getAllocatedType(), Offset, + NonConstantIdxSize)) + MarkUnsafe(Info, GEPI); +} + +/// isHomogeneousAggregate - Check if type T is a struct or array containing +/// elements of the same type (which is always true for arrays). If so, +/// return true with NumElts and EltTy set to the number of elements and the +/// element type, respectively. +static bool isHomogeneousAggregate(Type *T, unsigned &NumElts, + Type *&EltTy) { + if (ArrayType *AT = dyn_cast<ArrayType>(T)) { + NumElts = AT->getNumElements(); + EltTy = (NumElts == 0 ? 0 : AT->getElementType()); + return true; + } + if (StructType *ST = dyn_cast<StructType>(T)) { + NumElts = ST->getNumContainedTypes(); + EltTy = (NumElts == 0 ? 0 : ST->getContainedType(0)); + for (unsigned n = 1; n < NumElts; ++n) { + if (ST->getContainedType(n) != EltTy) + return false; + } + return true; + } + return false; +} + +/// isCompatibleAggregate - Check if T1 and T2 are either the same type or are +/// "homogeneous" aggregates with the same element type and number of elements. +static bool isCompatibleAggregate(Type *T1, Type *T2) { + if (T1 == T2) + return true; + + unsigned NumElts1, NumElts2; + Type *EltTy1, *EltTy2; + if (isHomogeneousAggregate(T1, NumElts1, EltTy1) && + isHomogeneousAggregate(T2, NumElts2, EltTy2) && + NumElts1 == NumElts2 && + EltTy1 == EltTy2) + return true; + + return false; +} + +/// isSafeMemAccess - Check if a load/store/memcpy operates on the entire AI +/// alloca or has an offset and size that corresponds to a component element +/// within it. The offset checked here may have been formed from a GEP with a +/// pointer bitcasted to a different type. +/// +/// If AllowWholeAccess is true, then this allows uses of the entire alloca as a +/// unit. If false, it only allows accesses known to be in a single element. +void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize, + Type *MemOpType, bool isStore, + AllocaInfo &Info, Instruction *TheAccess, + bool AllowWholeAccess) { + // Check if this is a load/store of the entire alloca. + if (Offset == 0 && AllowWholeAccess && + MemSize == TD->getTypeAllocSize(Info.AI->getAllocatedType())) { + // This can be safe for MemIntrinsics (where MemOpType is 0) and integer + // loads/stores (which are essentially the same as the MemIntrinsics with + // regard to copying padding between elements). But, if an alloca is + // flagged as both a source and destination of such operations, we'll need + // to check later for padding between elements. + if (!MemOpType || MemOpType->isIntegerTy()) { + if (isStore) + Info.isMemCpyDst = true; + else + Info.isMemCpySrc = true; + return; + } + // This is also safe for references using a type that is compatible with + // the type of the alloca, so that loads/stores can be rewritten using + // insertvalue/extractvalue. + if (isCompatibleAggregate(MemOpType, Info.AI->getAllocatedType())) { + Info.hasSubelementAccess = true; + return; + } + } + // Check if the offset/size correspond to a component within the alloca type. + Type *T = Info.AI->getAllocatedType(); + if (TypeHasComponent(T, Offset, MemSize)) { + Info.hasSubelementAccess = true; + return; + } + + return MarkUnsafe(Info, TheAccess); +} + +/// TypeHasComponent - Return true if T has a component type with the +/// specified offset and size. If Size is zero, do not check the size. +bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size) { + Type *EltTy; + uint64_t EltSize; + if (StructType *ST = dyn_cast<StructType>(T)) { + const StructLayout *Layout = TD->getStructLayout(ST); + unsigned EltIdx = Layout->getElementContainingOffset(Offset); + EltTy = ST->getContainedType(EltIdx); + EltSize = TD->getTypeAllocSize(EltTy); + Offset -= Layout->getElementOffset(EltIdx); + } else if (ArrayType *AT = dyn_cast<ArrayType>(T)) { + EltTy = AT->getElementType(); + EltSize = TD->getTypeAllocSize(EltTy); + if (Offset >= AT->getNumElements() * EltSize) + return false; + Offset %= EltSize; + } else if (VectorType *VT = dyn_cast<VectorType>(T)) { + EltTy = VT->getElementType(); + EltSize = TD->getTypeAllocSize(EltTy); + if (Offset >= VT->getNumElements() * EltSize) + return false; + Offset %= EltSize; + } else { + return false; + } + if (Offset == 0 && (Size == 0 || EltSize == Size)) + return true; + // Check if the component spans multiple elements. + if (Offset + Size > EltSize) + return false; + return TypeHasComponent(EltTy, Offset, Size); +} + +/// RewriteForScalarRepl - Alloca AI is being split into NewElts, so rewrite +/// the instruction I, which references it, to use the separate elements. +/// Offset indicates the position within AI that is referenced by this +/// instruction. +void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, + SmallVector<AllocaInst*, 32> &NewElts) { + for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E;) { + Use &TheUse = UI.getUse(); + Instruction *User = cast<Instruction>(*UI++); + + if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) { + RewriteBitCast(BC, AI, Offset, NewElts); + continue; + } + + if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) { + RewriteGEP(GEPI, AI, Offset, NewElts); + continue; + } + + if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) { + ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength()); + uint64_t MemSize = Length->getZExtValue(); + if (Offset == 0 && + MemSize == TD->getTypeAllocSize(AI->getAllocatedType())) + RewriteMemIntrinUserOfAlloca(MI, I, AI, NewElts); + // Otherwise the intrinsic can only touch a single element and the + // address operand will be updated, so nothing else needs to be done. + continue; + } + + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) { + if (II->getIntrinsicID() == Intrinsic::lifetime_start || + II->getIntrinsicID() == Intrinsic::lifetime_end) { + RewriteLifetimeIntrinsic(II, AI, Offset, NewElts); + } + continue; + } + + if (LoadInst *LI = dyn_cast<LoadInst>(User)) { + Type *LIType = LI->getType(); + + if (isCompatibleAggregate(LIType, AI->getAllocatedType())) { + // Replace: + // %res = load { i32, i32 }* %alloc + // with: + // %load.0 = load i32* %alloc.0 + // %insert.0 insertvalue { i32, i32 } zeroinitializer, i32 %load.0, 0 + // %load.1 = load i32* %alloc.1 + // %insert = insertvalue { i32, i32 } %insert.0, i32 %load.1, 1 + // (Also works for arrays instead of structs) + Value *Insert = UndefValue::get(LIType); + IRBuilder<> Builder(LI); + for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { + Value *Load = Builder.CreateLoad(NewElts[i], "load"); + Insert = Builder.CreateInsertValue(Insert, Load, i, "insert"); + } + LI->replaceAllUsesWith(Insert); + DeadInsts.push_back(LI); + } else if (LIType->isIntegerTy() && + TD->getTypeAllocSize(LIType) == + TD->getTypeAllocSize(AI->getAllocatedType())) { + // If this is a load of the entire alloca to an integer, rewrite it. + RewriteLoadUserOfWholeAlloca(LI, AI, NewElts); + } + continue; + } + + if (StoreInst *SI = dyn_cast<StoreInst>(User)) { + Value *Val = SI->getOperand(0); + Type *SIType = Val->getType(); + if (isCompatibleAggregate(SIType, AI->getAllocatedType())) { + // Replace: + // store { i32, i32 } %val, { i32, i32 }* %alloc + // with: + // %val.0 = extractvalue { i32, i32 } %val, 0 + // store i32 %val.0, i32* %alloc.0 + // %val.1 = extractvalue { i32, i32 } %val, 1 + // store i32 %val.1, i32* %alloc.1 + // (Also works for arrays instead of structs) + IRBuilder<> Builder(SI); + for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { + Value *Extract = Builder.CreateExtractValue(Val, i, Val->getName()); + Builder.CreateStore(Extract, NewElts[i]); + } + DeadInsts.push_back(SI); + } else if (SIType->isIntegerTy() && + TD->getTypeAllocSize(SIType) == + TD->getTypeAllocSize(AI->getAllocatedType())) { + // If this is a store of the entire alloca from an integer, rewrite it. + RewriteStoreUserOfWholeAlloca(SI, AI, NewElts); + } + continue; + } + + if (isa<SelectInst>(User) || isa<PHINode>(User)) { + // If we have a PHI user of the alloca itself (as opposed to a GEP or + // bitcast) we have to rewrite it. GEP and bitcast uses will be RAUW'd to + // the new pointer. + if (!isa<AllocaInst>(I)) continue; + + assert(Offset == 0 && NewElts[0] && + "Direct alloca use should have a zero offset"); + + // If we have a use of the alloca, we know the derived uses will be + // utilizing just the first element of the scalarized result. Insert a + // bitcast of the first alloca before the user as required. + AllocaInst *NewAI = NewElts[0]; + BitCastInst *BCI = new BitCastInst(NewAI, AI->getType(), "", NewAI); + NewAI->moveBefore(BCI); + TheUse = BCI; + continue; + } + } +} + +/// RewriteBitCast - Update a bitcast reference to the alloca being replaced +/// and recursively continue updating all of its uses. +void SROA::RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset, + SmallVector<AllocaInst*, 32> &NewElts) { + RewriteForScalarRepl(BC, AI, Offset, NewElts); + if (BC->getOperand(0) != AI) + return; + + // The bitcast references the original alloca. Replace its uses with + // references to the alloca containing offset zero (which is normally at + // index zero, but might not be in cases involving structs with elements + // of size zero). + Type *T = AI->getAllocatedType(); + uint64_t EltOffset = 0; + Type *IdxTy; + uint64_t Idx = FindElementAndOffset(T, EltOffset, IdxTy); + Instruction *Val = NewElts[Idx]; + if (Val->getType() != BC->getDestTy()) { + Val = new BitCastInst(Val, BC->getDestTy(), "", BC); + Val->takeName(BC); + } + BC->replaceAllUsesWith(Val); + DeadInsts.push_back(BC); +} + +/// FindElementAndOffset - Return the index of the element containing Offset +/// within the specified type, which must be either a struct or an array. +/// Sets T to the type of the element and Offset to the offset within that +/// element. IdxTy is set to the type of the index result to be used in a +/// GEP instruction. +uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset, + Type *&IdxTy) { + uint64_t Idx = 0; + if (StructType *ST = dyn_cast<StructType>(T)) { + const StructLayout *Layout = TD->getStructLayout(ST); + Idx = Layout->getElementContainingOffset(Offset); + T = ST->getContainedType(Idx); + Offset -= Layout->getElementOffset(Idx); + IdxTy = Type::getInt32Ty(T->getContext()); + return Idx; + } else if (ArrayType *AT = dyn_cast<ArrayType>(T)) { + T = AT->getElementType(); + uint64_t EltSize = TD->getTypeAllocSize(T); + Idx = Offset / EltSize; + Offset -= Idx * EltSize; + IdxTy = Type::getInt64Ty(T->getContext()); + return Idx; + } + VectorType *VT = cast<VectorType>(T); + T = VT->getElementType(); + uint64_t EltSize = TD->getTypeAllocSize(T); + Idx = Offset / EltSize; + Offset -= Idx * EltSize; + IdxTy = Type::getInt64Ty(T->getContext()); + return Idx; +} + +/// RewriteGEP - Check if this GEP instruction moves the pointer across +/// elements of the alloca that are being split apart, and if so, rewrite +/// the GEP to be relative to the new element. +void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, + SmallVector<AllocaInst*, 32> &NewElts) { + uint64_t OldOffset = Offset; + SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end()); + // If the GEP was dynamic then it must have been a dynamic vector lookup. + // In this case, it must be the last GEP operand which is dynamic so keep that + // aside until we've found the constant GEP offset then add it back in at the + // end. + Value* NonConstantIdx = 0; + if (!GEPI->hasAllConstantIndices()) + NonConstantIdx = Indices.pop_back_val(); + Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(), Indices); + + RewriteForScalarRepl(GEPI, AI, Offset, NewElts); + + Type *T = AI->getAllocatedType(); + Type *IdxTy; + uint64_t OldIdx = FindElementAndOffset(T, OldOffset, IdxTy); + if (GEPI->getOperand(0) == AI) + OldIdx = ~0ULL; // Force the GEP to be rewritten. + + T = AI->getAllocatedType(); + uint64_t EltOffset = Offset; + uint64_t Idx = FindElementAndOffset(T, EltOffset, IdxTy); + + // If this GEP does not move the pointer across elements of the alloca + // being split, then it does not needs to be rewritten. + if (Idx == OldIdx) + return; + + Type *i32Ty = Type::getInt32Ty(AI->getContext()); + SmallVector<Value*, 8> NewArgs; + NewArgs.push_back(Constant::getNullValue(i32Ty)); + while (EltOffset != 0) { + uint64_t EltIdx = FindElementAndOffset(T, EltOffset, IdxTy); + NewArgs.push_back(ConstantInt::get(IdxTy, EltIdx)); + } + if (NonConstantIdx) { + Type* GepTy = T; + // This GEP has a dynamic index. We need to add "i32 0" to index through + // any structs or arrays in the original type until we get to the vector + // to index. + while (!isa<VectorType>(GepTy)) { + NewArgs.push_back(Constant::getNullValue(i32Ty)); + GepTy = cast<CompositeType>(GepTy)->getTypeAtIndex(0U); + } + NewArgs.push_back(NonConstantIdx); + } + Instruction *Val = NewElts[Idx]; + if (NewArgs.size() > 1) { + Val = GetElementPtrInst::CreateInBounds(Val, NewArgs, "", GEPI); + Val->takeName(GEPI); + } + if (Val->getType() != GEPI->getType()) + Val = new BitCastInst(Val, GEPI->getType(), Val->getName(), GEPI); + GEPI->replaceAllUsesWith(Val); + DeadInsts.push_back(GEPI); +} + +/// RewriteLifetimeIntrinsic - II is a lifetime.start/lifetime.end. Rewrite it +/// to mark the lifetime of the scalarized memory. +void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI, + uint64_t Offset, + SmallVector<AllocaInst*, 32> &NewElts) { + ConstantInt *OldSize = cast<ConstantInt>(II->getArgOperand(0)); + // Put matching lifetime markers on everything from Offset up to + // Offset+OldSize. + Type *AIType = AI->getAllocatedType(); + uint64_t NewOffset = Offset; + Type *IdxTy; + uint64_t Idx = FindElementAndOffset(AIType, NewOffset, IdxTy); + + IRBuilder<> Builder(II); + uint64_t Size = OldSize->getLimitedValue(); + + if (NewOffset) { + // Splice the first element and index 'NewOffset' bytes in. SROA will + // split the alloca again later. + Value *V = Builder.CreateBitCast(NewElts[Idx], Builder.getInt8PtrTy()); + V = Builder.CreateGEP(V, Builder.getInt64(NewOffset)); + + IdxTy = NewElts[Idx]->getAllocatedType(); + uint64_t EltSize = TD->getTypeAllocSize(IdxTy) - NewOffset; + if (EltSize > Size) { + EltSize = Size; + Size = 0; + } else { + Size -= EltSize; + } + if (II->getIntrinsicID() == Intrinsic::lifetime_start) + Builder.CreateLifetimeStart(V, Builder.getInt64(EltSize)); + else + Builder.CreateLifetimeEnd(V, Builder.getInt64(EltSize)); + ++Idx; + } + + for (; Idx != NewElts.size() && Size; ++Idx) { + IdxTy = NewElts[Idx]->getAllocatedType(); + uint64_t EltSize = TD->getTypeAllocSize(IdxTy); + if (EltSize > Size) { + EltSize = Size; + Size = 0; + } else { + Size -= EltSize; + } + if (II->getIntrinsicID() == Intrinsic::lifetime_start) + Builder.CreateLifetimeStart(NewElts[Idx], + Builder.getInt64(EltSize)); + else + Builder.CreateLifetimeEnd(NewElts[Idx], + Builder.getInt64(EltSize)); + } + DeadInsts.push_back(II); +} + +/// RewriteMemIntrinUserOfAlloca - MI is a memcpy/memset/memmove from or to AI. +/// Rewrite it to copy or set the elements of the scalarized memory. +void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, + AllocaInst *AI, + SmallVector<AllocaInst*, 32> &NewElts) { + // If this is a memcpy/memmove, construct the other pointer as the + // appropriate type. The "Other" pointer is the pointer that goes to memory + // that doesn't have anything to do with the alloca that we are promoting. For + // memset, this Value* stays null. + Value *OtherPtr = 0; + unsigned MemAlignment = MI->getAlignment(); + if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) { // memmove/memcopy + if (Inst == MTI->getRawDest()) + OtherPtr = MTI->getRawSource(); + else { + assert(Inst == MTI->getRawSource()); + OtherPtr = MTI->getRawDest(); + } + } + + // If there is an other pointer, we want to convert it to the same pointer + // type as AI has, so we can GEP through it safely. + if (OtherPtr) { + unsigned AddrSpace = + cast<PointerType>(OtherPtr->getType())->getAddressSpace(); + + // Remove bitcasts and all-zero GEPs from OtherPtr. This is an + // optimization, but it's also required to detect the corner case where + // both pointer operands are referencing the same memory, and where + // OtherPtr may be a bitcast or GEP that currently being rewritten. (This + // function is only called for mem intrinsics that access the whole + // aggregate, so non-zero GEPs are not an issue here.) + OtherPtr = OtherPtr->stripPointerCasts(); + + // Copying the alloca to itself is a no-op: just delete it. + if (OtherPtr == AI || OtherPtr == NewElts[0]) { + // This code will run twice for a no-op memcpy -- once for each operand. + // Put only one reference to MI on the DeadInsts list. + for (SmallVector<Value*, 32>::const_iterator I = DeadInsts.begin(), + E = DeadInsts.end(); I != E; ++I) + if (*I == MI) return; + DeadInsts.push_back(MI); + return; + } + + // If the pointer is not the right type, insert a bitcast to the right + // type. + Type *NewTy = + PointerType::get(AI->getType()->getElementType(), AddrSpace); + + if (OtherPtr->getType() != NewTy) + OtherPtr = new BitCastInst(OtherPtr, NewTy, OtherPtr->getName(), MI); + } + + // Process each element of the aggregate. + bool SROADest = MI->getRawDest() == Inst; + + Constant *Zero = Constant::getNullValue(Type::getInt32Ty(MI->getContext())); + + for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { + // If this is a memcpy/memmove, emit a GEP of the other element address. + Value *OtherElt = 0; + unsigned OtherEltAlign = MemAlignment; + + if (OtherPtr) { + Value *Idx[2] = { Zero, + ConstantInt::get(Type::getInt32Ty(MI->getContext()), i) }; + OtherElt = GetElementPtrInst::CreateInBounds(OtherPtr, Idx, + OtherPtr->getName()+"."+Twine(i), + MI); + uint64_t EltOffset; + PointerType *OtherPtrTy = cast<PointerType>(OtherPtr->getType()); + Type *OtherTy = OtherPtrTy->getElementType(); + if (StructType *ST = dyn_cast<StructType>(OtherTy)) { + EltOffset = TD->getStructLayout(ST)->getElementOffset(i); + } else { + Type *EltTy = cast<SequentialType>(OtherTy)->getElementType(); + EltOffset = TD->getTypeAllocSize(EltTy)*i; + } + + // The alignment of the other pointer is the guaranteed alignment of the + // element, which is affected by both the known alignment of the whole + // mem intrinsic and the alignment of the element. If the alignment of + // the memcpy (f.e.) is 32 but the element is at a 4-byte offset, then the + // known alignment is just 4 bytes. + OtherEltAlign = (unsigned)MinAlign(OtherEltAlign, EltOffset); + } + + Value *EltPtr = NewElts[i]; + Type *EltTy = cast<PointerType>(EltPtr->getType())->getElementType(); + + // If we got down to a scalar, insert a load or store as appropriate. + if (EltTy->isSingleValueType()) { + if (isa<MemTransferInst>(MI)) { + if (SROADest) { + // From Other to Alloca. + Value *Elt = new LoadInst(OtherElt, "tmp", false, OtherEltAlign, MI); + new StoreInst(Elt, EltPtr, MI); + } else { + // From Alloca to Other. + Value *Elt = new LoadInst(EltPtr, "tmp", MI); + new StoreInst(Elt, OtherElt, false, OtherEltAlign, MI); + } + continue; + } + assert(isa<MemSetInst>(MI)); + + // If the stored element is zero (common case), just store a null + // constant. + Constant *StoreVal; + if (ConstantInt *CI = dyn_cast<ConstantInt>(MI->getArgOperand(1))) { + if (CI->isZero()) { + StoreVal = Constant::getNullValue(EltTy); // 0.0, null, 0, <0,0> + } else { + // If EltTy is a vector type, get the element type. + Type *ValTy = EltTy->getScalarType(); + + // Construct an integer with the right value. + unsigned EltSize = TD->getTypeSizeInBits(ValTy); + APInt OneVal(EltSize, CI->getZExtValue()); + APInt TotalVal(OneVal); + // Set each byte. + for (unsigned i = 0; 8*i < EltSize; ++i) { + TotalVal = TotalVal.shl(8); + TotalVal |= OneVal; + } + + // Convert the integer value to the appropriate type. + StoreVal = ConstantInt::get(CI->getContext(), TotalVal); + if (ValTy->isPointerTy()) + StoreVal = ConstantExpr::getIntToPtr(StoreVal, ValTy); + else if (ValTy->isFloatingPointTy()) + StoreVal = ConstantExpr::getBitCast(StoreVal, ValTy); + assert(StoreVal->getType() == ValTy && "Type mismatch!"); + + // If the requested value was a vector constant, create it. + if (EltTy->isVectorTy()) { + unsigned NumElts = cast<VectorType>(EltTy)->getNumElements(); + StoreVal = ConstantVector::getSplat(NumElts, StoreVal); + } + } + new StoreInst(StoreVal, EltPtr, MI); + continue; + } + // Otherwise, if we're storing a byte variable, use a memset call for + // this element. + } + + unsigned EltSize = TD->getTypeAllocSize(EltTy); + if (!EltSize) + continue; + + IRBuilder<> Builder(MI); + + // Finally, insert the meminst for this element. + if (isa<MemSetInst>(MI)) { + Builder.CreateMemSet(EltPtr, MI->getArgOperand(1), EltSize, + MI->isVolatile()); + } else { + assert(isa<MemTransferInst>(MI)); + Value *Dst = SROADest ? EltPtr : OtherElt; // Dest ptr + Value *Src = SROADest ? OtherElt : EltPtr; // Src ptr + + if (isa<MemCpyInst>(MI)) + Builder.CreateMemCpy(Dst, Src, EltSize, OtherEltAlign,MI->isVolatile()); + else + Builder.CreateMemMove(Dst, Src, EltSize,OtherEltAlign,MI->isVolatile()); + } + } + DeadInsts.push_back(MI); +} + +/// RewriteStoreUserOfWholeAlloca - We found a store of an integer that +/// overwrites the entire allocation. Extract out the pieces of the stored +/// integer and store them individually. +void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, + SmallVector<AllocaInst*, 32> &NewElts){ + // Extract each element out of the integer according to its structure offset + // and store the element value to the individual alloca. + Value *SrcVal = SI->getOperand(0); + Type *AllocaEltTy = AI->getAllocatedType(); + uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy); + + IRBuilder<> Builder(SI); + + // Handle tail padding by extending the operand + if (TD->getTypeSizeInBits(SrcVal->getType()) != AllocaSizeBits) + SrcVal = Builder.CreateZExt(SrcVal, + IntegerType::get(SI->getContext(), AllocaSizeBits)); + + DEBUG(dbgs() << "PROMOTING STORE TO WHOLE ALLOCA: " << *AI << '\n' << *SI + << '\n'); + + // There are two forms here: AI could be an array or struct. Both cases + // have different ways to compute the element offset. + if (StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) { + const StructLayout *Layout = TD->getStructLayout(EltSTy); + + for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { + // Get the number of bits to shift SrcVal to get the value. + Type *FieldTy = EltSTy->getElementType(i); + uint64_t Shift = Layout->getElementOffsetInBits(i); + + if (TD->isBigEndian()) + Shift = AllocaSizeBits-Shift-TD->getTypeAllocSizeInBits(FieldTy); + + Value *EltVal = SrcVal; + if (Shift) { + Value *ShiftVal = ConstantInt::get(EltVal->getType(), Shift); + EltVal = Builder.CreateLShr(EltVal, ShiftVal, "sroa.store.elt"); + } + + // Truncate down to an integer of the right size. + uint64_t FieldSizeBits = TD->getTypeSizeInBits(FieldTy); + + // Ignore zero sized fields like {}, they obviously contain no data. + if (FieldSizeBits == 0) continue; + + if (FieldSizeBits != AllocaSizeBits) + EltVal = Builder.CreateTrunc(EltVal, + IntegerType::get(SI->getContext(), FieldSizeBits)); + Value *DestField = NewElts[i]; + if (EltVal->getType() == FieldTy) { + // Storing to an integer field of this size, just do it. + } else if (FieldTy->isFloatingPointTy() || FieldTy->isVectorTy()) { + // Bitcast to the right element type (for fp/vector values). + EltVal = Builder.CreateBitCast(EltVal, FieldTy); + } else { + // Otherwise, bitcast the dest pointer (for aggregates). + DestField = Builder.CreateBitCast(DestField, + PointerType::getUnqual(EltVal->getType())); + } + new StoreInst(EltVal, DestField, SI); + } + + } else { + ArrayType *ATy = cast<ArrayType>(AllocaEltTy); + Type *ArrayEltTy = ATy->getElementType(); + uint64_t ElementOffset = TD->getTypeAllocSizeInBits(ArrayEltTy); + uint64_t ElementSizeBits = TD->getTypeSizeInBits(ArrayEltTy); + + uint64_t Shift; + + if (TD->isBigEndian()) + Shift = AllocaSizeBits-ElementOffset; + else + Shift = 0; + + for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { + // Ignore zero sized fields like {}, they obviously contain no data. + if (ElementSizeBits == 0) continue; + + Value *EltVal = SrcVal; + if (Shift) { + Value *ShiftVal = ConstantInt::get(EltVal->getType(), Shift); + EltVal = Builder.CreateLShr(EltVal, ShiftVal, "sroa.store.elt"); + } + + // Truncate down to an integer of the right size. + if (ElementSizeBits != AllocaSizeBits) + EltVal = Builder.CreateTrunc(EltVal, + IntegerType::get(SI->getContext(), + ElementSizeBits)); + Value *DestField = NewElts[i]; + if (EltVal->getType() == ArrayEltTy) { + // Storing to an integer field of this size, just do it. + } else if (ArrayEltTy->isFloatingPointTy() || + ArrayEltTy->isVectorTy()) { + // Bitcast to the right element type (for fp/vector values). + EltVal = Builder.CreateBitCast(EltVal, ArrayEltTy); + } else { + // Otherwise, bitcast the dest pointer (for aggregates). + DestField = Builder.CreateBitCast(DestField, + PointerType::getUnqual(EltVal->getType())); + } + new StoreInst(EltVal, DestField, SI); + + if (TD->isBigEndian()) + Shift -= ElementOffset; + else + Shift += ElementOffset; + } + } + + DeadInsts.push_back(SI); +} + +/// RewriteLoadUserOfWholeAlloca - We found a load of the entire allocation to +/// an integer. Load the individual pieces to form the aggregate value. +void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, + SmallVector<AllocaInst*, 32> &NewElts) { + // Extract each element out of the NewElts according to its structure offset + // and form the result value. + Type *AllocaEltTy = AI->getAllocatedType(); + uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy); + + DEBUG(dbgs() << "PROMOTING LOAD OF WHOLE ALLOCA: " << *AI << '\n' << *LI + << '\n'); + + // There are two forms here: AI could be an array or struct. Both cases + // have different ways to compute the element offset. + const StructLayout *Layout = 0; + uint64_t ArrayEltBitOffset = 0; + if (StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) { + Layout = TD->getStructLayout(EltSTy); + } else { + Type *ArrayEltTy = cast<ArrayType>(AllocaEltTy)->getElementType(); + ArrayEltBitOffset = TD->getTypeAllocSizeInBits(ArrayEltTy); + } + + Value *ResultVal = + Constant::getNullValue(IntegerType::get(LI->getContext(), AllocaSizeBits)); + + for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { + // Load the value from the alloca. If the NewElt is an aggregate, cast + // the pointer to an integer of the same size before doing the load. + Value *SrcField = NewElts[i]; + Type *FieldTy = + cast<PointerType>(SrcField->getType())->getElementType(); + uint64_t FieldSizeBits = TD->getTypeSizeInBits(FieldTy); + + // Ignore zero sized fields like {}, they obviously contain no data. + if (FieldSizeBits == 0) continue; + + IntegerType *FieldIntTy = IntegerType::get(LI->getContext(), + FieldSizeBits); + if (!FieldTy->isIntegerTy() && !FieldTy->isFloatingPointTy() && + !FieldTy->isVectorTy()) + SrcField = new BitCastInst(SrcField, + PointerType::getUnqual(FieldIntTy), + "", LI); + SrcField = new LoadInst(SrcField, "sroa.load.elt", LI); + + // If SrcField is a fp or vector of the right size but that isn't an + // integer type, bitcast to an integer so we can shift it. + if (SrcField->getType() != FieldIntTy) + SrcField = new BitCastInst(SrcField, FieldIntTy, "", LI); + + // Zero extend the field to be the same size as the final alloca so that + // we can shift and insert it. + if (SrcField->getType() != ResultVal->getType()) + SrcField = new ZExtInst(SrcField, ResultVal->getType(), "", LI); + + // Determine the number of bits to shift SrcField. + uint64_t Shift; + if (Layout) // Struct case. + Shift = Layout->getElementOffsetInBits(i); + else // Array case. + Shift = i*ArrayEltBitOffset; + + if (TD->isBigEndian()) + Shift = AllocaSizeBits-Shift-FieldIntTy->getBitWidth(); + + if (Shift) { + Value *ShiftVal = ConstantInt::get(SrcField->getType(), Shift); + SrcField = BinaryOperator::CreateShl(SrcField, ShiftVal, "", LI); + } + + // Don't create an 'or x, 0' on the first iteration. + if (!isa<Constant>(ResultVal) || + !cast<Constant>(ResultVal)->isNullValue()) + ResultVal = BinaryOperator::CreateOr(SrcField, ResultVal, "", LI); + else + ResultVal = SrcField; + } + + // Handle tail padding by truncating the result + if (TD->getTypeSizeInBits(LI->getType()) != AllocaSizeBits) + ResultVal = new TruncInst(ResultVal, LI->getType(), "", LI); + + LI->replaceAllUsesWith(ResultVal); + DeadInsts.push_back(LI); +} + +/// HasPadding - Return true if the specified type has any structure or +/// alignment padding in between the elements that would be split apart +/// by SROA; return false otherwise. +static bool HasPadding(Type *Ty, const TargetData &TD) { + if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { + Ty = ATy->getElementType(); + return TD.getTypeSizeInBits(Ty) != TD.getTypeAllocSizeInBits(Ty); + } + + // SROA currently handles only Arrays and Structs. + StructType *STy = cast<StructType>(Ty); + const StructLayout *SL = TD.getStructLayout(STy); + unsigned PrevFieldBitOffset = 0; + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + unsigned FieldBitOffset = SL->getElementOffsetInBits(i); + + // Check to see if there is any padding between this element and the + // previous one. + if (i) { + unsigned PrevFieldEnd = + PrevFieldBitOffset+TD.getTypeSizeInBits(STy->getElementType(i-1)); + if (PrevFieldEnd < FieldBitOffset) + return true; + } + PrevFieldBitOffset = FieldBitOffset; + } + // Check for tail padding. + if (unsigned EltCount = STy->getNumElements()) { + unsigned PrevFieldEnd = PrevFieldBitOffset + + TD.getTypeSizeInBits(STy->getElementType(EltCount-1)); + if (PrevFieldEnd < SL->getSizeInBits()) + return true; + } + return false; +} + +/// isSafeStructAllocaToScalarRepl - Check to see if the specified allocation of +/// an aggregate can be broken down into elements. Return 0 if not, 3 if safe, +/// or 1 if safe after canonicalization has been performed. +bool SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) { + // Loop over the use list of the alloca. We can only transform it if all of + // the users are safe to transform. + AllocaInfo Info(AI); + + isSafeForScalarRepl(AI, 0, Info); + if (Info.isUnsafe) { + DEBUG(dbgs() << "Cannot transform: " << *AI << '\n'); + return false; + } + + // Okay, we know all the users are promotable. If the aggregate is a memcpy + // source and destination, we have to be careful. In particular, the memcpy + // could be moving around elements that live in structure padding of the LLVM + // types, but may actually be used. In these cases, we refuse to promote the + // struct. + if (Info.isMemCpySrc && Info.isMemCpyDst && + HasPadding(AI->getAllocatedType(), *TD)) + return false; + + // If the alloca never has an access to just *part* of it, but is accessed + // via loads and stores, then we should use ConvertToScalarInfo to promote + // the alloca instead of promoting each piece at a time and inserting fission + // and fusion code. + if (!Info.hasSubelementAccess && Info.hasALoadOrStore) { + // If the struct/array just has one element, use basic SRoA. + if (StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) { + if (ST->getNumElements() > 1) return false; + } else { + if (cast<ArrayType>(AI->getAllocatedType())->getNumElements() > 1) + return false; + } + } + + return true; +} + + + +/// PointsToConstantGlobal - Return true if V (possibly indirectly) points to +/// some part of a constant global variable. This intentionally only accepts +/// constant expressions because we don't can't rewrite arbitrary instructions. +static bool PointsToConstantGlobal(Value *V) { + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) + return GV->isConstant(); + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) + if (CE->getOpcode() == Instruction::BitCast || + CE->getOpcode() == Instruction::GetElementPtr) + return PointsToConstantGlobal(CE->getOperand(0)); + return false; +} + +/// isOnlyCopiedFromConstantGlobal - Recursively walk the uses of a (derived) +/// pointer to an alloca. Ignore any reads of the pointer, return false if we +/// see any stores or other unknown uses. If we see pointer arithmetic, keep +/// track of whether it moves the pointer (with isOffset) but otherwise traverse +/// the uses. If we see a memcpy/memmove that targets an unoffseted pointer to +/// the alloca, and if the source pointer is a pointer to a constant global, we +/// can optimize this. +static bool +isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy, + bool isOffset, + SmallVector<Instruction *, 4> &LifetimeMarkers) { + // We track lifetime intrinsics as we encounter them. If we decide to go + // ahead and replace the value with the global, this lets the caller quickly + // eliminate the markers. + + for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) { + User *U = cast<Instruction>(*UI); + + if (LoadInst *LI = dyn_cast<LoadInst>(U)) { + // Ignore non-volatile loads, they are always ok. + if (!LI->isSimple()) return false; + continue; + } + + if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) { + // If uses of the bitcast are ok, we are ok. + if (!isOnlyCopiedFromConstantGlobal(BCI, TheCopy, isOffset, + LifetimeMarkers)) + return false; + continue; + } + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) { + // If the GEP has all zero indices, it doesn't offset the pointer. If it + // doesn't, it does. + if (!isOnlyCopiedFromConstantGlobal(GEP, TheCopy, + isOffset || !GEP->hasAllZeroIndices(), + LifetimeMarkers)) + return false; + continue; + } + + if (CallSite CS = U) { + // If this is the function being called then we treat it like a load and + // ignore it. + if (CS.isCallee(UI)) + continue; + + // If this is a readonly/readnone call site, then we know it is just a + // load (but one that potentially returns the value itself), so we can + // ignore it if we know that the value isn't captured. + unsigned ArgNo = CS.getArgumentNo(UI); + if (CS.onlyReadsMemory() && + (CS.getInstruction()->use_empty() || CS.doesNotCapture(ArgNo))) + continue; + + // If this is being passed as a byval argument, the caller is making a + // copy, so it is only a read of the alloca. + if (CS.isByValArgument(ArgNo)) + continue; + } + + // Lifetime intrinsics can be handled by the caller. + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) { + if (II->getIntrinsicID() == Intrinsic::lifetime_start || + II->getIntrinsicID() == Intrinsic::lifetime_end) { + assert(II->use_empty() && "Lifetime markers have no result to use!"); + LifetimeMarkers.push_back(II); + continue; + } + } + + // If this is isn't our memcpy/memmove, reject it as something we can't + // handle. + MemTransferInst *MI = dyn_cast<MemTransferInst>(U); + if (MI == 0) + return false; + + // If the transfer is using the alloca as a source of the transfer, then + // ignore it since it is a load (unless the transfer is volatile). + if (UI.getOperandNo() == 1) { + if (MI->isVolatile()) return false; + continue; + } + + // If we already have seen a copy, reject the second one. + if (TheCopy) return false; + + // If the pointer has been offset from the start of the alloca, we can't + // safely handle this. + if (isOffset) return false; + + // If the memintrinsic isn't using the alloca as the dest, reject it. + if (UI.getOperandNo() != 0) return false; + + // If the source of the memcpy/move is not a constant global, reject it. + if (!PointsToConstantGlobal(MI->getSource())) + return false; + + // Otherwise, the transform is safe. Remember the copy instruction. + TheCopy = MI; + } + return true; +} + +/// isOnlyCopiedFromConstantGlobal - Return true if the specified alloca is only +/// modified by a copy from a constant global. If we can prove this, we can +/// replace any uses of the alloca with uses of the global directly. +MemTransferInst * +SROA::isOnlyCopiedFromConstantGlobal(AllocaInst *AI, + SmallVector<Instruction*, 4> &ToDelete) { + MemTransferInst *TheCopy = 0; + if (::isOnlyCopiedFromConstantGlobal(AI, TheCopy, false, ToDelete)) + return TheCopy; + return 0; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp new file mode 100644 index 0000000..d13e4ab --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -0,0 +1,341 @@ +//===- SimplifyCFGPass.cpp - CFG Simplification Pass ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements dead code elimination and basic block merging, along +// with a collection of other peephole control flow optimizations. For example: +// +// * Removes basic blocks with no predecessors. +// * Merges a basic block into its predecessor if there is only one and the +// predecessor only has one successor. +// * Eliminates PHI nodes for basic blocks with a single predecessor. +// * Eliminates a basic block that only contains an unconditional branch. +// * Changes invoke instructions to nounwind functions to be calls. +// * Change things like "if (x) if (y)" into "if (x&y)". +// * etc.. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "simplifycfg" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Constants.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Module.h" +#include "llvm/Attributes.h" +#include "llvm/Support/CFG.h" +#include "llvm/Pass.h" +#include "llvm/Target/TargetData.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumSimpl, "Number of blocks simplified"); + +namespace { + struct CFGSimplifyPass : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + CFGSimplifyPass() : FunctionPass(ID) { + initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry()); + } + + virtual bool runOnFunction(Function &F); + }; +} + +char CFGSimplifyPass::ID = 0; +INITIALIZE_PASS(CFGSimplifyPass, "simplifycfg", + "Simplify the CFG", false, false) + +// Public interface to the CFGSimplification pass +FunctionPass *llvm::createCFGSimplificationPass() { + return new CFGSimplifyPass(); +} + +/// ChangeToUnreachable - Insert an unreachable instruction before the specified +/// instruction, making it and the rest of the code in the block dead. +static void ChangeToUnreachable(Instruction *I, bool UseLLVMTrap) { + BasicBlock *BB = I->getParent(); + // Loop over all of the successors, removing BB's entry from any PHI + // nodes. + for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) + (*SI)->removePredecessor(BB); + + // Insert a call to llvm.trap right before this. This turns the undefined + // behavior into a hard fail instead of falling through into random code. + if (UseLLVMTrap) { + Function *TrapFn = + Intrinsic::getDeclaration(BB->getParent()->getParent(), Intrinsic::trap); + CallInst *CallTrap = CallInst::Create(TrapFn, "", I); + CallTrap->setDebugLoc(I->getDebugLoc()); + } + new UnreachableInst(I->getContext(), I); + + // All instructions after this are dead. + BasicBlock::iterator BBI = I, BBE = BB->end(); + while (BBI != BBE) { + if (!BBI->use_empty()) + BBI->replaceAllUsesWith(UndefValue::get(BBI->getType())); + BB->getInstList().erase(BBI++); + } +} + +/// ChangeToCall - Convert the specified invoke into a normal call. +static void ChangeToCall(InvokeInst *II) { + SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3); + CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, "", II); + NewCall->takeName(II); + NewCall->setCallingConv(II->getCallingConv()); + NewCall->setAttributes(II->getAttributes()); + NewCall->setDebugLoc(II->getDebugLoc()); + II->replaceAllUsesWith(NewCall); + + // Follow the call by a branch to the normal destination. + BranchInst::Create(II->getNormalDest(), II); + + // Update PHI nodes in the unwind destination + II->getUnwindDest()->removePredecessor(II->getParent()); + II->eraseFromParent(); +} + +static bool MarkAliveBlocks(BasicBlock *BB, + SmallPtrSet<BasicBlock*, 128> &Reachable) { + + SmallVector<BasicBlock*, 128> Worklist; + Worklist.push_back(BB); + bool Changed = false; + do { + BB = Worklist.pop_back_val(); + + if (!Reachable.insert(BB)) + continue; + + // Do a quick scan of the basic block, turning any obviously unreachable + // instructions into LLVM unreachable insts. The instruction combining pass + // canonicalizes unreachable insts into stores to null or undef. + for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;++BBI){ + if (CallInst *CI = dyn_cast<CallInst>(BBI)) { + if (CI->doesNotReturn()) { + // If we found a call to a no-return function, insert an unreachable + // instruction after it. Make sure there isn't *already* one there + // though. + ++BBI; + if (!isa<UnreachableInst>(BBI)) { + // Don't insert a call to llvm.trap right before the unreachable. + ChangeToUnreachable(BBI, false); + Changed = true; + } + break; + } + } + + // Store to undef and store to null are undefined and used to signal that + // they should be changed to unreachable by passes that can't modify the + // CFG. + if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) { + // Don't touch volatile stores. + if (SI->isVolatile()) continue; + + Value *Ptr = SI->getOperand(1); + + if (isa<UndefValue>(Ptr) || + (isa<ConstantPointerNull>(Ptr) && + SI->getPointerAddressSpace() == 0)) { + ChangeToUnreachable(SI, true); + Changed = true; + break; + } + } + } + + // Turn invokes that call 'nounwind' functions into ordinary calls. + if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) { + Value *Callee = II->getCalledValue(); + if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) { + ChangeToUnreachable(II, true); + Changed = true; + } else if (II->doesNotThrow()) { + if (II->use_empty() && II->onlyReadsMemory()) { + // jump to the normal destination branch. + BranchInst::Create(II->getNormalDest(), II); + II->getUnwindDest()->removePredecessor(II->getParent()); + II->eraseFromParent(); + } else + ChangeToCall(II); + Changed = true; + } + } + + Changed |= ConstantFoldTerminator(BB, true); + for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) + Worklist.push_back(*SI); + } while (!Worklist.empty()); + return Changed; +} + +/// RemoveUnreachableBlocksFromFn - Remove blocks that are not reachable, even +/// if they are in a dead cycle. Return true if a change was made, false +/// otherwise. +static bool RemoveUnreachableBlocksFromFn(Function &F) { + SmallPtrSet<BasicBlock*, 128> Reachable; + bool Changed = MarkAliveBlocks(F.begin(), Reachable); + + // If there are unreachable blocks in the CFG... + if (Reachable.size() == F.size()) + return Changed; + + assert(Reachable.size() < F.size()); + NumSimpl += F.size()-Reachable.size(); + + // Loop over all of the basic blocks that are not reachable, dropping all of + // their internal references... + for (Function::iterator BB = ++F.begin(), E = F.end(); BB != E; ++BB) { + if (Reachable.count(BB)) + continue; + + for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) + if (Reachable.count(*SI)) + (*SI)->removePredecessor(BB); + BB->dropAllReferences(); + } + + for (Function::iterator I = ++F.begin(); I != F.end();) + if (!Reachable.count(I)) + I = F.getBasicBlockList().erase(I); + else + ++I; + + return true; +} + +/// MergeEmptyReturnBlocks - If we have more than one empty (other than phi +/// node) return blocks, merge them together to promote recursive block merging. +static bool MergeEmptyReturnBlocks(Function &F) { + bool Changed = false; + + BasicBlock *RetBlock = 0; + + // Scan all the blocks in the function, looking for empty return blocks. + for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; ) { + BasicBlock &BB = *BBI++; + + // Only look at return blocks. + ReturnInst *Ret = dyn_cast<ReturnInst>(BB.getTerminator()); + if (Ret == 0) continue; + + // Only look at the block if it is empty or the only other thing in it is a + // single PHI node that is the operand to the return. + if (Ret != &BB.front()) { + // Check for something else in the block. + BasicBlock::iterator I = Ret; + --I; + // Skip over debug info. + while (isa<DbgInfoIntrinsic>(I) && I != BB.begin()) + --I; + if (!isa<DbgInfoIntrinsic>(I) && + (!isa<PHINode>(I) || I != BB.begin() || + Ret->getNumOperands() == 0 || + Ret->getOperand(0) != I)) + continue; + } + + // If this is the first returning block, remember it and keep going. + if (RetBlock == 0) { + RetBlock = &BB; + continue; + } + + // Otherwise, we found a duplicate return block. Merge the two. + Changed = true; + + // Case when there is no input to the return or when the returned values + // agree is trivial. Note that they can't agree if there are phis in the + // blocks. + if (Ret->getNumOperands() == 0 || + Ret->getOperand(0) == + cast<ReturnInst>(RetBlock->getTerminator())->getOperand(0)) { + BB.replaceAllUsesWith(RetBlock); + BB.eraseFromParent(); + continue; + } + + // If the canonical return block has no PHI node, create one now. + PHINode *RetBlockPHI = dyn_cast<PHINode>(RetBlock->begin()); + if (RetBlockPHI == 0) { + Value *InVal = cast<ReturnInst>(RetBlock->getTerminator())->getOperand(0); + pred_iterator PB = pred_begin(RetBlock), PE = pred_end(RetBlock); + RetBlockPHI = PHINode::Create(Ret->getOperand(0)->getType(), + std::distance(PB, PE), "merge", + &RetBlock->front()); + + for (pred_iterator PI = PB; PI != PE; ++PI) + RetBlockPHI->addIncoming(InVal, *PI); + RetBlock->getTerminator()->setOperand(0, RetBlockPHI); + } + + // Turn BB into a block that just unconditionally branches to the return + // block. This handles the case when the two return blocks have a common + // predecessor but that return different things. + RetBlockPHI->addIncoming(Ret->getOperand(0), &BB); + BB.getTerminator()->eraseFromParent(); + BranchInst::Create(RetBlock, &BB); + } + + return Changed; +} + +/// IterativeSimplifyCFG - Call SimplifyCFG on all the blocks in the function, +/// iterating until no more changes are made. +static bool IterativeSimplifyCFG(Function &F, const TargetData *TD) { + bool Changed = false; + bool LocalChange = true; + while (LocalChange) { + LocalChange = false; + + // Loop over all of the basic blocks and remove them if they are unneeded... + // + for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) { + if (SimplifyCFG(BBIt++, TD)) { + LocalChange = true; + ++NumSimpl; + } + } + Changed |= LocalChange; + } + return Changed; +} + +// It is possible that we may require multiple passes over the code to fully +// simplify the CFG. +// +bool CFGSimplifyPass::runOnFunction(Function &F) { + const TargetData *TD = getAnalysisIfAvailable<TargetData>(); + bool EverChanged = RemoveUnreachableBlocksFromFn(F); + EverChanged |= MergeEmptyReturnBlocks(F); + EverChanged |= IterativeSimplifyCFG(F, TD); + + // If neither pass changed anything, we're done. + if (!EverChanged) return false; + + // IterativeSimplifyCFG can (rarely) make some loops dead. If this happens, + // RemoveUnreachableBlocksFromFn is needed to nuke them, which means we should + // iterate between the two optimizations. We structure the code like this to + // avoid reruning IterativeSimplifyCFG if the second pass of + // RemoveUnreachableBlocksFromFn doesn't do anything. + if (!RemoveUnreachableBlocksFromFn(F)) + return true; + + do { + EverChanged = IterativeSimplifyCFG(F, TD); + EverChanged |= RemoveUnreachableBlocksFromFn(F); + } while (EverChanged); + + return true; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyLibCalls.cpp new file mode 100644 index 0000000..f110320 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyLibCalls.cpp @@ -0,0 +1,2476 @@ +//===- SimplifyLibCalls.cpp - Optimize specific well-known library calls --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a simple pass that applies a variety of small +// optimizations for calls to specific well-known function calls (e.g. runtime +// library functions). Any optimization that takes the very simple form +// "replace call to library function with simpler code that provides the same +// result" belongs in this file. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "simplify-libcalls" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BuildLibCalls.h" +#include "llvm/IRBuilder.h" +#include "llvm/Intrinsics.h" +#include "llvm/LLVMContext.h" +#include "llvm/Module.h" +#include "llvm/Pass.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Config/config.h" // FIXME: Shouldn't depend on host! +using namespace llvm; + +STATISTIC(NumSimplified, "Number of library calls simplified"); +STATISTIC(NumAnnotated, "Number of attributes added to library functions"); + +//===----------------------------------------------------------------------===// +// Optimizer Base Class +//===----------------------------------------------------------------------===// + +/// This class is the abstract base class for the set of optimizations that +/// corresponds to one library call. +namespace { +class LibCallOptimization { +protected: + Function *Caller; + const TargetData *TD; + const TargetLibraryInfo *TLI; + LLVMContext* Context; +public: + LibCallOptimization() { } + virtual ~LibCallOptimization() {} + + /// CallOptimizer - This pure virtual method is implemented by base classes to + /// do various optimizations. If this returns null then no transformation was + /// performed. If it returns CI, then it transformed the call and CI is to be + /// deleted. If it returns something else, replace CI with the new value and + /// delete CI. + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) + =0; + + Value *OptimizeCall(CallInst *CI, const TargetData *TD, + const TargetLibraryInfo *TLI, IRBuilder<> &B) { + Caller = CI->getParent()->getParent(); + this->TD = TD; + this->TLI = TLI; + if (CI->getCalledFunction()) + Context = &CI->getCalledFunction()->getContext(); + + // We never change the calling convention. + if (CI->getCallingConv() != llvm::CallingConv::C) + return NULL; + + return CallOptimizer(CI->getCalledFunction(), CI, B); + } +}; +} // End anonymous namespace. + + +//===----------------------------------------------------------------------===// +// Helper Functions +//===----------------------------------------------------------------------===// + +/// IsOnlyUsedInZeroEqualityComparison - Return true if it only matters that the +/// value is equal or not-equal to zero. +static bool IsOnlyUsedInZeroEqualityComparison(Value *V) { + for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); + UI != E; ++UI) { + if (ICmpInst *IC = dyn_cast<ICmpInst>(*UI)) + if (IC->isEquality()) + if (Constant *C = dyn_cast<Constant>(IC->getOperand(1))) + if (C->isNullValue()) + continue; + // Unknown instruction. + return false; + } + return true; +} + +static bool CallHasFloatingPointArgument(const CallInst *CI) { + for (CallInst::const_op_iterator it = CI->op_begin(), e = CI->op_end(); + it != e; ++it) { + if ((*it)->getType()->isFloatingPointTy()) + return true; + } + return false; +} + +/// IsOnlyUsedInEqualityComparison - Return true if it is only used in equality +/// comparisons with With. +static bool IsOnlyUsedInEqualityComparison(Value *V, Value *With) { + for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); + UI != E; ++UI) { + if (ICmpInst *IC = dyn_cast<ICmpInst>(*UI)) + if (IC->isEquality() && IC->getOperand(1) == With) + continue; + // Unknown instruction. + return false; + } + return true; +} + +//===----------------------------------------------------------------------===// +// String and Memory LibCall Optimizations +//===----------------------------------------------------------------------===// + +//===---------------------------------------===// +// 'strcat' Optimizations +namespace { +struct StrCatOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Verify the "strcat" function prototype. + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || + FT->getReturnType() != B.getInt8PtrTy() || + FT->getParamType(0) != FT->getReturnType() || + FT->getParamType(1) != FT->getReturnType()) + return 0; + + // Extract some information from the instruction + Value *Dst = CI->getArgOperand(0); + Value *Src = CI->getArgOperand(1); + + // See if we can get the length of the input string. + uint64_t Len = GetStringLength(Src); + if (Len == 0) return 0; + --Len; // Unbias length. + + // Handle the simple, do-nothing case: strcat(x, "") -> x + if (Len == 0) + return Dst; + + // These optimizations require TargetData. + if (!TD) return 0; + + return EmitStrLenMemCpy(Src, Dst, Len, B); + } + + Value *EmitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len, IRBuilder<> &B) { + // We need to find the end of the destination string. That's where the + // memory is to be moved to. We just generate a call to strlen. + Value *DstLen = EmitStrLen(Dst, B, TD, TLI); + if (!DstLen) + return 0; + + // Now that we have the destination's length, we must index into the + // destination's pointer to get the actual memcpy destination (end of + // the string .. we're concatenating). + Value *CpyDst = B.CreateGEP(Dst, DstLen, "endptr"); + + // We have enough information to now generate the memcpy call to do the + // concatenation for us. Make a memcpy to copy the nul byte with align = 1. + B.CreateMemCpy(CpyDst, Src, + ConstantInt::get(TD->getIntPtrType(*Context), Len + 1), 1); + return Dst; + } +}; + +//===---------------------------------------===// +// 'strncat' Optimizations + +struct StrNCatOpt : public StrCatOpt { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Verify the "strncat" function prototype. + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 3 || + FT->getReturnType() != B.getInt8PtrTy() || + FT->getParamType(0) != FT->getReturnType() || + FT->getParamType(1) != FT->getReturnType() || + !FT->getParamType(2)->isIntegerTy()) + return 0; + + // Extract some information from the instruction + Value *Dst = CI->getArgOperand(0); + Value *Src = CI->getArgOperand(1); + uint64_t Len; + + // We don't do anything if length is not constant + if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2))) + Len = LengthArg->getZExtValue(); + else + return 0; + + // See if we can get the length of the input string. + uint64_t SrcLen = GetStringLength(Src); + if (SrcLen == 0) return 0; + --SrcLen; // Unbias length. + + // Handle the simple, do-nothing cases: + // strncat(x, "", c) -> x + // strncat(x, c, 0) -> x + if (SrcLen == 0 || Len == 0) return Dst; + + // These optimizations require TargetData. + if (!TD) return 0; + + // We don't optimize this case + if (Len < SrcLen) return 0; + + // strncat(x, s, c) -> strcat(x, s) + // s is constant so the strcat can be optimized further + return EmitStrLenMemCpy(Src, Dst, SrcLen, B); + } +}; + +//===---------------------------------------===// +// 'strchr' Optimizations + +struct StrChrOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Verify the "strchr" function prototype. + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || + FT->getReturnType() != B.getInt8PtrTy() || + FT->getParamType(0) != FT->getReturnType() || + !FT->getParamType(1)->isIntegerTy(32)) + return 0; + + Value *SrcStr = CI->getArgOperand(0); + + // If the second operand is non-constant, see if we can compute the length + // of the input string and turn this into memchr. + ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1)); + if (CharC == 0) { + // These optimizations require TargetData. + if (!TD) return 0; + + uint64_t Len = GetStringLength(SrcStr); + if (Len == 0 || !FT->getParamType(1)->isIntegerTy(32))// memchr needs i32. + return 0; + + return EmitMemChr(SrcStr, CI->getArgOperand(1), // include nul. + ConstantInt::get(TD->getIntPtrType(*Context), Len), + B, TD, TLI); + } + + // Otherwise, the character is a constant, see if the first argument is + // a string literal. If so, we can constant fold. + StringRef Str; + if (!getConstantStringInfo(SrcStr, Str)) + return 0; + + // Compute the offset, make sure to handle the case when we're searching for + // zero (a weird way to spell strlen). + size_t I = CharC->getSExtValue() == 0 ? + Str.size() : Str.find(CharC->getSExtValue()); + if (I == StringRef::npos) // Didn't find the char. strchr returns null. + return Constant::getNullValue(CI->getType()); + + // strchr(s+n,c) -> gep(s+n+i,c) + return B.CreateGEP(SrcStr, B.getInt64(I), "strchr"); + } +}; + +//===---------------------------------------===// +// 'strrchr' Optimizations + +struct StrRChrOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Verify the "strrchr" function prototype. + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || + FT->getReturnType() != B.getInt8PtrTy() || + FT->getParamType(0) != FT->getReturnType() || + !FT->getParamType(1)->isIntegerTy(32)) + return 0; + + Value *SrcStr = CI->getArgOperand(0); + ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1)); + + // Cannot fold anything if we're not looking for a constant. + if (!CharC) + return 0; + + StringRef Str; + if (!getConstantStringInfo(SrcStr, Str)) { + // strrchr(s, 0) -> strchr(s, 0) + if (TD && CharC->isZero()) + return EmitStrChr(SrcStr, '\0', B, TD, TLI); + return 0; + } + + // Compute the offset. + size_t I = CharC->getSExtValue() == 0 ? + Str.size() : Str.rfind(CharC->getSExtValue()); + if (I == StringRef::npos) // Didn't find the char. Return null. + return Constant::getNullValue(CI->getType()); + + // strrchr(s+n,c) -> gep(s+n+i,c) + return B.CreateGEP(SrcStr, B.getInt64(I), "strrchr"); + } +}; + +//===---------------------------------------===// +// 'strcmp' Optimizations + +struct StrCmpOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Verify the "strcmp" function prototype. + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || + !FT->getReturnType()->isIntegerTy(32) || + FT->getParamType(0) != FT->getParamType(1) || + FT->getParamType(0) != B.getInt8PtrTy()) + return 0; + + Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1); + if (Str1P == Str2P) // strcmp(x,x) -> 0 + return ConstantInt::get(CI->getType(), 0); + + StringRef Str1, Str2; + bool HasStr1 = getConstantStringInfo(Str1P, Str1); + bool HasStr2 = getConstantStringInfo(Str2P, Str2); + + // strcmp(x, y) -> cnst (if both x and y are constant strings) + if (HasStr1 && HasStr2) + return ConstantInt::get(CI->getType(), Str1.compare(Str2)); + + if (HasStr1 && Str1.empty()) // strcmp("", x) -> -*x + return B.CreateNeg(B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), + CI->getType())); + + if (HasStr2 && Str2.empty()) // strcmp(x,"") -> *x + return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType()); + + // strcmp(P, "x") -> memcmp(P, "x", 2) + uint64_t Len1 = GetStringLength(Str1P); + uint64_t Len2 = GetStringLength(Str2P); + if (Len1 && Len2) { + // These optimizations require TargetData. + if (!TD) return 0; + + return EmitMemCmp(Str1P, Str2P, + ConstantInt::get(TD->getIntPtrType(*Context), + std::min(Len1, Len2)), B, TD, TLI); + } + + return 0; + } +}; + +//===---------------------------------------===// +// 'strncmp' Optimizations + +struct StrNCmpOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Verify the "strncmp" function prototype. + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 3 || + !FT->getReturnType()->isIntegerTy(32) || + FT->getParamType(0) != FT->getParamType(1) || + FT->getParamType(0) != B.getInt8PtrTy() || + !FT->getParamType(2)->isIntegerTy()) + return 0; + + Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1); + if (Str1P == Str2P) // strncmp(x,x,n) -> 0 + return ConstantInt::get(CI->getType(), 0); + + // Get the length argument if it is constant. + uint64_t Length; + if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2))) + Length = LengthArg->getZExtValue(); + else + return 0; + + if (Length == 0) // strncmp(x,y,0) -> 0 + return ConstantInt::get(CI->getType(), 0); + + if (TD && Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1) + return EmitMemCmp(Str1P, Str2P, CI->getArgOperand(2), B, TD, TLI); + + StringRef Str1, Str2; + bool HasStr1 = getConstantStringInfo(Str1P, Str1); + bool HasStr2 = getConstantStringInfo(Str2P, Str2); + + // strncmp(x, y) -> cnst (if both x and y are constant strings) + if (HasStr1 && HasStr2) { + StringRef SubStr1 = Str1.substr(0, Length); + StringRef SubStr2 = Str2.substr(0, Length); + return ConstantInt::get(CI->getType(), SubStr1.compare(SubStr2)); + } + + if (HasStr1 && Str1.empty()) // strncmp("", x, n) -> -*x + return B.CreateNeg(B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), + CI->getType())); + + if (HasStr2 && Str2.empty()) // strncmp(x, "", n) -> *x + return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType()); + + return 0; + } +}; + + +//===---------------------------------------===// +// 'strcpy' Optimizations + +struct StrCpyOpt : public LibCallOptimization { + bool OptChkCall; // True if it's optimizing a __strcpy_chk libcall. + + StrCpyOpt(bool c) : OptChkCall(c) {} + + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Verify the "strcpy" function prototype. + unsigned NumParams = OptChkCall ? 3 : 2; + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != NumParams || + FT->getReturnType() != FT->getParamType(0) || + FT->getParamType(0) != FT->getParamType(1) || + FT->getParamType(0) != B.getInt8PtrTy()) + return 0; + + Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); + if (Dst == Src) // strcpy(x,x) -> x + return Src; + + // These optimizations require TargetData. + if (!TD) return 0; + + // See if we can get the length of the input string. + uint64_t Len = GetStringLength(Src); + if (Len == 0) return 0; + + // We have enough information to now generate the memcpy call to do the + // concatenation for us. Make a memcpy to copy the nul byte with align = 1. + if (!OptChkCall || + !EmitMemCpyChk(Dst, Src, + ConstantInt::get(TD->getIntPtrType(*Context), Len), + CI->getArgOperand(2), B, TD, TLI)) + B.CreateMemCpy(Dst, Src, + ConstantInt::get(TD->getIntPtrType(*Context), Len), 1); + return Dst; + } +}; + +//===---------------------------------------===// +// 'stpcpy' Optimizations + +struct StpCpyOpt: public LibCallOptimization { + bool OptChkCall; // True if it's optimizing a __stpcpy_chk libcall. + + StpCpyOpt(bool c) : OptChkCall(c) {} + + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Verify the "stpcpy" function prototype. + unsigned NumParams = OptChkCall ? 3 : 2; + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != NumParams || + FT->getReturnType() != FT->getParamType(0) || + FT->getParamType(0) != FT->getParamType(1) || + FT->getParamType(0) != B.getInt8PtrTy()) + return 0; + + // These optimizations require TargetData. + if (!TD) return 0; + + Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); + if (Dst == Src) { // stpcpy(x,x) -> x+strlen(x) + Value *StrLen = EmitStrLen(Src, B, TD, TLI); + return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : 0; + } + + // See if we can get the length of the input string. + uint64_t Len = GetStringLength(Src); + if (Len == 0) return 0; + + Value *LenV = ConstantInt::get(TD->getIntPtrType(*Context), Len); + Value *DstEnd = B.CreateGEP(Dst, + ConstantInt::get(TD->getIntPtrType(*Context), + Len - 1)); + + // We have enough information to now generate the memcpy call to do the + // copy for us. Make a memcpy to copy the nul byte with align = 1. + if (!OptChkCall || !EmitMemCpyChk(Dst, Src, LenV, CI->getArgOperand(2), B, + TD, TLI)) + B.CreateMemCpy(Dst, Src, LenV, 1); + return DstEnd; + } +}; + +//===---------------------------------------===// +// 'strncpy' Optimizations + +struct StrNCpyOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || + FT->getParamType(0) != FT->getParamType(1) || + FT->getParamType(0) != B.getInt8PtrTy() || + !FT->getParamType(2)->isIntegerTy()) + return 0; + + Value *Dst = CI->getArgOperand(0); + Value *Src = CI->getArgOperand(1); + Value *LenOp = CI->getArgOperand(2); + + // See if we can get the length of the input string. + uint64_t SrcLen = GetStringLength(Src); + if (SrcLen == 0) return 0; + --SrcLen; + + if (SrcLen == 0) { + // strncpy(x, "", y) -> memset(x, '\0', y, 1) + B.CreateMemSet(Dst, B.getInt8('\0'), LenOp, 1); + return Dst; + } + + uint64_t Len; + if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(LenOp)) + Len = LengthArg->getZExtValue(); + else + return 0; + + if (Len == 0) return Dst; // strncpy(x, y, 0) -> x + + // These optimizations require TargetData. + if (!TD) return 0; + + // Let strncpy handle the zero padding + if (Len > SrcLen+1) return 0; + + // strncpy(x, s, c) -> memcpy(x, s, c, 1) [s and c are constant] + B.CreateMemCpy(Dst, Src, + ConstantInt::get(TD->getIntPtrType(*Context), Len), 1); + + return Dst; + } +}; + +//===---------------------------------------===// +// 'strlen' Optimizations + +struct StrLenOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 1 || + FT->getParamType(0) != B.getInt8PtrTy() || + !FT->getReturnType()->isIntegerTy()) + return 0; + + Value *Src = CI->getArgOperand(0); + + // Constant folding: strlen("xyz") -> 3 + if (uint64_t Len = GetStringLength(Src)) + return ConstantInt::get(CI->getType(), Len-1); + + // strlen(x) != 0 --> *x != 0 + // strlen(x) == 0 --> *x == 0 + if (IsOnlyUsedInZeroEqualityComparison(CI)) + return B.CreateZExt(B.CreateLoad(Src, "strlenfirst"), CI->getType()); + return 0; + } +}; + + +//===---------------------------------------===// +// 'strpbrk' Optimizations + +struct StrPBrkOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || + FT->getParamType(0) != B.getInt8PtrTy() || + FT->getParamType(1) != FT->getParamType(0) || + FT->getReturnType() != FT->getParamType(0)) + return 0; + + StringRef S1, S2; + bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1); + bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2); + + // strpbrk(s, "") -> NULL + // strpbrk("", s) -> NULL + if ((HasS1 && S1.empty()) || (HasS2 && S2.empty())) + return Constant::getNullValue(CI->getType()); + + // Constant folding. + if (HasS1 && HasS2) { + size_t I = S1.find_first_of(S2); + if (I == std::string::npos) // No match. + return Constant::getNullValue(CI->getType()); + + return B.CreateGEP(CI->getArgOperand(0), B.getInt64(I), "strpbrk"); + } + + // strpbrk(s, "a") -> strchr(s, 'a') + if (TD && HasS2 && S2.size() == 1) + return EmitStrChr(CI->getArgOperand(0), S2[0], B, TD, TLI); + + return 0; + } +}; + +//===---------------------------------------===// +// 'strto*' Optimizations. This handles strtol, strtod, strtof, strtoul, etc. + +struct StrToOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + FunctionType *FT = Callee->getFunctionType(); + if ((FT->getNumParams() != 2 && FT->getNumParams() != 3) || + !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isPointerTy()) + return 0; + + Value *EndPtr = CI->getArgOperand(1); + if (isa<ConstantPointerNull>(EndPtr)) { + // With a null EndPtr, this function won't capture the main argument. + // It would be readonly too, except that it still may write to errno. + CI->addAttribute(1, Attribute::NoCapture); + } + + return 0; + } +}; + +//===---------------------------------------===// +// 'strspn' Optimizations + +struct StrSpnOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || + FT->getParamType(0) != B.getInt8PtrTy() || + FT->getParamType(1) != FT->getParamType(0) || + !FT->getReturnType()->isIntegerTy()) + return 0; + + StringRef S1, S2; + bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1); + bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2); + + // strspn(s, "") -> 0 + // strspn("", s) -> 0 + if ((HasS1 && S1.empty()) || (HasS2 && S2.empty())) + return Constant::getNullValue(CI->getType()); + + // Constant folding. + if (HasS1 && HasS2) { + size_t Pos = S1.find_first_not_of(S2); + if (Pos == StringRef::npos) Pos = S1.size(); + return ConstantInt::get(CI->getType(), Pos); + } + + return 0; + } +}; + +//===---------------------------------------===// +// 'strcspn' Optimizations + +struct StrCSpnOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || + FT->getParamType(0) != B.getInt8PtrTy() || + FT->getParamType(1) != FT->getParamType(0) || + !FT->getReturnType()->isIntegerTy()) + return 0; + + StringRef S1, S2; + bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1); + bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2); + + // strcspn("", s) -> 0 + if (HasS1 && S1.empty()) + return Constant::getNullValue(CI->getType()); + + // Constant folding. + if (HasS1 && HasS2) { + size_t Pos = S1.find_first_of(S2); + if (Pos == StringRef::npos) Pos = S1.size(); + return ConstantInt::get(CI->getType(), Pos); + } + + // strcspn(s, "") -> strlen(s) + if (TD && HasS2 && S2.empty()) + return EmitStrLen(CI->getArgOperand(0), B, TD, TLI); + + return 0; + } +}; + +//===---------------------------------------===// +// 'strstr' Optimizations + +struct StrStrOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || + !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isPointerTy() || + !FT->getReturnType()->isPointerTy()) + return 0; + + // fold strstr(x, x) -> x. + if (CI->getArgOperand(0) == CI->getArgOperand(1)) + return B.CreateBitCast(CI->getArgOperand(0), CI->getType()); + + // fold strstr(a, b) == a -> strncmp(a, b, strlen(b)) == 0 + if (TD && IsOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) { + Value *StrLen = EmitStrLen(CI->getArgOperand(1), B, TD, TLI); + if (!StrLen) + return 0; + Value *StrNCmp = EmitStrNCmp(CI->getArgOperand(0), CI->getArgOperand(1), + StrLen, B, TD, TLI); + if (!StrNCmp) + return 0; + for (Value::use_iterator UI = CI->use_begin(), UE = CI->use_end(); + UI != UE; ) { + ICmpInst *Old = cast<ICmpInst>(*UI++); + Value *Cmp = B.CreateICmp(Old->getPredicate(), StrNCmp, + ConstantInt::getNullValue(StrNCmp->getType()), + "cmp"); + Old->replaceAllUsesWith(Cmp); + Old->eraseFromParent(); + } + return CI; + } + + // See if either input string is a constant string. + StringRef SearchStr, ToFindStr; + bool HasStr1 = getConstantStringInfo(CI->getArgOperand(0), SearchStr); + bool HasStr2 = getConstantStringInfo(CI->getArgOperand(1), ToFindStr); + + // fold strstr(x, "") -> x. + if (HasStr2 && ToFindStr.empty()) + return B.CreateBitCast(CI->getArgOperand(0), CI->getType()); + + // If both strings are known, constant fold it. + if (HasStr1 && HasStr2) { + std::string::size_type Offset = SearchStr.find(ToFindStr); + + if (Offset == StringRef::npos) // strstr("foo", "bar") -> null + return Constant::getNullValue(CI->getType()); + + // strstr("abcd", "bc") -> gep((char*)"abcd", 1) + Value *Result = CastToCStr(CI->getArgOperand(0), B); + Result = B.CreateConstInBoundsGEP1_64(Result, Offset, "strstr"); + return B.CreateBitCast(Result, CI->getType()); + } + + // fold strstr(x, "y") -> strchr(x, 'y'). + if (HasStr2 && ToFindStr.size() == 1) { + Value *StrChr= EmitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TD, TLI); + return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : 0; + } + return 0; + } +}; + + +//===---------------------------------------===// +// 'memcmp' Optimizations + +struct MemCmpOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 3 || !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isPointerTy() || + !FT->getReturnType()->isIntegerTy(32)) + return 0; + + Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1); + + if (LHS == RHS) // memcmp(s,s,x) -> 0 + return Constant::getNullValue(CI->getType()); + + // Make sure we have a constant length. + ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getArgOperand(2)); + if (!LenC) return 0; + uint64_t Len = LenC->getZExtValue(); + + if (Len == 0) // memcmp(s1,s2,0) -> 0 + return Constant::getNullValue(CI->getType()); + + // memcmp(S1,S2,1) -> *(unsigned char*)LHS - *(unsigned char*)RHS + if (Len == 1) { + Value *LHSV = B.CreateZExt(B.CreateLoad(CastToCStr(LHS, B), "lhsc"), + CI->getType(), "lhsv"); + Value *RHSV = B.CreateZExt(B.CreateLoad(CastToCStr(RHS, B), "rhsc"), + CI->getType(), "rhsv"); + return B.CreateSub(LHSV, RHSV, "chardiff"); + } + + // Constant folding: memcmp(x, y, l) -> cnst (all arguments are constant) + StringRef LHSStr, RHSStr; + if (getConstantStringInfo(LHS, LHSStr) && + getConstantStringInfo(RHS, RHSStr)) { + // Make sure we're not reading out-of-bounds memory. + if (Len > LHSStr.size() || Len > RHSStr.size()) + return 0; + uint64_t Ret = memcmp(LHSStr.data(), RHSStr.data(), Len); + return ConstantInt::get(CI->getType(), Ret); + } + + return 0; + } +}; + +//===---------------------------------------===// +// 'memcpy' Optimizations + +struct MemCpyOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // These optimizations require TargetData. + if (!TD) return 0; + + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || + !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isPointerTy() || + FT->getParamType(2) != TD->getIntPtrType(*Context)) + return 0; + + // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1) + B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), 1); + return CI->getArgOperand(0); + } +}; + +//===---------------------------------------===// +// 'memmove' Optimizations + +struct MemMoveOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // These optimizations require TargetData. + if (!TD) return 0; + + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || + !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isPointerTy() || + FT->getParamType(2) != TD->getIntPtrType(*Context)) + return 0; + + // memmove(x, y, n) -> llvm.memmove(x, y, n, 1) + B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), 1); + return CI->getArgOperand(0); + } +}; + +//===---------------------------------------===// +// 'memset' Optimizations + +struct MemSetOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // These optimizations require TargetData. + if (!TD) return 0; + + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || + !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isIntegerTy() || + FT->getParamType(2) != TD->getIntPtrType(*Context)) + return 0; + + // memset(p, v, n) -> llvm.memset(p, v, n, 1) + Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false); + B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1); + return CI->getArgOperand(0); + } +}; + +//===----------------------------------------------------------------------===// +// Math Library Optimizations +//===----------------------------------------------------------------------===// + +//===---------------------------------------===// +// 'cos*' Optimizations + +struct CosOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + FunctionType *FT = Callee->getFunctionType(); + // Just make sure this has 1 argument of FP type, which matches the + // result type. + if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || + !FT->getParamType(0)->isFloatingPointTy()) + return 0; + + // cos(-x) -> cos(x) + Value *Op1 = CI->getArgOperand(0); + if (BinaryOperator::isFNeg(Op1)) { + BinaryOperator *BinExpr = cast<BinaryOperator>(Op1); + return B.CreateCall(Callee, BinExpr->getOperand(1), "cos"); + } + return 0; + } +}; + +//===---------------------------------------===// +// 'pow*' Optimizations + +struct PowOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + FunctionType *FT = Callee->getFunctionType(); + // Just make sure this has 2 arguments of the same FP type, which match the + // result type. + if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) || + FT->getParamType(0) != FT->getParamType(1) || + !FT->getParamType(0)->isFloatingPointTy()) + return 0; + + Value *Op1 = CI->getArgOperand(0), *Op2 = CI->getArgOperand(1); + if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) { + if (Op1C->isExactlyValue(1.0)) // pow(1.0, x) -> 1.0 + return Op1C; + if (Op1C->isExactlyValue(2.0)) // pow(2.0, x) -> exp2(x) + return EmitUnaryFloatFnCall(Op2, "exp2", B, Callee->getAttributes()); + } + + ConstantFP *Op2C = dyn_cast<ConstantFP>(Op2); + if (Op2C == 0) return 0; + + if (Op2C->getValueAPF().isZero()) // pow(x, 0.0) -> 1.0 + return ConstantFP::get(CI->getType(), 1.0); + + if (Op2C->isExactlyValue(0.5)) { + // Expand pow(x, 0.5) to (x == -infinity ? +infinity : fabs(sqrt(x))). + // This is faster than calling pow, and still handles negative zero + // and negative infinity correctly. + // TODO: In fast-math mode, this could be just sqrt(x). + // TODO: In finite-only mode, this could be just fabs(sqrt(x)). + Value *Inf = ConstantFP::getInfinity(CI->getType()); + Value *NegInf = ConstantFP::getInfinity(CI->getType(), true); + Value *Sqrt = EmitUnaryFloatFnCall(Op1, "sqrt", B, + Callee->getAttributes()); + Value *FAbs = EmitUnaryFloatFnCall(Sqrt, "fabs", B, + Callee->getAttributes()); + Value *FCmp = B.CreateFCmpOEQ(Op1, NegInf); + Value *Sel = B.CreateSelect(FCmp, Inf, FAbs); + return Sel; + } + + if (Op2C->isExactlyValue(1.0)) // pow(x, 1.0) -> x + return Op1; + if (Op2C->isExactlyValue(2.0)) // pow(x, 2.0) -> x*x + return B.CreateFMul(Op1, Op1, "pow2"); + if (Op2C->isExactlyValue(-1.0)) // pow(x, -1.0) -> 1.0/x + return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), + Op1, "powrecip"); + return 0; + } +}; + +//===---------------------------------------===// +// 'exp2' Optimizations + +struct Exp2Opt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + FunctionType *FT = Callee->getFunctionType(); + // Just make sure this has 1 argument of FP type, which matches the + // result type. + if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || + !FT->getParamType(0)->isFloatingPointTy()) + return 0; + + Value *Op = CI->getArgOperand(0); + // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x)) if sizeof(x) <= 32 + // Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x)) if sizeof(x) < 32 + Value *LdExpArg = 0; + if (SIToFPInst *OpC = dyn_cast<SIToFPInst>(Op)) { + if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() <= 32) + LdExpArg = B.CreateSExt(OpC->getOperand(0), B.getInt32Ty()); + } else if (UIToFPInst *OpC = dyn_cast<UIToFPInst>(Op)) { + if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() < 32) + LdExpArg = B.CreateZExt(OpC->getOperand(0), B.getInt32Ty()); + } + + if (LdExpArg) { + const char *Name; + if (Op->getType()->isFloatTy()) + Name = "ldexpf"; + else if (Op->getType()->isDoubleTy()) + Name = "ldexp"; + else + Name = "ldexpl"; + + Constant *One = ConstantFP::get(*Context, APFloat(1.0f)); + if (!Op->getType()->isFloatTy()) + One = ConstantExpr::getFPExtend(One, Op->getType()); + + Module *M = Caller->getParent(); + Value *Callee = M->getOrInsertFunction(Name, Op->getType(), + Op->getType(), + B.getInt32Ty(), NULL); + CallInst *CI = B.CreateCall2(Callee, One, LdExpArg); + if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts())) + CI->setCallingConv(F->getCallingConv()); + + return CI; + } + return 0; + } +}; + +//===---------------------------------------===// +// Double -> Float Shrinking Optimizations for Unary Functions like 'floor' + +struct UnaryDoubleFPOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 1 || !FT->getReturnType()->isDoubleTy() || + !FT->getParamType(0)->isDoubleTy()) + return 0; + + // If this is something like 'floor((double)floatval)', convert to floorf. + FPExtInst *Cast = dyn_cast<FPExtInst>(CI->getArgOperand(0)); + if (Cast == 0 || !Cast->getOperand(0)->getType()->isFloatTy()) + return 0; + + // floor((double)floatval) -> (double)floorf(floatval) + Value *V = Cast->getOperand(0); + V = EmitUnaryFloatFnCall(V, Callee->getName(), B, Callee->getAttributes()); + return B.CreateFPExt(V, B.getDoubleTy()); + } +}; + +//===----------------------------------------------------------------------===// +// Integer Optimizations +//===----------------------------------------------------------------------===// + +//===---------------------------------------===// +// 'ffs*' Optimizations + +struct FFSOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + FunctionType *FT = Callee->getFunctionType(); + // Just make sure this has 2 arguments of the same FP type, which match the + // result type. + if (FT->getNumParams() != 1 || + !FT->getReturnType()->isIntegerTy(32) || + !FT->getParamType(0)->isIntegerTy()) + return 0; + + Value *Op = CI->getArgOperand(0); + + // Constant fold. + if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { + if (CI->getValue() == 0) // ffs(0) -> 0. + return Constant::getNullValue(CI->getType()); + // ffs(c) -> cttz(c)+1 + return B.getInt32(CI->getValue().countTrailingZeros() + 1); + } + + // ffs(x) -> x != 0 ? (i32)llvm.cttz(x)+1 : 0 + Type *ArgType = Op->getType(); + Value *F = Intrinsic::getDeclaration(Callee->getParent(), + Intrinsic::cttz, ArgType); + Value *V = B.CreateCall2(F, Op, B.getFalse(), "cttz"); + V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1)); + V = B.CreateIntCast(V, B.getInt32Ty(), false); + + Value *Cond = B.CreateICmpNE(Op, Constant::getNullValue(ArgType)); + return B.CreateSelect(Cond, V, B.getInt32(0)); + } +}; + +//===---------------------------------------===// +// 'isdigit' Optimizations + +struct IsDigitOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + FunctionType *FT = Callee->getFunctionType(); + // We require integer(i32) + if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() || + !FT->getParamType(0)->isIntegerTy(32)) + return 0; + + // isdigit(c) -> (c-'0') <u 10 + Value *Op = CI->getArgOperand(0); + Op = B.CreateSub(Op, B.getInt32('0'), "isdigittmp"); + Op = B.CreateICmpULT(Op, B.getInt32(10), "isdigit"); + return B.CreateZExt(Op, CI->getType()); + } +}; + +//===---------------------------------------===// +// 'isascii' Optimizations + +struct IsAsciiOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + FunctionType *FT = Callee->getFunctionType(); + // We require integer(i32) + if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() || + !FT->getParamType(0)->isIntegerTy(32)) + return 0; + + // isascii(c) -> c <u 128 + Value *Op = CI->getArgOperand(0); + Op = B.CreateICmpULT(Op, B.getInt32(128), "isascii"); + return B.CreateZExt(Op, CI->getType()); + } +}; + +//===---------------------------------------===// +// 'abs', 'labs', 'llabs' Optimizations + +struct AbsOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + FunctionType *FT = Callee->getFunctionType(); + // We require integer(integer) where the types agree. + if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() || + FT->getParamType(0) != FT->getReturnType()) + return 0; + + // abs(x) -> x >s -1 ? x : -x + Value *Op = CI->getArgOperand(0); + Value *Pos = B.CreateICmpSGT(Op, Constant::getAllOnesValue(Op->getType()), + "ispos"); + Value *Neg = B.CreateNeg(Op, "neg"); + return B.CreateSelect(Pos, Op, Neg); + } +}; + + +//===---------------------------------------===// +// 'toascii' Optimizations + +struct ToAsciiOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + FunctionType *FT = Callee->getFunctionType(); + // We require i32(i32) + if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || + !FT->getParamType(0)->isIntegerTy(32)) + return 0; + + // isascii(c) -> c & 0x7f + return B.CreateAnd(CI->getArgOperand(0), + ConstantInt::get(CI->getType(),0x7F)); + } +}; + +//===----------------------------------------------------------------------===// +// Formatting and IO Optimizations +//===----------------------------------------------------------------------===// + +//===---------------------------------------===// +// 'printf' Optimizations + +struct PrintFOpt : public LibCallOptimization { + Value *OptimizeFixedFormatString(Function *Callee, CallInst *CI, + IRBuilder<> &B) { + // Check for a fixed format string. + StringRef FormatStr; + if (!getConstantStringInfo(CI->getArgOperand(0), FormatStr)) + return 0; + + // Empty format string -> noop. + if (FormatStr.empty()) // Tolerate printf's declared void. + return CI->use_empty() ? (Value*)CI : + ConstantInt::get(CI->getType(), 0); + + // Do not do any of the following transformations if the printf return value + // is used, in general the printf return value is not compatible with either + // putchar() or puts(). + if (!CI->use_empty()) + return 0; + + // printf("x") -> putchar('x'), even for '%'. + if (FormatStr.size() == 1) { + Value *Res = EmitPutChar(B.getInt32(FormatStr[0]), B, TD, TLI); + if (CI->use_empty() || !Res) return Res; + return B.CreateIntCast(Res, CI->getType(), true); + } + + // printf("foo\n") --> puts("foo") + if (FormatStr[FormatStr.size()-1] == '\n' && + FormatStr.find('%') == std::string::npos) { // no format characters. + // Create a string literal with no \n on it. We expect the constant merge + // pass to be run after this pass, to merge duplicate strings. + FormatStr = FormatStr.drop_back(); + Value *GV = B.CreateGlobalString(FormatStr, "str"); + Value *NewCI = EmitPutS(GV, B, TD, TLI); + return (CI->use_empty() || !NewCI) ? + NewCI : + ConstantInt::get(CI->getType(), FormatStr.size()+1); + } + + // Optimize specific format strings. + // printf("%c", chr) --> putchar(chr) + if (FormatStr == "%c" && CI->getNumArgOperands() > 1 && + CI->getArgOperand(1)->getType()->isIntegerTy()) { + Value *Res = EmitPutChar(CI->getArgOperand(1), B, TD, TLI); + + if (CI->use_empty() || !Res) return Res; + return B.CreateIntCast(Res, CI->getType(), true); + } + + // printf("%s\n", str) --> puts(str) + if (FormatStr == "%s\n" && CI->getNumArgOperands() > 1 && + CI->getArgOperand(1)->getType()->isPointerTy()) { + return EmitPutS(CI->getArgOperand(1), B, TD, TLI); + } + return 0; + } + + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Require one fixed pointer argument and an integer/void result. + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() || + !(FT->getReturnType()->isIntegerTy() || + FT->getReturnType()->isVoidTy())) + return 0; + + if (Value *V = OptimizeFixedFormatString(Callee, CI, B)) { + return V; + } + + // printf(format, ...) -> iprintf(format, ...) if no floating point + // arguments. + if (TLI->has(LibFunc::iprintf) && !CallHasFloatingPointArgument(CI)) { + Module *M = B.GetInsertBlock()->getParent()->getParent(); + Constant *IPrintFFn = + M->getOrInsertFunction("iprintf", FT, Callee->getAttributes()); + CallInst *New = cast<CallInst>(CI->clone()); + New->setCalledFunction(IPrintFFn); + B.Insert(New); + return New; + } + return 0; + } +}; + +//===---------------------------------------===// +// 'sprintf' Optimizations + +struct SPrintFOpt : public LibCallOptimization { + Value *OptimizeFixedFormatString(Function *Callee, CallInst *CI, + IRBuilder<> &B) { + // Check for a fixed format string. + StringRef FormatStr; + if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr)) + return 0; + + // If we just have a format string (nothing else crazy) transform it. + if (CI->getNumArgOperands() == 2) { + // Make sure there's no % in the constant array. We could try to handle + // %% -> % in the future if we cared. + for (unsigned i = 0, e = FormatStr.size(); i != e; ++i) + if (FormatStr[i] == '%') + return 0; // we found a format specifier, bail out. + + // These optimizations require TargetData. + if (!TD) return 0; + + // sprintf(str, fmt) -> llvm.memcpy(str, fmt, strlen(fmt)+1, 1) + B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1), + ConstantInt::get(TD->getIntPtrType(*Context), // Copy the + FormatStr.size() + 1), 1); // nul byte. + return ConstantInt::get(CI->getType(), FormatStr.size()); + } + + // The remaining optimizations require the format string to be "%s" or "%c" + // and have an extra operand. + if (FormatStr.size() != 2 || FormatStr[0] != '%' || + CI->getNumArgOperands() < 3) + return 0; + + // Decode the second character of the format string. + if (FormatStr[1] == 'c') { + // sprintf(dst, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0 + if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0; + Value *V = B.CreateTrunc(CI->getArgOperand(2), B.getInt8Ty(), "char"); + Value *Ptr = CastToCStr(CI->getArgOperand(0), B); + B.CreateStore(V, Ptr); + Ptr = B.CreateGEP(Ptr, B.getInt32(1), "nul"); + B.CreateStore(B.getInt8(0), Ptr); + + return ConstantInt::get(CI->getType(), 1); + } + + if (FormatStr[1] == 's') { + // These optimizations require TargetData. + if (!TD) return 0; + + // sprintf(dest, "%s", str) -> llvm.memcpy(dest, str, strlen(str)+1, 1) + if (!CI->getArgOperand(2)->getType()->isPointerTy()) return 0; + + Value *Len = EmitStrLen(CI->getArgOperand(2), B, TD, TLI); + if (!Len) + return 0; + Value *IncLen = B.CreateAdd(Len, + ConstantInt::get(Len->getType(), 1), + "leninc"); + B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(2), IncLen, 1); + + // The sprintf result is the unincremented number of bytes in the string. + return B.CreateIntCast(Len, CI->getType(), false); + } + return 0; + } + + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Require two fixed pointer arguments and an integer result. + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isPointerTy() || + !FT->getReturnType()->isIntegerTy()) + return 0; + + if (Value *V = OptimizeFixedFormatString(Callee, CI, B)) { + return V; + } + + // sprintf(str, format, ...) -> siprintf(str, format, ...) if no floating + // point arguments. + if (TLI->has(LibFunc::siprintf) && !CallHasFloatingPointArgument(CI)) { + Module *M = B.GetInsertBlock()->getParent()->getParent(); + Constant *SIPrintFFn = + M->getOrInsertFunction("siprintf", FT, Callee->getAttributes()); + CallInst *New = cast<CallInst>(CI->clone()); + New->setCalledFunction(SIPrintFFn); + B.Insert(New); + return New; + } + return 0; + } +}; + +//===---------------------------------------===// +// 'fwrite' Optimizations + +struct FWriteOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Require a pointer, an integer, an integer, a pointer, returning integer. + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 4 || !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isIntegerTy() || + !FT->getParamType(2)->isIntegerTy() || + !FT->getParamType(3)->isPointerTy() || + !FT->getReturnType()->isIntegerTy()) + return 0; + + // Get the element size and count. + ConstantInt *SizeC = dyn_cast<ConstantInt>(CI->getArgOperand(1)); + ConstantInt *CountC = dyn_cast<ConstantInt>(CI->getArgOperand(2)); + if (!SizeC || !CountC) return 0; + uint64_t Bytes = SizeC->getZExtValue()*CountC->getZExtValue(); + + // If this is writing zero records, remove the call (it's a noop). + if (Bytes == 0) + return ConstantInt::get(CI->getType(), 0); + + // If this is writing one byte, turn it into fputc. + // This optimisation is only valid, if the return value is unused. + if (Bytes == 1 && CI->use_empty()) { // fwrite(S,1,1,F) -> fputc(S[0],F) + Value *Char = B.CreateLoad(CastToCStr(CI->getArgOperand(0), B), "char"); + Value *NewCI = EmitFPutC(Char, CI->getArgOperand(3), B, TD, TLI); + return NewCI ? ConstantInt::get(CI->getType(), 1) : 0; + } + + return 0; + } +}; + +//===---------------------------------------===// +// 'fputs' Optimizations + +struct FPutsOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // These optimizations require TargetData. + if (!TD) return 0; + + // Require two pointers. Also, we can't optimize if return value is used. + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isPointerTy() || + !CI->use_empty()) + return 0; + + // fputs(s,F) --> fwrite(s,1,strlen(s),F) + uint64_t Len = GetStringLength(CI->getArgOperand(0)); + if (!Len) return 0; + // Known to have no uses (see above). + return EmitFWrite(CI->getArgOperand(0), + ConstantInt::get(TD->getIntPtrType(*Context), Len-1), + CI->getArgOperand(1), B, TD, TLI); + } +}; + +//===---------------------------------------===// +// 'fprintf' Optimizations + +struct FPrintFOpt : public LibCallOptimization { + Value *OptimizeFixedFormatString(Function *Callee, CallInst *CI, + IRBuilder<> &B) { + // All the optimizations depend on the format string. + StringRef FormatStr; + if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr)) + return 0; + + // fprintf(F, "foo") --> fwrite("foo", 3, 1, F) + if (CI->getNumArgOperands() == 2) { + for (unsigned i = 0, e = FormatStr.size(); i != e; ++i) + if (FormatStr[i] == '%') // Could handle %% -> % if we cared. + return 0; // We found a format specifier. + + // These optimizations require TargetData. + if (!TD) return 0; + + Value *NewCI = EmitFWrite(CI->getArgOperand(1), + ConstantInt::get(TD->getIntPtrType(*Context), + FormatStr.size()), + CI->getArgOperand(0), B, TD, TLI); + return NewCI ? ConstantInt::get(CI->getType(), FormatStr.size()) : 0; + } + + // The remaining optimizations require the format string to be "%s" or "%c" + // and have an extra operand. + if (FormatStr.size() != 2 || FormatStr[0] != '%' || + CI->getNumArgOperands() < 3) + return 0; + + // Decode the second character of the format string. + if (FormatStr[1] == 'c') { + // fprintf(F, "%c", chr) --> fputc(chr, F) + if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0; + Value *NewCI = EmitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, + TD, TLI); + return NewCI ? ConstantInt::get(CI->getType(), 1) : 0; + } + + if (FormatStr[1] == 's') { + // fprintf(F, "%s", str) --> fputs(str, F) + if (!CI->getArgOperand(2)->getType()->isPointerTy() || !CI->use_empty()) + return 0; + return EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TD, TLI); + } + return 0; + } + + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Require two fixed paramters as pointers and integer result. + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() || + !FT->getParamType(1)->isPointerTy() || + !FT->getReturnType()->isIntegerTy()) + return 0; + + if (Value *V = OptimizeFixedFormatString(Callee, CI, B)) { + return V; + } + + // fprintf(stream, format, ...) -> fiprintf(stream, format, ...) if no + // floating point arguments. + if (TLI->has(LibFunc::fiprintf) && !CallHasFloatingPointArgument(CI)) { + Module *M = B.GetInsertBlock()->getParent()->getParent(); + Constant *FIPrintFFn = + M->getOrInsertFunction("fiprintf", FT, Callee->getAttributes()); + CallInst *New = cast<CallInst>(CI->clone()); + New->setCalledFunction(FIPrintFFn); + B.Insert(New); + return New; + } + return 0; + } +}; + +//===---------------------------------------===// +// 'puts' Optimizations + +struct PutsOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Require one fixed pointer argument and an integer/void result. + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() || + !(FT->getReturnType()->isIntegerTy() || + FT->getReturnType()->isVoidTy())) + return 0; + + // Check for a constant string. + StringRef Str; + if (!getConstantStringInfo(CI->getArgOperand(0), Str)) + return 0; + + if (Str.empty() && CI->use_empty()) { + // puts("") -> putchar('\n') + Value *Res = EmitPutChar(B.getInt32('\n'), B, TD, TLI); + if (CI->use_empty() || !Res) return Res; + return B.CreateIntCast(Res, CI->getType(), true); + } + + return 0; + } +}; + +} // end anonymous namespace. + +//===----------------------------------------------------------------------===// +// SimplifyLibCalls Pass Implementation +//===----------------------------------------------------------------------===// + +namespace { + /// This pass optimizes well known library functions from libc and libm. + /// + class SimplifyLibCalls : public FunctionPass { + TargetLibraryInfo *TLI; + + StringMap<LibCallOptimization*> Optimizations; + // String and Memory LibCall Optimizations + StrCatOpt StrCat; StrNCatOpt StrNCat; StrChrOpt StrChr; StrRChrOpt StrRChr; + StrCmpOpt StrCmp; StrNCmpOpt StrNCmp; + StrCpyOpt StrCpy; StrCpyOpt StrCpyChk; + StpCpyOpt StpCpy; StpCpyOpt StpCpyChk; + StrNCpyOpt StrNCpy; + StrLenOpt StrLen; StrPBrkOpt StrPBrk; + StrToOpt StrTo; StrSpnOpt StrSpn; StrCSpnOpt StrCSpn; StrStrOpt StrStr; + MemCmpOpt MemCmp; MemCpyOpt MemCpy; MemMoveOpt MemMove; MemSetOpt MemSet; + // Math Library Optimizations + CosOpt Cos; PowOpt Pow; Exp2Opt Exp2; UnaryDoubleFPOpt UnaryDoubleFP; + // Integer Optimizations + FFSOpt FFS; AbsOpt Abs; IsDigitOpt IsDigit; IsAsciiOpt IsAscii; + ToAsciiOpt ToAscii; + // Formatting and IO Optimizations + SPrintFOpt SPrintF; PrintFOpt PrintF; + FWriteOpt FWrite; FPutsOpt FPuts; FPrintFOpt FPrintF; + PutsOpt Puts; + + bool Modified; // This is only used by doInitialization. + public: + static char ID; // Pass identification + SimplifyLibCalls() : FunctionPass(ID), StrCpy(false), StrCpyChk(true), + StpCpy(false), StpCpyChk(true) { + initializeSimplifyLibCallsPass(*PassRegistry::getPassRegistry()); + } + void AddOpt(LibFunc::Func F, LibCallOptimization* Opt); + void InitOptimizations(); + bool runOnFunction(Function &F); + + void setDoesNotAccessMemory(Function &F); + void setOnlyReadsMemory(Function &F); + void setDoesNotThrow(Function &F); + void setDoesNotCapture(Function &F, unsigned n); + void setDoesNotAlias(Function &F, unsigned n); + bool doInitialization(Module &M); + + void inferPrototypeAttributes(Function &F); + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<TargetLibraryInfo>(); + } + }; +} // end anonymous namespace. + +char SimplifyLibCalls::ID = 0; + +INITIALIZE_PASS_BEGIN(SimplifyLibCalls, "simplify-libcalls", + "Simplify well-known library calls", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_END(SimplifyLibCalls, "simplify-libcalls", + "Simplify well-known library calls", false, false) + +// Public interface to the Simplify LibCalls pass. +FunctionPass *llvm::createSimplifyLibCallsPass() { + return new SimplifyLibCalls(); +} + +void SimplifyLibCalls::AddOpt(LibFunc::Func F, LibCallOptimization* Opt) { + if (TLI->has(F)) + Optimizations[TLI->getName(F)] = Opt; +} + +/// Optimizations - Populate the Optimizations map with all the optimizations +/// we know. +void SimplifyLibCalls::InitOptimizations() { + // String and Memory LibCall Optimizations + Optimizations["strcat"] = &StrCat; + Optimizations["strncat"] = &StrNCat; + Optimizations["strchr"] = &StrChr; + Optimizations["strrchr"] = &StrRChr; + Optimizations["strcmp"] = &StrCmp; + Optimizations["strncmp"] = &StrNCmp; + Optimizations["strcpy"] = &StrCpy; + Optimizations["strncpy"] = &StrNCpy; + Optimizations["stpcpy"] = &StpCpy; + Optimizations["strlen"] = &StrLen; + Optimizations["strpbrk"] = &StrPBrk; + Optimizations["strtol"] = &StrTo; + Optimizations["strtod"] = &StrTo; + Optimizations["strtof"] = &StrTo; + Optimizations["strtoul"] = &StrTo; + Optimizations["strtoll"] = &StrTo; + Optimizations["strtold"] = &StrTo; + Optimizations["strtoull"] = &StrTo; + Optimizations["strspn"] = &StrSpn; + Optimizations["strcspn"] = &StrCSpn; + Optimizations["strstr"] = &StrStr; + Optimizations["memcmp"] = &MemCmp; + AddOpt(LibFunc::memcpy, &MemCpy); + Optimizations["memmove"] = &MemMove; + AddOpt(LibFunc::memset, &MemSet); + + // _chk variants of String and Memory LibCall Optimizations. + Optimizations["__strcpy_chk"] = &StrCpyChk; + Optimizations["__stpcpy_chk"] = &StpCpyChk; + + // Math Library Optimizations + Optimizations["cosf"] = &Cos; + Optimizations["cos"] = &Cos; + Optimizations["cosl"] = &Cos; + Optimizations["powf"] = &Pow; + Optimizations["pow"] = &Pow; + Optimizations["powl"] = &Pow; + Optimizations["llvm.pow.f32"] = &Pow; + Optimizations["llvm.pow.f64"] = &Pow; + Optimizations["llvm.pow.f80"] = &Pow; + Optimizations["llvm.pow.f128"] = &Pow; + Optimizations["llvm.pow.ppcf128"] = &Pow; + Optimizations["exp2l"] = &Exp2; + Optimizations["exp2"] = &Exp2; + Optimizations["exp2f"] = &Exp2; + Optimizations["llvm.exp2.ppcf128"] = &Exp2; + Optimizations["llvm.exp2.f128"] = &Exp2; + Optimizations["llvm.exp2.f80"] = &Exp2; + Optimizations["llvm.exp2.f64"] = &Exp2; + Optimizations["llvm.exp2.f32"] = &Exp2; + + if (TLI->has(LibFunc::floor) && TLI->has(LibFunc::floorf)) + Optimizations["floor"] = &UnaryDoubleFP; + if (TLI->has(LibFunc::ceil) && TLI->has(LibFunc::ceilf)) + Optimizations["ceil"] = &UnaryDoubleFP; + if (TLI->has(LibFunc::round) && TLI->has(LibFunc::roundf)) + Optimizations["round"] = &UnaryDoubleFP; + if (TLI->has(LibFunc::rint) && TLI->has(LibFunc::rintf)) + Optimizations["rint"] = &UnaryDoubleFP; + if (TLI->has(LibFunc::nearbyint) && TLI->has(LibFunc::nearbyintf)) + Optimizations["nearbyint"] = &UnaryDoubleFP; + + // Integer Optimizations + Optimizations["ffs"] = &FFS; + Optimizations["ffsl"] = &FFS; + Optimizations["ffsll"] = &FFS; + Optimizations["abs"] = &Abs; + Optimizations["labs"] = &Abs; + Optimizations["llabs"] = &Abs; + Optimizations["isdigit"] = &IsDigit; + Optimizations["isascii"] = &IsAscii; + Optimizations["toascii"] = &ToAscii; + + // Formatting and IO Optimizations + Optimizations["sprintf"] = &SPrintF; + Optimizations["printf"] = &PrintF; + AddOpt(LibFunc::fwrite, &FWrite); + AddOpt(LibFunc::fputs, &FPuts); + Optimizations["fprintf"] = &FPrintF; + Optimizations["puts"] = &Puts; +} + + +/// runOnFunction - Top level algorithm. +/// +bool SimplifyLibCalls::runOnFunction(Function &F) { + TLI = &getAnalysis<TargetLibraryInfo>(); + + if (Optimizations.empty()) + InitOptimizations(); + + const TargetData *TD = getAnalysisIfAvailable<TargetData>(); + + IRBuilder<> Builder(F.getContext()); + + bool Changed = false; + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) { + // Ignore non-calls. + CallInst *CI = dyn_cast<CallInst>(I++); + if (!CI) continue; + + // Ignore indirect calls and calls to non-external functions. + Function *Callee = CI->getCalledFunction(); + if (Callee == 0 || !Callee->isDeclaration() || + !(Callee->hasExternalLinkage() || Callee->hasDLLImportLinkage())) + continue; + + // Ignore unknown calls. + LibCallOptimization *LCO = Optimizations.lookup(Callee->getName()); + if (!LCO) continue; + + // Set the builder to the instruction after the call. + Builder.SetInsertPoint(BB, I); + + // Use debug location of CI for all new instructions. + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + // Try to optimize this call. + Value *Result = LCO->OptimizeCall(CI, TD, TLI, Builder); + if (Result == 0) continue; + + DEBUG(dbgs() << "SimplifyLibCalls simplified: " << *CI; + dbgs() << " into: " << *Result << "\n"); + + // Something changed! + Changed = true; + ++NumSimplified; + + // Inspect the instruction after the call (which was potentially just + // added) next. + I = CI; ++I; + + if (CI != Result && !CI->use_empty()) { + CI->replaceAllUsesWith(Result); + if (!Result->hasName()) + Result->takeName(CI); + } + CI->eraseFromParent(); + } + } + return Changed; +} + +// Utility methods for doInitialization. + +void SimplifyLibCalls::setDoesNotAccessMemory(Function &F) { + if (!F.doesNotAccessMemory()) { + F.setDoesNotAccessMemory(); + ++NumAnnotated; + Modified = true; + } +} +void SimplifyLibCalls::setOnlyReadsMemory(Function &F) { + if (!F.onlyReadsMemory()) { + F.setOnlyReadsMemory(); + ++NumAnnotated; + Modified = true; + } +} +void SimplifyLibCalls::setDoesNotThrow(Function &F) { + if (!F.doesNotThrow()) { + F.setDoesNotThrow(); + ++NumAnnotated; + Modified = true; + } +} +void SimplifyLibCalls::setDoesNotCapture(Function &F, unsigned n) { + if (!F.doesNotCapture(n)) { + F.setDoesNotCapture(n); + ++NumAnnotated; + Modified = true; + } +} +void SimplifyLibCalls::setDoesNotAlias(Function &F, unsigned n) { + if (!F.doesNotAlias(n)) { + F.setDoesNotAlias(n); + ++NumAnnotated; + Modified = true; + } +} + + +void SimplifyLibCalls::inferPrototypeAttributes(Function &F) { + FunctionType *FTy = F.getFunctionType(); + + StringRef Name = F.getName(); + switch (Name[0]) { + case 's': + if (Name == "strlen") { + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return; + setOnlyReadsMemory(F); + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } else if (Name == "strchr" || + Name == "strrchr") { + if (FTy->getNumParams() != 2 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isIntegerTy()) + return; + setOnlyReadsMemory(F); + setDoesNotThrow(F); + } else if (Name == "strcpy" || + Name == "stpcpy" || + Name == "strcat" || + Name == "strtol" || + Name == "strtod" || + Name == "strtof" || + Name == "strtoul" || + Name == "strtoll" || + Name == "strtold" || + Name == "strncat" || + Name == "strncpy" || + Name == "stpncpy" || + Name == "strtoull") { + if (FTy->getNumParams() < 2 || + !FTy->getParamType(1)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 2); + } else if (Name == "strxfrm") { + if (FTy->getNumParams() != 3 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + } else if (Name == "strcmp" || + Name == "strspn" || + Name == "strncmp" || + Name == "strcspn" || + Name == "strcoll" || + Name == "strcasecmp" || + Name == "strncasecmp") { + if (FTy->getNumParams() < 2 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return; + setOnlyReadsMemory(F); + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + } else if (Name == "strstr" || + Name == "strpbrk") { + if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) + return; + setOnlyReadsMemory(F); + setDoesNotThrow(F); + setDoesNotCapture(F, 2); + } else if (Name == "strtok" || + Name == "strtok_r") { + if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 2); + } else if (Name == "scanf" || + Name == "setbuf" || + Name == "setvbuf") { + if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } else if (Name == "strdup" || + Name == "strndup") { + if (FTy->getNumParams() < 1 || !FTy->getReturnType()->isPointerTy() || + !FTy->getParamType(0)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotAlias(F, 0); + setDoesNotCapture(F, 1); + } else if (Name == "stat" || + Name == "sscanf" || + Name == "sprintf" || + Name == "statvfs") { + if (FTy->getNumParams() < 2 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + } else if (Name == "snprintf") { + if (FTy->getNumParams() != 3 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(2)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 3); + } else if (Name == "setitimer") { + if (FTy->getNumParams() != 3 || + !FTy->getParamType(1)->isPointerTy() || + !FTy->getParamType(2)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 2); + setDoesNotCapture(F, 3); + } else if (Name == "system") { + if (FTy->getNumParams() != 1 || + !FTy->getParamType(0)->isPointerTy()) + return; + // May throw; "system" is a valid pthread cancellation point. + setDoesNotCapture(F, 1); + } + break; + case 'm': + if (Name == "malloc") { + if (FTy->getNumParams() != 1 || + !FTy->getReturnType()->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotAlias(F, 0); + } else if (Name == "memcmp") { + if (FTy->getNumParams() != 3 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return; + setOnlyReadsMemory(F); + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + } else if (Name == "memchr" || + Name == "memrchr") { + if (FTy->getNumParams() != 3) + return; + setOnlyReadsMemory(F); + setDoesNotThrow(F); + } else if (Name == "modf" || + Name == "modff" || + Name == "modfl" || + Name == "memcpy" || + Name == "memccpy" || + Name == "memmove") { + if (FTy->getNumParams() < 2 || + !FTy->getParamType(1)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 2); + } else if (Name == "memalign") { + if (!FTy->getReturnType()->isPointerTy()) + return; + setDoesNotAlias(F, 0); + } else if (Name == "mkdir" || + Name == "mktime") { + if (FTy->getNumParams() == 0 || + !FTy->getParamType(0)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } + break; + case 'r': + if (Name == "realloc") { + if (FTy->getNumParams() != 2 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getReturnType()->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotAlias(F, 0); + setDoesNotCapture(F, 1); + } else if (Name == "read") { + if (FTy->getNumParams() != 3 || + !FTy->getParamType(1)->isPointerTy()) + return; + // May throw; "read" is a valid pthread cancellation point. + setDoesNotCapture(F, 2); + } else if (Name == "rmdir" || + Name == "rewind" || + Name == "remove" || + Name == "realpath") { + if (FTy->getNumParams() < 1 || + !FTy->getParamType(0)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } else if (Name == "rename" || + Name == "readlink") { + if (FTy->getNumParams() < 2 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + } + break; + case 'w': + if (Name == "write") { + if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy()) + return; + // May throw; "write" is a valid pthread cancellation point. + setDoesNotCapture(F, 2); + } + break; + case 'b': + if (Name == "bcopy") { + if (FTy->getNumParams() != 3 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + } else if (Name == "bcmp") { + if (FTy->getNumParams() != 3 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return; + setDoesNotThrow(F); + setOnlyReadsMemory(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + } else if (Name == "bzero") { + if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } + break; + case 'c': + if (Name == "calloc") { + if (FTy->getNumParams() != 2 || + !FTy->getReturnType()->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotAlias(F, 0); + } else if (Name == "chmod" || + Name == "chown" || + Name == "ctermid" || + Name == "clearerr" || + Name == "closedir") { + if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } + break; + case 'a': + if (Name == "atoi" || + Name == "atol" || + Name == "atof" || + Name == "atoll") { + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return; + setDoesNotThrow(F); + setOnlyReadsMemory(F); + setDoesNotCapture(F, 1); + } else if (Name == "access") { + if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } + break; + case 'f': + if (Name == "fopen") { + if (FTy->getNumParams() != 2 || + !FTy->getReturnType()->isPointerTy() || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotAlias(F, 0); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + } else if (Name == "fdopen") { + if (FTy->getNumParams() != 2 || + !FTy->getReturnType()->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotAlias(F, 0); + setDoesNotCapture(F, 2); + } else if (Name == "feof" || + Name == "free" || + Name == "fseek" || + Name == "ftell" || + Name == "fgetc" || + Name == "fseeko" || + Name == "ftello" || + Name == "fileno" || + Name == "fflush" || + Name == "fclose" || + Name == "fsetpos" || + Name == "flockfile" || + Name == "funlockfile" || + Name == "ftrylockfile") { + if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } else if (Name == "ferror") { + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setOnlyReadsMemory(F); + } else if (Name == "fputc" || + Name == "fstat" || + Name == "frexp" || + Name == "frexpf" || + Name == "frexpl" || + Name == "fstatvfs") { + if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 2); + } else if (Name == "fgets") { + if (FTy->getNumParams() != 3 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(2)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 3); + } else if (Name == "fread" || + Name == "fwrite") { + if (FTy->getNumParams() != 4 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(3)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 4); + } else if (Name == "fputs" || + Name == "fscanf" || + Name == "fprintf" || + Name == "fgetpos") { + if (FTy->getNumParams() < 2 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + } + break; + case 'g': + if (Name == "getc" || + Name == "getlogin_r" || + Name == "getc_unlocked") { + if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } else if (Name == "getenv") { + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return; + setDoesNotThrow(F); + setOnlyReadsMemory(F); + setDoesNotCapture(F, 1); + } else if (Name == "gets" || + Name == "getchar") { + setDoesNotThrow(F); + } else if (Name == "getitimer") { + if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 2); + } else if (Name == "getpwnam") { + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } + break; + case 'u': + if (Name == "ungetc") { + if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 2); + } else if (Name == "uname" || + Name == "unlink" || + Name == "unsetenv") { + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } else if (Name == "utime" || + Name == "utimes") { + if (FTy->getNumParams() != 2 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + } + break; + case 'p': + if (Name == "putc") { + if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 2); + } else if (Name == "puts" || + Name == "printf" || + Name == "perror") { + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } else if (Name == "pread" || + Name == "pwrite") { + if (FTy->getNumParams() != 4 || !FTy->getParamType(1)->isPointerTy()) + return; + // May throw; these are valid pthread cancellation points. + setDoesNotCapture(F, 2); + } else if (Name == "putchar") { + setDoesNotThrow(F); + } else if (Name == "popen") { + if (FTy->getNumParams() != 2 || + !FTy->getReturnType()->isPointerTy() || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotAlias(F, 0); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + } else if (Name == "pclose") { + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } + break; + case 'v': + if (Name == "vscanf") { + if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } else if (Name == "vsscanf" || + Name == "vfscanf") { + if (FTy->getNumParams() != 3 || + !FTy->getParamType(1)->isPointerTy() || + !FTy->getParamType(2)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + } else if (Name == "valloc") { + if (!FTy->getReturnType()->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotAlias(F, 0); + } else if (Name == "vprintf") { + if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } else if (Name == "vfprintf" || + Name == "vsprintf") { + if (FTy->getNumParams() != 3 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + } else if (Name == "vsnprintf") { + if (FTy->getNumParams() != 4 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(2)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 3); + } + break; + case 'o': + if (Name == "open") { + if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy()) + return; + // May throw; "open" is a valid pthread cancellation point. + setDoesNotCapture(F, 1); + } else if (Name == "opendir") { + if (FTy->getNumParams() != 1 || + !FTy->getReturnType()->isPointerTy() || + !FTy->getParamType(0)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotAlias(F, 0); + setDoesNotCapture(F, 1); + } + break; + case 't': + if (Name == "tmpfile") { + if (!FTy->getReturnType()->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotAlias(F, 0); + } else if (Name == "times") { + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } + break; + case 'h': + if (Name == "htonl" || + Name == "htons") { + setDoesNotThrow(F); + setDoesNotAccessMemory(F); + } + break; + case 'n': + if (Name == "ntohl" || + Name == "ntohs") { + setDoesNotThrow(F); + setDoesNotAccessMemory(F); + } + break; + case 'l': + if (Name == "lstat") { + if (FTy->getNumParams() != 2 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + } else if (Name == "lchown") { + if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } + break; + case 'q': + if (Name == "qsort") { + if (FTy->getNumParams() != 4 || !FTy->getParamType(3)->isPointerTy()) + return; + // May throw; places call through function pointer. + setDoesNotCapture(F, 4); + } + break; + case '_': + if (Name == "__strdup" || + Name == "__strndup") { + if (FTy->getNumParams() < 1 || + !FTy->getReturnType()->isPointerTy() || + !FTy->getParamType(0)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotAlias(F, 0); + setDoesNotCapture(F, 1); + } else if (Name == "__strtok_r") { + if (FTy->getNumParams() != 3 || + !FTy->getParamType(1)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 2); + } else if (Name == "_IO_getc") { + if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } else if (Name == "_IO_putc") { + if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 2); + } + break; + case 1: + if (Name == "\1__isoc99_scanf") { + if (FTy->getNumParams() < 1 || + !FTy->getParamType(0)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } else if (Name == "\1stat64" || + Name == "\1lstat64" || + Name == "\1statvfs64" || + Name == "\1__isoc99_sscanf") { + if (FTy->getNumParams() < 1 || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + } else if (Name == "\1fopen64") { + if (FTy->getNumParams() != 2 || + !FTy->getReturnType()->isPointerTy() || + !FTy->getParamType(0)->isPointerTy() || + !FTy->getParamType(1)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotAlias(F, 0); + setDoesNotCapture(F, 1); + setDoesNotCapture(F, 2); + } else if (Name == "\1fseeko64" || + Name == "\1ftello64") { + if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 1); + } else if (Name == "\1tmpfile64") { + if (!FTy->getReturnType()->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotAlias(F, 0); + } else if (Name == "\1fstat64" || + Name == "\1fstatvfs64") { + if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy()) + return; + setDoesNotThrow(F); + setDoesNotCapture(F, 2); + } else if (Name == "\1open64") { + if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy()) + return; + // May throw; "open" is a valid pthread cancellation point. + setDoesNotCapture(F, 1); + } + break; + } +} + +/// doInitialization - Add attributes to well-known functions. +/// +bool SimplifyLibCalls::doInitialization(Module &M) { + Modified = false; + for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { + Function &F = *I; + if (F.isDeclaration() && F.hasName()) + inferPrototypeAttributes(F); + } + return Modified; +} + +// TODO: +// Additional cases that we need to add to this file: +// +// cbrt: +// * cbrt(expN(X)) -> expN(x/3) +// * cbrt(sqrt(x)) -> pow(x,1/6) +// * cbrt(sqrt(x)) -> pow(x,1/9) +// +// exp, expf, expl: +// * exp(log(x)) -> x +// +// log, logf, logl: +// * log(exp(x)) -> x +// * log(x**y) -> y*log(x) +// * log(exp(y)) -> y*log(e) +// * log(exp2(y)) -> y*log(2) +// * log(exp10(y)) -> y*log(10) +// * log(sqrt(x)) -> 0.5*log(x) +// * log(pow(x,y)) -> y*log(x) +// +// lround, lroundf, lroundl: +// * lround(cnst) -> cnst' +// +// pow, powf, powl: +// * pow(exp(x),y) -> exp(x*y) +// * pow(sqrt(x),y) -> pow(x,y*0.5) +// * pow(pow(x,y),z)-> pow(x,y*z) +// +// round, roundf, roundl: +// * round(cnst) -> cnst' +// +// signbit: +// * signbit(cnst) -> cnst' +// * signbit(nncst) -> 0 (if pstv is a non-negative constant) +// +// sqrt, sqrtf, sqrtl: +// * sqrt(expN(x)) -> expN(x*0.5) +// * sqrt(Nroot(x)) -> pow(x,1/(2*N)) +// * sqrt(pow(x,y)) -> pow(|x|,y*0.5) +// +// strchr: +// * strchr(p, 0) -> strlen(p) +// tan, tanf, tanl: +// * tan(atan(x)) -> x +// +// trunc, truncf, truncl: +// * trunc(cnst) -> cnst' +// +// diff --git a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp new file mode 100644 index 0000000..34f1d6c --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp @@ -0,0 +1,270 @@ +//===-- Sink.cpp - Code Sinking -------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass moves instructions into successor blocks, when possible, so that +// they aren't executed on paths where their results aren't needed. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "sink" +#include "llvm/Transforms/Scalar.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Assembly/Writer.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/CFG.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +STATISTIC(NumSunk, "Number of instructions sunk"); +STATISTIC(NumSinkIter, "Number of sinking iterations"); + +namespace { + class Sinking : public FunctionPass { + DominatorTree *DT; + LoopInfo *LI; + AliasAnalysis *AA; + + public: + static char ID; // Pass identification + Sinking() : FunctionPass(ID) { + initializeSinkingPass(*PassRegistry::getPassRegistry()); + } + + virtual bool runOnFunction(Function &F); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + FunctionPass::getAnalysisUsage(AU); + AU.addRequired<AliasAnalysis>(); + AU.addRequired<DominatorTree>(); + AU.addRequired<LoopInfo>(); + AU.addPreserved<DominatorTree>(); + AU.addPreserved<LoopInfo>(); + } + private: + bool ProcessBlock(BasicBlock &BB); + bool SinkInstruction(Instruction *I, SmallPtrSet<Instruction *, 8> &Stores); + bool AllUsesDominatedByBlock(Instruction *Inst, BasicBlock *BB) const; + bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo) const; + }; +} // end anonymous namespace + +char Sinking::ID = 0; +INITIALIZE_PASS_BEGIN(Sinking, "sink", "Code sinking", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_END(Sinking, "sink", "Code sinking", false, false) + +FunctionPass *llvm::createSinkingPass() { return new Sinking(); } + +/// AllUsesDominatedByBlock - Return true if all uses of the specified value +/// occur in blocks dominated by the specified block. +bool Sinking::AllUsesDominatedByBlock(Instruction *Inst, + BasicBlock *BB) const { + // Ignoring debug uses is necessary so debug info doesn't affect the code. + // This may leave a referencing dbg_value in the original block, before + // the definition of the vreg. Dwarf generator handles this although the + // user might not get the right info at runtime. + for (Value::use_iterator I = Inst->use_begin(), + E = Inst->use_end(); I != E; ++I) { + // Determine the block of the use. + Instruction *UseInst = cast<Instruction>(*I); + BasicBlock *UseBlock = UseInst->getParent(); + if (PHINode *PN = dyn_cast<PHINode>(UseInst)) { + // PHI nodes use the operand in the predecessor block, not the block with + // the PHI. + unsigned Num = PHINode::getIncomingValueNumForOperand(I.getOperandNo()); + UseBlock = PN->getIncomingBlock(Num); + } + // Check that it dominates. + if (!DT->dominates(BB, UseBlock)) + return false; + } + return true; +} + +bool Sinking::runOnFunction(Function &F) { + DT = &getAnalysis<DominatorTree>(); + LI = &getAnalysis<LoopInfo>(); + AA = &getAnalysis<AliasAnalysis>(); + + bool MadeChange, EverMadeChange = false; + + do { + MadeChange = false; + DEBUG(dbgs() << "Sinking iteration " << NumSinkIter << "\n"); + // Process all basic blocks. + for (Function::iterator I = F.begin(), E = F.end(); + I != E; ++I) + MadeChange |= ProcessBlock(*I); + EverMadeChange |= MadeChange; + NumSinkIter++; + } while (MadeChange); + + return EverMadeChange; +} + +bool Sinking::ProcessBlock(BasicBlock &BB) { + // Can't sink anything out of a block that has less than two successors. + if (BB.getTerminator()->getNumSuccessors() <= 1 || BB.empty()) return false; + + // Don't bother sinking code out of unreachable blocks. In addition to being + // unprofitable, it can also lead to infinite looping, because in an + // unreachable loop there may be nowhere to stop. + if (!DT->isReachableFromEntry(&BB)) return false; + + bool MadeChange = false; + + // Walk the basic block bottom-up. Remember if we saw a store. + BasicBlock::iterator I = BB.end(); + --I; + bool ProcessedBegin = false; + SmallPtrSet<Instruction *, 8> Stores; + do { + Instruction *Inst = I; // The instruction to sink. + + // Predecrement I (if it's not begin) so that it isn't invalidated by + // sinking. + ProcessedBegin = I == BB.begin(); + if (!ProcessedBegin) + --I; + + if (isa<DbgInfoIntrinsic>(Inst)) + continue; + + if (SinkInstruction(Inst, Stores)) + ++NumSunk, MadeChange = true; + + // If we just processed the first instruction in the block, we're done. + } while (!ProcessedBegin); + + return MadeChange; +} + +static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA, + SmallPtrSet<Instruction *, 8> &Stores) { + + if (Inst->mayWriteToMemory()) { + Stores.insert(Inst); + return false; + } + + if (LoadInst *L = dyn_cast<LoadInst>(Inst)) { + AliasAnalysis::Location Loc = AA->getLocation(L); + for (SmallPtrSet<Instruction *, 8>::iterator I = Stores.begin(), + E = Stores.end(); I != E; ++I) + if (AA->getModRefInfo(*I, Loc) & AliasAnalysis::Mod) + return false; + } + + if (isa<TerminatorInst>(Inst) || isa<PHINode>(Inst)) + return false; + + return true; +} + +/// IsAcceptableTarget - Return true if it is possible to sink the instruction +/// in the specified basic block. +bool Sinking::IsAcceptableTarget(Instruction *Inst, + BasicBlock *SuccToSinkTo) const { + assert(Inst && "Instruction to be sunk is null"); + assert(SuccToSinkTo && "Candidate sink target is null"); + + // It is not possible to sink an instruction into its own block. This can + // happen with loops. + if (Inst->getParent() == SuccToSinkTo) + return false; + + // If the block has multiple predecessors, this would introduce computation + // on different code paths. We could split the critical edge, but for now we + // just punt. + // FIXME: Split critical edges if not backedges. + if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) { + // We cannot sink a load across a critical edge - there may be stores in + // other code paths. + if (!isSafeToSpeculativelyExecute(Inst)) + return false; + + // We don't want to sink across a critical edge if we don't dominate the + // successor. We could be introducing calculations to new code paths. + if (!DT->dominates(Inst->getParent(), SuccToSinkTo)) + return false; + + // Don't sink instructions into a loop. + Loop *succ = LI->getLoopFor(SuccToSinkTo); + Loop *cur = LI->getLoopFor(Inst->getParent()); + if (succ != 0 && succ != cur) + return false; + } + + // Finally, check that all the uses of the instruction are actually + // dominated by the candidate + return AllUsesDominatedByBlock(Inst, SuccToSinkTo); +} + +/// SinkInstruction - Determine whether it is safe to sink the specified machine +/// instruction out of its current block into a successor. +bool Sinking::SinkInstruction(Instruction *Inst, + SmallPtrSet<Instruction *, 8> &Stores) { + // Check if it's safe to move the instruction. + if (!isSafeToMove(Inst, AA, Stores)) + return false; + + // FIXME: This should include support for sinking instructions within the + // block they are currently in to shorten the live ranges. We often get + // instructions sunk into the top of a large block, but it would be better to + // also sink them down before their first use in the block. This xform has to + // be careful not to *increase* register pressure though, e.g. sinking + // "x = y + z" down if it kills y and z would increase the live ranges of y + // and z and only shrink the live range of x. + + // SuccToSinkTo - This is the successor to sink this instruction to, once we + // decide. + BasicBlock *SuccToSinkTo = 0; + + // Instructions can only be sunk if all their uses are in blocks + // dominated by one of the successors. + // Look at all the postdominators and see if we can sink it in one. + DomTreeNode *DTN = DT->getNode(Inst->getParent()); + for (DomTreeNode::iterator I = DTN->begin(), E = DTN->end(); + I != E && SuccToSinkTo == 0; ++I) { + BasicBlock *Candidate = (*I)->getBlock(); + if ((*I)->getIDom()->getBlock() == Inst->getParent() && + IsAcceptableTarget(Inst, Candidate)) + SuccToSinkTo = Candidate; + } + + // If no suitable postdominator was found, look at all the successors and + // decide which one we should sink to, if any. + for (succ_iterator I = succ_begin(Inst->getParent()), + E = succ_end(Inst->getParent()); I != E && SuccToSinkTo == 0; ++I) { + if (IsAcceptableTarget(Inst, *I)) + SuccToSinkTo = *I; + } + + // If we couldn't find a block to sink to, ignore this instruction. + if (SuccToSinkTo == 0) + return false; + + DEBUG(dbgs() << "Sink" << *Inst << " ("; + WriteAsOperand(dbgs(), Inst->getParent(), false); + dbgs() << " -> "; + WriteAsOperand(dbgs(), SuccToSinkTo, false); + dbgs() << ")\n"); + + // Move the instruction. + Inst->moveBefore(SuccToSinkTo->getFirstInsertionPt()); + return true; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp new file mode 100644 index 0000000..6557d63 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -0,0 +1,634 @@ +//===- TailRecursionElimination.cpp - Eliminate Tail Calls ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file transforms calls of the current function (self recursion) followed +// by a return instruction with a branch to the entry of the function, creating +// a loop. This pass also implements the following extensions to the basic +// algorithm: +// +// 1. Trivial instructions between the call and return do not prevent the +// transformation from taking place, though currently the analysis cannot +// support moving any really useful instructions (only dead ones). +// 2. This pass transforms functions that are prevented from being tail +// recursive by an associative and commutative expression to use an +// accumulator variable, thus compiling the typical naive factorial or +// 'fib' implementation into efficient code. +// 3. TRE is performed if the function returns void, if the return +// returns the result returned by the call, or if the function returns a +// run-time constant on all exits from the function. It is possible, though +// unlikely, that the return returns something else (like constant 0), and +// can still be TRE'd. It can be TRE'd if ALL OTHER return instructions in +// the function return the exact same value. +// 4. If it can prove that callees do not access their caller stack frame, +// they are marked as eligible for tail call elimination (by the code +// generator). +// +// There are several improvements that could be made: +// +// 1. If the function has any alloca instructions, these instructions will be +// moved out of the entry block of the function, causing them to be +// evaluated each time through the tail recursion. Safely keeping allocas +// in the entry block requires analysis to proves that the tail-called +// function does not read or write the stack object. +// 2. Tail recursion is only performed if the call immediately precedes the +// return instruction. It's possible that there could be a jump between +// the call and the return. +// 3. There can be intervening operations between the call and the return that +// prevent the TRE from occurring. For example, there could be GEP's and +// stores to memory that will not be read or written by the call. This +// requires some substantial analysis (such as with DSA) to prove safe to +// move ahead of the call, but doing so could allow many more TREs to be +// performed, for example in TreeAdd/TreeAlloc from the treeadd benchmark. +// 4. The algorithm we use to detect if callees access their caller stack +// frames is very primitive. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "tailcallelim" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Module.h" +#include "llvm/Pass.h" +#include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/InlineCost.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Support/CFG.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +using namespace llvm; + +STATISTIC(NumEliminated, "Number of tail calls removed"); +STATISTIC(NumRetDuped, "Number of return duplicated"); +STATISTIC(NumAccumAdded, "Number of accumulators introduced"); + +namespace { + struct TailCallElim : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + TailCallElim() : FunctionPass(ID) { + initializeTailCallElimPass(*PassRegistry::getPassRegistry()); + } + + virtual bool runOnFunction(Function &F); + + private: + CallInst *FindTRECandidate(Instruction *I, + bool CannotTailCallElimCallsMarkedTail); + bool EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, + BasicBlock *&OldEntry, + bool &TailCallsAreMarkedTail, + SmallVector<PHINode*, 8> &ArgumentPHIs, + bool CannotTailCallElimCallsMarkedTail); + bool FoldReturnAndProcessPred(BasicBlock *BB, + ReturnInst *Ret, BasicBlock *&OldEntry, + bool &TailCallsAreMarkedTail, + SmallVector<PHINode*, 8> &ArgumentPHIs, + bool CannotTailCallElimCallsMarkedTail); + bool ProcessReturningBlock(ReturnInst *RI, BasicBlock *&OldEntry, + bool &TailCallsAreMarkedTail, + SmallVector<PHINode*, 8> &ArgumentPHIs, + bool CannotTailCallElimCallsMarkedTail); + bool CanMoveAboveCall(Instruction *I, CallInst *CI); + Value *CanTransformAccumulatorRecursion(Instruction *I, CallInst *CI); + }; +} + +char TailCallElim::ID = 0; +INITIALIZE_PASS(TailCallElim, "tailcallelim", + "Tail Call Elimination", false, false) + +// Public interface to the TailCallElimination pass +FunctionPass *llvm::createTailCallEliminationPass() { + return new TailCallElim(); +} + +/// AllocaMightEscapeToCalls - Return true if this alloca may be accessed by +/// callees of this function. We only do very simple analysis right now, this +/// could be expanded in the future to use mod/ref information for particular +/// call sites if desired. +static bool AllocaMightEscapeToCalls(AllocaInst *AI) { + // FIXME: do simple 'address taken' analysis. + return true; +} + +/// CheckForEscapingAllocas - Scan the specified basic block for alloca +/// instructions. If it contains any that might be accessed by calls, return +/// true. +static bool CheckForEscapingAllocas(BasicBlock *BB, + bool &CannotTCETailMarkedCall) { + bool RetVal = false; + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) + if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) { + RetVal |= AllocaMightEscapeToCalls(AI); + + // If this alloca is in the body of the function, or if it is a variable + // sized allocation, we cannot tail call eliminate calls marked 'tail' + // with this mechanism. + if (BB != &BB->getParent()->getEntryBlock() || + !isa<ConstantInt>(AI->getArraySize())) + CannotTCETailMarkedCall = true; + } + return RetVal; +} + +bool TailCallElim::runOnFunction(Function &F) { + // If this function is a varargs function, we won't be able to PHI the args + // right, so don't even try to convert it... + if (F.getFunctionType()->isVarArg()) return false; + + BasicBlock *OldEntry = 0; + bool TailCallsAreMarkedTail = false; + SmallVector<PHINode*, 8> ArgumentPHIs; + bool MadeChange = false; + bool FunctionContainsEscapingAllocas = false; + + // CannotTCETailMarkedCall - If true, we cannot perform TCE on tail calls + // marked with the 'tail' attribute, because doing so would cause the stack + // size to increase (real TCE would deallocate variable sized allocas, TCE + // doesn't). + bool CannotTCETailMarkedCall = false; + + // Loop over the function, looking for any returning blocks, and keeping track + // of whether this function has any non-trivially used allocas. + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { + if (FunctionContainsEscapingAllocas && CannotTCETailMarkedCall) + break; + + FunctionContainsEscapingAllocas |= + CheckForEscapingAllocas(BB, CannotTCETailMarkedCall); + } + + /// FIXME: The code generator produces really bad code when an 'escaping + /// alloca' is changed from being a static alloca to being a dynamic alloca. + /// Until this is resolved, disable this transformation if that would ever + /// happen. This bug is PR962. + if (FunctionContainsEscapingAllocas) + return false; + + // Second pass, change any tail calls to loops. + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { + if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) { + bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail, + ArgumentPHIs,CannotTCETailMarkedCall); + if (!Change && BB->getFirstNonPHIOrDbg() == Ret) + Change = FoldReturnAndProcessPred(BB, Ret, OldEntry, + TailCallsAreMarkedTail, ArgumentPHIs, + CannotTCETailMarkedCall); + MadeChange |= Change; + } + } + + // If we eliminated any tail recursions, it's possible that we inserted some + // silly PHI nodes which just merge an initial value (the incoming operand) + // with themselves. Check to see if we did and clean up our mess if so. This + // occurs when a function passes an argument straight through to its tail + // call. + if (!ArgumentPHIs.empty()) { + for (unsigned i = 0, e = ArgumentPHIs.size(); i != e; ++i) { + PHINode *PN = ArgumentPHIs[i]; + + // If the PHI Node is a dynamic constant, replace it with the value it is. + if (Value *PNV = SimplifyInstruction(PN)) { + PN->replaceAllUsesWith(PNV); + PN->eraseFromParent(); + } + } + } + + // Finally, if this function contains no non-escaping allocas, or calls + // setjmp, mark all calls in the function as eligible for tail calls + //(there is no stack memory for them to access). + if (!FunctionContainsEscapingAllocas && !F.callsFunctionThatReturnsTwice()) + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) + if (CallInst *CI = dyn_cast<CallInst>(I)) { + CI->setTailCall(); + MadeChange = true; + } + + return MadeChange; +} + + +/// CanMoveAboveCall - Return true if it is safe to move the specified +/// instruction from after the call to before the call, assuming that all +/// instructions between the call and this instruction are movable. +/// +bool TailCallElim::CanMoveAboveCall(Instruction *I, CallInst *CI) { + // FIXME: We can move load/store/call/free instructions above the call if the + // call does not mod/ref the memory location being processed. + if (I->mayHaveSideEffects()) // This also handles volatile loads. + return false; + + if (LoadInst *L = dyn_cast<LoadInst>(I)) { + // Loads may always be moved above calls without side effects. + if (CI->mayHaveSideEffects()) { + // Non-volatile loads may be moved above a call with side effects if it + // does not write to memory and the load provably won't trap. + // FIXME: Writes to memory only matter if they may alias the pointer + // being loaded from. + if (CI->mayWriteToMemory() || + !isSafeToLoadUnconditionally(L->getPointerOperand(), L, + L->getAlignment())) + return false; + } + } + + // Otherwise, if this is a side-effect free instruction, check to make sure + // that it does not use the return value of the call. If it doesn't use the + // return value of the call, it must only use things that are defined before + // the call, or movable instructions between the call and the instruction + // itself. + for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) + if (I->getOperand(i) == CI) + return false; + return true; +} + +// isDynamicConstant - Return true if the specified value is the same when the +// return would exit as it was when the initial iteration of the recursive +// function was executed. +// +// We currently handle static constants and arguments that are not modified as +// part of the recursion. +// +static bool isDynamicConstant(Value *V, CallInst *CI, ReturnInst *RI) { + if (isa<Constant>(V)) return true; // Static constants are always dyn consts + + // Check to see if this is an immutable argument, if so, the value + // will be available to initialize the accumulator. + if (Argument *Arg = dyn_cast<Argument>(V)) { + // Figure out which argument number this is... + unsigned ArgNo = 0; + Function *F = CI->getParent()->getParent(); + for (Function::arg_iterator AI = F->arg_begin(); &*AI != Arg; ++AI) + ++ArgNo; + + // If we are passing this argument into call as the corresponding + // argument operand, then the argument is dynamically constant. + // Otherwise, we cannot transform this function safely. + if (CI->getArgOperand(ArgNo) == Arg) + return true; + } + + // Switch cases are always constant integers. If the value is being switched + // on and the return is only reachable from one of its cases, it's + // effectively constant. + if (BasicBlock *UniquePred = RI->getParent()->getUniquePredecessor()) + if (SwitchInst *SI = dyn_cast<SwitchInst>(UniquePred->getTerminator())) + if (SI->getCondition() == V) + return SI->getDefaultDest() != RI->getParent(); + + // Not a constant or immutable argument, we can't safely transform. + return false; +} + +// getCommonReturnValue - Check to see if the function containing the specified +// tail call consistently returns the same runtime-constant value at all exit +// points except for IgnoreRI. If so, return the returned value. +// +static Value *getCommonReturnValue(ReturnInst *IgnoreRI, CallInst *CI) { + Function *F = CI->getParent()->getParent(); + Value *ReturnedValue = 0; + + for (Function::iterator BBI = F->begin(), E = F->end(); BBI != E; ++BBI) { + ReturnInst *RI = dyn_cast<ReturnInst>(BBI->getTerminator()); + if (RI == 0 || RI == IgnoreRI) continue; + + // We can only perform this transformation if the value returned is + // evaluatable at the start of the initial invocation of the function, + // instead of at the end of the evaluation. + // + Value *RetOp = RI->getOperand(0); + if (!isDynamicConstant(RetOp, CI, RI)) + return 0; + + if (ReturnedValue && RetOp != ReturnedValue) + return 0; // Cannot transform if differing values are returned. + ReturnedValue = RetOp; + } + return ReturnedValue; +} + +/// CanTransformAccumulatorRecursion - If the specified instruction can be +/// transformed using accumulator recursion elimination, return the constant +/// which is the start of the accumulator value. Otherwise return null. +/// +Value *TailCallElim::CanTransformAccumulatorRecursion(Instruction *I, + CallInst *CI) { + if (!I->isAssociative() || !I->isCommutative()) return 0; + assert(I->getNumOperands() == 2 && + "Associative/commutative operations should have 2 args!"); + + // Exactly one operand should be the result of the call instruction. + if ((I->getOperand(0) == CI && I->getOperand(1) == CI) || + (I->getOperand(0) != CI && I->getOperand(1) != CI)) + return 0; + + // The only user of this instruction we allow is a single return instruction. + if (!I->hasOneUse() || !isa<ReturnInst>(I->use_back())) + return 0; + + // Ok, now we have to check all of the other return instructions in this + // function. If they return non-constants or differing values, then we cannot + // transform the function safely. + return getCommonReturnValue(cast<ReturnInst>(I->use_back()), CI); +} + +static Instruction *FirstNonDbg(BasicBlock::iterator I) { + while (isa<DbgInfoIntrinsic>(I)) + ++I; + return &*I; +} + +CallInst* +TailCallElim::FindTRECandidate(Instruction *TI, + bool CannotTailCallElimCallsMarkedTail) { + BasicBlock *BB = TI->getParent(); + Function *F = BB->getParent(); + + if (&BB->front() == TI) // Make sure there is something before the terminator. + return 0; + + // Scan backwards from the return, checking to see if there is a tail call in + // this block. If so, set CI to it. + CallInst *CI = 0; + BasicBlock::iterator BBI = TI; + while (true) { + CI = dyn_cast<CallInst>(BBI); + if (CI && CI->getCalledFunction() == F) + break; + + if (BBI == BB->begin()) + return 0; // Didn't find a potential tail call. + --BBI; + } + + // If this call is marked as a tail call, and if there are dynamic allocas in + // the function, we cannot perform this optimization. + if (CI->isTailCall() && CannotTailCallElimCallsMarkedTail) + return 0; + + // As a special case, detect code like this: + // double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call + // and disable this xform in this case, because the code generator will + // lower the call to fabs into inline code. + if (BB == &F->getEntryBlock() && + FirstNonDbg(BB->front()) == CI && + FirstNonDbg(llvm::next(BB->begin())) == TI && + callIsSmall(CI)) { + // A single-block function with just a call and a return. Check that + // the arguments match. + CallSite::arg_iterator I = CallSite(CI).arg_begin(), + E = CallSite(CI).arg_end(); + Function::arg_iterator FI = F->arg_begin(), + FE = F->arg_end(); + for (; I != E && FI != FE; ++I, ++FI) + if (*I != &*FI) break; + if (I == E && FI == FE) + return 0; + } + + return CI; +} + +bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, + BasicBlock *&OldEntry, + bool &TailCallsAreMarkedTail, + SmallVector<PHINode*, 8> &ArgumentPHIs, + bool CannotTailCallElimCallsMarkedTail) { + // If we are introducing accumulator recursion to eliminate operations after + // the call instruction that are both associative and commutative, the initial + // value for the accumulator is placed in this variable. If this value is set + // then we actually perform accumulator recursion elimination instead of + // simple tail recursion elimination. If the operation is an LLVM instruction + // (eg: "add") then it is recorded in AccumulatorRecursionInstr. If not, then + // we are handling the case when the return instruction returns a constant C + // which is different to the constant returned by other return instructions + // (which is recorded in AccumulatorRecursionEliminationInitVal). This is a + // special case of accumulator recursion, the operation being "return C". + Value *AccumulatorRecursionEliminationInitVal = 0; + Instruction *AccumulatorRecursionInstr = 0; + + // Ok, we found a potential tail call. We can currently only transform the + // tail call if all of the instructions between the call and the return are + // movable to above the call itself, leaving the call next to the return. + // Check that this is the case now. + BasicBlock::iterator BBI = CI; + for (++BBI; &*BBI != Ret; ++BBI) { + if (CanMoveAboveCall(BBI, CI)) continue; + + // If we can't move the instruction above the call, it might be because it + // is an associative and commutative operation that could be transformed + // using accumulator recursion elimination. Check to see if this is the + // case, and if so, remember the initial accumulator value for later. + if ((AccumulatorRecursionEliminationInitVal = + CanTransformAccumulatorRecursion(BBI, CI))) { + // Yes, this is accumulator recursion. Remember which instruction + // accumulates. + AccumulatorRecursionInstr = BBI; + } else { + return false; // Otherwise, we cannot eliminate the tail recursion! + } + } + + // We can only transform call/return pairs that either ignore the return value + // of the call and return void, ignore the value of the call and return a + // constant, return the value returned by the tail call, or that are being + // accumulator recursion variable eliminated. + if (Ret->getNumOperands() == 1 && Ret->getReturnValue() != CI && + !isa<UndefValue>(Ret->getReturnValue()) && + AccumulatorRecursionEliminationInitVal == 0 && + !getCommonReturnValue(0, CI)) { + // One case remains that we are able to handle: the current return + // instruction returns a constant, and all other return instructions + // return a different constant. + if (!isDynamicConstant(Ret->getReturnValue(), CI, Ret)) + return false; // Current return instruction does not return a constant. + // Check that all other return instructions return a common constant. If + // so, record it in AccumulatorRecursionEliminationInitVal. + AccumulatorRecursionEliminationInitVal = getCommonReturnValue(Ret, CI); + if (!AccumulatorRecursionEliminationInitVal) + return false; + } + + BasicBlock *BB = Ret->getParent(); + Function *F = BB->getParent(); + + // OK! We can transform this tail call. If this is the first one found, + // create the new entry block, allowing us to branch back to the old entry. + if (OldEntry == 0) { + OldEntry = &F->getEntryBlock(); + BasicBlock *NewEntry = BasicBlock::Create(F->getContext(), "", F, OldEntry); + NewEntry->takeName(OldEntry); + OldEntry->setName("tailrecurse"); + BranchInst::Create(OldEntry, NewEntry); + + // If this tail call is marked 'tail' and if there are any allocas in the + // entry block, move them up to the new entry block. + TailCallsAreMarkedTail = CI->isTailCall(); + if (TailCallsAreMarkedTail) + // Move all fixed sized allocas from OldEntry to NewEntry. + for (BasicBlock::iterator OEBI = OldEntry->begin(), E = OldEntry->end(), + NEBI = NewEntry->begin(); OEBI != E; ) + if (AllocaInst *AI = dyn_cast<AllocaInst>(OEBI++)) + if (isa<ConstantInt>(AI->getArraySize())) + AI->moveBefore(NEBI); + + // Now that we have created a new block, which jumps to the entry + // block, insert a PHI node for each argument of the function. + // For now, we initialize each PHI to only have the real arguments + // which are passed in. + Instruction *InsertPos = OldEntry->begin(); + for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); + I != E; ++I) { + PHINode *PN = PHINode::Create(I->getType(), 2, + I->getName() + ".tr", InsertPos); + I->replaceAllUsesWith(PN); // Everyone use the PHI node now! + PN->addIncoming(I, NewEntry); + ArgumentPHIs.push_back(PN); + } + } + + // If this function has self recursive calls in the tail position where some + // are marked tail and some are not, only transform one flavor or another. We + // have to choose whether we move allocas in the entry block to the new entry + // block or not, so we can't make a good choice for both. NOTE: We could do + // slightly better here in the case that the function has no entry block + // allocas. + if (TailCallsAreMarkedTail && !CI->isTailCall()) + return false; + + // Ok, now that we know we have a pseudo-entry block WITH all of the + // required PHI nodes, add entries into the PHI node for the actual + // parameters passed into the tail-recursive call. + for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) + ArgumentPHIs[i]->addIncoming(CI->getArgOperand(i), BB); + + // If we are introducing an accumulator variable to eliminate the recursion, + // do so now. Note that we _know_ that no subsequent tail recursion + // eliminations will happen on this function because of the way the + // accumulator recursion predicate is set up. + // + if (AccumulatorRecursionEliminationInitVal) { + Instruction *AccRecInstr = AccumulatorRecursionInstr; + // Start by inserting a new PHI node for the accumulator. + pred_iterator PB = pred_begin(OldEntry), PE = pred_end(OldEntry); + PHINode *AccPN = + PHINode::Create(AccumulatorRecursionEliminationInitVal->getType(), + std::distance(PB, PE) + 1, + "accumulator.tr", OldEntry->begin()); + + // Loop over all of the predecessors of the tail recursion block. For the + // real entry into the function we seed the PHI with the initial value, + // computed earlier. For any other existing branches to this block (due to + // other tail recursions eliminated) the accumulator is not modified. + // Because we haven't added the branch in the current block to OldEntry yet, + // it will not show up as a predecessor. + for (pred_iterator PI = PB; PI != PE; ++PI) { + BasicBlock *P = *PI; + if (P == &F->getEntryBlock()) + AccPN->addIncoming(AccumulatorRecursionEliminationInitVal, P); + else + AccPN->addIncoming(AccPN, P); + } + + if (AccRecInstr) { + // Add an incoming argument for the current block, which is computed by + // our associative and commutative accumulator instruction. + AccPN->addIncoming(AccRecInstr, BB); + + // Next, rewrite the accumulator recursion instruction so that it does not + // use the result of the call anymore, instead, use the PHI node we just + // inserted. + AccRecInstr->setOperand(AccRecInstr->getOperand(0) != CI, AccPN); + } else { + // Add an incoming argument for the current block, which is just the + // constant returned by the current return instruction. + AccPN->addIncoming(Ret->getReturnValue(), BB); + } + + // Finally, rewrite any return instructions in the program to return the PHI + // node instead of the "initval" that they do currently. This loop will + // actually rewrite the return value we are destroying, but that's ok. + for (Function::iterator BBI = F->begin(), E = F->end(); BBI != E; ++BBI) + if (ReturnInst *RI = dyn_cast<ReturnInst>(BBI->getTerminator())) + RI->setOperand(0, AccPN); + ++NumAccumAdded; + } + + // Now that all of the PHI nodes are in place, remove the call and + // ret instructions, replacing them with an unconditional branch. + BranchInst *NewBI = BranchInst::Create(OldEntry, Ret); + NewBI->setDebugLoc(CI->getDebugLoc()); + + BB->getInstList().erase(Ret); // Remove return. + BB->getInstList().erase(CI); // Remove call. + ++NumEliminated; + return true; +} + +bool TailCallElim::FoldReturnAndProcessPred(BasicBlock *BB, + ReturnInst *Ret, BasicBlock *&OldEntry, + bool &TailCallsAreMarkedTail, + SmallVector<PHINode*, 8> &ArgumentPHIs, + bool CannotTailCallElimCallsMarkedTail) { + bool Change = false; + + // If the return block contains nothing but the return and PHI's, + // there might be an opportunity to duplicate the return in its + // predecessors and perform TRC there. Look for predecessors that end + // in unconditional branch and recursive call(s). + SmallVector<BranchInst*, 8> UncondBranchPreds; + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { + BasicBlock *Pred = *PI; + TerminatorInst *PTI = Pred->getTerminator(); + if (BranchInst *BI = dyn_cast<BranchInst>(PTI)) + if (BI->isUnconditional()) + UncondBranchPreds.push_back(BI); + } + + while (!UncondBranchPreds.empty()) { + BranchInst *BI = UncondBranchPreds.pop_back_val(); + BasicBlock *Pred = BI->getParent(); + if (CallInst *CI = FindTRECandidate(BI, CannotTailCallElimCallsMarkedTail)){ + DEBUG(dbgs() << "FOLDING: " << *BB + << "INTO UNCOND BRANCH PRED: " << *Pred); + EliminateRecursiveTailCall(CI, FoldReturnIntoUncondBranch(Ret, BB, Pred), + OldEntry, TailCallsAreMarkedTail, ArgumentPHIs, + CannotTailCallElimCallsMarkedTail); + ++NumRetDuped; + Change = true; + } + } + + return Change; +} + +bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry, + bool &TailCallsAreMarkedTail, + SmallVector<PHINode*, 8> &ArgumentPHIs, + bool CannotTailCallElimCallsMarkedTail) { + CallInst *CI = FindTRECandidate(Ret, CannotTailCallElimCallsMarkedTail); + if (!CI) + return false; + + return EliminateRecursiveTailCall(CI, Ret, OldEntry, TailCallsAreMarkedTail, + ArgumentPHIs, + CannotTailCallElimCallsMarkedTail); +} |