diff options
Diffstat (limited to 'contrib/llvm/lib/Transforms/Scalar')
27 files changed, 4989 insertions, 2583 deletions
diff --git a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp index a097308..a3eb07a9 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp @@ -83,7 +83,7 @@ bool ADCE::runOnFunction(Function& F) { I->dropAllReferences(); } - for (SmallVector<Instruction*, 1024>::iterator I = worklist.begin(), + for (SmallVectorImpl<Instruction *>::iterator I = worklist.begin(), E = worklist.end(); I != E; ++I) { ++NumRemoved; (*I)->eraseFromParent(); diff --git a/contrib/llvm/lib/Transforms/Scalar/BasicBlockPlacement.cpp b/contrib/llvm/lib/Transforms/Scalar/BasicBlockPlacement.cpp deleted file mode 100644 index e755008..0000000 --- a/contrib/llvm/lib/Transforms/Scalar/BasicBlockPlacement.cpp +++ /dev/null @@ -1,152 +0,0 @@ -//===-- BasicBlockPlacement.cpp - Basic Block Code Layout optimization ----===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements a very simple profile guided basic block placement -// algorithm. The idea is to put frequently executed blocks together at the -// start of the function, and hopefully increase the number of fall-through -// conditional branches. If there is no profile information for a particular -// function, this pass basically orders blocks in depth-first order -// -// The algorithm implemented here is basically "Algo1" from "Profile Guided Code -// Positioning" by Pettis and Hansen, except that it uses basic block counts -// instead of edge counts. This should be improved in many ways, but is very -// simple for now. -// -// Basically we "place" the entry block, then loop over all successors in a DFO, -// placing the most frequently executed successor until we run out of blocks. I -// told you this was _extremely_ simplistic. :) This is also much slower than it -// could be. When it becomes important, this pass will be rewritten to use a -// better algorithm, and then we can worry about efficiency. -// -//===----------------------------------------------------------------------===// - -#define DEBUG_TYPE "block-placement" -#include "llvm/Transforms/Scalar.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/ProfileInfo.h" -#include "llvm/IR/Function.h" -#include "llvm/Pass.h" -#include "llvm/Support/CFG.h" -#include <set> -using namespace llvm; - -STATISTIC(NumMoved, "Number of basic blocks moved"); - -namespace { - struct BlockPlacement : public FunctionPass { - static char ID; // Pass identification, replacement for typeid - BlockPlacement() : FunctionPass(ID) { - initializeBlockPlacementPass(*PassRegistry::getPassRegistry()); - } - - virtual bool runOnFunction(Function &F); - - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.setPreservesCFG(); - AU.addRequired<ProfileInfo>(); - //AU.addPreserved<ProfileInfo>(); // Does this work? - } - private: - /// PI - The profile information that is guiding us. - /// - ProfileInfo *PI; - - /// NumMovedBlocks - Every time we move a block, increment this counter. - /// - unsigned NumMovedBlocks; - - /// PlacedBlocks - Every time we place a block, remember it so we don't get - /// into infinite loops. - std::set<BasicBlock*> PlacedBlocks; - - /// InsertPos - This an iterator to the next place we want to insert a - /// block. - Function::iterator InsertPos; - - /// PlaceBlocks - Recursively place the specified blocks and any unplaced - /// successors. - void PlaceBlocks(BasicBlock *BB); - }; -} - -char BlockPlacement::ID = 0; -INITIALIZE_PASS_BEGIN(BlockPlacement, "block-placement", - "Profile Guided Basic Block Placement", false, false) -INITIALIZE_AG_DEPENDENCY(ProfileInfo) -INITIALIZE_PASS_END(BlockPlacement, "block-placement", - "Profile Guided Basic Block Placement", false, false) - -FunctionPass *llvm::createBlockPlacementPass() { return new BlockPlacement(); } - -bool BlockPlacement::runOnFunction(Function &F) { - PI = &getAnalysis<ProfileInfo>(); - - NumMovedBlocks = 0; - InsertPos = F.begin(); - - // Recursively place all blocks. - PlaceBlocks(F.begin()); - - PlacedBlocks.clear(); - NumMoved += NumMovedBlocks; - return NumMovedBlocks != 0; -} - - -/// PlaceBlocks - Recursively place the specified blocks and any unplaced -/// successors. -void BlockPlacement::PlaceBlocks(BasicBlock *BB) { - assert(!PlacedBlocks.count(BB) && "Already placed this block!"); - PlacedBlocks.insert(BB); - - // Place the specified block. - if (&*InsertPos != BB) { - // Use splice to move the block into the right place. This avoids having to - // remove the block from the function then readd it, which causes a bunch of - // symbol table traffic that is entirely pointless. - Function::BasicBlockListType &Blocks = BB->getParent()->getBasicBlockList(); - Blocks.splice(InsertPos, Blocks, BB); - - ++NumMovedBlocks; - } else { - // This block is already in the right place, we don't have to do anything. - ++InsertPos; - } - - // Keep placing successors until we run out of ones to place. Note that this - // loop is very inefficient (N^2) for blocks with many successors, like switch - // statements. FIXME! - while (1) { - // Okay, now place any unplaced successors. - succ_iterator SI = succ_begin(BB), E = succ_end(BB); - - // Scan for the first unplaced successor. - for (; SI != E && PlacedBlocks.count(*SI); ++SI) - /*empty*/; - if (SI == E) return; // No more successors to place. - - double MaxExecutionCount = PI->getExecutionCount(*SI); - BasicBlock *MaxSuccessor = *SI; - - // Scan for more frequently executed successors - for (; SI != E; ++SI) - if (!PlacedBlocks.count(*SI)) { - double Count = PI->getExecutionCount(*SI); - if (Count > MaxExecutionCount || - // Prefer to not disturb the code. - (Count == MaxExecutionCount && *SI == &*InsertPos)) { - MaxExecutionCount = Count; - MaxSuccessor = *SI; - } - } - - // Now that we picked the maximally executed successor, place it. - PlaceBlocks(MaxSuccessor); - } -} diff --git a/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp b/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp index f0d29c8..007e9b7 100644 --- a/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp @@ -22,7 +22,6 @@ #include "llvm/Analysis/DominatorInternals.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Analysis/ProfileInfo.h" #include "llvm/Assembly/Writer.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -76,10 +75,10 @@ namespace { class CodeGenPrepare : public FunctionPass { /// TLI - Keep a pointer of a TargetLowering to consult for determining /// transformation profitability. + const TargetMachine *TM; const TargetLowering *TLI; const TargetLibraryInfo *TLInfo; DominatorTree *DT; - ProfileInfo *PFI; /// CurInstIterator - As we scan instructions optimizing them, this is the /// next instruction to optimize. Xforms that can invalidate this should @@ -100,8 +99,8 @@ namespace { public: static char ID; // Pass identification, replacement for typeid - explicit CodeGenPrepare(const TargetLowering *tli = 0) - : FunctionPass(ID), TLI(tli) { + explicit CodeGenPrepare(const TargetMachine *TM = 0) + : FunctionPass(ID), TM(TM), TLI(0) { initializeCodeGenPreparePass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F); @@ -110,7 +109,6 @@ namespace { virtual void getAnalysisUsage(AnalysisUsage &AU) const { AU.addPreserved<DominatorTree>(); - AU.addPreserved<ProfileInfo>(); AU.addRequired<TargetLibraryInfo>(); } @@ -139,17 +137,17 @@ INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) INITIALIZE_PASS_END(CodeGenPrepare, "codegenprepare", "Optimize for code generation", false, false) -FunctionPass *llvm::createCodeGenPreparePass(const TargetLowering *TLI) { - return new CodeGenPrepare(TLI); +FunctionPass *llvm::createCodeGenPreparePass(const TargetMachine *TM) { + return new CodeGenPrepare(TM); } bool CodeGenPrepare::runOnFunction(Function &F) { bool EverMadeChange = false; ModifiedDT = false; + if (TM) TLI = TM->getTargetLowering(); TLInfo = &getAnalysis<TargetLibraryInfo>(); DT = getAnalysisIfAvailable<DominatorTree>(); - PFI = getAnalysisIfAvailable<ProfileInfo>(); OptSize = F.getAttributes().hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); @@ -205,7 +203,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) { SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB)); DeleteDeadBlock(BB); - + for (SmallVectorImpl<BasicBlock*>::iterator II = Successors.begin(), IE = Successors.end(); II != IE; ++II) if (pred_begin(*II) == pred_end(*II)) @@ -440,10 +438,6 @@ void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) { DT->changeImmediateDominator(DestBB, NewIDom); DT->eraseNode(BB); } - if (PFI) { - PFI->replaceAllUses(BB, DestBB); - PFI->removeEdge(ProfileInfo::getEdge(BB, DestBB)); - } BB->eraseFromParent(); ++NumBlocksElim; @@ -830,7 +824,7 @@ struct ExtAddrMode : public TargetLowering::AddrMode { ExtAddrMode() : BaseReg(0), ScaledReg(0) {} void print(raw_ostream &OS) const; void dump() const; - + bool operator==(const ExtAddrMode& O) const { return (BaseReg == O.BaseReg) && (ScaledReg == O.ScaledReg) && (BaseGV == O.BaseGV) && (BaseOffs == O.BaseOffs) && @@ -838,10 +832,12 @@ struct ExtAddrMode : public TargetLowering::AddrMode { } }; +#ifndef NDEBUG static inline raw_ostream &operator<<(raw_ostream &OS, const ExtAddrMode &AM) { AM.print(OS); return OS; } +#endif void ExtAddrMode::print(raw_ostream &OS) const { bool NeedPlus = false; @@ -866,7 +862,6 @@ void ExtAddrMode::print(raw_ostream &OS) const { OS << (NeedPlus ? " + " : "") << Scale << "*"; WriteAsOperand(OS, ScaledReg, /*PrintType=*/false); - NeedPlus = true; } OS << ']'; @@ -891,16 +886,16 @@ class AddressingModeMatcher { /// the memory instruction that we're computing this address for. Type *AccessTy; Instruction *MemoryInst; - + /// AddrMode - This is the addressing mode that we're building up. This is /// part of the return value of this addressing mode matching stuff. ExtAddrMode &AddrMode; - + /// IgnoreProfitability - This is set to true when we should not do /// profitability checks. When true, IsProfitableToFoldIntoAddressingMode /// always returns true. bool IgnoreProfitability; - + AddressingModeMatcher(SmallVectorImpl<Instruction*> &AMI, const TargetLowering &T, Type *AT, Instruction *MI, ExtAddrMode &AM) @@ -908,7 +903,7 @@ class AddressingModeMatcher { IgnoreProfitability = false; } public: - + /// Match - Find the maximal addressing mode that a load/store of V can fold, /// give an access type of AccessTy. This returns a list of involved /// instructions in AddrModeInsts. @@ -918,7 +913,7 @@ public: const TargetLowering &TLI) { ExtAddrMode Result; - bool Success = + bool Success = AddressingModeMatcher(AddrModeInsts, TLI, AccessTy, MemoryInst, Result).MatchAddr(V, 0); (void)Success; assert(Success && "Couldn't select *anything*?"); @@ -943,11 +938,11 @@ bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale, // mode. Just process that directly. if (Scale == 1) return MatchAddr(ScaleReg, Depth); - + // If the scale is 0, it takes nothing to add this. if (Scale == 0) return true; - + // If we already have a scale of this value, we can add to it, otherwise, we // need an available scale field. if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg) @@ -966,7 +961,7 @@ bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale, // It was legal, so commit it. AddrMode = TestAddrMode; - + // Okay, we decided that we can add ScaleReg+Scale to AddrMode. Check now // to see if ScaleReg is actually X+C. If so, we can turn this into adding // X*Scale + C*Scale to addr mode. @@ -975,7 +970,7 @@ bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale, match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI)))) { TestAddrMode.ScaledReg = AddLHS; TestAddrMode.BaseOffs += CI->getSExtValue()*TestAddrMode.Scale; - + // If this addressing mode is legal, commit it and remember that we folded // this instruction. if (TLI.isLegalAddressingMode(TestAddrMode, AccessTy)) { @@ -1026,7 +1021,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, unsigned Depth) { // Avoid exponential behavior on extremely deep expression trees. if (Depth >= 5) return false; - + switch (Opcode) { case Instruction::PtrToInt: // PtrToInt is always a noop, as we know that the int type is pointer sized. @@ -1034,7 +1029,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, case Instruction::IntToPtr: // This inttoptr is a no-op if the integer type is pointer sized. if (TLI.getValueType(AddrInst->getOperand(0)->getType()) == - TLI.getPointerTy()) + TLI.getPointerTy(AddrInst->getType()->getPointerAddressSpace())) return MatchAddr(AddrInst->getOperand(0), Depth); return false; case Instruction::BitCast: @@ -1055,16 +1050,16 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, if (MatchAddr(AddrInst->getOperand(1), Depth+1) && MatchAddr(AddrInst->getOperand(0), Depth+1)) return true; - + // Restore the old addr mode info. AddrMode = BackupAddrMode; AddrModeInsts.resize(OldSize); - + // Otherwise this was over-aggressive. Try merging in the LHS then the RHS. if (MatchAddr(AddrInst->getOperand(0), Depth+1) && MatchAddr(AddrInst->getOperand(1), Depth+1)) return true; - + // Otherwise we definitely can't merge the ADD in. AddrMode = BackupAddrMode; AddrModeInsts.resize(OldSize); @@ -1081,7 +1076,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, int64_t Scale = RHS->getSExtValue(); if (Opcode == Instruction::Shl) Scale = 1LL << Scale; - + return MatchScaledValue(AddrInst->getOperand(0), Scale, Depth); } case Instruction::GetElementPtr: { @@ -1089,7 +1084,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, // one variable offset. int VariableOperand = -1; unsigned VariableScale = 0; - + int64_t ConstantOffset = 0; const DataLayout *TD = TLI.getDataLayout(); gep_type_iterator GTI = gep_type_begin(AddrInst); @@ -1107,14 +1102,14 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode, // We only allow one variable index at the moment. if (VariableOperand != -1) return false; - + // Remember the variable index. VariableOperand = i; VariableScale = TypeSize; } } } - + // A common case is for the GEP to only do a constant offset. In this case, // just add it to the disp field and check validity. if (VariableOperand == -1) { @@ -1208,7 +1203,7 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) { AddrModeInsts.push_back(I); return true; } - + // It isn't profitable to do this, roll back. //cerr << "NOT FOLDING: " << *I; AddrMode = BackupAddrMode; @@ -1254,7 +1249,7 @@ static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal, TargetLowering::AsmOperandInfoVector TargetConstraints = TLI.ParseConstraints(ImmutableCallSite(CI)); for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; - + // Compute the constraint code and ConstraintType to use. TLI.ComputeConstraintToUse(OpInfo, SDValue()); @@ -1279,7 +1274,7 @@ static bool FindAllMemoryUses(Instruction *I, // If we already considered this instruction, we're done. if (!ConsideredInsts.insert(I)) return false; - + // If this is an obviously unfoldable instruction, bail out. if (!MightBeFoldableInst(I)) return true; @@ -1293,24 +1288,24 @@ static bool FindAllMemoryUses(Instruction *I, MemoryUses.push_back(std::make_pair(LI, UI.getOperandNo())); continue; } - + if (StoreInst *SI = dyn_cast<StoreInst>(U)) { unsigned opNo = UI.getOperandNo(); if (opNo == 0) return true; // Storing addr, not into addr. MemoryUses.push_back(std::make_pair(SI, opNo)); continue; } - + if (CallInst *CI = dyn_cast<CallInst>(U)) { InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue()); if (!IA) return true; - + // If this is a memory operand, we're cool, otherwise bail out. if (!IsOperandAMemoryOperand(CI, IA, I, TLI)) return true; continue; } - + if (FindAllMemoryUses(cast<Instruction>(U), MemoryUses, ConsideredInsts, TLI)) return true; @@ -1328,17 +1323,17 @@ bool AddressingModeMatcher::ValueAlreadyLiveAtInst(Value *Val,Value *KnownLive1, // If Val is either of the known-live values, we know it is live! if (Val == 0 || Val == KnownLive1 || Val == KnownLive2) return true; - + // All values other than instructions and arguments (e.g. constants) are live. if (!isa<Instruction>(Val) && !isa<Argument>(Val)) return true; - + // If Val is a constant sized alloca in the entry block, it is live, this is // true because it is just a reference to the stack/frame pointer, which is // live for the whole function. if (AllocaInst *AI = dyn_cast<AllocaInst>(Val)) if (AI->isStaticAlloca()) return true; - + // Check to see if this value is already used in the memory instruction's // block. If so, it's already live into the block at the very least, so we // can reasonably fold it. @@ -1370,7 +1365,7 @@ bool AddressingModeMatcher:: IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, ExtAddrMode &AMAfter) { if (IgnoreProfitability) return true; - + // AMBefore is the addressing mode before this instruction was folded into it, // and AMAfter is the addressing mode after the instruction was folded. Get // the set of registers referenced by AMAfter and subtract out those @@ -1381,7 +1376,7 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, // BaseReg and ScaleReg (global addresses are always available, as are any // folded immediates). Value *BaseReg = AMAfter.BaseReg, *ScaledReg = AMAfter.ScaledReg; - + // If the BaseReg or ScaledReg was referenced by the previous addrmode, their // lifetime wasn't extended by adding this instruction. if (ValueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg)) @@ -1402,7 +1397,7 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, SmallPtrSet<Instruction*, 16> ConsideredInsts; if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI)) return false; // Has a non-memory, non-foldable use! - + // Now that we know that all uses of this instruction are part of a chain of // computation involving only operations that could theoretically be folded // into a memory use, loop over each of these uses and see if they could @@ -1411,15 +1406,14 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) { Instruction *User = MemoryUses[i].first; unsigned OpNo = MemoryUses[i].second; - + // Get the access type of this use. If the use isn't a pointer, we don't // know what it accesses. Value *Address = User->getOperand(OpNo); if (!Address->getType()->isPointerTy()) return false; - Type *AddressAccessTy = - cast<PointerType>(Address->getType())->getElementType(); - + Type *AddressAccessTy = Address->getType()->getPointerElementType(); + // Do a match against the root of this address, ignoring profitability. This // will tell us if the addressing mode for the memory operation will // *actually* cover the shared instruction. @@ -1434,10 +1428,10 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore, if (std::find(MatchedAddrModeInsts.begin(), MatchedAddrModeInsts.end(), I) == MatchedAddrModeInsts.end()) return false; - + MatchedAddrModeInsts.clear(); } - + return true; } @@ -1572,9 +1566,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, } else { DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for " << *MemoryInst); - Type *IntPtrTy = - TLI->getDataLayout()->getIntPtrType(AccessTy->getContext()); - + Type *IntPtrTy = TLI->getDataLayout()->getIntPtrType(Addr->getType()); Value *Result = 0; // Start with the base register. Do this first so that subsequent address @@ -1893,7 +1885,8 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) { // It is possible for very late stage optimizations (such as SimplifyCFG) // to introduce PHI nodes too late to be cleaned up. If we detect such a // trivial PHI, go ahead and zap it here. - if (Value *V = SimplifyInstruction(P)) { + if (Value *V = SimplifyInstruction(P, TLI ? TLI->getDataLayout() : 0, + TLInfo, DT)) { P->replaceAllUsesWith(V); P->eraseFromParent(); ++NumPHIsElim; diff --git a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp index 3c08634..5266894 100644 --- a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -72,11 +72,6 @@ namespace { } namespace llvm { -// SimpleValue is POD. -template<> struct isPodLike<SimpleValue> { - static const bool value = true; -}; - template<> struct DenseMapInfo<SimpleValue> { static inline SimpleValue getEmptyKey() { return DenseMapInfo<Instruction*>::getEmptyKey(); @@ -220,11 +215,6 @@ namespace { } namespace llvm { - // CallValue is POD. - template<> struct isPodLike<CallValue> { - static const bool value = true; - }; - template<> struct DenseMapInfo<CallValue> { static inline CallValue getEmptyKey() { return DenseMapInfo<Instruction*>::getEmptyKey(); diff --git a/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp new file mode 100644 index 0000000..e7de07f --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp @@ -0,0 +1,79 @@ +//===- FlattenCFGPass.cpp - CFG Flatten Pass ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements flattening of CFG. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "flattencfg" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Pass.h" +#include "llvm/Support/CFG.h" +#include "llvm/Transforms/Utils/Local.h" +using namespace llvm; + +namespace { +struct FlattenCFGPass : public FunctionPass { + static char ID; // Pass identification, replacement for typeid +public: + FlattenCFGPass() : FunctionPass(ID) { + initializeFlattenCFGPassPass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F); + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<AliasAnalysis>(); + } + +private: + AliasAnalysis *AA; +}; +} + +char FlattenCFGPass::ID = 0; +INITIALIZE_PASS_BEGIN(FlattenCFGPass, "flattencfg", "Flatten the CFG", false, + false) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_END(FlattenCFGPass, "flattencfg", "Flatten the CFG", false, + false) + +// Public interface to the FlattenCFG pass +FunctionPass *llvm::createFlattenCFGPass() { return new FlattenCFGPass(); } + +/// iterativelyFlattenCFG - Call FlattenCFG on all the blocks in the function, +/// iterating until no more changes are made. +static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) { + bool Changed = false; + bool LocalChange = true; + while (LocalChange) { + LocalChange = false; + + // Loop over all of the basic blocks and remove them if they are unneeded... + // + for (Function::iterator BBIt = F.begin(); BBIt != F.end();) { + if (FlattenCFG(BBIt++, AA)) { + LocalChange = true; + } + } + Changed |= LocalChange; + } + return Changed; +} + +bool FlattenCFGPass::runOnFunction(Function &F) { + AA = &getAnalysis<AliasAnalysis>(); + bool EverChanged = false; + // iterativelyFlattenCFG can make some blocks dead. + while (iterativelyFlattenCFG(F, AA)) { + removeUnreachableBlocks(F); + EverChanged = true; + } + return EverChanged; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp index f350b9b..6af269d 100644 --- a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp @@ -21,8 +21,10 @@ #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/CFG.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" @@ -45,6 +47,7 @@ #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" +#include <vector> using namespace llvm; using namespace PatternMatch; @@ -505,7 +508,9 @@ namespace { enum ValType { SimpleVal, // A simple offsetted value that is accessed. LoadVal, // A value produced by a load. - MemIntrin // A memory intrinsic which is loaded from. + MemIntrin, // A memory intrinsic which is loaded from. + UndefVal // A UndefValue representing a value from dead block (which + // is not yet physically removed from the CFG). }; /// V - The value that is live out of the block. @@ -543,10 +548,20 @@ namespace { Res.Offset = Offset; return Res; } - + + static AvailableValueInBlock getUndef(BasicBlock *BB) { + AvailableValueInBlock Res; + Res.BB = BB; + Res.Val.setPointer(0); + Res.Val.setInt(UndefVal); + Res.Offset = 0; + return Res; + } + bool isSimpleValue() const { return Val.getInt() == SimpleVal; } bool isCoercedLoadValue() const { return Val.getInt() == LoadVal; } bool isMemIntrinValue() const { return Val.getInt() == MemIntrin; } + bool isUndefValue() const { return Val.getInt() == UndefVal; } Value *getSimpleValue() const { assert(isSimpleValue() && "Wrong accessor"); @@ -574,6 +589,7 @@ namespace { DominatorTree *DT; const DataLayout *TD; const TargetLibraryInfo *TLI; + SetVector<BasicBlock *> DeadBlocks; ValueTable VN; @@ -692,9 +708,13 @@ namespace { void cleanupGlobalSets(); void verifyRemoved(const Instruction *I) const; bool splitCriticalEdges(); + BasicBlock *splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ); unsigned replaceAllDominatedUsesWith(Value *From, Value *To, const BasicBlockEdge &Root); bool propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root); + bool processFoldableCondBr(BranchInst *BI); + void addDeadBlock(BasicBlock *BB); + void assignValNumForDeadCode(); }; char GVN::ID = 0; @@ -1068,14 +1088,15 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, if (Offset == -1) return Offset; + unsigned AS = Src->getType()->getPointerAddressSpace(); // Otherwise, see if we can constant fold a load from the constant with the // offset applied as appropriate. Src = ConstantExpr::getBitCast(Src, - llvm::Type::getInt8PtrTy(Src->getContext())); + Type::getInt8PtrTy(Src->getContext(), AS)); Constant *OffsetCst = ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); Src = ConstantExpr::getGetElementPtr(Src, OffsetCst); - Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy)); + Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS)); if (ConstantFoldLoadFromConstPtr(Src, &TD)) return Offset; return -1; @@ -1152,7 +1173,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, Type *DestPTy = IntegerType::get(LoadTy->getContext(), NewLoadSize*8); DestPTy = PointerType::get(DestPTy, - cast<PointerType>(PtrVal->getType())->getAddressSpace()); + PtrVal->getType()->getPointerAddressSpace()); Builder.SetCurrentDebugLocation(SrcVal->getDebugLoc()); PtrVal = Builder.CreateBitCast(PtrVal, DestPTy); LoadInst *NewLoad = Builder.CreateLoad(PtrVal); @@ -1227,15 +1248,16 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, // Otherwise, this is a memcpy/memmove from a constant global. MemTransferInst *MTI = cast<MemTransferInst>(SrcInst); Constant *Src = cast<Constant>(MTI->getSource()); + unsigned AS = Src->getType()->getPointerAddressSpace(); // Otherwise, see if we can constant fold a load from the constant with the // offset applied as appropriate. Src = ConstantExpr::getBitCast(Src, - llvm::Type::getInt8PtrTy(Src->getContext())); + Type::getInt8PtrTy(Src->getContext(), AS)); Constant *OffsetCst = - ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); + ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); Src = ConstantExpr::getGetElementPtr(Src, OffsetCst); - Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy)); + Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS)); return ConstantFoldLoadFromConstPtr(Src, &TD); } @@ -1250,8 +1272,10 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, // just use the dominating value directly. if (ValuesPerBlock.size() == 1 && gvn.getDominatorTree().properlyDominates(ValuesPerBlock[0].BB, - LI->getParent())) + LI->getParent())) { + assert(!ValuesPerBlock[0].isUndefValue() && "Dead BB dominate this block"); return ValuesPerBlock[0].MaterializeAdjustedValue(LI->getType(), gvn); + } // Otherwise, we have to construct SSA form. SmallVector<PHINode*, 8> NewPHIs; @@ -1321,7 +1345,7 @@ Value *AvailableValueInBlock::MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) c << *getCoercedLoadValue() << '\n' << *Res << '\n' << "\n\n\n"); } - } else { + } else if (isMemIntrinValue()) { const DataLayout *TD = gvn.getDataLayout(); assert(TD && "Need target data to handle type mismatch case"); Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset, @@ -1329,6 +1353,10 @@ Value *AvailableValueInBlock::MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) c DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset << " " << *getMemIntrinValue() << '\n' << *Res << '\n' << "\n\n\n"); + } else { + assert(isUndefValue() && "Should be UndefVal"); + DEBUG(dbgs() << "GVN COERCED NONLOCAL Undef:\n";); + return UndefValue::get(LoadTy); } return Res; } @@ -1352,6 +1380,13 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, BasicBlock *DepBB = Deps[i].getBB(); MemDepResult DepInfo = Deps[i].getResult(); + if (DeadBlocks.count(DepBB)) { + // Dead dependent mem-op disguise as a load evaluating the same value + // as the load in question. + ValuesPerBlock.push_back(AvailableValueInBlock::getUndef(DepBB)); + continue; + } + if (!DepInfo.isDef() && !DepInfo.isClobber()) { UnavailableBlocks.push_back(DepBB); continue; @@ -1513,7 +1548,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i) FullyAvailableBlocks[UnavailableBlocks[i]] = false; - SmallVector<std::pair<TerminatorInst*, unsigned>, 4> NeedToSplit; + SmallVector<BasicBlock *, 4> CriticalEdgePred; for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB); PI != E; ++PI) { BasicBlock *Pred = *PI; @@ -1536,20 +1571,14 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, return false; } - unsigned SuccNum = GetSuccessorNumber(Pred, LoadBB); - NeedToSplit.push_back(std::make_pair(Pred->getTerminator(), SuccNum)); + CriticalEdgePred.push_back(Pred); } } - if (!NeedToSplit.empty()) { - toSplit.append(NeedToSplit.begin(), NeedToSplit.end()); - return false; - } - // Decide whether PRE is profitable for this load. unsigned NumUnavailablePreds = PredLoads.size(); assert(NumUnavailablePreds != 0 && - "Fully available value should be eliminated above!"); + "Fully available value should already be eliminated!"); // If this load is unavailable in multiple predecessors, reject it. // FIXME: If we could restructure the CFG, we could make a common pred with @@ -1558,6 +1587,17 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, if (NumUnavailablePreds != 1) return false; + // Split critical edges, and update the unavailable predecessors accordingly. + for (SmallVectorImpl<BasicBlock *>::iterator I = CriticalEdgePred.begin(), + E = CriticalEdgePred.end(); I != E; I++) { + BasicBlock *OrigPred = *I; + BasicBlock *NewPred = splitCriticalEdges(OrigPred, LoadBB); + PredLoads.erase(OrigPred); + PredLoads[NewPred] = 0; + DEBUG(dbgs() << "Split critical edge " << OrigPred->getName() << "->" + << LoadBB->getName() << '\n'); + } + // Check if the load can safely be moved to all the unavailable predecessors. bool CanDoPRE = true; SmallVector<Instruction*, 8> NewInsts; @@ -1594,7 +1634,9 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, if (MD) MD->removeInstruction(I); I->eraseFromParent(); } - return false; + // HINT:Don't revert the edge-splitting as following transformation may + // also need to split these critial edges. + return !CriticalEdgePred.empty(); } // Okay, we can eliminate this load by inserting a reload in the predecessor @@ -2181,11 +2223,13 @@ bool GVN::processInstruction(Instruction *I) { // For conditional branches, we can perform simple conditional propagation on // the condition value itself. if (BranchInst *BI = dyn_cast<BranchInst>(I)) { - if (!BI->isConditional() || isa<Constant>(BI->getCondition())) + if (!BI->isConditional()) return false; - Value *BranchCond = BI->getCondition(); + if (isa<Constant>(BI->getCondition())) + return processFoldableCondBr(BI); + Value *BranchCond = BI->getCondition(); BasicBlock *TrueSucc = BI->getSuccessor(0); BasicBlock *FalseSucc = BI->getSuccessor(1); // Avoid multiple edges early. @@ -2297,25 +2341,30 @@ bool GVN::runOnFunction(Function& F) { while (ShouldContinue) { DEBUG(dbgs() << "GVN iteration: " << Iteration << "\n"); ShouldContinue = iterateOnFunction(F); - if (splitCriticalEdges()) - ShouldContinue = true; Changed |= ShouldContinue; ++Iteration; } if (EnablePRE) { + // Fabricate val-num for dead-code in order to suppress assertion in + // performPRE(). + assignValNumForDeadCode(); bool PREChanged = true; while (PREChanged) { PREChanged = performPRE(F); Changed |= PREChanged; } } + // FIXME: Should perform GVN again after PRE does something. PRE can move // computations into blocks where they become fully redundant. Note that // we can't do this until PRE's critical edge splitting updates memdep. // Actually, when this happens, we should just fully integrate PRE into GVN. cleanupGlobalSets(); + // Do not cleanup DeadBlocks in cleanupGlobalSets() as it's called for each + // iteration. + DeadBlocks.clear(); return Changed; } @@ -2326,6 +2375,9 @@ bool GVN::processBlock(BasicBlock *BB) { // (and incrementing BI before processing an instruction). assert(InstrsToErase.empty() && "We expect InstrsToErase to be empty across iterations"); + if (DeadBlocks.count(BB)) + return false; + bool ChangedFunction = false; for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); @@ -2344,7 +2396,7 @@ bool GVN::processBlock(BasicBlock *BB) { if (!AtStart) --BI; - for (SmallVector<Instruction*, 4>::iterator I = InstrsToErase.begin(), + for (SmallVectorImpl<Instruction *>::iterator I = InstrsToErase.begin(), E = InstrsToErase.end(); I != E; ++I) { DEBUG(dbgs() << "GVN removed: " << **I << '\n'); if (MD) MD->removeInstruction(*I); @@ -2543,6 +2595,15 @@ bool GVN::performPRE(Function &F) { return Changed; } +/// Split the critical edge connecting the given two blocks, and return +/// the block inserted to the critical edge. +BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) { + BasicBlock *BB = SplitCriticalEdge(Pred, Succ, this); + if (MD) + MD->invalidateCachedPredecessors(); + return BB; +} + /// splitCriticalEdges - Split critical edges found during the previous /// iteration that may enable further optimization. bool GVN::splitCriticalEdges() { @@ -2569,9 +2630,18 @@ bool GVN::iterateOnFunction(Function &F) { RE = RPOT.end(); RI != RE; ++RI) Changed |= processBlock(*RI); #else + // Save the blocks this function have before transformation begins. GVN may + // split critical edge, and hence may invalidate the RPO/DT iterator. + // + std::vector<BasicBlock *> BBVect; + BBVect.reserve(256); for (df_iterator<DomTreeNode*> DI = df_begin(DT->getRootNode()), DE = df_end(DT->getRootNode()); DI != DE; ++DI) - Changed |= processBlock(DI->getBlock()); + BBVect.push_back(DI->getBlock()); + + for (std::vector<BasicBlock *>::iterator I = BBVect.begin(), E = BBVect.end(); + I != E; I++) + Changed |= processBlock(*I); #endif return Changed; @@ -2601,3 +2671,133 @@ void GVN::verifyRemoved(const Instruction *Inst) const { } } } + +// BB is declared dead, which implied other blocks become dead as well. This +// function is to add all these blocks to "DeadBlocks". For the dead blocks' +// live successors, update their phi nodes by replacing the operands +// corresponding to dead blocks with UndefVal. +// +void GVN::addDeadBlock(BasicBlock *BB) { + SmallVector<BasicBlock *, 4> NewDead; + SmallSetVector<BasicBlock *, 4> DF; + + NewDead.push_back(BB); + while (!NewDead.empty()) { + BasicBlock *D = NewDead.pop_back_val(); + if (DeadBlocks.count(D)) + continue; + + // All blocks dominated by D are dead. + SmallVector<BasicBlock *, 8> Dom; + DT->getDescendants(D, Dom); + DeadBlocks.insert(Dom.begin(), Dom.end()); + + // Figure out the dominance-frontier(D). + for (SmallVectorImpl<BasicBlock *>::iterator I = Dom.begin(), + E = Dom.end(); I != E; I++) { + BasicBlock *B = *I; + for (succ_iterator SI = succ_begin(B), SE = succ_end(B); SI != SE; SI++) { + BasicBlock *S = *SI; + if (DeadBlocks.count(S)) + continue; + + bool AllPredDead = true; + for (pred_iterator PI = pred_begin(S), PE = pred_end(S); PI != PE; PI++) + if (!DeadBlocks.count(*PI)) { + AllPredDead = false; + break; + } + + if (!AllPredDead) { + // S could be proved dead later on. That is why we don't update phi + // operands at this moment. + DF.insert(S); + } else { + // While S is not dominated by D, it is dead by now. This could take + // place if S already have a dead predecessor before D is declared + // dead. + NewDead.push_back(S); + } + } + } + } + + // For the dead blocks' live successors, update their phi nodes by replacing + // the operands corresponding to dead blocks with UndefVal. + for(SmallSetVector<BasicBlock *, 4>::iterator I = DF.begin(), E = DF.end(); + I != E; I++) { + BasicBlock *B = *I; + if (DeadBlocks.count(B)) + continue; + + SmallVector<BasicBlock *, 4> Preds(pred_begin(B), pred_end(B)); + for (SmallVectorImpl<BasicBlock *>::iterator PI = Preds.begin(), + PE = Preds.end(); PI != PE; PI++) { + BasicBlock *P = *PI; + + if (!DeadBlocks.count(P)) + continue; + + if (isCriticalEdge(P->getTerminator(), GetSuccessorNumber(P, B))) { + if (BasicBlock *S = splitCriticalEdges(P, B)) + DeadBlocks.insert(P = S); + } + + for (BasicBlock::iterator II = B->begin(); isa<PHINode>(II); ++II) { + PHINode &Phi = cast<PHINode>(*II); + Phi.setIncomingValue(Phi.getBasicBlockIndex(P), + UndefValue::get(Phi.getType())); + } + } + } +} + +// If the given branch is recognized as a foldable branch (i.e. conditional +// branch with constant condition), it will perform following analyses and +// transformation. +// 1) If the dead out-coming edge is a critical-edge, split it. Let +// R be the target of the dead out-coming edge. +// 1) Identify the set of dead blocks implied by the branch's dead outcoming +// edge. The result of this step will be {X| X is dominated by R} +// 2) Identify those blocks which haves at least one dead prodecessor. The +// result of this step will be dominance-frontier(R). +// 3) Update the PHIs in DF(R) by replacing the operands corresponding to +// dead blocks with "UndefVal" in an hope these PHIs will optimized away. +// +// Return true iff *NEW* dead code are found. +bool GVN::processFoldableCondBr(BranchInst *BI) { + if (!BI || BI->isUnconditional()) + return false; + + ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition()); + if (!Cond) + return false; + + BasicBlock *DeadRoot = Cond->getZExtValue() ? + BI->getSuccessor(1) : BI->getSuccessor(0); + if (DeadBlocks.count(DeadRoot)) + return false; + + if (!DeadRoot->getSinglePredecessor()) + DeadRoot = splitCriticalEdges(BI->getParent(), DeadRoot); + + addDeadBlock(DeadRoot); + return true; +} + +// performPRE() will trigger assert if it come across an instruciton without +// associated val-num. As it normally has far more live instructions than dead +// instructions, it makes more sense just to "fabricate" a val-number for the +// dead code than checking if instruction involved is dead or not. +void GVN::assignValNumForDeadCode() { + for (SetVector<BasicBlock *>::iterator I = DeadBlocks.begin(), + E = DeadBlocks.end(); I != E; I++) { + BasicBlock *BB = *I; + for (BasicBlock::iterator II = BB->begin(), EE = BB->end(); + II != EE; II++) { + Instruction *Inst = &*II; + unsigned ValNum = VN.lookup_or_add(Inst); + addToLeaderTable(ValNum, Inst, BB); + } + } +} diff --git a/contrib/llvm/lib/Transforms/Scalar/GlobalMerge.cpp b/contrib/llvm/lib/Transforms/Scalar/GlobalMerge.cpp index 4796eb2..954e545 100644 --- a/contrib/llvm/lib/Transforms/Scalar/GlobalMerge.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/GlobalMerge.cpp @@ -72,15 +72,13 @@ using namespace llvm; static cl::opt<bool> EnableGlobalMergeOnConst("global-merge-on-const", cl::Hidden, - cl::desc("Enable global merge pass on constants"), - cl::init(false)); + cl::desc("Enable global merge pass on constants"), + cl::init(false)); STATISTIC(NumMerged , "Number of globals merged"); namespace { class GlobalMerge : public FunctionPass { - /// TLI - Keep a pointer of a TargetLowering to consult for determining - /// target type sizes. - const TargetLowering *TLI; + const TargetMachine *TM; bool doMerge(SmallVectorImpl<GlobalVariable*> &Globals, Module &M, bool isConst, unsigned AddrSpace) const; @@ -104,8 +102,8 @@ namespace { public: static char ID; // Pass identification, replacement for typeid. - explicit GlobalMerge(const TargetLowering *tli = 0) - : FunctionPass(ID), TLI(tli) { + explicit GlobalMerge(const TargetMachine *TM = 0) + : FunctionPass(ID), TM(TM) { initializeGlobalMergePass(*PassRegistry::getPassRegistry()); } @@ -144,6 +142,7 @@ INITIALIZE_PASS(GlobalMerge, "global-merge", bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals, Module &M, bool isConst, unsigned AddrSpace) const { + const TargetLowering *TLI = TM->getTargetLowering(); const DataLayout *TD = TLI->getDataLayout(); // FIXME: Infer the maximum possible offset depending on the actual users @@ -234,6 +233,7 @@ void GlobalMerge::setMustKeepGlobalVariables(Module &M) { bool GlobalMerge::doInitialization(Module &M) { DenseMap<unsigned, SmallVector<GlobalVariable*, 16> > Globals, ConstGlobals, BSSGlobals; + const TargetLowering *TLI = TM->getTargetLowering(); const DataLayout *TD = TLI->getDataLayout(); unsigned MaxOffset = TLI->getMaximalGlobalOffset(); bool Changed = false; @@ -305,6 +305,6 @@ bool GlobalMerge::doFinalization(Module &M) { return false; } -Pass *llvm::createGlobalMergePass(const TargetLowering *tli) { - return new GlobalMerge(tli); +Pass *llvm::createGlobalMergePass(const TargetMachine *TM) { + return new GlobalMerge(TM); } diff --git a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 8e76c78..235aaaa 100644 --- a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -532,7 +532,8 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { // and varies predictably *inside* the loop. Evaluate the value it // contains when the loop exits, if possible. const SCEV *ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop()); - if (!SE->isLoopInvariant(ExitValue, L)) + if (!SE->isLoopInvariant(ExitValue, L) || + !isSafeToExpand(ExitValue, *SE)) continue; // Computing the value outside of the loop brings no benefit if : @@ -1479,8 +1480,14 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L, if (IndVar->getType()->isPointerTy() && !IVCount->getType()->isPointerTy()) { + // IVOffset will be the new GEP offset that is interpreted by GEP as a + // signed value. IVCount on the other hand represents the loop trip count, + // which is an unsigned value. FindLoopCounter only allows induction + // variables that have a positive unit stride of one. This means we don't + // have to handle the case of negative offsets (yet) and just need to zero + // extend IVCount. Type *OfsTy = SE->getEffectiveSCEVType(IVInit->getType()); - const SCEV *IVOffset = SE->getTruncateOrSignExtend(IVCount, OfsTy); + const SCEV *IVOffset = SE->getTruncateOrZeroExtend(IVCount, OfsTy); // Expand the code for the iteration count. assert(SE->isLoopInvariant(IVOffset, L) && @@ -1492,7 +1499,7 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L, assert(AR->getStart() == SE->getSCEV(GEPBase) && "bad loop counter"); // We could handle pointer IVs other than i8*, but we need to compensate for // gep index scaling. See canExpandBackedgeTakenCount comments. - assert(SE->getSizeOfExpr( + assert(SE->getSizeOfExpr(IntegerType::getInt64Ty(IndVar->getContext()), cast<PointerType>(GEPBase->getType())->getElementType())->isOne() && "unit stride pointer IV must be i8*"); @@ -1506,9 +1513,10 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L, // BECount = (IVEnd - IVInit - 1) => IVLimit = IVInit (postinc). // // Valid Cases: (1) both integers is most common; (2) both may be pointers - // for simple memset-style loops; (3) IVInit is an integer and IVCount is a - // pointer may occur when enable-iv-rewrite generates a canonical IV on top - // of case #2. + // for simple memset-style loops. + // + // IVInit integer and IVCount pointer would only occur if a canonical IV + // were generated on top of case #2, which is not expected. const SCEV *IVLimit = 0; // For unit stride, IVCount = Start + BECount with 2's complement overflow. @@ -1552,44 +1560,23 @@ LinearFunctionTestReplace(Loop *L, SCEVExpander &Rewriter) { assert(canExpandBackedgeTakenCount(L, SE) && "precondition"); - // LFTR can ignore IV overflow and truncate to the width of - // BECount. This avoids materializing the add(zext(add)) expression. - Type *CntTy = BackedgeTakenCount->getType(); - + // Initialize CmpIndVar and IVCount to their preincremented values. + Value *CmpIndVar = IndVar; const SCEV *IVCount = BackedgeTakenCount; // If the exiting block is the same as the backedge block, we prefer to // compare against the post-incremented value, otherwise we must compare // against the preincremented value. - Value *CmpIndVar; if (L->getExitingBlock() == L->getLoopLatch()) { // Add one to the "backedge-taken" count to get the trip count. - // If this addition may overflow, we have to be more pessimistic and - // cast the induction variable before doing the add. - const SCEV *N = - SE->getAddExpr(IVCount, SE->getConstant(IVCount->getType(), 1)); - if (CntTy == IVCount->getType()) - IVCount = N; - else { - const SCEV *Zero = SE->getConstant(IVCount->getType(), 0); - if ((isa<SCEVConstant>(N) && !N->isZero()) || - SE->isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, N, Zero)) { - // No overflow. Cast the sum. - IVCount = SE->getTruncateOrZeroExtend(N, CntTy); - } else { - // Potential overflow. Cast before doing the add. - IVCount = SE->getTruncateOrZeroExtend(IVCount, CntTy); - IVCount = SE->getAddExpr(IVCount, SE->getConstant(CntTy, 1)); - } - } + // This addition may overflow, which is valid as long as the comparison is + // truncated to BackedgeTakenCount->getType(). + IVCount = SE->getAddExpr(BackedgeTakenCount, + SE->getConstant(BackedgeTakenCount->getType(), 1)); // The BackedgeTaken expression contains the number of times that the // backedge branches to the loop header. This is one less than the // number of times the loop executes, so use the incremented indvar. CmpIndVar = IndVar->getIncomingValueForBlock(L->getExitingBlock()); - } else { - // We must use the preincremented value... - IVCount = SE->getTruncateOrZeroExtend(IVCount, CntTy); - CmpIndVar = IndVar; } Value *ExitCnt = genLoopLimit(IndVar, IVCount, L, Rewriter, SE); @@ -1612,12 +1599,40 @@ LinearFunctionTestReplace(Loop *L, << " IVCount:\t" << *IVCount << "\n"); IRBuilder<> Builder(BI); - if (SE->getTypeSizeInBits(CmpIndVar->getType()) - > SE->getTypeSizeInBits(ExitCnt->getType())) { - CmpIndVar = Builder.CreateTrunc(CmpIndVar, ExitCnt->getType(), - "lftr.wideiv"); - } + // LFTR can ignore IV overflow and truncate to the width of + // BECount. This avoids materializing the add(zext(add)) expression. + unsigned CmpIndVarSize = SE->getTypeSizeInBits(CmpIndVar->getType()); + unsigned ExitCntSize = SE->getTypeSizeInBits(ExitCnt->getType()); + if (CmpIndVarSize > ExitCntSize) { + const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(SE->getSCEV(IndVar)); + const SCEV *ARStart = AR->getStart(); + const SCEV *ARStep = AR->getStepRecurrence(*SE); + // For constant IVCount, avoid truncation. + if (isa<SCEVConstant>(ARStart) && isa<SCEVConstant>(IVCount)) { + const APInt &Start = cast<SCEVConstant>(ARStart)->getValue()->getValue(); + APInt Count = cast<SCEVConstant>(IVCount)->getValue()->getValue(); + // Note that the post-inc value of BackedgeTakenCount may have overflowed + // above such that IVCount is now zero. + if (IVCount != BackedgeTakenCount && Count == 0) { + Count = APInt::getMaxValue(Count.getBitWidth()).zext(CmpIndVarSize); + ++Count; + } + else + Count = Count.zext(CmpIndVarSize); + APInt NewLimit; + if (cast<SCEVConstant>(ARStep)->getValue()->isNegative()) + NewLimit = Start - Count; + else + NewLimit = Start + Count; + ExitCnt = ConstantInt::get(CmpIndVar->getType(), NewLimit); + + DEBUG(dbgs() << " Widen RHS:\t" << *ExitCnt << "\n"); + } else { + CmpIndVar = Builder.CreateTrunc(CmpIndVar, ExitCnt->getType(), + "lftr.wideiv"); + } + } Value *Cond = Builder.CreateICmp(P, CmpIndVar, ExitCnt, "exitcond"); Value *OrigCond = BI->getCondition(); // It's tempting to use replaceAllUsesWith here to fully replace the old diff --git a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp index b61c5ba..b3ec2fc 100644 --- a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -19,6 +19,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/CFG.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LazyValueInfo.h" @@ -129,6 +130,7 @@ namespace { bool ProcessBranchOnXOR(BinaryOperator *BO); bool SimplifyPartiallyRedundantLoad(LoadInst *LI); + bool TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB); }; } @@ -775,7 +777,11 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { return true; } } + } + + if (CondBr && CondConst && TryToUnfoldSelect(CondCmp, BB)) + return true; } // Check for some cases that are worth simplifying. Right now we want to look @@ -821,7 +827,6 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { return false; } - /// SimplifyPartiallyRedundantLoad - If LI is an obviously partially redundant /// load instruction, eliminate it by replacing it with a PHI node. This is an /// important optimization that encourages jump threading, and needs to be run @@ -836,6 +841,12 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { if (LoadBB->getSinglePredecessor()) return false; + // If the load is defined in a landing pad, it can't be partially redundant, + // because the edges between the invoke and the landing pad cannot have other + // instructions between them. + if (LoadBB->isLandingPad()) + return false; + Value *LoadedPtr = LI->getOperand(0); // If the loaded operand is defined in the LoadBB, it can't be available. @@ -1615,4 +1626,80 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, return true; } +/// TryToUnfoldSelect - Look for blocks of the form +/// bb1: +/// %a = select +/// br bb +/// +/// bb2: +/// %p = phi [%a, %bb] ... +/// %c = icmp %p +/// br i1 %c +/// +/// And expand the select into a branch structure if one of its arms allows %c +/// to be folded. This later enables threading from bb1 over bb2. +bool JumpThreading::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) { + BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator()); + PHINode *CondLHS = dyn_cast<PHINode>(CondCmp->getOperand(0)); + Constant *CondRHS = cast<Constant>(CondCmp->getOperand(1)); + + if (!CondBr || !CondBr->isConditional() || !CondLHS || + CondLHS->getParent() != BB) + return false; + + for (unsigned I = 0, E = CondLHS->getNumIncomingValues(); I != E; ++I) { + BasicBlock *Pred = CondLHS->getIncomingBlock(I); + SelectInst *SI = dyn_cast<SelectInst>(CondLHS->getIncomingValue(I)); + // Look if one of the incoming values is a select in the corresponding + // predecessor. + if (!SI || SI->getParent() != Pred || !SI->hasOneUse()) + continue; + + BranchInst *PredTerm = dyn_cast<BranchInst>(Pred->getTerminator()); + if (!PredTerm || !PredTerm->isUnconditional()) + continue; + + // Now check if one of the select values would allow us to constant fold the + // terminator in BB. We don't do the transform if both sides fold, those + // cases will be threaded in any case. + LazyValueInfo::Tristate LHSFolds = + LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(1), + CondRHS, Pred, BB); + LazyValueInfo::Tristate RHSFolds = + LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(2), + CondRHS, Pred, BB); + if ((LHSFolds != LazyValueInfo::Unknown || + RHSFolds != LazyValueInfo::Unknown) && + LHSFolds != RHSFolds) { + // Expand the select. + // + // Pred -- + // | v + // | NewBB + // | | + // |----- + // v + // BB + BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "select.unfold", + BB->getParent(), BB); + // Move the unconditional branch to NewBB. + PredTerm->removeFromParent(); + NewBB->getInstList().insert(NewBB->end(), PredTerm); + // Create a conditional branch and update PHI nodes. + BranchInst::Create(NewBB, BB, SI->getCondition(), Pred); + CondLHS->setIncomingValue(I, SI->getFalseValue()); + CondLHS->addIncoming(SI->getTrueValue(), NewBB); + // The select is now dead. + SI->eraseFromParent(); + + // Update any other PHI nodes in BB. + for (BasicBlock::iterator BI = BB->begin(); + PHINode *Phi = dyn_cast<PHINode>(BI); ++BI) + if (Phi != CondLHS) + Phi->addIncoming(Phi->getIncomingValueForBlock(Pred), NewBB); + return true; + } + } + return false; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp index 0b62050..9e39d2e 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp @@ -51,8 +51,8 @@ namespace { } private: - bool isLoopDead(Loop *L, SmallVector<BasicBlock*, 4> &exitingBlocks, - SmallVector<BasicBlock*, 4> &exitBlocks, + bool isLoopDead(Loop *L, SmallVectorImpl<BasicBlock *> &exitingBlocks, + SmallVectorImpl<BasicBlock *> &exitBlocks, bool &Changed, BasicBlock *Preheader); }; @@ -77,8 +77,8 @@ Pass *llvm::createLoopDeletionPass() { /// checked for unique exit and exiting blocks, and that the code is in LCSSA /// form. bool LoopDeletion::isLoopDead(Loop *L, - SmallVector<BasicBlock*, 4> &exitingBlocks, - SmallVector<BasicBlock*, 4> &exitBlocks, + SmallVectorImpl<BasicBlock *> &exitingBlocks, + SmallVectorImpl<BasicBlock *> &exitBlocks, bool &Changed, BasicBlock *Preheader) { BasicBlock *exitBlock = exitBlocks[0]; @@ -209,7 +209,7 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) { // Move all of the block's children to be children of the preheader, which // allows us to remove the domtree entry for the block. ChildNodes.insert(ChildNodes.begin(), DT[*LI]->begin(), DT[*LI]->end()); - for (SmallVector<DomTreeNode*, 8>::iterator DI = ChildNodes.begin(), + for (SmallVectorImpl<DomTreeNode *>::iterator DI = ChildNodes.begin(), DE = ChildNodes.end(); DI != DE; ++DI) { DT.changeImmediateDominator(*DI, DT[preheader]); } diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 8258719..952b76b 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -81,7 +81,7 @@ namespace { /// Return the condition of the branch terminating the given basic block. static Value *getBrCondtion(BasicBlock *); - /// Derive the precondition block (i.e the block that guards the loop + /// Derive the precondition block (i.e the block that guards the loop /// preheader) from the given preheader. static BasicBlock *getPrecondBb(BasicBlock *PreHead); }; @@ -111,7 +111,7 @@ namespace { /// beween a variable and zero, and if the variable is non-zero, the /// control yeilds to the loop entry. If the branch matches the behavior, /// the variable involved in the comparion is returned. This function will - /// be called to see if the precondition and postcondition of the loop + /// be called to see if the precondition and postcondition of the loop /// are in desirable form. Value *matchCondition (BranchInst *Br, BasicBlock *NonZeroTarget) const; @@ -274,11 +274,11 @@ static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE, // //===----------------------------------------------------------------------===// -// This fucntion will return true iff the given block contains nothing but goto. -// A typical usage of this function is to check if the preheader fucntion is -// "almost" empty such that generated intrinsic function can be moved across -// preheader and to be placed at the end of the preconditiona block without -// concerning of breaking data dependence. +// This function will return true iff the given block contains nothing but goto. +// A typical usage of this function is to check if the preheader function is +// "almost" empty such that generated intrinsic functions can be moved across +// the preheader and be placed at the end of the precondition block without +// the concern of breaking data dependence. bool LIRUtil::isAlmostEmpty(BasicBlock *BB) { if (BranchInst *Br = getBranch(BB)) { return Br->isUnconditional() && BB->size() == 1; @@ -314,7 +314,7 @@ bool NclPopcountRecognize::preliminaryScreen() { if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware) return false; - // Counting population are usually conducted by few arithmetic instrutions. + // Counting population are usually conducted by few arithmetic instructions. // Such instructions can be easilly "absorbed" by vacant slots in a // non-compact loop. Therefore, recognizing popcount idiom only makes sense // in a compact loop. @@ -339,7 +339,7 @@ bool NclPopcountRecognize::preliminaryScreen() { PreCondBB = LIRUtil::getPrecondBb(PreHead); if (!PreCondBB) return false; - + return true; } @@ -504,7 +504,7 @@ void NclPopcountRecognize::transform(Instruction *CntInst, // Assuming before transformation, the loop is following: // if (x) // the precondition // do { cnt++; x &= x - 1; } while(x); - + // Step 1: Insert the ctpop instruction at the end of the precondition block IRBuilderTy Builder(PreCondBr); Value *PopCnt, *PopCntZext, *NewCount, *TripCnt; @@ -611,7 +611,7 @@ void NclPopcountRecognize::transform(Instruction *CntInst, SE->forgetLoop(CurLoop); } -CallInst *NclPopcountRecognize::createPopcntIntrinsic(IRBuilderTy &IRBuilder, +CallInst *NclPopcountRecognize::createPopcntIntrinsic(IRBuilderTy &IRBuilder, Value *Val, DebugLoc DL) { Value *Ops[] = { Val }; Type *Tys[] = { Val->getType() }; @@ -667,13 +667,13 @@ bool LoopIdiomRecognize::runOnCountableLoop() { if (!getDataLayout()) return false; - // set DT + // set DT (void)getDominatorTree(); LoopInfo &LI = getAnalysis<LoopInfo>(); TLI = &getAnalysis<TargetLibraryInfo>(); - // set TLI + // set TLI (void)getTargetLibraryInfo(); SmallVector<BasicBlock*, 8> ExitBlocks; @@ -953,6 +953,8 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, Value *SplatValue = isBytewiseValue(StoredVal); Constant *PatternValue = 0; + unsigned DestAS = DestPtr->getType()->getPointerAddressSpace(); + // If we're allowed to form a memset, and the stored value would be acceptable // for memset, use it. if (SplatValue && TLI->has(LibFunc::memset) && @@ -961,8 +963,10 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, CurLoop->isLoopInvariant(SplatValue)) { // Keep and use SplatValue. PatternValue = 0; - } else if (TLI->has(LibFunc::memset_pattern16) && + } else if (DestAS == 0 && + TLI->has(LibFunc::memset_pattern16) && (PatternValue = getMemSetPatternValue(StoredVal, *TD))) { + // Don't create memset_pattern16s with address spaces. // It looks like we can use PatternValue! SplatValue = 0; } else { @@ -978,20 +982,20 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, IRBuilder<> Builder(Preheader->getTerminator()); SCEVExpander Expander(*SE, "loop-idiom"); + Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS); + // Okay, we have a strided store "p[i]" of a splattable value. We can turn // this into a memset in the loop preheader now if we want. However, this // would be unsafe to do if there is anything else in the loop that may read // or write to the aliased location. Check for any overlap by generating the // base pointer and checking the region. - unsigned AddrSpace = cast<PointerType>(DestPtr->getType())->getAddressSpace(); Value *BasePtr = - Expander.expandCodeFor(Ev->getStart(), Builder.getInt8PtrTy(AddrSpace), + Expander.expandCodeFor(Ev->getStart(), DestInt8PtrTy, Preheader->getTerminator()); - if (mayLoopAccessLocation(BasePtr, AliasAnalysis::ModRef, CurLoop, BECount, - StoreSize, getAnalysis<AliasAnalysis>(), TheStore)){ + StoreSize, getAnalysis<AliasAnalysis>(), TheStore)) { Expander.clear(); // If we generated new code for the base pointer, clean up. deleteIfDeadInstruction(BasePtr, *SE, TLI); @@ -1002,27 +1006,35 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, // The # stored bytes is (BECount+1)*Size. Expand the trip count out to // pointer size if it isn't already. - Type *IntPtr = TD->getIntPtrType(DestPtr->getContext()); + Type *IntPtr = Builder.getIntPtrTy(TD, DestAS); BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr); const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1), SCEV::FlagNUW); - if (StoreSize != 1) + if (StoreSize != 1) { NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize), SCEV::FlagNUW); + } Value *NumBytes = Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator()); CallInst *NewCall; - if (SplatValue) - NewCall = Builder.CreateMemSet(BasePtr, SplatValue,NumBytes,StoreAlignment); - else { + if (SplatValue) { + NewCall = Builder.CreateMemSet(BasePtr, + SplatValue, + NumBytes, + StoreAlignment); + } else { + // Everything is emitted in default address space + Type *Int8PtrTy = DestInt8PtrTy; + Module *M = TheStore->getParent()->getParent()->getParent(); Value *MSP = M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(), - Builder.getInt8PtrTy(), - Builder.getInt8PtrTy(), IntPtr, + Int8PtrTy, + Int8PtrTy, + IntPtr, (void*)0); // Otherwise we should form a memset_pattern16. PatternValue is known to be @@ -1032,7 +1044,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, PatternValue, ".memset_pattern"); GV->setUnnamedAddr(true); // Ok to merge these. GV->setAlignment(16); - Value *PatternPtr = ConstantExpr::getBitCast(GV, Builder.getInt8PtrTy()); + Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy); NewCall = Builder.CreateCall3(MSP, BasePtr, PatternPtr, NumBytes); } @@ -1108,17 +1120,17 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, // The # stored bytes is (BECount+1)*Size. Expand the trip count out to // pointer size if it isn't already. - Type *IntPtr = TD->getIntPtrType(SI->getContext()); - BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr); + Type *IntPtrTy = Builder.getIntPtrTy(TD, SI->getPointerAddressSpace()); + BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy); - const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1), + const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtrTy, 1), SCEV::FlagNUW); if (StoreSize != 1) - NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize), + NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize), SCEV::FlagNUW); Value *NumBytes = - Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator()); + Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator()); CallInst *NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp new file mode 100644 index 0000000..335af81 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp @@ -0,0 +1,1184 @@ +//===-- LoopReroll.cpp - Loop rerolling pass ------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass implements a simple loop reroller. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "loop-reroll" +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopUtils.h" + +using namespace llvm; + +STATISTIC(NumRerolledLoops, "Number of rerolled loops"); + +static cl::opt<unsigned> +MaxInc("max-reroll-increment", cl::init(2048), cl::Hidden, + cl::desc("The maximum increment for loop rerolling")); + +// This loop re-rolling transformation aims to transform loops like this: +// +// int foo(int a); +// void bar(int *x) { +// for (int i = 0; i < 500; i += 3) { +// foo(i); +// foo(i+1); +// foo(i+2); +// } +// } +// +// into a loop like this: +// +// void bar(int *x) { +// for (int i = 0; i < 500; ++i) +// foo(i); +// } +// +// It does this by looking for loops that, besides the latch code, are composed +// of isomorphic DAGs of instructions, with each DAG rooted at some increment +// to the induction variable, and where each DAG is isomorphic to the DAG +// rooted at the induction variable (excepting the sub-DAGs which root the +// other induction-variable increments). In other words, we're looking for loop +// bodies of the form: +// +// %iv = phi [ (preheader, ...), (body, %iv.next) ] +// f(%iv) +// %iv.1 = add %iv, 1 <-- a root increment +// f(%iv.1) +// %iv.2 = add %iv, 2 <-- a root increment +// f(%iv.2) +// %iv.scale_m_1 = add %iv, scale-1 <-- a root increment +// f(%iv.scale_m_1) +// ... +// %iv.next = add %iv, scale +// %cmp = icmp(%iv, ...) +// br %cmp, header, exit +// +// where each f(i) is a set of instructions that, collectively, are a function +// only of i (and other loop-invariant values). +// +// As a special case, we can also reroll loops like this: +// +// int foo(int); +// void bar(int *x) { +// for (int i = 0; i < 500; ++i) { +// x[3*i] = foo(0); +// x[3*i+1] = foo(0); +// x[3*i+2] = foo(0); +// } +// } +// +// into this: +// +// void bar(int *x) { +// for (int i = 0; i < 1500; ++i) +// x[i] = foo(0); +// } +// +// in which case, we're looking for inputs like this: +// +// %iv = phi [ (preheader, ...), (body, %iv.next) ] +// %scaled.iv = mul %iv, scale +// f(%scaled.iv) +// %scaled.iv.1 = add %scaled.iv, 1 +// f(%scaled.iv.1) +// %scaled.iv.2 = add %scaled.iv, 2 +// f(%scaled.iv.2) +// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1 +// f(%scaled.iv.scale_m_1) +// ... +// %iv.next = add %iv, 1 +// %cmp = icmp(%iv, ...) +// br %cmp, header, exit + +namespace { + class LoopReroll : public LoopPass { + public: + static char ID; // Pass ID, replacement for typeid + LoopReroll() : LoopPass(ID) { + initializeLoopRerollPass(*PassRegistry::getPassRegistry()); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<AliasAnalysis>(); + AU.addRequired<LoopInfo>(); + AU.addPreserved<LoopInfo>(); + AU.addRequired<DominatorTree>(); + AU.addPreserved<DominatorTree>(); + AU.addRequired<ScalarEvolution>(); + AU.addRequired<TargetLibraryInfo>(); + } + +protected: + AliasAnalysis *AA; + LoopInfo *LI; + ScalarEvolution *SE; + DataLayout *DL; + TargetLibraryInfo *TLI; + DominatorTree *DT; + + typedef SmallVector<Instruction *, 16> SmallInstructionVector; + typedef SmallSet<Instruction *, 16> SmallInstructionSet; + + // A chain of isomorphic instructions, indentified by a single-use PHI, + // representing a reduction. Only the last value may be used outside the + // loop. + struct SimpleLoopReduction { + SimpleLoopReduction(Instruction *P, Loop *L) + : Valid(false), Instructions(1, P) { + assert(isa<PHINode>(P) && "First reduction instruction must be a PHI"); + add(L); + } + + bool valid() const { + return Valid; + } + + Instruction *getPHI() const { + assert(Valid && "Using invalid reduction"); + return Instructions.front(); + } + + Instruction *getReducedValue() const { + assert(Valid && "Using invalid reduction"); + return Instructions.back(); + } + + Instruction *get(size_t i) const { + assert(Valid && "Using invalid reduction"); + return Instructions[i+1]; + } + + Instruction *operator [] (size_t i) const { return get(i); } + + // The size, ignoring the initial PHI. + size_t size() const { + assert(Valid && "Using invalid reduction"); + return Instructions.size()-1; + } + + typedef SmallInstructionVector::iterator iterator; + typedef SmallInstructionVector::const_iterator const_iterator; + + iterator begin() { + assert(Valid && "Using invalid reduction"); + return llvm::next(Instructions.begin()); + } + + const_iterator begin() const { + assert(Valid && "Using invalid reduction"); + return llvm::next(Instructions.begin()); + } + + iterator end() { return Instructions.end(); } + const_iterator end() const { return Instructions.end(); } + + protected: + bool Valid; + SmallInstructionVector Instructions; + + void add(Loop *L); + }; + + // The set of all reductions, and state tracking of possible reductions + // during loop instruction processing. + struct ReductionTracker { + typedef SmallVector<SimpleLoopReduction, 16> SmallReductionVector; + + // Add a new possible reduction. + void addSLR(SimpleLoopReduction &SLR) { + PossibleReds.push_back(SLR); + } + + // Setup to track possible reductions corresponding to the provided + // rerolling scale. Only reductions with a number of non-PHI instructions + // that is divisible by the scale are considered. Three instructions sets + // are filled in: + // - A set of all possible instructions in eligible reductions. + // - A set of all PHIs in eligible reductions + // - A set of all reduced values (last instructions) in eligible reductions. + void restrictToScale(uint64_t Scale, + SmallInstructionSet &PossibleRedSet, + SmallInstructionSet &PossibleRedPHISet, + SmallInstructionSet &PossibleRedLastSet) { + PossibleRedIdx.clear(); + PossibleRedIter.clear(); + Reds.clear(); + + for (unsigned i = 0, e = PossibleReds.size(); i != e; ++i) + if (PossibleReds[i].size() % Scale == 0) { + PossibleRedLastSet.insert(PossibleReds[i].getReducedValue()); + PossibleRedPHISet.insert(PossibleReds[i].getPHI()); + + PossibleRedSet.insert(PossibleReds[i].getPHI()); + PossibleRedIdx[PossibleReds[i].getPHI()] = i; + for (SimpleLoopReduction::iterator J = PossibleReds[i].begin(), + JE = PossibleReds[i].end(); J != JE; ++J) { + PossibleRedSet.insert(*J); + PossibleRedIdx[*J] = i; + } + } + } + + // The functions below are used while processing the loop instructions. + + // Are the two instructions both from reductions, and furthermore, from + // the same reduction? + bool isPairInSame(Instruction *J1, Instruction *J2) { + DenseMap<Instruction *, int>::iterator J1I = PossibleRedIdx.find(J1); + if (J1I != PossibleRedIdx.end()) { + DenseMap<Instruction *, int>::iterator J2I = PossibleRedIdx.find(J2); + if (J2I != PossibleRedIdx.end() && J1I->second == J2I->second) + return true; + } + + return false; + } + + // The two provided instructions, the first from the base iteration, and + // the second from iteration i, form a matched pair. If these are part of + // a reduction, record that fact. + void recordPair(Instruction *J1, Instruction *J2, unsigned i) { + if (PossibleRedIdx.count(J1)) { + assert(PossibleRedIdx.count(J2) && + "Recording reduction vs. non-reduction instruction?"); + + PossibleRedIter[J1] = 0; + PossibleRedIter[J2] = i; + + int Idx = PossibleRedIdx[J1]; + assert(Idx == PossibleRedIdx[J2] && + "Recording pair from different reductions?"); + Reds.insert(Idx); + } + } + + // The functions below can be called after we've finished processing all + // instructions in the loop, and we know which reductions were selected. + + // Is the provided instruction the PHI of a reduction selected for + // rerolling? + bool isSelectedPHI(Instruction *J) { + if (!isa<PHINode>(J)) + return false; + + for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end(); + RI != RIE; ++RI) { + int i = *RI; + if (cast<Instruction>(J) == PossibleReds[i].getPHI()) + return true; + } + + return false; + } + + bool validateSelected(); + void replaceSelected(); + + protected: + // The vector of all possible reductions (for any scale). + SmallReductionVector PossibleReds; + + DenseMap<Instruction *, int> PossibleRedIdx; + DenseMap<Instruction *, int> PossibleRedIter; + DenseSet<int> Reds; + }; + + void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs); + void collectPossibleReductions(Loop *L, + ReductionTracker &Reductions); + void collectInLoopUserSet(Loop *L, + const SmallInstructionVector &Roots, + const SmallInstructionSet &Exclude, + const SmallInstructionSet &Final, + DenseSet<Instruction *> &Users); + void collectInLoopUserSet(Loop *L, + Instruction * Root, + const SmallInstructionSet &Exclude, + const SmallInstructionSet &Final, + DenseSet<Instruction *> &Users); + bool findScaleFromMul(Instruction *RealIV, uint64_t &Scale, + Instruction *&IV, + SmallInstructionVector &LoopIncs); + bool collectAllRoots(Loop *L, uint64_t Inc, uint64_t Scale, Instruction *IV, + SmallVector<SmallInstructionVector, 32> &Roots, + SmallInstructionSet &AllRoots, + SmallInstructionVector &LoopIncs); + bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount, + ReductionTracker &Reductions); + }; +} + +char LoopReroll::ID = 0; +INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false) + +Pass *llvm::createLoopRerollPass() { + return new LoopReroll; +} + +// Returns true if the provided instruction is used outside the given loop. +// This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in +// non-loop blocks to be outside the loop. +static bool hasUsesOutsideLoop(Instruction *I, Loop *L) { + for (Value::use_iterator UI = I->use_begin(), + UIE = I->use_end(); UI != UIE; ++UI) { + Instruction *User = cast<Instruction>(*UI); + if (!L->contains(User)) + return true; + } + + return false; +} + +// Collect the list of loop induction variables with respect to which it might +// be possible to reroll the loop. +void LoopReroll::collectPossibleIVs(Loop *L, + SmallInstructionVector &PossibleIVs) { + BasicBlock *Header = L->getHeader(); + for (BasicBlock::iterator I = Header->begin(), + IE = Header->getFirstInsertionPt(); I != IE; ++I) { + if (!isa<PHINode>(I)) + continue; + if (!I->getType()->isIntegerTy()) + continue; + + if (const SCEVAddRecExpr *PHISCEV = + dyn_cast<SCEVAddRecExpr>(SE->getSCEV(I))) { + if (PHISCEV->getLoop() != L) + continue; + if (!PHISCEV->isAffine()) + continue; + if (const SCEVConstant *IncSCEV = + dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE))) { + if (!IncSCEV->getValue()->getValue().isStrictlyPositive()) + continue; + if (IncSCEV->getValue()->uge(MaxInc)) + continue; + + DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << + *PHISCEV << "\n"); + PossibleIVs.push_back(I); + } + } + } +} + +// Add the remainder of the reduction-variable chain to the instruction vector +// (the initial PHINode has already been added). If successful, the object is +// marked as valid. +void LoopReroll::SimpleLoopReduction::add(Loop *L) { + assert(!Valid && "Cannot add to an already-valid chain"); + + // The reduction variable must be a chain of single-use instructions + // (including the PHI), except for the last value (which is used by the PHI + // and also outside the loop). + Instruction *C = Instructions.front(); + + do { + C = cast<Instruction>(*C->use_begin()); + if (C->hasOneUse()) { + if (!C->isBinaryOp()) + return; + + if (!(isa<PHINode>(Instructions.back()) || + C->isSameOperationAs(Instructions.back()))) + return; + + Instructions.push_back(C); + } + } while (C->hasOneUse()); + + if (Instructions.size() < 2 || + !C->isSameOperationAs(Instructions.back()) || + C->use_begin() == C->use_end()) + return; + + // C is now the (potential) last instruction in the reduction chain. + for (Value::use_iterator UI = C->use_begin(), UIE = C->use_end(); + UI != UIE; ++UI) { + // The only in-loop user can be the initial PHI. + if (L->contains(cast<Instruction>(*UI))) + if (cast<Instruction>(*UI ) != Instructions.front()) + return; + } + + Instructions.push_back(C); + Valid = true; +} + +// Collect the vector of possible reduction variables. +void LoopReroll::collectPossibleReductions(Loop *L, + ReductionTracker &Reductions) { + BasicBlock *Header = L->getHeader(); + for (BasicBlock::iterator I = Header->begin(), + IE = Header->getFirstInsertionPt(); I != IE; ++I) { + if (!isa<PHINode>(I)) + continue; + if (!I->getType()->isSingleValueType()) + continue; + + SimpleLoopReduction SLR(I, L); + if (!SLR.valid()) + continue; + + DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with " << + SLR.size() << " chained instructions)\n"); + Reductions.addSLR(SLR); + } +} + +// Collect the set of all users of the provided root instruction. This set of +// users contains not only the direct users of the root instruction, but also +// all users of those users, and so on. There are two exceptions: +// +// 1. Instructions in the set of excluded instructions are never added to the +// use set (even if they are users). This is used, for example, to exclude +// including root increments in the use set of the primary IV. +// +// 2. Instructions in the set of final instructions are added to the use set +// if they are users, but their users are not added. This is used, for +// example, to prevent a reduction update from forcing all later reduction +// updates into the use set. +void LoopReroll::collectInLoopUserSet(Loop *L, + Instruction *Root, const SmallInstructionSet &Exclude, + const SmallInstructionSet &Final, + DenseSet<Instruction *> &Users) { + SmallInstructionVector Queue(1, Root); + while (!Queue.empty()) { + Instruction *I = Queue.pop_back_val(); + if (!Users.insert(I).second) + continue; + + if (!Final.count(I)) + for (Value::use_iterator UI = I->use_begin(), + UIE = I->use_end(); UI != UIE; ++UI) { + Instruction *User = cast<Instruction>(*UI); + if (PHINode *PN = dyn_cast<PHINode>(User)) { + // Ignore "wrap-around" uses to PHIs of this loop's header. + if (PN->getIncomingBlock(UI) == L->getHeader()) + continue; + } + + if (L->contains(User) && !Exclude.count(User)) { + Queue.push_back(User); + } + } + + // We also want to collect single-user "feeder" values. + for (User::op_iterator OI = I->op_begin(), + OIE = I->op_end(); OI != OIE; ++OI) { + if (Instruction *Op = dyn_cast<Instruction>(*OI)) + if (Op->hasOneUse() && L->contains(Op) && !Exclude.count(Op) && + !Final.count(Op)) + Queue.push_back(Op); + } + } +} + +// Collect all of the users of all of the provided root instructions (combined +// into a single set). +void LoopReroll::collectInLoopUserSet(Loop *L, + const SmallInstructionVector &Roots, + const SmallInstructionSet &Exclude, + const SmallInstructionSet &Final, + DenseSet<Instruction *> &Users) { + for (SmallInstructionVector::const_iterator I = Roots.begin(), + IE = Roots.end(); I != IE; ++I) + collectInLoopUserSet(L, *I, Exclude, Final, Users); +} + +static bool isSimpleLoadStore(Instruction *I) { + if (LoadInst *LI = dyn_cast<LoadInst>(I)) + return LI->isSimple(); + if (StoreInst *SI = dyn_cast<StoreInst>(I)) + return SI->isSimple(); + if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) + return !MI->isVolatile(); + return false; +} + +// Recognize loops that are setup like this: +// +// %iv = phi [ (preheader, ...), (body, %iv.next) ] +// %scaled.iv = mul %iv, scale +// f(%scaled.iv) +// %scaled.iv.1 = add %scaled.iv, 1 +// f(%scaled.iv.1) +// %scaled.iv.2 = add %scaled.iv, 2 +// f(%scaled.iv.2) +// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1 +// f(%scaled.iv.scale_m_1) +// ... +// %iv.next = add %iv, 1 +// %cmp = icmp(%iv, ...) +// br %cmp, header, exit +// +// and, if found, set IV = %scaled.iv, and add %iv.next to LoopIncs. +bool LoopReroll::findScaleFromMul(Instruction *RealIV, uint64_t &Scale, + Instruction *&IV, + SmallInstructionVector &LoopIncs) { + // This is a special case: here we're looking for all uses (except for + // the increment) to be multiplied by a common factor. The increment must + // be by one. This is to capture loops like: + // for (int i = 0; i < 500; ++i) { + // foo(3*i); foo(3*i+1); foo(3*i+2); + // } + if (RealIV->getNumUses() != 2) + return false; + const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(RealIV)); + Instruction *User1 = cast<Instruction>(*RealIV->use_begin()), + *User2 = cast<Instruction>(*llvm::next(RealIV->use_begin())); + if (!SE->isSCEVable(User1->getType()) || !SE->isSCEVable(User2->getType())) + return false; + const SCEVAddRecExpr *User1SCEV = + dyn_cast<SCEVAddRecExpr>(SE->getSCEV(User1)), + *User2SCEV = + dyn_cast<SCEVAddRecExpr>(SE->getSCEV(User2)); + if (!User1SCEV || !User1SCEV->isAffine() || + !User2SCEV || !User2SCEV->isAffine()) + return false; + + // We assume below that User1 is the scale multiply and User2 is the + // increment. If this can't be true, then swap them. + if (User1SCEV == RealIVSCEV->getPostIncExpr(*SE)) { + std::swap(User1, User2); + std::swap(User1SCEV, User2SCEV); + } + + if (User2SCEV != RealIVSCEV->getPostIncExpr(*SE)) + return false; + assert(User2SCEV->getStepRecurrence(*SE)->isOne() && + "Invalid non-unit step for multiplicative scaling"); + LoopIncs.push_back(User2); + + if (const SCEVConstant *MulScale = + dyn_cast<SCEVConstant>(User1SCEV->getStepRecurrence(*SE))) { + // Make sure that both the start and step have the same multiplier. + if (RealIVSCEV->getStart()->getType() != MulScale->getType()) + return false; + if (SE->getMulExpr(RealIVSCEV->getStart(), MulScale) != + User1SCEV->getStart()) + return false; + + ConstantInt *MulScaleCI = MulScale->getValue(); + if (!MulScaleCI->uge(2) || MulScaleCI->uge(MaxInc)) + return false; + Scale = MulScaleCI->getZExtValue(); + IV = User1; + } else + return false; + + DEBUG(dbgs() << "LRR: Found possible scaling " << *User1 << "\n"); + return true; +} + +// Collect all root increments with respect to the provided induction variable +// (normally the PHI, but sometimes a multiply). A root increment is an +// instruction, normally an add, with a positive constant less than Scale. In a +// rerollable loop, each of these increments is the root of an instruction +// graph isomorphic to the others. Also, we collect the final induction +// increment (the increment equal to the Scale), and its users in LoopIncs. +bool LoopReroll::collectAllRoots(Loop *L, uint64_t Inc, uint64_t Scale, + Instruction *IV, + SmallVector<SmallInstructionVector, 32> &Roots, + SmallInstructionSet &AllRoots, + SmallInstructionVector &LoopIncs) { + for (Value::use_iterator UI = IV->use_begin(), + UIE = IV->use_end(); UI != UIE; ++UI) { + Instruction *User = cast<Instruction>(*UI); + if (!SE->isSCEVable(User->getType())) + continue; + if (User->getType() != IV->getType()) + continue; + if (!L->contains(User)) + continue; + if (hasUsesOutsideLoop(User, L)) + continue; + + if (const SCEVConstant *Diff = dyn_cast<SCEVConstant>(SE->getMinusSCEV( + SE->getSCEV(User), SE->getSCEV(IV)))) { + uint64_t Idx = Diff->getValue()->getValue().getZExtValue(); + if (Idx > 0 && Idx < Scale) { + Roots[Idx-1].push_back(User); + AllRoots.insert(User); + } else if (Idx == Scale && Inc > 1) { + LoopIncs.push_back(User); + } + } + } + + if (Roots[0].empty()) + return false; + bool AllSame = true; + for (unsigned i = 1; i < Scale-1; ++i) + if (Roots[i].size() != Roots[0].size()) { + AllSame = false; + break; + } + + if (!AllSame) + return false; + + return true; +} + +// Validate the selected reductions. All iterations must have an isomorphic +// part of the reduction chain and, for non-associative reductions, the chain +// entries must appear in order. +bool LoopReroll::ReductionTracker::validateSelected() { + // For a non-associative reduction, the chain entries must appear in order. + for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end(); + RI != RIE; ++RI) { + int i = *RI; + int PrevIter = 0, BaseCount = 0, Count = 0; + for (SimpleLoopReduction::iterator J = PossibleReds[i].begin(), + JE = PossibleReds[i].end(); J != JE; ++J) { + // Note that all instructions in the chain must have been found because + // all instructions in the function must have been assigned to some + // iteration. + int Iter = PossibleRedIter[*J]; + if (Iter != PrevIter && Iter != PrevIter + 1 && + !PossibleReds[i].getReducedValue()->isAssociative()) { + DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " << + *J << "\n"); + return false; + } + + if (Iter != PrevIter) { + if (Count != BaseCount) { + DEBUG(dbgs() << "LRR: Iteration " << PrevIter << + " reduction use count " << Count << + " is not equal to the base use count " << + BaseCount << "\n"); + return false; + } + + Count = 0; + } + + ++Count; + if (Iter == 0) + ++BaseCount; + + PrevIter = Iter; + } + } + + return true; +} + +// For all selected reductions, remove all parts except those in the first +// iteration (and the PHI). Replace outside uses of the reduced value with uses +// of the first-iteration reduced value (in other words, reroll the selected +// reductions). +void LoopReroll::ReductionTracker::replaceSelected() { + // Fixup reductions to refer to the last instruction associated with the + // first iteration (not the last). + for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end(); + RI != RIE; ++RI) { + int i = *RI; + int j = 0; + for (int e = PossibleReds[i].size(); j != e; ++j) + if (PossibleRedIter[PossibleReds[i][j]] != 0) { + --j; + break; + } + + // Replace users with the new end-of-chain value. + SmallInstructionVector Users; + for (Value::use_iterator UI = + PossibleReds[i].getReducedValue()->use_begin(), + UIE = PossibleReds[i].getReducedValue()->use_end(); UI != UIE; ++UI) + Users.push_back(cast<Instruction>(*UI)); + + for (SmallInstructionVector::iterator J = Users.begin(), + JE = Users.end(); J != JE; ++J) + (*J)->replaceUsesOfWith(PossibleReds[i].getReducedValue(), + PossibleReds[i][j]); + } +} + +// Reroll the provided loop with respect to the provided induction variable. +// Generally, we're looking for a loop like this: +// +// %iv = phi [ (preheader, ...), (body, %iv.next) ] +// f(%iv) +// %iv.1 = add %iv, 1 <-- a root increment +// f(%iv.1) +// %iv.2 = add %iv, 2 <-- a root increment +// f(%iv.2) +// %iv.scale_m_1 = add %iv, scale-1 <-- a root increment +// f(%iv.scale_m_1) +// ... +// %iv.next = add %iv, scale +// %cmp = icmp(%iv, ...) +// br %cmp, header, exit +// +// Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of +// instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can +// be intermixed with eachother. The restriction imposed by this algorithm is +// that the relative order of the isomorphic instructions in f(%iv), f(%iv.1), +// etc. be the same. +// +// First, we collect the use set of %iv, excluding the other increment roots. +// This gives us f(%iv). Then we iterate over the loop instructions (scale-1) +// times, having collected the use set of f(%iv.(i+1)), during which we: +// - Ensure that the next unmatched instruction in f(%iv) is isomorphic to +// the next unmatched instruction in f(%iv.(i+1)). +// - Ensure that both matched instructions don't have any external users +// (with the exception of last-in-chain reduction instructions). +// - Track the (aliasing) write set, and other side effects, of all +// instructions that belong to future iterations that come before the matched +// instructions. If the matched instructions read from that write set, then +// f(%iv) or f(%iv.(i+1)) has some dependency on instructions in +// f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly, +// if any of these future instructions had side effects (could not be +// speculatively executed), and so do the matched instructions, when we +// cannot reorder those side-effect-producing instructions, and rerolling +// fails. +// +// Finally, we make sure that all loop instructions are either loop increment +// roots, belong to simple latch code, parts of validated reductions, part of +// f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions +// have been validated), then we reroll the loop. +bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, + const SCEV *IterCount, + ReductionTracker &Reductions) { + const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(IV)); + uint64_t Inc = cast<SCEVConstant>(RealIVSCEV->getOperand(1))-> + getValue()->getZExtValue(); + // The collection of loop increment instructions. + SmallInstructionVector LoopIncs; + uint64_t Scale = Inc; + + // The effective induction variable, IV, is normally also the real induction + // variable. When we're dealing with a loop like: + // for (int i = 0; i < 500; ++i) + // x[3*i] = ...; + // x[3*i+1] = ...; + // x[3*i+2] = ...; + // then the real IV is still i, but the effective IV is (3*i). + Instruction *RealIV = IV; + if (Inc == 1 && !findScaleFromMul(RealIV, Scale, IV, LoopIncs)) + return false; + + assert(Scale <= MaxInc && "Scale is too large"); + assert(Scale > 1 && "Scale must be at least 2"); + + // The set of increment instructions for each increment value. + SmallVector<SmallInstructionVector, 32> Roots(Scale-1); + SmallInstructionSet AllRoots; + if (!collectAllRoots(L, Inc, Scale, IV, Roots, AllRoots, LoopIncs)) + return false; + + DEBUG(dbgs() << "LRR: Found all root induction increments for: " << + *RealIV << "\n"); + + // An array of just the possible reductions for this scale factor. When we + // collect the set of all users of some root instructions, these reduction + // instructions are treated as 'final' (their uses are not considered). + // This is important because we don't want the root use set to search down + // the reduction chain. + SmallInstructionSet PossibleRedSet; + SmallInstructionSet PossibleRedLastSet, PossibleRedPHISet; + Reductions.restrictToScale(Scale, PossibleRedSet, PossibleRedPHISet, + PossibleRedLastSet); + + // We now need to check for equivalence of the use graph of each root with + // that of the primary induction variable (excluding the roots). Our goal + // here is not to solve the full graph isomorphism problem, but rather to + // catch common cases without a lot of work. As a result, we will assume + // that the relative order of the instructions in each unrolled iteration + // is the same (although we will not make an assumption about how the + // different iterations are intermixed). Note that while the order must be + // the same, the instructions may not be in the same basic block. + SmallInstructionSet Exclude(AllRoots); + Exclude.insert(LoopIncs.begin(), LoopIncs.end()); + + DenseSet<Instruction *> BaseUseSet; + collectInLoopUserSet(L, IV, Exclude, PossibleRedSet, BaseUseSet); + + DenseSet<Instruction *> AllRootUses; + std::vector<DenseSet<Instruction *> > RootUseSets(Scale-1); + + bool MatchFailed = false; + for (unsigned i = 0; i < Scale-1 && !MatchFailed; ++i) { + DenseSet<Instruction *> &RootUseSet = RootUseSets[i]; + collectInLoopUserSet(L, Roots[i], SmallInstructionSet(), + PossibleRedSet, RootUseSet); + + DEBUG(dbgs() << "LRR: base use set size: " << BaseUseSet.size() << + " vs. iteration increment " << (i+1) << + " use set size: " << RootUseSet.size() << "\n"); + + if (BaseUseSet.size() != RootUseSet.size()) { + MatchFailed = true; + break; + } + + // In addition to regular aliasing information, we need to look for + // instructions from later (future) iterations that have side effects + // preventing us from reordering them past other instructions with side + // effects. + bool FutureSideEffects = false; + AliasSetTracker AST(*AA); + + // The map between instructions in f(%iv.(i+1)) and f(%iv). + DenseMap<Value *, Value *> BaseMap; + + assert(L->getNumBlocks() == 1 && "Cannot handle multi-block loops"); + for (BasicBlock::iterator J1 = Header->begin(), J2 = Header->begin(), + JE = Header->end(); J1 != JE && !MatchFailed; ++J1) { + if (cast<Instruction>(J1) == RealIV) + continue; + if (cast<Instruction>(J1) == IV) + continue; + if (!BaseUseSet.count(J1)) + continue; + if (PossibleRedPHISet.count(J1)) // Skip reduction PHIs. + continue; + + while (J2 != JE && (!RootUseSet.count(J2) || + std::find(Roots[i].begin(), Roots[i].end(), J2) != + Roots[i].end())) { + // As we iterate through the instructions, instructions that don't + // belong to previous iterations (or the base case), must belong to + // future iterations. We want to track the alias set of writes from + // previous iterations. + if (!isa<PHINode>(J2) && !BaseUseSet.count(J2) && + !AllRootUses.count(J2)) { + if (J2->mayWriteToMemory()) + AST.add(J2); + + // Note: This is specifically guarded by a check on isa<PHINode>, + // which while a valid (somewhat arbitrary) micro-optimization, is + // needed because otherwise isSafeToSpeculativelyExecute returns + // false on PHI nodes. + if (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2, DL)) + FutureSideEffects = true; + } + + ++J2; + } + + if (!J1->isSameOperationAs(J2)) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << + " vs. " << *J2 << "\n"); + MatchFailed = true; + break; + } + + // Make sure that this instruction, which is in the use set of this + // root instruction, does not also belong to the base set or the set of + // some previous root instruction. + if (BaseUseSet.count(J2) || AllRootUses.count(J2)) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << + " vs. " << *J2 << " (prev. case overlap)\n"); + MatchFailed = true; + break; + } + + // Make sure that we don't alias with any instruction in the alias set + // tracker. If we do, then we depend on a future iteration, and we + // can't reroll. + if (J2->mayReadFromMemory()) { + for (AliasSetTracker::iterator K = AST.begin(), KE = AST.end(); + K != KE && !MatchFailed; ++K) { + if (K->aliasesUnknownInst(J2, *AA)) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << + " vs. " << *J2 << " (depends on future store)\n"); + MatchFailed = true; + break; + } + } + } + + // If we've past an instruction from a future iteration that may have + // side effects, and this instruction might also, then we can't reorder + // them, and this matching fails. As an exception, we allow the alias + // set tracker to handle regular (simple) load/store dependencies. + if (FutureSideEffects && + ((!isSimpleLoadStore(J1) && !isSafeToSpeculativelyExecute(J1)) || + (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2)))) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << + " vs. " << *J2 << + " (side effects prevent reordering)\n"); + MatchFailed = true; + break; + } + + // For instructions that are part of a reduction, if the operation is + // associative, then don't bother matching the operands (because we + // already know that the instructions are isomorphic, and the order + // within the iteration does not matter). For non-associative reductions, + // we do need to match the operands, because we need to reject + // out-of-order instructions within an iteration! + // For example (assume floating-point addition), we need to reject this: + // x += a[i]; x += b[i]; + // x += a[i+1]; x += b[i+1]; + // x += b[i+2]; x += a[i+2]; + bool InReduction = Reductions.isPairInSame(J1, J2); + + if (!(InReduction && J1->isAssociative())) { + bool Swapped = false, SomeOpMatched = false;; + for (unsigned j = 0; j < J1->getNumOperands() && !MatchFailed; ++j) { + Value *Op2 = J2->getOperand(j); + + // If this is part of a reduction (and the operation is not + // associatve), then we match all operands, but not those that are + // part of the reduction. + if (InReduction) + if (Instruction *Op2I = dyn_cast<Instruction>(Op2)) + if (Reductions.isPairInSame(J2, Op2I)) + continue; + + DenseMap<Value *, Value *>::iterator BMI = BaseMap.find(Op2); + if (BMI != BaseMap.end()) + Op2 = BMI->second; + else if (std::find(Roots[i].begin(), Roots[i].end(), + (Instruction*) Op2) != Roots[i].end()) + Op2 = IV; + + if (J1->getOperand(Swapped ? unsigned(!j) : j) != Op2) { + // If we've not already decided to swap the matched operands, and + // we've not already matched our first operand (note that we could + // have skipped matching the first operand because it is part of a + // reduction above), and the instruction is commutative, then try + // the swapped match. + if (!Swapped && J1->isCommutative() && !SomeOpMatched && + J1->getOperand(!j) == Op2) { + Swapped = true; + } else { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << + " vs. " << *J2 << " (operand " << j << ")\n"); + MatchFailed = true; + break; + } + } + + SomeOpMatched = true; + } + } + + if ((!PossibleRedLastSet.count(J1) && hasUsesOutsideLoop(J1, L)) || + (!PossibleRedLastSet.count(J2) && hasUsesOutsideLoop(J2, L))) { + DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 << + " vs. " << *J2 << " (uses outside loop)\n"); + MatchFailed = true; + break; + } + + if (!MatchFailed) + BaseMap.insert(std::pair<Value *, Value *>(J2, J1)); + + AllRootUses.insert(J2); + Reductions.recordPair(J1, J2, i+1); + + ++J2; + } + } + + if (MatchFailed) + return false; + + DEBUG(dbgs() << "LRR: Matched all iteration increments for " << + *RealIV << "\n"); + + DenseSet<Instruction *> LoopIncUseSet; + collectInLoopUserSet(L, LoopIncs, SmallInstructionSet(), + SmallInstructionSet(), LoopIncUseSet); + DEBUG(dbgs() << "LRR: Loop increment set size: " << + LoopIncUseSet.size() << "\n"); + + // Make sure that all instructions in the loop have been included in some + // use set. + for (BasicBlock::iterator J = Header->begin(), JE = Header->end(); + J != JE; ++J) { + if (isa<DbgInfoIntrinsic>(J)) + continue; + if (cast<Instruction>(J) == RealIV) + continue; + if (cast<Instruction>(J) == IV) + continue; + if (BaseUseSet.count(J) || AllRootUses.count(J) || + (LoopIncUseSet.count(J) && (J->isTerminator() || + isSafeToSpeculativelyExecute(J, DL)))) + continue; + + if (AllRoots.count(J)) + continue; + + if (Reductions.isSelectedPHI(J)) + continue; + + DEBUG(dbgs() << "LRR: aborting reroll based on " << *RealIV << + " unprocessed instruction found: " << *J << "\n"); + MatchFailed = true; + break; + } + + if (MatchFailed) + return false; + + DEBUG(dbgs() << "LRR: all instructions processed from " << + *RealIV << "\n"); + + if (!Reductions.validateSelected()) + return false; + + // At this point, we've validated the rerolling, and we're committed to + // making changes! + + Reductions.replaceSelected(); + + // Remove instructions associated with non-base iterations. + for (BasicBlock::reverse_iterator J = Header->rbegin(); + J != Header->rend();) { + if (AllRootUses.count(&*J)) { + Instruction *D = &*J; + DEBUG(dbgs() << "LRR: removing: " << *D << "\n"); + D->eraseFromParent(); + continue; + } + + ++J; + } + + // Insert the new induction variable. + const SCEV *Start = RealIVSCEV->getStart(); + if (Inc == 1) + Start = SE->getMulExpr(Start, + SE->getConstant(Start->getType(), Scale)); + const SCEVAddRecExpr *H = + cast<SCEVAddRecExpr>(SE->getAddRecExpr(Start, + SE->getConstant(RealIVSCEV->getType(), 1), + L, SCEV::FlagAnyWrap)); + { // Limit the lifetime of SCEVExpander. + SCEVExpander Expander(*SE, "reroll"); + PHINode *NewIV = + cast<PHINode>(Expander.expandCodeFor(H, IV->getType(), + Header->begin())); + for (DenseSet<Instruction *>::iterator J = BaseUseSet.begin(), + JE = BaseUseSet.end(); J != JE; ++J) + (*J)->replaceUsesOfWith(IV, NewIV); + + if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) { + if (LoopIncUseSet.count(BI)) { + const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE); + if (Inc == 1) + ICSCEV = + SE->getMulExpr(ICSCEV, SE->getConstant(ICSCEV->getType(), Scale)); + Value *IC; + if (isa<SCEVConstant>(ICSCEV)) { + IC = Expander.expandCodeFor(ICSCEV, NewIV->getType(), BI); + } else { + BasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) + Preheader = InsertPreheaderForLoop(L, this); + + IC = Expander.expandCodeFor(ICSCEV, NewIV->getType(), + Preheader->getTerminator()); + } + + Value *NewIVNext = NewIV->getIncomingValueForBlock(Header); + Value *Cond = new ICmpInst(BI, CmpInst::ICMP_EQ, NewIVNext, IC, + "exitcond"); + BI->setCondition(Cond); + + if (BI->getSuccessor(1) != Header) + BI->swapSuccessors(); + } + } + } + + SimplifyInstructionsInBlock(Header, DL, TLI); + DeleteDeadPHIs(Header, TLI); + ++NumRerolledLoops; + return true; +} + +bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) { + AA = &getAnalysis<AliasAnalysis>(); + LI = &getAnalysis<LoopInfo>(); + SE = &getAnalysis<ScalarEvolution>(); + TLI = &getAnalysis<TargetLibraryInfo>(); + DL = getAnalysisIfAvailable<DataLayout>(); + DT = &getAnalysis<DominatorTree>(); + + BasicBlock *Header = L->getHeader(); + DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() << + "] Loop %" << Header->getName() << " (" << + L->getNumBlocks() << " block(s))\n"); + + bool Changed = false; + + // For now, we'll handle only single BB loops. + if (L->getNumBlocks() > 1) + return Changed; + + if (!SE->hasLoopInvariantBackedgeTakenCount(L)) + return Changed; + + const SCEV *LIBETC = SE->getBackedgeTakenCount(L); + const SCEV *IterCount = + SE->getAddExpr(LIBETC, SE->getConstant(LIBETC->getType(), 1)); + DEBUG(dbgs() << "LRR: iteration count = " << *IterCount << "\n"); + + // First, we need to find the induction variable with respect to which we can + // reroll (there may be several possible options). + SmallInstructionVector PossibleIVs; + collectPossibleIVs(L, PossibleIVs); + + if (PossibleIVs.empty()) { + DEBUG(dbgs() << "LRR: No possible IVs found\n"); + return Changed; + } + + ReductionTracker Reductions; + collectPossibleReductions(L, Reductions); + + // For each possible IV, collect the associated possible set of 'root' nodes + // (i+1, i+2, etc.). + for (SmallInstructionVector::iterator I = PossibleIVs.begin(), + IE = PossibleIVs.end(); I != IE; ++I) + if (reroll(*I, L, Header, IterCount, Reductions)) { + Changed = true; + break; + } + + return Changed; +} + diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 73e44d7..eff5268 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -774,6 +774,16 @@ DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) { } namespace { +class LSRUse; +} +// Check if it is legal to fold 2 base registers. +static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU, + const Formula &F); +// Get the cost of the scaling factor used in F for LU. +static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, + const LSRUse &LU, const Formula &F); + +namespace { /// Cost - This class is used to measure and compare candidate formulae. class Cost { @@ -785,11 +795,12 @@ class Cost { unsigned NumBaseAdds; unsigned ImmCost; unsigned SetupCost; + unsigned ScaleCost; public: Cost() : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0), - SetupCost(0) {} + SetupCost(0), ScaleCost(0) {} bool operator<(const Cost &Other) const; @@ -799,9 +810,9 @@ public: // Once any of the metrics loses, they must all remain losers. bool isValid() { return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds - | ImmCost | SetupCost) != ~0u) + | ImmCost | SetupCost | ScaleCost) != ~0u) || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds - & ImmCost & SetupCost) == ~0u); + & ImmCost & SetupCost & ScaleCost) == ~0u); } #endif @@ -810,12 +821,14 @@ public: return NumRegs == ~0u; } - void RateFormula(const Formula &F, + void RateFormula(const TargetTransformInfo &TTI, + const Formula &F, SmallPtrSet<const SCEV *, 16> &Regs, const DenseSet<const SCEV *> &VisitedRegs, const Loop *L, const SmallVectorImpl<int64_t> &Offsets, ScalarEvolution &SE, DominatorTree &DT, + const LSRUse &LU, SmallPtrSet<const SCEV *, 16> *LoserRegs = 0); void print(raw_ostream &OS) const; @@ -900,12 +913,14 @@ void Cost::RatePrimaryRegister(const SCEV *Reg, } } -void Cost::RateFormula(const Formula &F, +void Cost::RateFormula(const TargetTransformInfo &TTI, + const Formula &F, SmallPtrSet<const SCEV *, 16> &Regs, const DenseSet<const SCEV *> &VisitedRegs, const Loop *L, const SmallVectorImpl<int64_t> &Offsets, ScalarEvolution &SE, DominatorTree &DT, + const LSRUse &LU, SmallPtrSet<const SCEV *, 16> *LoserRegs) { // Tally up the registers. if (const SCEV *ScaledReg = F.ScaledReg) { @@ -932,7 +947,12 @@ void Cost::RateFormula(const Formula &F, // Determine how many (unfolded) adds we'll need inside the loop. size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0); if (NumBaseParts > 1) - NumBaseAdds += NumBaseParts - 1; + // Do not count the base and a possible second register if the target + // allows to fold 2 registers. + NumBaseAdds += NumBaseParts - (1 + isLegal2RegAMUse(TTI, LU, F)); + + // Accumulate non-free scaling amounts. + ScaleCost += getScalingFactorCost(TTI, LU, F); // Tally up the non-zero immediates. for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(), @@ -955,6 +975,7 @@ void Cost::Loose() { NumBaseAdds = ~0u; ImmCost = ~0u; SetupCost = ~0u; + ScaleCost = ~0u; } /// operator< - Choose the lower cost. @@ -967,6 +988,8 @@ bool Cost::operator<(const Cost &Other) const { return NumIVMuls < Other.NumIVMuls; if (NumBaseAdds != Other.NumBaseAdds) return NumBaseAdds < Other.NumBaseAdds; + if (ScaleCost != Other.ScaleCost) + return ScaleCost < Other.ScaleCost; if (ImmCost != Other.ImmCost) return ImmCost < Other.ImmCost; if (SetupCost != Other.SetupCost) @@ -983,6 +1006,8 @@ void Cost::print(raw_ostream &OS) const { if (NumBaseAdds != 0) OS << ", plus " << NumBaseAdds << " base add" << (NumBaseAdds == 1 ? "" : "s"); + if (ScaleCost != 0) + OS << ", plus " << ScaleCost << " scale cost"; if (ImmCost != 0) OS << ", plus " << ImmCost << " imm cost"; if (SetupCost != 0) @@ -1145,6 +1170,13 @@ public: /// may be used. bool AllFixupsOutsideLoop; + /// RigidFormula is set to true to guarantee that this use will be associated + /// with a single formula--the one that initially matched. Some SCEV + /// expressions cannot be expanded. This allows LSR to consider the registers + /// used by those expressions without the need to expand them later after + /// changing the formula. + bool RigidFormula; + /// WidestFixupType - This records the widest use type for any fixup using /// this LSRUse. FindUseWithSimilarFormula can't consider uses with different /// max fixup widths to be equivalent, because the narrower one may be relying @@ -1163,6 +1195,7 @@ public: MinOffset(INT64_MAX), MaxOffset(INT64_MIN), AllFixupsOutsideLoop(true), + RigidFormula(false), WidestFixupType(0) {} bool HasFormulaWithSameRegs(const Formula &F) const; @@ -1189,6 +1222,9 @@ bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const { /// InsertFormula - If the given formula has not yet been inserted, add it to /// the list, and return true. Return false otherwise. bool LSRUse::InsertFormula(const Formula &F) { + if (!Formulae.empty() && RigidFormula) + return false; + SmallVector<const SCEV *, 4> Key = F.BaseRegs; if (F.ScaledReg) Key.push_back(F.ScaledReg); // Unstable sort by host order ok, because this is only used for uniquifying. @@ -1359,6 +1395,66 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, F.BaseOffset, F.HasBaseReg, F.Scale); } +static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU, + const Formula &F) { + // If F is used as an Addressing Mode, it may fold one Base plus one + // scaled register. If the scaled register is nil, do as if another + // element of the base regs is a 1-scaled register. + // This is possible if BaseRegs has at least 2 registers. + + // If this is not an address calculation, this is not an addressing mode + // use. + if (LU.Kind != LSRUse::Address) + return false; + + // F is already scaled. + if (F.Scale != 0) + return false; + + // We need to keep one register for the base and one to scale. + if (F.BaseRegs.size() < 2) + return false; + + return isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, + F.BaseGV, F.BaseOffset, F.HasBaseReg, 1); + } + +static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, + const LSRUse &LU, const Formula &F) { + if (!F.Scale) + return 0; + assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, + LU.AccessTy, F) && "Illegal formula in use."); + + switch (LU.Kind) { + case LSRUse::Address: { + // Check the scaling factor cost with both the min and max offsets. + int ScaleCostMinOffset = + TTI.getScalingFactorCost(LU.AccessTy, F.BaseGV, + F.BaseOffset + LU.MinOffset, + F.HasBaseReg, F.Scale); + int ScaleCostMaxOffset = + TTI.getScalingFactorCost(LU.AccessTy, F.BaseGV, + F.BaseOffset + LU.MaxOffset, + F.HasBaseReg, F.Scale); + + assert(ScaleCostMinOffset >= 0 && ScaleCostMaxOffset >= 0 && + "Legal addressing mode has an illegal cost!"); + return std::max(ScaleCostMinOffset, ScaleCostMaxOffset); + } + case LSRUse::ICmpZero: + // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg. + // Therefore, return 0 in case F.Scale == -1. + return F.Scale != -1; + + case LSRUse::Basic: + case LSRUse::Special: + return 0; + } + + llvm_unreachable("Invalid LSRUse Kind!"); +} + static bool isAlwaysFoldable(const TargetTransformInfo &TTI, LSRUse::KindType Kind, Type *AccessTy, GlobalValue *BaseGV, int64_t BaseOffset, @@ -1664,7 +1760,7 @@ void LSRInstance::OptimizeShadowIV() { IVUsers::const_iterator CandidateUI = UI; ++UI; Instruction *ShadowUse = CandidateUI->getUser(); - Type *DestTy = NULL; + Type *DestTy = 0; bool IsSigned = false; /* If shadow use is a int->float cast then insert a second IV @@ -1726,7 +1822,7 @@ void LSRInstance::OptimizeShadowIV() { continue; /* Initialize new IV, double d = 0.0 in above example. */ - ConstantInt *C = NULL; + ConstantInt *C = 0; if (Incr->getOperand(0) == PH) C = dyn_cast<ConstantInt>(Incr->getOperand(1)); else if (Incr->getOperand(1) == PH) @@ -2858,7 +2954,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { // x == y --> x - y == 0 const SCEV *N = SE.getSCEV(NV); - if (SE.isLoopInvariant(N, L) && isSafeToExpand(N)) { + if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE)) { // S is normalized, so normalize N before folding it into S // to keep the result normalized. N = TransformForPostIncUse(Normalize, N, CI, 0, @@ -2901,6 +2997,10 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { /// and loop-computable portions. void LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) { + // Mark uses whose expressions cannot be expanded. + if (!isSafeToExpand(S, SE)) + LU.RigidFormula = true; + Formula F; F.InitialMatch(S, L, SE); bool Inserted = InsertFormula(LU, LUIdx, F); @@ -3048,7 +3148,7 @@ static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C, if (Remainder) Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder); } - return NULL; + return 0; } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { // Split a non-zero base out of an addrec. if (AR->getStart()->isZero()) @@ -3060,7 +3160,7 @@ static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C, // does not pertain to this loop. if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) { Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder); - Remainder = NULL; + Remainder = 0; } if (Remainder != AR->getStart()) { if (!Remainder) @@ -3082,7 +3182,7 @@ static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C, CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1); if (Remainder) Ops.push_back(SE.getMulExpr(C, Remainder)); - return NULL; + return 0; } } return S; @@ -3607,7 +3707,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { abs64(NewF.BaseOffset)) && (C->getValue()->getValue() + NewF.BaseOffset).countTrailingZeros() >= - CountTrailingZeros_64(NewF.BaseOffset)) + countTrailingZeros<uint64_t>(NewF.BaseOffset)) goto skip_formula; // Ok, looks good. @@ -3690,7 +3790,7 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() { // the corresponding bad register from the Regs set. Cost CostF; Regs.clear(); - CostF.RateFormula(F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, + CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, LU, &LoserRegs); if (CostF.isLoser()) { // During initial formula generation, undesirable formulae are generated @@ -3726,7 +3826,8 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() { Cost CostBest; Regs.clear(); - CostBest.RateFormula(Best, Regs, VisitedRegs, L, LU.Offsets, SE, DT); + CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, LU.Offsets, SE, + DT, LU); if (CostF < CostBest) std::swap(F, Best); DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs()); @@ -4079,7 +4180,8 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution, // the current best, prune the search at that point. NewCost = CurCost; NewRegs = CurRegs; - NewCost.RateFormula(F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT); + NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT, + LU); if (NewCost < SolutionCost) { Workspace.push_back(&F); if (Workspace.size() != Uses.size()) { @@ -4266,6 +4368,8 @@ Value *LSRInstance::Expand(const LSRFixup &LF, SCEVExpander &Rewriter, SmallVectorImpl<WeakVH> &DeadInsts) const { const LSRUse &LU = Uses[LF.LUIdx]; + if (LU.RigidFormula) + return LF.OperandValToReplace; // Determine an input position which will be dominated by the operands and // which will dominate the result. diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 80d060b..08ac38d 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -49,12 +49,17 @@ namespace { class LoopUnroll : public LoopPass { public: static char ID; // Pass ID, replacement for typeid - LoopUnroll(int T = -1, int C = -1, int P = -1) : LoopPass(ID) { + LoopUnroll(int T = -1, int C = -1, int P = -1, int R = -1) : LoopPass(ID) { CurrentThreshold = (T == -1) ? UnrollThreshold : unsigned(T); CurrentCount = (C == -1) ? UnrollCount : unsigned(C); CurrentAllowPartial = (P == -1) ? UnrollAllowPartial : (bool)P; + CurrentRuntime = (R == -1) ? UnrollRuntime : (bool)R; UserThreshold = (T != -1) || (UnrollThreshold.getNumOccurrences() > 0); + UserAllowPartial = (P != -1) || + (UnrollAllowPartial.getNumOccurrences() > 0); + UserRuntime = (R != -1) || (UnrollRuntime.getNumOccurrences() > 0); + UserCount = (C != -1) || (UnrollCount.getNumOccurrences() > 0); initializeLoopUnrollPass(*PassRegistry::getPassRegistry()); } @@ -75,7 +80,11 @@ namespace { unsigned CurrentCount; unsigned CurrentThreshold; bool CurrentAllowPartial; + bool CurrentRuntime; + bool UserCount; // CurrentCount is user-specified. bool UserThreshold; // CurrentThreshold is user-specified. + bool UserAllowPartial; // CurrentAllowPartial is user-specified. + bool UserRuntime; // CurrentRuntime is user-specified. bool runOnLoop(Loop *L, LPPassManager &LPM); @@ -110,8 +119,9 @@ INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false) -Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial) { - return new LoopUnroll(Threshold, Count, AllowPartial); +Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial, + int Runtime) { + return new LoopUnroll(Threshold, Count, AllowPartial, Runtime); } /// ApproximateLoopSize - Approximate the size of the loop. @@ -145,16 +155,24 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { << "] Loop %" << Header->getName() << "\n"); (void)Header; + TargetTransformInfo::UnrollingPreferences UP; + UP.Threshold = CurrentThreshold; + UP.OptSizeThreshold = OptSizeUnrollThreshold; + UP.Count = CurrentCount; + UP.Partial = CurrentAllowPartial; + UP.Runtime = CurrentRuntime; + TTI.getUnrollingPreferences(L, UP); + // Determine the current unrolling threshold. While this is normally set // from UnrollThreshold, it is overridden to a smaller value if the current // function is marked as optimize-for-size, and the unroll threshold was // not user specified. - unsigned Threshold = CurrentThreshold; + unsigned Threshold = UserThreshold ? CurrentThreshold : UP.Threshold; if (!UserThreshold && Header->getParent()->getAttributes(). hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize)) - Threshold = OptSizeUnrollThreshold; + Threshold = UP.OptSizeThreshold; // Find trip count and trip multiple if count is not available unsigned TripCount = 0; @@ -167,11 +185,14 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { TripCount = SE->getSmallConstantTripCount(L, LatchBlock); TripMultiple = SE->getSmallConstantTripMultiple(L, LatchBlock); } + + bool Runtime = UserRuntime ? CurrentRuntime : UP.Runtime; + // Use a default unroll-count if the user doesn't specify a value // and the trip count is a run-time value. The default is different // for run-time or compile-time trip count loops. - unsigned Count = CurrentCount; - if (UnrollRuntime && CurrentCount == 0 && TripCount == 0) + unsigned Count = UserCount ? CurrentCount : UP.Count; + if (Runtime && Count == 0 && TripCount == 0) Count = UnrollRuntimeCount; if (Count == 0) { @@ -204,7 +225,8 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { if (TripCount != 1 && Size > Threshold) { DEBUG(dbgs() << " Too large to fully unroll with count: " << Count << " because size: " << Size << ">" << Threshold << "\n"); - if (!CurrentAllowPartial && !(UnrollRuntime && TripCount == 0)) { + bool AllowPartial = UserAllowPartial ? CurrentAllowPartial : UP.Partial; + if (!AllowPartial && !(Runtime && TripCount == 0)) { DEBUG(dbgs() << " will not try to unroll partially because " << "-unroll-allow-partial not given\n"); return false; @@ -215,7 +237,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { while (Count != 0 && TripCount%Count != 0) Count--; } - else if (UnrollRuntime) { + else if (Runtime) { // Reduce unroll count to be a lower power-of-two value while (Count != 0 && Size > Threshold) { Count >>= 1; @@ -231,7 +253,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { } // Unroll the loop. - if (!UnrollLoop(L, Count, TripCount, UnrollRuntime, TripMultiple, LI, &LPM)) + if (!UnrollLoop(L, Count, TripCount, Runtime, TripMultiple, LI, &LPM)) return false; return true; diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp index 0e8199f..c4ebfd5 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -87,8 +87,8 @@ namespace { typedef LoopPropsMap::iterator LoopPropsMapIt; LoopPropsMap LoopsProperties; - UnswitchedValsMap* CurLoopInstructions; - LoopProperties* CurrentLoopProperties; + UnswitchedValsMap *CurLoopInstructions; + LoopProperties *CurrentLoopProperties; // Max size of code we can produce on remained iterations. unsigned MaxSize; @@ -96,30 +96,30 @@ namespace { public: LUAnalysisCache() : - CurLoopInstructions(NULL), CurrentLoopProperties(NULL), + CurLoopInstructions(0), CurrentLoopProperties(0), MaxSize(Threshold) {} // Analyze loop. Check its size, calculate is it possible to unswitch // it. Returns true if we can unswitch this loop. - bool countLoop(const Loop* L, const TargetTransformInfo &TTI); + bool countLoop(const Loop *L, const TargetTransformInfo &TTI); // Clean all data related to given loop. - void forgetLoop(const Loop* L); + void forgetLoop(const Loop *L); // Mark case value as unswitched. // Since SI instruction can be partly unswitched, in order to avoid // extra unswitching in cloned loops keep track all unswitched values. - void setUnswitched(const SwitchInst* SI, const Value* V); + void setUnswitched(const SwitchInst *SI, const Value *V); // Check was this case value unswitched before or not. - bool isUnswitched(const SwitchInst* SI, const Value* V); + bool isUnswitched(const SwitchInst *SI, const Value *V); // Clone all loop-unswitch related loop properties. // Redistribute unswitching quotas. // Note, that new loop data is stored inside the VMap. - void cloneData(const Loop* NewLoop, const Loop* OldLoop, - const ValueToValueMapTy& VMap); + void cloneData(const Loop *NewLoop, const Loop *OldLoop, + const ValueToValueMapTy &VMap); }; class LoopUnswitch : public LoopPass { @@ -151,8 +151,8 @@ namespace { static char ID; // Pass ID, replacement for typeid explicit LoopUnswitch(bool Os = false) : LoopPass(ID), OptimizeForSize(Os), redoLoop(false), - currentLoop(NULL), DT(NULL), loopHeader(NULL), - loopPreheader(NULL) { + currentLoop(0), DT(0), loopHeader(0), + loopPreheader(0) { initializeLoopUnswitchPass(*PassRegistry::getPassRegistry()); } @@ -196,7 +196,7 @@ namespace { /// Split all of the edges from inside the loop to their exit blocks. /// Update the appropriate Phi nodes as we do so. - void SplitExitEdges(Loop *L, const SmallVector<BasicBlock *, 8> &ExitBlocks); + void SplitExitEdges(Loop *L, const SmallVectorImpl<BasicBlock *> &ExitBlocks); bool UnswitchIfProfitable(Value *LoopCond, Constant *Val); void UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val, @@ -212,8 +212,6 @@ namespace { Instruction *InsertPt); void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L); - void RemoveBlockIfDead(BasicBlock *BB, - std::vector<Instruction*> &Worklist, Loop *l); void RemoveLoopFromHierarchy(Loop *L); bool IsTrivialUnswitchCondition(Value *Cond, Constant **Val = 0, BasicBlock **LoopExit = 0); @@ -225,12 +223,14 @@ namespace { // it. Returns true if we can unswitch this loop. bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI) { - std::pair<LoopPropsMapIt, bool> InsertRes = + LoopPropsMapIt PropsIt; + bool Inserted; + llvm::tie(PropsIt, Inserted) = LoopsProperties.insert(std::make_pair(L, LoopProperties())); - LoopProperties& Props = InsertRes.first->second; + LoopProperties &Props = PropsIt->second; - if (InsertRes.second) { + if (Inserted) { // New loop. // Limit the number of instructions to avoid causing significant code @@ -242,8 +242,7 @@ bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI) { // consideration code simplification opportunities and code that can // be shared by the resultant unswitched loops. CodeMetrics Metrics; - for (Loop::block_iterator I = L->block_begin(), - E = L->block_end(); + for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E; ++I) Metrics.analyzeBasicBlock(*I, TTI); @@ -253,17 +252,16 @@ bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI) { if (Metrics.notDuplicatable) { DEBUG(dbgs() << "NOT unswitching loop %" - << L->getHeader()->getName() << ", contents cannot be " - << "duplicated!\n"); + << L->getHeader()->getName() << ", contents cannot be " + << "duplicated!\n"); return false; } } if (!Props.CanBeUnswitchedCount) { DEBUG(dbgs() << "NOT unswitching loop %" - << L->getHeader()->getName() << ", cost too high: " - << L->getBlocks().size() << "\n"); - + << L->getHeader()->getName() << ", cost too high: " + << L->getBlocks().size() << "\n"); return false; } @@ -275,41 +273,41 @@ bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI) { } // Clean all data related to given loop. -void LUAnalysisCache::forgetLoop(const Loop* L) { +void LUAnalysisCache::forgetLoop(const Loop *L) { LoopPropsMapIt LIt = LoopsProperties.find(L); if (LIt != LoopsProperties.end()) { - LoopProperties& Props = LIt->second; + LoopProperties &Props = LIt->second; MaxSize += Props.CanBeUnswitchedCount * Props.SizeEstimation; LoopsProperties.erase(LIt); } - CurrentLoopProperties = NULL; - CurLoopInstructions = NULL; + CurrentLoopProperties = 0; + CurLoopInstructions = 0; } // Mark case value as unswitched. // Since SI instruction can be partly unswitched, in order to avoid // extra unswitching in cloned loops keep track all unswitched values. -void LUAnalysisCache::setUnswitched(const SwitchInst* SI, const Value* V) { +void LUAnalysisCache::setUnswitched(const SwitchInst *SI, const Value *V) { (*CurLoopInstructions)[SI].insert(V); } // Check was this case value unswitched before or not. -bool LUAnalysisCache::isUnswitched(const SwitchInst* SI, const Value* V) { +bool LUAnalysisCache::isUnswitched(const SwitchInst *SI, const Value *V) { return (*CurLoopInstructions)[SI].count(V); } // Clone all loop-unswitch related loop properties. // Redistribute unswitching quotas. // Note, that new loop data is stored inside the VMap. -void LUAnalysisCache::cloneData(const Loop* NewLoop, const Loop* OldLoop, - const ValueToValueMapTy& VMap) { +void LUAnalysisCache::cloneData(const Loop *NewLoop, const Loop *OldLoop, + const ValueToValueMapTy &VMap) { - LoopProperties& NewLoopProps = LoopsProperties[NewLoop]; - LoopProperties& OldLoopProps = *CurrentLoopProperties; - UnswitchedValsMap& Insts = OldLoopProps.UnswitchedVals; + LoopProperties &NewLoopProps = LoopsProperties[NewLoop]; + LoopProperties &OldLoopProps = *CurrentLoopProperties; + UnswitchedValsMap &Insts = OldLoopProps.UnswitchedVals; // Reallocate "can-be-unswitched quota" @@ -324,9 +322,9 @@ void LUAnalysisCache::cloneData(const Loop* NewLoop, const Loop* OldLoop, // for new loop switches we clone info about values that was // already unswitched and has redundant successors. for (UnswitchedValsIt I = Insts.begin(); I != Insts.end(); ++I) { - const SwitchInst* OldInst = I->first; - Value* NewI = VMap.lookup(OldInst); - const SwitchInst* NewInst = cast_or_null<SwitchInst>(NewI); + const SwitchInst *OldInst = I->first; + Value *NewI = VMap.lookup(OldInst); + const SwitchInst *NewInst = cast_or_null<SwitchInst>(NewI); assert(NewInst && "All instructions that are in SrcBB must be in VMap."); NewLoopProps.UnswitchedVals[NewInst] = OldLoopProps.UnswitchedVals[OldInst]; @@ -458,14 +456,14 @@ bool LoopUnswitch::processCurrentLoop() { // Find a value to unswitch on: // FIXME: this should chose the most expensive case! // FIXME: scan for a case with a non-critical edge? - Constant *UnswitchVal = NULL; + Constant *UnswitchVal = 0; // Do not process same value again and again. // At this point we have some cases already unswitched and // some not yet unswitched. Let's find the first not yet unswitched one. for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i) { - Constant* UnswitchValCandidate = i.getCaseValue(); + Constant *UnswitchValCandidate = i.getCaseValue(); if (!BranchesInfo.isUnswitched(SI, UnswitchValCandidate)) { UnswitchVal = UnswitchValCandidate; break; @@ -511,7 +509,8 @@ static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB, // Already visited. Without more analysis, this could indicate an infinite // loop. return false; - } else if (!L->contains(BB)) { + } + if (!L->contains(BB)) { // Otherwise, this is a loop exit, this is fine so long as this is the // first exit. if (ExitBB != 0) return false; @@ -595,11 +594,11 @@ bool LoopUnswitch::IsTrivialUnswitchCondition(Value *Cond, Constant **Val, // on already unswitched cases. for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i) { - BasicBlock* LoopExitCandidate; + BasicBlock *LoopExitCandidate; if ((LoopExitCandidate = isTrivialLoopExitBlock(currentLoop, i.getCaseSuccessor()))) { // Okay, we found a trivial case, remember the value that is trivial. - ConstantInt* CaseVal = i.getCaseValue(); + ConstantInt *CaseVal = i.getCaseValue(); // Check that it was not unswitched before, since already unswitched // trivial vals are looks trivial too. @@ -752,7 +751,7 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, /// SplitExitEdges - Split all of the edges from inside the loop to their exit /// blocks. Update the appropriate Phi nodes as we do so. void LoopUnswitch::SplitExitEdges(Loop *L, - const SmallVector<BasicBlock *, 8> &ExitBlocks){ + const SmallVectorImpl<BasicBlock *> &ExitBlocks){ for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) { BasicBlock *ExitBlock = ExitBlocks[i]; @@ -854,9 +853,8 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, // If the successor of the exit block had PHI nodes, add an entry for // NewExit. - PHINode *PN; - for (BasicBlock::iterator I = ExitSucc->begin(); isa<PHINode>(I); ++I) { - PN = cast<PHINode>(I); + for (BasicBlock::iterator I = ExitSucc->begin(); + PHINode *PN = dyn_cast<PHINode>(I); ++I) { Value *V = PN->getIncomingValueForBlock(ExitBlocks[i]); ValueToValueMapTy::iterator It = VMap.find(V); if (It != VMap.end()) V = It->second; @@ -864,8 +862,8 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, } if (LandingPadInst *LPad = NewExit->getLandingPadInst()) { - PN = PHINode::Create(LPad->getType(), 0, "", - ExitSucc->getFirstInsertionPt()); + PHINode *PN = PHINode::Create(LPad->getType(), 0, "", + ExitSucc->getFirstInsertionPt()); for (pred_iterator I = pred_begin(ExitSucc), E = pred_end(ExitSucc); I != E; ++I) { @@ -946,117 +944,6 @@ static void ReplaceUsesOfWith(Instruction *I, Value *V, ++NumSimplify; } -/// RemoveBlockIfDead - If the specified block is dead, remove it, update loop -/// information, and remove any dead successors it has. -/// -void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB, - std::vector<Instruction*> &Worklist, - Loop *L) { - if (pred_begin(BB) != pred_end(BB)) { - // This block isn't dead, since an edge to BB was just removed, see if there - // are any easy simplifications we can do now. - if (BasicBlock *Pred = BB->getSinglePredecessor()) { - // If it has one pred, fold phi nodes in BB. - while (isa<PHINode>(BB->begin())) - ReplaceUsesOfWith(BB->begin(), - cast<PHINode>(BB->begin())->getIncomingValue(0), - Worklist, L, LPM); - - // If this is the header of a loop and the only pred is the latch, we now - // have an unreachable loop. - if (Loop *L = LI->getLoopFor(BB)) - if (loopHeader == BB && L->contains(Pred)) { - // Remove the branch from the latch to the header block, this makes - // the header dead, which will make the latch dead (because the header - // dominates the latch). - LPM->deleteSimpleAnalysisValue(Pred->getTerminator(), L); - Pred->getTerminator()->eraseFromParent(); - new UnreachableInst(BB->getContext(), Pred); - - // The loop is now broken, remove it from LI. - RemoveLoopFromHierarchy(L); - - // Reprocess the header, which now IS dead. - RemoveBlockIfDead(BB, Worklist, L); - return; - } - - // If pred ends in a uncond branch, add uncond branch to worklist so that - // the two blocks will get merged. - if (BranchInst *BI = dyn_cast<BranchInst>(Pred->getTerminator())) - if (BI->isUnconditional()) - Worklist.push_back(BI); - } - return; - } - - DEBUG(dbgs() << "Nuking dead block: " << *BB); - - // Remove the instructions in the basic block from the worklist. - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { - RemoveFromWorklist(I, Worklist); - - // Anything that uses the instructions in this basic block should have their - // uses replaced with undefs. - // If I is not void type then replaceAllUsesWith undef. - // This allows ValueHandlers and custom metadata to adjust itself. - if (!I->getType()->isVoidTy()) - I->replaceAllUsesWith(UndefValue::get(I->getType())); - } - - // If this is the edge to the header block for a loop, remove the loop and - // promote all subloops. - if (Loop *BBLoop = LI->getLoopFor(BB)) { - if (BBLoop->getLoopLatch() == BB) { - RemoveLoopFromHierarchy(BBLoop); - if (currentLoop == BBLoop) { - currentLoop = 0; - redoLoop = false; - } - } - } - - // Remove the block from the loop info, which removes it from any loops it - // was in. - LI->removeBlock(BB); - - - // Remove phi node entries in successors for this block. - TerminatorInst *TI = BB->getTerminator(); - SmallVector<BasicBlock*, 4> Succs; - for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) { - Succs.push_back(TI->getSuccessor(i)); - TI->getSuccessor(i)->removePredecessor(BB); - } - - // Unique the successors, remove anything with multiple uses. - array_pod_sort(Succs.begin(), Succs.end()); - Succs.erase(std::unique(Succs.begin(), Succs.end()), Succs.end()); - - // Remove the basic block, including all of the instructions contained in it. - LPM->deleteSimpleAnalysisValue(BB, L); - BB->eraseFromParent(); - // Remove successor blocks here that are not dead, so that we know we only - // have dead blocks in this list. Nondead blocks have a way of becoming dead, - // then getting removed before we revisit them, which is badness. - // - for (unsigned i = 0; i != Succs.size(); ++i) - if (pred_begin(Succs[i]) != pred_end(Succs[i])) { - // One exception is loop headers. If this block was the preheader for a - // loop, then we DO want to visit the loop so the loop gets deleted. - // We know that if the successor is a loop header, that this loop had to - // be the preheader: the case where this was the latch block was handled - // above and headers can only have two predecessors. - if (!LI->isLoopHeader(Succs[i])) { - Succs.erase(Succs.begin()+i); - --i; - } - } - - for (unsigned i = 0, e = Succs.size(); i != e; ++i) - RemoveBlockIfDead(Succs[i], Worklist, L); -} - /// RemoveLoopFromHierarchy - We have discovered that the specified loop has /// become unwrapped, either because the backedge was deleted, or because the /// edge into the header was removed. If the edge into the header from the @@ -1088,7 +975,6 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, std::vector<Instruction*> Worklist; LLVMContext &Context = Val->getContext(); - // If we know that LIC == Val, or that LIC == NotVal, just replace uses of LIC // in the loop with the appropriate one directly. if (IsEqual || (isa<ConstantInt>(Val) && @@ -1108,8 +994,8 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, Worklist.push_back(U); } - for (std::vector<Instruction*>::iterator UI = Worklist.begin(); - UI != Worklist.end(); ++UI) + for (std::vector<Instruction*>::iterator UI = Worklist.begin(), + UE = Worklist.end(); UI != UE; ++UI) (*UI)->replaceUsesOfWith(LIC, Replacement); SimplifyCode(Worklist, L); @@ -1266,23 +1152,6 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) { continue; } - if (ConstantInt *CB = dyn_cast<ConstantInt>(BI->getCondition())){ - // Conditional branch. Turn it into an unconditional branch, then - // remove dead blocks. - continue; // FIXME: Enable. - - DEBUG(dbgs() << "Folded branch: " << *BI); - BasicBlock *DeadSucc = BI->getSuccessor(CB->getZExtValue()); - BasicBlock *LiveSucc = BI->getSuccessor(!CB->getZExtValue()); - DeadSucc->removePredecessor(BI->getParent(), true); - Worklist.push_back(BranchInst::Create(LiveSucc, BI)); - LPM->deleteSimpleAnalysisValue(BI, L); - BI->eraseFromParent(); - RemoveFromWorklist(BI, Worklist); - ++NumSimplify; - - RemoveBlockIfDead(DeadSucc, Worklist, L); - } continue; } } diff --git a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index be0f0e8..9912d3d 100644 --- a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -170,14 +170,17 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &TD) const { // pessimize the llvm optimizer. // // Since we don't have perfect knowledge here, make some assumptions: assume - // the maximum GPR width is the same size as the pointer size and assume that - // this width can be stored. If so, check to see whether we will end up - // actually reducing the number of stores used. + // the maximum GPR width is the same size as the largest legal integer + // size. If so, check to see whether we will end up actually reducing the + // number of stores used. unsigned Bytes = unsigned(End-Start); - unsigned NumPointerStores = Bytes/TD.getPointerSize(); + unsigned MaxIntSize = TD.getLargestLegalIntTypeSize(); + if (MaxIntSize == 0) + MaxIntSize = 1; + unsigned NumPointerStores = Bytes / MaxIntSize; // Assume the remaining bytes if any are done a byte at a time. - unsigned NumByteStores = Bytes - NumPointerStores*TD.getPointerSize(); + unsigned NumByteStores = Bytes - NumPointerStores * MaxIntSize; // If we will reduce the # stores (according to this heuristic), do the // transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32 @@ -465,7 +468,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc()); // Zap all the stores. - for (SmallVector<Instruction*, 16>::const_iterator + for (SmallVectorImpl<Instruction *>::const_iterator SI = Range.TheStores.begin(), SE = Range.TheStores.end(); SI != SE; ++SI) { MD->removeInstruction(*SI); @@ -626,8 +629,14 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, return false; Type *StructTy = cast<PointerType>(A->getType())->getElementType(); - uint64_t destSize = TD->getTypeAllocSize(StructTy); + if (!StructTy->isSized()) { + // The call may never return and hence the copy-instruction may never + // be executed, and therefore it's not safe to say "the destination + // has at least <cpyLen> bytes, as implied by the copy-instruction", + return false; + } + uint64_t destSize = TD->getTypeAllocSize(StructTy); if (destSize < srcSize) return false; } else { diff --git a/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp new file mode 100644 index 0000000..15cee44 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp @@ -0,0 +1,156 @@ +//===--- PartiallyInlineLibCalls.cpp - Partially inline libcalls ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass tries to partially inline the fast path of well-known library +// functions, such as using square-root instructions for cases where sqrt() +// does not need to set errno. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "partially-inline-libcalls" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +namespace { + class PartiallyInlineLibCalls : public FunctionPass { + public: + static char ID; + + PartiallyInlineLibCalls() : + FunctionPass(ID) { + initializePartiallyInlineLibCallsPass(*PassRegistry::getPassRegistry()); + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const; + virtual bool runOnFunction(Function &F); + + private: + /// Optimize calls to sqrt. + bool optimizeSQRT(CallInst *Call, Function *CalledFunc, + BasicBlock &CurrBB, Function::iterator &BB); + }; + + char PartiallyInlineLibCalls::ID = 0; +} + +INITIALIZE_PASS(PartiallyInlineLibCalls, "partially-inline-libcalls", + "Partially inline calls to library functions", false, false) + +void PartiallyInlineLibCalls::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<TargetLibraryInfo>(); + AU.addRequired<TargetTransformInfo>(); + FunctionPass::getAnalysisUsage(AU); +} + +bool PartiallyInlineLibCalls::runOnFunction(Function &F) { + bool Changed = false; + Function::iterator CurrBB; + TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); + const TargetTransformInfo *TTI = &getAnalysis<TargetTransformInfo>(); + for (Function::iterator BB = F.begin(), BE = F.end(); BB != BE;) { + CurrBB = BB++; + + for (BasicBlock::iterator II = CurrBB->begin(), IE = CurrBB->end(); + II != IE; ++II) { + CallInst *Call = dyn_cast<CallInst>(&*II); + Function *CalledFunc; + + if (!Call || !(CalledFunc = Call->getCalledFunction())) + continue; + + // Skip if function either has local linkage or is not a known library + // function. + LibFunc::Func LibFunc; + if (CalledFunc->hasLocalLinkage() || !CalledFunc->hasName() || + !TLI->getLibFunc(CalledFunc->getName(), LibFunc)) + continue; + + switch (LibFunc) { + case LibFunc::sqrtf: + case LibFunc::sqrt: + if (TTI->haveFastSqrt(Call->getType()) && + optimizeSQRT(Call, CalledFunc, *CurrBB, BB)) + break; + continue; + default: + continue; + } + + Changed = true; + break; + } + } + + return Changed; +} + +bool PartiallyInlineLibCalls::optimizeSQRT(CallInst *Call, + Function *CalledFunc, + BasicBlock &CurrBB, + Function::iterator &BB) { + // There is no need to change the IR, since backend will emit sqrt + // instruction if the call has already been marked read-only. + if (Call->onlyReadsMemory()) + return false; + + // Do the following transformation: + // + // (before) + // dst = sqrt(src) + // + // (after) + // v0 = sqrt_noreadmem(src) # native sqrt instruction. + // if (v0 is a NaN) + // v1 = sqrt(src) # library call. + // dst = phi(v0, v1) + // + + // Move all instructions following Call to newly created block JoinBB. + // Create phi and replace all uses. + BasicBlock *JoinBB = llvm::SplitBlock(&CurrBB, Call->getNextNode(), this); + IRBuilder<> Builder(JoinBB, JoinBB->begin()); + PHINode *Phi = Builder.CreatePHI(Call->getType(), 2); + Call->replaceAllUsesWith(Phi); + + // Create basic block LibCallBB and insert a call to library function sqrt. + BasicBlock *LibCallBB = BasicBlock::Create(CurrBB.getContext(), "call.sqrt", + CurrBB.getParent(), JoinBB); + Builder.SetInsertPoint(LibCallBB); + Instruction *LibCall = Call->clone(); + Builder.Insert(LibCall); + Builder.CreateBr(JoinBB); + + // Add attribute "readnone" so that backend can use a native sqrt instruction + // for this call. Insert a FP compare instruction and a conditional branch + // at the end of CurrBB. + Call->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone); + CurrBB.getTerminator()->eraseFromParent(); + Builder.SetInsertPoint(&CurrBB); + Value *FCmp = Builder.CreateFCmpOEQ(Call, Call); + Builder.CreateCondBr(FCmp, JoinBB, LibCallBB); + + // Add phi operands. + Phi->addIncoming(Call, &CurrBB); + Phi->addIncoming(LibCall, LibCallBB); + + BB = JoinBB; + return true; +} + +FunctionPass *llvm::createPartiallyInlineLibCallsPass() { + return new PartiallyInlineLibCalls(); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp index a3c241d..328a9c5 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -122,7 +122,6 @@ namespace { class XorOpnd { public: XorOpnd(Value *V); - const XorOpnd &operator=(const XorOpnd &That); bool isInvalid() const { return SymbolicPart == 0; } bool isOrExpr() const { return isOr; } @@ -225,15 +224,6 @@ XorOpnd::XorOpnd(Value *V) { isOr = true; } -const XorOpnd &XorOpnd::operator=(const XorOpnd &That) { - OrigVal = That.OrigVal; - SymbolicPart = That.SymbolicPart; - ConstPart = That.ConstPart; - SymbolicRank = That.SymbolicRank; - isOr = That.isOr; - return *this; -} - char Reassociate::ID = 0; INITIALIZE_PASS(Reassociate, "reassociate", "Reassociate expressions", false, false) @@ -251,21 +241,24 @@ static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) { } static bool isUnmovableInstruction(Instruction *I) { - if (I->getOpcode() == Instruction::PHI || - I->getOpcode() == Instruction::LandingPad || - I->getOpcode() == Instruction::Alloca || - I->getOpcode() == Instruction::Load || - I->getOpcode() == Instruction::Invoke || - (I->getOpcode() == Instruction::Call && - !isa<DbgInfoIntrinsic>(I)) || - I->getOpcode() == Instruction::UDiv || - I->getOpcode() == Instruction::SDiv || - I->getOpcode() == Instruction::FDiv || - I->getOpcode() == Instruction::URem || - I->getOpcode() == Instruction::SRem || - I->getOpcode() == Instruction::FRem) + switch (I->getOpcode()) { + case Instruction::PHI: + case Instruction::LandingPad: + case Instruction::Alloca: + case Instruction::Load: + case Instruction::Invoke: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: return true; - return false; + case Instruction::Call: + return !isa<DbgInfoIntrinsic>(I); + default: + return false; + } } void Reassociate::BuildRankMap(Function &F) { diff --git a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp index e30a274..4364720 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -214,7 +214,7 @@ public: /// This returns true if the block was not considered live before. bool MarkBlockExecutable(BasicBlock *BB) { if (!BBExecutable.insert(BB)) return false; - DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << "\n"); + DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << '\n'); BBWorkList.push_back(BB); // Add the block to the work list! return true; } @@ -427,7 +427,7 @@ private: // feasible that wasn't before. Revisit the PHI nodes in the block // because they have potentially new operands. DEBUG(dbgs() << "Marking Edge Executable: " << Source->getName() - << " -> " << Dest->getName() << "\n"); + << " -> " << Dest->getName() << '\n'); PHINode *PN; for (BasicBlock::iterator I = Dest->begin(); @@ -439,7 +439,7 @@ private: // getFeasibleSuccessors - Return a vector of booleans to indicate which // successors are reachable from a given terminator instruction. // - void getFeasibleSuccessors(TerminatorInst &TI, SmallVector<bool, 16> &Succs); + void getFeasibleSuccessors(TerminatorInst &TI, SmallVectorImpl<bool> &Succs); // isEdgeFeasible - Return true if the control flow edge from the 'From' basic // block to the 'To' basic block is currently feasible. @@ -501,7 +501,7 @@ private: void visitInstruction(Instruction &I) { // If a new instruction is added to LLVM that we don't handle. - dbgs() << "SCCP: Don't know how to handle: " << I; + dbgs() << "SCCP: Don't know how to handle: " << I << '\n'; markAnythingOverdefined(&I); // Just in case } }; @@ -513,7 +513,7 @@ private: // successors are reachable from a given terminator instruction. // void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI, - SmallVector<bool, 16> &Succs) { + SmallVectorImpl<bool> &Succs) { Succs.resize(TI.getNumSuccessors()); if (BranchInst *BI = dyn_cast<BranchInst>(&TI)) { if (BI->isUnconditional()) { @@ -1604,7 +1604,7 @@ bool SCCP::runOnFunction(Function &F) { Constant *Const = IV.isConstant() ? IV.getConstant() : UndefValue::get(Inst->getType()); - DEBUG(dbgs() << " Constant: " << *Const << " = " << *Inst); + DEBUG(dbgs() << " Constant: " << *Const << " = " << *Inst << '\n'); // Replaces all of the uses of a variable with uses of the constant. Inst->replaceAllUsesWith(Const); @@ -1812,7 +1812,7 @@ bool IPSCCP::runOnModule(Module &M) { Constant *Const = IV.isConstant() ? IV.getConstant() : UndefValue::get(Inst->getType()); - DEBUG(dbgs() << " Constant: " << *Const << " = " << *Inst); + DEBUG(dbgs() << " Constant: " << *Const << " = " << *Inst << '\n'); // Replaces all of the uses of a variable with uses of the // constant. diff --git a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp index d073e78..9f3fc83 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp @@ -47,6 +47,7 @@ #include "llvm/InstVisitor.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" @@ -58,9 +59,9 @@ using namespace llvm; STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement"); STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed"); -STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions"); -STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses found"); -STATISTIC(MaxPartitionUsesPerAlloca, "Maximum number of partition uses"); +STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca"); +STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten"); +STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition"); STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced"); STATISTIC(NumPromoted, "Number of allocas promoted to SSA values"); STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion"); @@ -110,17 +111,39 @@ typedef llvm::IRBuilder<false, ConstantFolder, } namespace { -/// \brief A common base class for representing a half-open byte range. -struct ByteRange { +/// \brief A used slice of an alloca. +/// +/// This structure represents a slice of an alloca used by some instruction. It +/// stores both the begin and end offsets of this use, a pointer to the use +/// itself, and a flag indicating whether we can classify the use as splittable +/// or not when forming partitions of the alloca. +class Slice { /// \brief The beginning offset of the range. uint64_t BeginOffset; /// \brief The ending offset, not included in the range. uint64_t EndOffset; - ByteRange() : BeginOffset(), EndOffset() {} - ByteRange(uint64_t BeginOffset, uint64_t EndOffset) - : BeginOffset(BeginOffset), EndOffset(EndOffset) {} + /// \brief Storage for both the use of this slice and whether it can be + /// split. + PointerIntPair<Use *, 1, bool> UseAndIsSplittable; + +public: + Slice() : BeginOffset(), EndOffset() {} + Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable) + : BeginOffset(BeginOffset), EndOffset(EndOffset), + UseAndIsSplittable(U, IsSplittable) {} + + uint64_t beginOffset() const { return BeginOffset; } + uint64_t endOffset() const { return EndOffset; } + + bool isSplittable() const { return UseAndIsSplittable.getInt(); } + void makeUnsplittable() { UseAndIsSplittable.setInt(false); } + + Use *getUse() const { return UseAndIsSplittable.getPointer(); } + + bool isDead() const { return getUse() == 0; } + void kill() { UseAndIsSplittable.setPointer(0); } /// \brief Support for ordering ranges. /// @@ -128,173 +151,67 @@ struct ByteRange { /// always increasing, and within equal start offsets, the end offsets are /// decreasing. Thus the spanning range comes first in a cluster with the /// same start position. - bool operator<(const ByteRange &RHS) const { - if (BeginOffset < RHS.BeginOffset) return true; - if (BeginOffset > RHS.BeginOffset) return false; - if (EndOffset > RHS.EndOffset) return true; + bool operator<(const Slice &RHS) const { + if (beginOffset() < RHS.beginOffset()) return true; + if (beginOffset() > RHS.beginOffset()) return false; + if (isSplittable() != RHS.isSplittable()) return !isSplittable(); + if (endOffset() > RHS.endOffset()) return true; return false; } /// \brief Support comparison with a single offset to allow binary searches. - friend bool operator<(const ByteRange &LHS, uint64_t RHSOffset) { - return LHS.BeginOffset < RHSOffset; + friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS, + uint64_t RHSOffset) { + return LHS.beginOffset() < RHSOffset; } - friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset, - const ByteRange &RHS) { - return LHSOffset < RHS.BeginOffset; + const Slice &RHS) { + return LHSOffset < RHS.beginOffset(); } - bool operator==(const ByteRange &RHS) const { - return BeginOffset == RHS.BeginOffset && EndOffset == RHS.EndOffset; + bool operator==(const Slice &RHS) const { + return isSplittable() == RHS.isSplittable() && + beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset(); } - bool operator!=(const ByteRange &RHS) const { return !operator==(RHS); } + bool operator!=(const Slice &RHS) const { return !operator==(RHS); } }; - -/// \brief A partition of an alloca. -/// -/// This structure represents a contiguous partition of the alloca. These are -/// formed by examining the uses of the alloca. During formation, they may -/// overlap but once an AllocaPartitioning is built, the Partitions within it -/// are all disjoint. -struct Partition : public ByteRange { - /// \brief Whether this partition is splittable into smaller partitions. - /// - /// We flag partitions as splittable when they are formed entirely due to - /// accesses by trivially splittable operations such as memset and memcpy. - bool IsSplittable; - - /// \brief Test whether a partition has been marked as dead. - bool isDead() const { - if (BeginOffset == UINT64_MAX) { - assert(EndOffset == UINT64_MAX); - return true; - } - return false; - } - - /// \brief Kill a partition. - /// This is accomplished by setting both its beginning and end offset to - /// the maximum possible value. - void kill() { - assert(!isDead() && "He's Dead, Jim!"); - BeginOffset = EndOffset = UINT64_MAX; - } - - Partition() : ByteRange(), IsSplittable() {} - Partition(uint64_t BeginOffset, uint64_t EndOffset, bool IsSplittable) - : ByteRange(BeginOffset, EndOffset), IsSplittable(IsSplittable) {} -}; - -/// \brief A particular use of a partition of the alloca. -/// -/// This structure is used to associate uses of a partition with it. They -/// mark the range of bytes which are referenced by a particular instruction, -/// and includes a handle to the user itself and the pointer value in use. -/// The bounds of these uses are determined by intersecting the bounds of the -/// memory use itself with a particular partition. As a consequence there is -/// intentionally overlap between various uses of the same partition. -class PartitionUse : public ByteRange { - /// \brief Combined storage for both the Use* and split state. - PointerIntPair<Use*, 1, bool> UsePtrAndIsSplit; - -public: - PartitionUse() : ByteRange(), UsePtrAndIsSplit() {} - PartitionUse(uint64_t BeginOffset, uint64_t EndOffset, Use *U, - bool IsSplit) - : ByteRange(BeginOffset, EndOffset), UsePtrAndIsSplit(U, IsSplit) {} - - /// \brief The use in question. Provides access to both user and used value. - /// - /// Note that this may be null if the partition use is *dead*, that is, it - /// should be ignored. - Use *getUse() const { return UsePtrAndIsSplit.getPointer(); } - - /// \brief Set the use for this partition use range. - void setUse(Use *U) { UsePtrAndIsSplit.setPointer(U); } - - /// \brief Whether this use is split across multiple partitions. - bool isSplit() const { return UsePtrAndIsSplit.getInt(); } -}; -} +} // end anonymous namespace namespace llvm { -template <> struct isPodLike<Partition> : llvm::true_type {}; -template <> struct isPodLike<PartitionUse> : llvm::true_type {}; +template <typename T> struct isPodLike; +template <> struct isPodLike<Slice> { + static const bool value = true; +}; } namespace { -/// \brief Alloca partitioning representation. +/// \brief Representation of the alloca slices. /// -/// This class represents a partitioning of an alloca into slices, and -/// information about the nature of uses of each slice of the alloca. The goal -/// is that this information is sufficient to decide if and how to split the -/// alloca apart and replace slices with scalars. It is also intended that this -/// structure can capture the relevant information needed both to decide about -/// and to enact these transformations. -class AllocaPartitioning { +/// This class represents the slices of an alloca which are formed by its +/// various uses. If a pointer escapes, we can't fully build a representation +/// for the slices used and we reflect that in this structure. The uses are +/// stored, sorted by increasing beginning offset and with unsplittable slices +/// starting at a particular offset before splittable slices. +class AllocaSlices { public: - /// \brief Construct a partitioning of a particular alloca. - /// - /// Construction does most of the work for partitioning the alloca. This - /// performs the necessary walks of users and builds a partitioning from it. - AllocaPartitioning(const DataLayout &TD, AllocaInst &AI); + /// \brief Construct the slices of a particular alloca. + AllocaSlices(const DataLayout &DL, AllocaInst &AI); /// \brief Test whether a pointer to the allocation escapes our analysis. /// - /// If this is true, the partitioning is never fully built and should be + /// If this is true, the slices are never fully built and should be /// ignored. bool isEscaped() const { return PointerEscapingInstr; } - /// \brief Support for iterating over the partitions. + /// \brief Support for iterating over the slices. /// @{ - typedef SmallVectorImpl<Partition>::iterator iterator; - iterator begin() { return Partitions.begin(); } - iterator end() { return Partitions.end(); } + typedef SmallVectorImpl<Slice>::iterator iterator; + iterator begin() { return Slices.begin(); } + iterator end() { return Slices.end(); } - typedef SmallVectorImpl<Partition>::const_iterator const_iterator; - const_iterator begin() const { return Partitions.begin(); } - const_iterator end() const { return Partitions.end(); } - /// @} - - /// \brief Support for iterating over and manipulating a particular - /// partition's uses. - /// - /// The iteration support provided for uses is more limited, but also - /// includes some manipulation routines to support rewriting the uses of - /// partitions during SROA. - /// @{ - typedef SmallVectorImpl<PartitionUse>::iterator use_iterator; - use_iterator use_begin(unsigned Idx) { return Uses[Idx].begin(); } - use_iterator use_begin(const_iterator I) { return Uses[I - begin()].begin(); } - use_iterator use_end(unsigned Idx) { return Uses[Idx].end(); } - use_iterator use_end(const_iterator I) { return Uses[I - begin()].end(); } - - typedef SmallVectorImpl<PartitionUse>::const_iterator const_use_iterator; - const_use_iterator use_begin(unsigned Idx) const { return Uses[Idx].begin(); } - const_use_iterator use_begin(const_iterator I) const { - return Uses[I - begin()].begin(); - } - const_use_iterator use_end(unsigned Idx) const { return Uses[Idx].end(); } - const_use_iterator use_end(const_iterator I) const { - return Uses[I - begin()].end(); - } - - unsigned use_size(unsigned Idx) const { return Uses[Idx].size(); } - unsigned use_size(const_iterator I) const { return Uses[I - begin()].size(); } - const PartitionUse &getUse(unsigned PIdx, unsigned UIdx) const { - return Uses[PIdx][UIdx]; - } - const PartitionUse &getUse(const_iterator I, unsigned UIdx) const { - return Uses[I - begin()][UIdx]; - } - - void use_push_back(unsigned Idx, const PartitionUse &PU) { - Uses[Idx].push_back(PU); - } - void use_push_back(const_iterator I, const PartitionUse &PU) { - Uses[I - begin()].push_back(PU); - } + typedef SmallVectorImpl<Slice>::const_iterator const_iterator; + const_iterator begin() const { return Slices.begin(); } + const_iterator end() const { return Slices.end(); } /// @} /// \brief Allow iterating the dead users for this alloca. @@ -320,66 +237,12 @@ public: dead_op_iterator dead_op_end() const { return DeadOperands.end(); } /// @} - /// \brief MemTransferInst auxiliary data. - /// This struct provides some auxiliary data about memory transfer - /// intrinsics such as memcpy and memmove. These intrinsics can use two - /// different ranges within the same alloca, and provide other challenges to - /// correctly represent. We stash extra data to help us untangle this - /// after the partitioning is complete. - struct MemTransferOffsets { - /// The destination begin and end offsets when the destination is within - /// this alloca. If the end offset is zero the destination is not within - /// this alloca. - uint64_t DestBegin, DestEnd; - - /// The source begin and end offsets when the source is within this alloca. - /// If the end offset is zero, the source is not within this alloca. - uint64_t SourceBegin, SourceEnd; - - /// Flag for whether an alloca is splittable. - bool IsSplittable; - }; - MemTransferOffsets getMemTransferOffsets(MemTransferInst &II) const { - return MemTransferInstData.lookup(&II); - } - - /// \brief Map from a PHI or select operand back to a partition. - /// - /// When manipulating PHI nodes or selects, they can use more than one - /// partition of an alloca. We store a special mapping to allow finding the - /// partition referenced by each of these operands, if any. - iterator findPartitionForPHIOrSelectOperand(Use *U) { - SmallDenseMap<Use *, std::pair<unsigned, unsigned> >::const_iterator MapIt - = PHIOrSelectOpMap.find(U); - if (MapIt == PHIOrSelectOpMap.end()) - return end(); - - return begin() + MapIt->second.first; - } - - /// \brief Map from a PHI or select operand back to the specific use of - /// a partition. - /// - /// Similar to mapping these operands back to the partitions, this maps - /// directly to the use structure of that partition. - use_iterator findPartitionUseForPHIOrSelectOperand(Use *U) { - SmallDenseMap<Use *, std::pair<unsigned, unsigned> >::const_iterator MapIt - = PHIOrSelectOpMap.find(U); - assert(MapIt != PHIOrSelectOpMap.end()); - return Uses[MapIt->second.first].begin() + MapIt->second.second; - } - - /// \brief Compute a common type among the uses of a particular partition. - /// - /// This routines walks all of the uses of a particular partition and tries - /// to find a common type between them. Untyped operations such as memset and - /// memcpy are ignored. - Type *getCommonType(iterator I) const; - #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const; - void printUsers(raw_ostream &OS, const_iterator I, + void printSlice(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const; + void printUse(raw_ostream &OS, const_iterator I, + StringRef Indent = " ") const; void print(raw_ostream &OS) const; void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump(const_iterator I) const; void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump() const; @@ -387,47 +250,36 @@ public: private: template <typename DerivedT, typename RetT = void> class BuilderBase; - class PartitionBuilder; - friend class AllocaPartitioning::PartitionBuilder; - class UseBuilder; - friend class AllocaPartitioning::UseBuilder; + class SliceBuilder; + friend class AllocaSlices::SliceBuilder; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// \brief Handle to alloca instruction to simplify method interfaces. AllocaInst &AI; #endif - /// \brief The instruction responsible for this alloca having no partitioning. + /// \brief The instruction responsible for this alloca not having a known set + /// of slices. /// /// When an instruction (potentially) escapes the pointer to the alloca, we - /// store a pointer to that here and abort trying to partition the alloca. - /// This will be null if the alloca is partitioned successfully. + /// store a pointer to that here and abort trying to form slices of the + /// alloca. This will be null if the alloca slices are analyzed successfully. Instruction *PointerEscapingInstr; - /// \brief The partitions of the alloca. + /// \brief The slices of the alloca. /// - /// We store a vector of the partitions over the alloca here. This vector is - /// sorted by increasing begin offset, and then by decreasing end offset. See - /// the Partition inner class for more details. Initially (during - /// construction) there are overlaps, but we form a disjoint sequence of - /// partitions while finishing construction and a fully constructed object is - /// expected to always have this as a disjoint space. - SmallVector<Partition, 8> Partitions; - - /// \brief The uses of the partitions. - /// - /// This is essentially a mapping from each partition to a list of uses of - /// that partition. The mapping is done with a Uses vector that has the exact - /// same number of entries as the partition vector. Each entry is itself - /// a vector of the uses. - SmallVector<SmallVector<PartitionUse, 2>, 8> Uses; + /// We store a vector of the slices formed by uses of the alloca here. This + /// vector is sorted by increasing begin offset, and then the unsplittable + /// slices before the splittable ones. See the Slice inner class for more + /// details. + SmallVector<Slice, 8> Slices; /// \brief Instructions which will become dead if we rewrite the alloca. /// - /// Note that these are not separated by partition. This is because we expect - /// a partitioned alloca to be completely rewritten or not rewritten at all. - /// If rewritten, all these instructions can simply be removed and replaced - /// with undef as they come from outside of the allocated space. + /// Note that these are not separated by slice. This is because we expect an + /// alloca to be completely rewritten or not rewritten at all. If rewritten, + /// all these instructions can simply be removed and replaced with undef as + /// they come from outside of the allocated space. SmallVector<Instruction *, 8> DeadUsers; /// \brief Operands which will become dead if we rewrite the alloca. @@ -439,26 +291,6 @@ private: /// want to swap this particular input for undef to simplify the use lists of /// the alloca. SmallVector<Use *, 8> DeadOperands; - - /// \brief The underlying storage for auxiliary memcpy and memset info. - SmallDenseMap<MemTransferInst *, MemTransferOffsets, 4> MemTransferInstData; - - /// \brief A side datastructure used when building up the partitions and uses. - /// - /// This mapping is only really used during the initial building of the - /// partitioning so that we can retain information about PHI and select nodes - /// processed. - SmallDenseMap<Instruction *, std::pair<uint64_t, bool> > PHIOrSelectSizes; - - /// \brief Auxiliary information for particular PHI or select operands. - SmallDenseMap<Use *, std::pair<unsigned, unsigned>, 4> PHIOrSelectOpMap; - - /// \brief A utility routine called from the constructor. - /// - /// This does what it says on the tin. It is the key of the alloca partition - /// splitting and merging. After it is called we have the desired disjoint - /// collection of partitions. - void splitAndMergePartitions(); }; } @@ -474,29 +306,35 @@ static Value *foldSelectInst(SelectInst &SI) { return 0; } -/// \brief Builder for the alloca partitioning. +/// \brief Builder for the alloca slices. /// -/// This class builds an alloca partitioning by recursively visiting the uses -/// of an alloca and splitting the partitions for each load and store at each -/// offset. -class AllocaPartitioning::PartitionBuilder - : public PtrUseVisitor<PartitionBuilder> { - friend class PtrUseVisitor<PartitionBuilder>; - friend class InstVisitor<PartitionBuilder>; - typedef PtrUseVisitor<PartitionBuilder> Base; +/// This class builds a set of alloca slices by recursively visiting the uses +/// of an alloca and making a slice for each load and store at each offset. +class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> { + friend class PtrUseVisitor<SliceBuilder>; + friend class InstVisitor<SliceBuilder>; + typedef PtrUseVisitor<SliceBuilder> Base; const uint64_t AllocSize; - AllocaPartitioning &P; + AllocaSlices &S; + + SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap; + SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes; - SmallDenseMap<Instruction *, unsigned> MemTransferPartitionMap; + /// \brief Set to de-duplicate dead instructions found in the use walk. + SmallPtrSet<Instruction *, 4> VisitedDeadInsts; public: - PartitionBuilder(const DataLayout &DL, AllocaInst &AI, AllocaPartitioning &P) - : PtrUseVisitor<PartitionBuilder>(DL), - AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())), - P(P) {} + SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &S) + : PtrUseVisitor<SliceBuilder>(DL), + AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())), S(S) {} private: + void markAsDead(Instruction &I) { + if (VisitedDeadInsts.insert(&I)) + S.DeadUsers.push_back(&I); + } + void insertUse(Instruction &I, const APInt &Offset, uint64_t Size, bool IsSplittable = false) { // Completely skip uses which have a zero size or start either before or @@ -505,9 +343,9 @@ private: DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset << " which has zero size or starts outside of the " << AllocSize << " byte alloca:\n" - << " alloca: " << P.AI << "\n" + << " alloca: " << S.AI << "\n" << " use: " << I << "\n"); - return; + return markAsDead(I); } uint64_t BeginOffset = Offset.getZExtValue(); @@ -523,13 +361,26 @@ private: if (Size > AllocSize - BeginOffset) { DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset << " to remain within the " << AllocSize << " byte alloca:\n" - << " alloca: " << P.AI << "\n" + << " alloca: " << S.AI << "\n" << " use: " << I << "\n"); EndOffset = AllocSize; } - Partition New(BeginOffset, EndOffset, IsSplittable); - P.Partitions.push_back(New); + S.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable)); + } + + void visitBitCastInst(BitCastInst &BC) { + if (BC.use_empty()) + return markAsDead(BC); + + return Base::visitBitCastInst(BC); + } + + void visitGetElementPtrInst(GetElementPtrInst &GEPI) { + if (GEPI.use_empty()) + return markAsDead(GEPI); + + return Base::visitGetElementPtrInst(GEPI); } void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset, @@ -580,9 +431,9 @@ private: DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" << Offset << " which extends past the end of the " << AllocSize << " byte alloca:\n" - << " alloca: " << P.AI << "\n" + << " alloca: " << S.AI << "\n" << " use: " << SI << "\n"); - return; + return markAsDead(SI); } assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) && @@ -597,7 +448,7 @@ private: if ((Length && Length->getValue() == 0) || (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) // Zero-length mem transfer intrinsics can be ignored entirely. - return; + return markAsDead(II); if (!IsOffsetKnown) return PI.setAborted(&II); @@ -613,7 +464,7 @@ private: if ((Length && Length->getValue() == 0) || (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) // Zero-length mem transfer intrinsics can be ignored entirely. - return; + return markAsDead(II); if (!IsOffsetKnown) return PI.setAborted(&II); @@ -622,63 +473,44 @@ private: uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset; - MemTransferOffsets &Offsets = P.MemTransferInstData[&II]; - - // Only intrinsics with a constant length can be split. - Offsets.IsSplittable = Length; + // Check for the special case where the same exact value is used for both + // source and dest. + if (*U == II.getRawDest() && *U == II.getRawSource()) { + // For non-volatile transfers this is a no-op. + if (!II.isVolatile()) + return markAsDead(II); - if (*U == II.getRawDest()) { - Offsets.DestBegin = RawOffset; - Offsets.DestEnd = RawOffset + Size; - } - if (*U == II.getRawSource()) { - Offsets.SourceBegin = RawOffset; - Offsets.SourceEnd = RawOffset + Size; + return insertUse(II, Offset, Size, /*IsSplittable=*/false); } - // If we have set up end offsets for both the source and the destination, - // we have found both sides of this transfer pointing at the same alloca. - bool SeenBothEnds = Offsets.SourceEnd && Offsets.DestEnd; - if (SeenBothEnds && II.getRawDest() != II.getRawSource()) { - unsigned PrevIdx = MemTransferPartitionMap[&II]; + // If we have seen both source and destination for a mem transfer, then + // they both point to the same alloca. + bool Inserted; + SmallDenseMap<Instruction *, unsigned>::iterator MTPI; + llvm::tie(MTPI, Inserted) = + MemTransferSliceMap.insert(std::make_pair(&II, S.Slices.size())); + unsigned PrevIdx = MTPI->second; + if (!Inserted) { + Slice &PrevP = S.Slices[PrevIdx]; // Check if the begin offsets match and this is a non-volatile transfer. // In that case, we can completely elide the transfer. - if (!II.isVolatile() && Offsets.SourceBegin == Offsets.DestBegin) { - P.Partitions[PrevIdx].kill(); - return; + if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) { + PrevP.kill(); + return markAsDead(II); } // Otherwise we have an offset transfer within the same alloca. We can't // split those. - P.Partitions[PrevIdx].IsSplittable = Offsets.IsSplittable = false; - } else if (SeenBothEnds) { - // Handle the case where this exact use provides both ends of the - // operation. - assert(II.getRawDest() == II.getRawSource()); - - // For non-volatile transfers this is a no-op. - if (!II.isVolatile()) - return; - - // Otherwise just suppress splitting. - Offsets.IsSplittable = false; + PrevP.makeUnsplittable(); } - // Insert the use now that we've fixed up the splittable nature. - insertUse(II, Offset, Size, Offsets.IsSplittable); - - // Setup the mapping from intrinsic to partition of we've not seen both - // ends of this transfer. - if (!SeenBothEnds) { - unsigned NewIdx = P.Partitions.size() - 1; - bool Inserted - = MemTransferPartitionMap.insert(std::make_pair(&II, NewIdx)).second; - assert(Inserted && - "Already have intrinsic in map but haven't seen both ends"); - (void)Inserted; - } + insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length); + + // Check that we ended up with a valid index in the map. + assert(S.Slices[PrevIdx].getUse()->getUser() == &II && + "Map index doesn't point back to a slice with this user."); } // Disable SRoA for any intrinsics except for lifetime invariants. @@ -702,7 +534,7 @@ private: Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) { // We consider any PHI or select that results in a direct load or store of - // the same offset to be a viable use for partitioning purposes. These uses + // the same offset to be a viable use for slicing purposes. These uses // are considered unsplittable and the size is the maximum loaded or stored // size. SmallPtrSet<Instruction *, 4> Visited; @@ -747,234 +579,36 @@ private: void visitPHINode(PHINode &PN) { if (PN.use_empty()) - return; + return markAsDead(PN); if (!IsOffsetKnown) return PI.setAborted(&PN); // See if we already have computed info on this node. - std::pair<uint64_t, bool> &PHIInfo = P.PHIOrSelectSizes[&PN]; - if (PHIInfo.first) { - PHIInfo.second = true; - insertUse(PN, Offset, PHIInfo.first); - return; + uint64_t &PHISize = PHIOrSelectSizes[&PN]; + if (!PHISize) { + // This is a new PHI node, check for an unsafe use of the PHI node. + if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&PN, PHISize)) + return PI.setAborted(UnsafeI); } - // Check for an unsafe use of the PHI node. - if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&PN, PHIInfo.first)) - return PI.setAborted(UnsafeI); - - insertUse(PN, Offset, PHIInfo.first); - } - - void visitSelectInst(SelectInst &SI) { - if (SI.use_empty()) - return; - if (Value *Result = foldSelectInst(SI)) { - if (Result == *U) - // If the result of the constant fold will be the pointer, recurse - // through the select as if we had RAUW'ed it. - enqueueUsers(SI); - - return; - } - if (!IsOffsetKnown) - return PI.setAborted(&SI); - - // See if we already have computed info on this node. - std::pair<uint64_t, bool> &SelectInfo = P.PHIOrSelectSizes[&SI]; - if (SelectInfo.first) { - SelectInfo.second = true; - insertUse(SI, Offset, SelectInfo.first); - return; - } - - // Check for an unsafe use of the PHI node. - if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&SI, SelectInfo.first)) - return PI.setAborted(UnsafeI); - - insertUse(SI, Offset, SelectInfo.first); - } - - /// \brief Disable SROA entirely if there are unhandled users of the alloca. - void visitInstruction(Instruction &I) { - PI.setAborted(&I); - } -}; - -/// \brief Use adder for the alloca partitioning. -/// -/// This class adds the uses of an alloca to all of the partitions which they -/// use. For splittable partitions, this can end up doing essentially a linear -/// walk of the partitions, but the number of steps remains bounded by the -/// total result instruction size: -/// - The number of partitions is a result of the number unsplittable -/// instructions using the alloca. -/// - The number of users of each partition is at worst the total number of -/// splittable instructions using the alloca. -/// Thus we will produce N * M instructions in the end, where N are the number -/// of unsplittable uses and M are the number of splittable. This visitor does -/// the exact same number of updates to the partitioning. -/// -/// In the more common case, this visitor will leverage the fact that the -/// partition space is pre-sorted, and do a logarithmic search for the -/// partition needed, making the total visit a classical ((N + M) * log(N)) -/// complexity operation. -class AllocaPartitioning::UseBuilder : public PtrUseVisitor<UseBuilder> { - friend class PtrUseVisitor<UseBuilder>; - friend class InstVisitor<UseBuilder>; - typedef PtrUseVisitor<UseBuilder> Base; - - const uint64_t AllocSize; - AllocaPartitioning &P; - - /// \brief Set to de-duplicate dead instructions found in the use walk. - SmallPtrSet<Instruction *, 4> VisitedDeadInsts; - -public: - UseBuilder(const DataLayout &TD, AllocaInst &AI, AllocaPartitioning &P) - : PtrUseVisitor<UseBuilder>(TD), - AllocSize(TD.getTypeAllocSize(AI.getAllocatedType())), - P(P) {} - -private: - void markAsDead(Instruction &I) { - if (VisitedDeadInsts.insert(&I)) - P.DeadUsers.push_back(&I); - } - - void insertUse(Instruction &User, const APInt &Offset, uint64_t Size) { - // If the use has a zero size or extends outside of the allocation, record - // it as a dead use for elimination later. - if (Size == 0 || Offset.isNegative() || Offset.uge(AllocSize)) - return markAsDead(User); - - uint64_t BeginOffset = Offset.getZExtValue(); - uint64_t EndOffset = BeginOffset + Size; - - // Clamp the end offset to the end of the allocation. Note that this is - // formulated to handle even the case where "BeginOffset + Size" overflows. - assert(AllocSize >= BeginOffset); // Established above. - if (Size > AllocSize - BeginOffset) - EndOffset = AllocSize; - - // NB: This only works if we have zero overlapping partitions. - iterator I = std::lower_bound(P.begin(), P.end(), BeginOffset); - if (I != P.begin() && llvm::prior(I)->EndOffset > BeginOffset) - I = llvm::prior(I); - iterator E = P.end(); - bool IsSplit = llvm::next(I) != E && llvm::next(I)->BeginOffset < EndOffset; - for (; I != E && I->BeginOffset < EndOffset; ++I) { - PartitionUse NewPU(std::max(I->BeginOffset, BeginOffset), - std::min(I->EndOffset, EndOffset), U, IsSplit); - P.use_push_back(I, NewPU); - if (isa<PHINode>(U->getUser()) || isa<SelectInst>(U->getUser())) - P.PHIOrSelectOpMap[U] - = std::make_pair(I - P.begin(), P.Uses[I - P.begin()].size() - 1); - } - } - - void visitBitCastInst(BitCastInst &BC) { - if (BC.use_empty()) - return markAsDead(BC); - - return Base::visitBitCastInst(BC); - } - - void visitGetElementPtrInst(GetElementPtrInst &GEPI) { - if (GEPI.use_empty()) - return markAsDead(GEPI); - - return Base::visitGetElementPtrInst(GEPI); - } - - void visitLoadInst(LoadInst &LI) { - assert(IsOffsetKnown); - uint64_t Size = DL.getTypeStoreSize(LI.getType()); - insertUse(LI, Offset, Size); - } - - void visitStoreInst(StoreInst &SI) { - assert(IsOffsetKnown); - uint64_t Size = DL.getTypeStoreSize(SI.getOperand(0)->getType()); - - // If this memory access can be shown to *statically* extend outside the - // bounds of of the allocation, it's behavior is undefined, so simply - // ignore it. Note that this is more strict than the generic clamping - // behavior of insertUse. - if (Offset.isNegative() || Size > AllocSize || - Offset.ugt(AllocSize - Size)) - return markAsDead(SI); - - insertUse(SI, Offset, Size); - } - - void visitMemSetInst(MemSetInst &II) { - ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); - if ((Length && Length->getValue() == 0) || - (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) - return markAsDead(II); - - assert(IsOffsetKnown); - insertUse(II, Offset, Length ? Length->getLimitedValue() - : AllocSize - Offset.getLimitedValue()); - } - - void visitMemTransferInst(MemTransferInst &II) { - ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); - if ((Length && Length->getValue() == 0) || - (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize))) - return markAsDead(II); - - assert(IsOffsetKnown); - uint64_t Size = Length ? Length->getLimitedValue() - : AllocSize - Offset.getLimitedValue(); - - const MemTransferOffsets &Offsets = P.MemTransferInstData[&II]; - if (!II.isVolatile() && Offsets.DestEnd && Offsets.SourceEnd && - Offsets.DestBegin == Offsets.SourceBegin) - return markAsDead(II); // Skip identity transfers without side-effects. - - insertUse(II, Offset, Size); - } - - void visitIntrinsicInst(IntrinsicInst &II) { - assert(IsOffsetKnown); - assert(II.getIntrinsicID() == Intrinsic::lifetime_start || - II.getIntrinsicID() == Intrinsic::lifetime_end); - - ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0)); - insertUse(II, Offset, std::min(Length->getLimitedValue(), - AllocSize - Offset.getLimitedValue())); - } - - void insertPHIOrSelect(Instruction &User, const APInt &Offset) { - uint64_t Size = P.PHIOrSelectSizes.lookup(&User).first; - // For PHI and select operands outside the alloca, we can't nuke the entire // phi or select -- the other side might still be relevant, so we special // case them here and use a separate structure to track the operands // themselves which should be replaced with undef. - if ((Offset.isNegative() && Offset.uge(Size)) || + // FIXME: This should instead be escaped in the event we're instrumenting + // for address sanitization. + if ((Offset.isNegative() && (-Offset).uge(PHISize)) || (!Offset.isNegative() && Offset.uge(AllocSize))) { - P.DeadOperands.push_back(U); + S.DeadOperands.push_back(U); return; } - insertUse(User, Offset, Size); - } - - void visitPHINode(PHINode &PN) { - if (PN.use_empty()) - return markAsDead(PN); - - assert(IsOffsetKnown); - insertPHIOrSelect(PN, Offset); + insertUse(PN, Offset, PHISize); } void visitSelectInst(SelectInst &SI) { if (SI.use_empty()) return markAsDead(SI); - if (Value *Result = foldSelectInst(SI)) { if (Result == *U) // If the result of the constant fold will be the pointer, recurse @@ -983,276 +617,106 @@ private: else // Otherwise the operand to the select is dead, and we can replace it // with undef. - P.DeadOperands.push_back(U); + S.DeadOperands.push_back(U); return; } + if (!IsOffsetKnown) + return PI.setAborted(&SI); - assert(IsOffsetKnown); - insertPHIOrSelect(SI, Offset); - } - - /// \brief Unreachable, we've already visited the alloca once. - void visitInstruction(Instruction &I) { - llvm_unreachable("Unhandled instruction in use builder."); - } -}; - -void AllocaPartitioning::splitAndMergePartitions() { - size_t NumDeadPartitions = 0; - - // Track the range of splittable partitions that we pass when accumulating - // overlapping unsplittable partitions. - uint64_t SplitEndOffset = 0ull; - - Partition New(0ull, 0ull, false); - - for (unsigned i = 0, j = i, e = Partitions.size(); i != e; i = j) { - ++j; - - if (!Partitions[i].IsSplittable || New.BeginOffset == New.EndOffset) { - assert(New.BeginOffset == New.EndOffset); - New = Partitions[i]; - } else { - assert(New.IsSplittable); - New.EndOffset = std::max(New.EndOffset, Partitions[i].EndOffset); - } - assert(New.BeginOffset != New.EndOffset); - - // Scan the overlapping partitions. - while (j != e && New.EndOffset > Partitions[j].BeginOffset) { - // If the new partition we are forming is splittable, stop at the first - // unsplittable partition. - if (New.IsSplittable && !Partitions[j].IsSplittable) - break; - - // Grow the new partition to include any equally splittable range. 'j' is - // always equally splittable when New is splittable, but when New is not - // splittable, we may subsume some (or part of some) splitable partition - // without growing the new one. - if (New.IsSplittable == Partitions[j].IsSplittable) { - New.EndOffset = std::max(New.EndOffset, Partitions[j].EndOffset); - } else { - assert(!New.IsSplittable); - assert(Partitions[j].IsSplittable); - SplitEndOffset = std::max(SplitEndOffset, Partitions[j].EndOffset); - } - - Partitions[j].kill(); - ++NumDeadPartitions; - ++j; - } - - // If the new partition is splittable, chop off the end as soon as the - // unsplittable subsequent partition starts and ensure we eventually cover - // the splittable area. - if (j != e && New.IsSplittable) { - SplitEndOffset = std::max(SplitEndOffset, New.EndOffset); - New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset); + // See if we already have computed info on this node. + uint64_t &SelectSize = PHIOrSelectSizes[&SI]; + if (!SelectSize) { + // This is a new Select, check for an unsafe use of it. + if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&SI, SelectSize)) + return PI.setAborted(UnsafeI); } - // Add the new partition if it differs from the original one and is - // non-empty. We can end up with an empty partition here if it was - // splittable but there is an unsplittable one that starts at the same - // offset. - if (New != Partitions[i]) { - if (New.BeginOffset != New.EndOffset) - Partitions.push_back(New); - // Mark the old one for removal. - Partitions[i].kill(); - ++NumDeadPartitions; + // For PHI and select operands outside the alloca, we can't nuke the entire + // phi or select -- the other side might still be relevant, so we special + // case them here and use a separate structure to track the operands + // themselves which should be replaced with undef. + // FIXME: This should instead be escaped in the event we're instrumenting + // for address sanitization. + if ((Offset.isNegative() && Offset.uge(SelectSize)) || + (!Offset.isNegative() && Offset.uge(AllocSize))) { + S.DeadOperands.push_back(U); + return; } - New.BeginOffset = New.EndOffset; - if (!New.IsSplittable) { - New.EndOffset = std::max(New.EndOffset, SplitEndOffset); - if (j != e && !Partitions[j].IsSplittable) - New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset); - New.IsSplittable = true; - // If there is a trailing splittable partition which won't be fused into - // the next splittable partition go ahead and add it onto the partitions - // list. - if (New.BeginOffset < New.EndOffset && - (j == e || !Partitions[j].IsSplittable || - New.EndOffset < Partitions[j].BeginOffset)) { - Partitions.push_back(New); - New.BeginOffset = New.EndOffset = 0ull; - } - } + insertUse(SI, Offset, SelectSize); } - // Re-sort the partitions now that they have been split and merged into - // disjoint set of partitions. Also remove any of the dead partitions we've - // replaced in the process. - std::sort(Partitions.begin(), Partitions.end()); - if (NumDeadPartitions) { - assert(Partitions.back().isDead()); - assert((ptrdiff_t)NumDeadPartitions == - std::count(Partitions.begin(), Partitions.end(), Partitions.back())); + /// \brief Disable SROA entirely if there are unhandled users of the alloca. + void visitInstruction(Instruction &I) { + PI.setAborted(&I); } - Partitions.erase(Partitions.end() - NumDeadPartitions, Partitions.end()); -} +}; -AllocaPartitioning::AllocaPartitioning(const DataLayout &TD, AllocaInst &AI) +AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI) : #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) AI(AI), #endif PointerEscapingInstr(0) { - PartitionBuilder PB(TD, AI, *this); - PartitionBuilder::PtrInfo PtrI = PB.visitPtr(AI); + SliceBuilder PB(DL, AI, *this); + SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI); if (PtrI.isEscaped() || PtrI.isAborted()) { // FIXME: We should sink the escape vs. abort info into the caller nicely, - // possibly by just storing the PtrInfo in the AllocaPartitioning. + // possibly by just storing the PtrInfo in the AllocaSlices. PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst() : PtrI.getAbortingInst(); assert(PointerEscapingInstr && "Did not track a bad instruction"); return; } + Slices.erase(std::remove_if(Slices.begin(), Slices.end(), + std::mem_fun_ref(&Slice::isDead)), + Slices.end()); + // Sort the uses. This arranges for the offsets to be in ascending order, // and the sizes to be in descending order. - std::sort(Partitions.begin(), Partitions.end()); - - // Remove any partitions from the back which are marked as dead. - while (!Partitions.empty() && Partitions.back().isDead()) - Partitions.pop_back(); - - if (Partitions.size() > 1) { - // Intersect splittability for all partitions with equal offsets and sizes. - // Then remove all but the first so that we have a sequence of non-equal but - // potentially overlapping partitions. - for (iterator I = Partitions.begin(), J = I, E = Partitions.end(); I != E; - I = J) { - ++J; - while (J != E && *I == *J) { - I->IsSplittable &= J->IsSplittable; - ++J; - } - } - Partitions.erase(std::unique(Partitions.begin(), Partitions.end()), - Partitions.end()); - - // Split splittable and merge unsplittable partitions into a disjoint set - // of partitions over the used space of the allocation. - splitAndMergePartitions(); - } - - // Record how many partitions we end up with. - NumAllocaPartitions += Partitions.size(); - MaxPartitionsPerAlloca = std::max<unsigned>(Partitions.size(), MaxPartitionsPerAlloca); - - // Now build up the user lists for each of these disjoint partitions by - // re-walking the recursive users of the alloca. - Uses.resize(Partitions.size()); - UseBuilder UB(TD, AI, *this); - PtrI = UB.visitPtr(AI); - assert(!PtrI.isEscaped() && "Previously analyzed pointer now escapes!"); - assert(!PtrI.isAborted() && "Early aborted the visit of the pointer."); - - unsigned NumUses = 0; -#if !defined(NDEBUG) || defined(LLVM_ENABLE_STATS) - for (unsigned Idx = 0, Size = Uses.size(); Idx != Size; ++Idx) - NumUses += Uses[Idx].size(); -#endif - NumAllocaPartitionUses += NumUses; - MaxPartitionUsesPerAlloca = std::max<unsigned>(NumUses, MaxPartitionUsesPerAlloca); + std::sort(Slices.begin(), Slices.end()); } -Type *AllocaPartitioning::getCommonType(iterator I) const { - Type *Ty = 0; - for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) { - Use *U = UI->getUse(); - if (!U) - continue; // Skip dead uses. - if (isa<IntrinsicInst>(*U->getUser())) - continue; - if (UI->BeginOffset != I->BeginOffset || UI->EndOffset != I->EndOffset) - continue; - - Type *UserTy = 0; - if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) - UserTy = LI->getType(); - else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) - UserTy = SI->getValueOperand()->getType(); - else - return 0; // Bail if we have weird uses. - - if (IntegerType *ITy = dyn_cast<IntegerType>(UserTy)) { - // If the type is larger than the partition, skip it. We only encounter - // this for split integer operations where we want to use the type of the - // entity causing the split. - if (ITy->getBitWidth() > (I->EndOffset - I->BeginOffset)*8) - continue; - - // If we have found an integer type use covering the alloca, use that - // regardless of the other types, as integers are often used for a "bucket - // of bits" type. - return ITy; - } - - if (Ty && Ty != UserTy) - return 0; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - Ty = UserTy; - } - return Ty; +void AllocaSlices::print(raw_ostream &OS, const_iterator I, + StringRef Indent) const { + printSlice(OS, I, Indent); + printUse(OS, I, Indent); } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - -void AllocaPartitioning::print(raw_ostream &OS, const_iterator I, - StringRef Indent) const { - OS << Indent << "partition #" << (I - begin()) - << " [" << I->BeginOffset << "," << I->EndOffset << ")" - << (I->IsSplittable ? " (splittable)" : "") - << (Uses[I - begin()].empty() ? " (zero uses)" : "") - << "\n"; +void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I, + StringRef Indent) const { + OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")" + << " slice #" << (I - begin()) + << (I->isSplittable() ? " (splittable)" : "") << "\n"; } -void AllocaPartitioning::printUsers(raw_ostream &OS, const_iterator I, - StringRef Indent) const { - for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) { - if (!UI->getUse()) - continue; // Skip dead uses. - OS << Indent << " [" << UI->BeginOffset << "," << UI->EndOffset << ") " - << "used by: " << *UI->getUse()->getUser() << "\n"; - if (MemTransferInst *II = - dyn_cast<MemTransferInst>(UI->getUse()->getUser())) { - const MemTransferOffsets &MTO = MemTransferInstData.lookup(II); - bool IsDest; - if (!MTO.IsSplittable) - IsDest = UI->BeginOffset == MTO.DestBegin; - else - IsDest = MTO.DestBegin != 0u; - OS << Indent << " (original " << (IsDest ? "dest" : "source") << ": " - << "[" << (IsDest ? MTO.DestBegin : MTO.SourceBegin) - << "," << (IsDest ? MTO.DestEnd : MTO.SourceEnd) << ")\n"; - } - } +void AllocaSlices::printUse(raw_ostream &OS, const_iterator I, + StringRef Indent) const { + OS << Indent << " used by: " << *I->getUse()->getUser() << "\n"; } -void AllocaPartitioning::print(raw_ostream &OS) const { +void AllocaSlices::print(raw_ostream &OS) const { if (PointerEscapingInstr) { - OS << "No partitioning for alloca: " << AI << "\n" + OS << "Can't analyze slices for alloca: " << AI << "\n" << " A pointer to this alloca escaped by:\n" << " " << *PointerEscapingInstr << "\n"; return; } - OS << "Partitioning of alloca: " << AI << "\n"; - for (const_iterator I = begin(), E = end(); I != E; ++I) { + OS << "Slices of alloca: " << AI << "\n"; + for (const_iterator I = begin(), E = end(); I != E; ++I) print(OS, I); - printUsers(OS, I); - } } -void AllocaPartitioning::dump(const_iterator I) const { print(dbgs(), I); } -void AllocaPartitioning::dump() const { print(dbgs()); } +void AllocaSlices::dump(const_iterator I) const { print(dbgs(), I); } +void AllocaSlices::dump() const { print(dbgs()); } #endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - namespace { /// \brief Implementation of LoadAndStorePromoter for promoting allocas. /// @@ -1269,12 +733,13 @@ class AllocaPromoter : public LoadAndStorePromoter { SmallVector<DbgValueInst *, 4> DVIs; public: - AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S, + AllocaPromoter(const SmallVectorImpl<Instruction *> &Insts, SSAUpdater &S, AllocaInst &AI, DIBuilder &DIB) - : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {} + : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {} void run(const SmallVectorImpl<Instruction*> &Insts) { - // Remember which alloca we're promoting (for isInstInList). + // Retain the debug information attached to the alloca for use when + // rewriting loads and stores. if (MDNode *DebugNode = MDNode::getIfExists(AI.getContext(), &AI)) { for (Value::use_iterator UI = DebugNode->use_begin(), UE = DebugNode->use_end(); @@ -1286,7 +751,9 @@ public: } LoadAndStorePromoter::run(Insts); - AI.eraseFromParent(); + + // While we have the debug information, clear it off of the alloca. The + // caller takes care of deleting the alloca. while (!DDIs.empty()) DDIs.pop_back_val()->eraseFromParent(); while (!DVIs.empty()) @@ -1295,13 +762,34 @@ public: virtual bool isInstInList(Instruction *I, const SmallVectorImpl<Instruction*> &Insts) const { + Value *Ptr; if (LoadInst *LI = dyn_cast<LoadInst>(I)) - return LI->getOperand(0) == &AI; - return cast<StoreInst>(I)->getPointerOperand() == &AI; + Ptr = LI->getOperand(0); + else + Ptr = cast<StoreInst>(I)->getPointerOperand(); + + // Only used to detect cycles, which will be rare and quickly found as + // we're walking up a chain of defs rather than down through uses. + SmallPtrSet<Value *, 4> Visited; + + do { + if (Ptr == &AI) + return true; + + if (BitCastInst *BCI = dyn_cast<BitCastInst>(Ptr)) + Ptr = BCI->getOperand(0); + else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr)) + Ptr = GEPI->getPointerOperand(); + else + return false; + + } while (Visited.insert(Ptr)); + + return false; } virtual void updateDebugInfo(Instruction *Inst) const { - for (SmallVector<DbgDeclareInst *, 4>::const_iterator I = DDIs.begin(), + for (SmallVectorImpl<DbgDeclareInst *>::const_iterator I = DDIs.begin(), E = DDIs.end(); I != E; ++I) { DbgDeclareInst *DDI = *I; if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) @@ -1309,7 +797,7 @@ public: else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) ConvertDebugDeclareToDebugValue(DDI, LI, DIB); } - for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(), + for (SmallVectorImpl<DbgValueInst *>::const_iterator I = DVIs.begin(), E = DVIs.end(); I != E; ++I) { DbgValueInst *DVI = *I; Value *Arg = 0; @@ -1360,7 +848,7 @@ class SROA : public FunctionPass { const bool RequiresDomTree; LLVMContext *C; - const DataLayout *TD; + const DataLayout *DL; DominatorTree *DT; /// \brief Worklist of alloca instructions to simplify. @@ -1390,10 +878,25 @@ class SROA : public FunctionPass { /// \brief A collection of alloca instructions we can directly promote. std::vector<AllocaInst *> PromotableAllocas; + /// \brief A worklist of PHIs to speculate prior to promoting allocas. + /// + /// All of these PHIs have been checked for the safety of speculation and by + /// being speculated will allow promoting allocas currently in the promotable + /// queue. + SetVector<PHINode *, SmallVector<PHINode *, 2> > SpeculatablePHIs; + + /// \brief A worklist of select instructions to speculate prior to promoting + /// allocas. + /// + /// All of these select instructions have been checked for the safety of + /// speculation and by being speculated will allow promoting allocas + /// currently in the promotable queue. + SetVector<SelectInst *, SmallVector<SelectInst *, 2> > SpeculatableSelects; + public: SROA(bool RequiresDomTree = true) : FunctionPass(ID), RequiresDomTree(RequiresDomTree), - C(0), TD(0), DT(0) { + C(0), DL(0), DT(0) { initializeSROAPass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F); @@ -1404,13 +907,13 @@ public: private: friend class PHIOrSelectSpeculator; - friend class AllocaPartitionRewriter; - friend class AllocaPartitionVectorRewriter; + friend class AllocaSliceRewriter; - bool rewriteAllocaPartition(AllocaInst &AI, - AllocaPartitioning &P, - AllocaPartitioning::iterator PI); - bool splitAlloca(AllocaInst &AI, AllocaPartitioning &P); + bool rewritePartition(AllocaInst &AI, AllocaSlices &S, + AllocaSlices::iterator B, AllocaSlices::iterator E, + int64_t BeginOffset, int64_t EndOffset, + ArrayRef<AllocaSlices::iterator> SplitUses); + bool splitAlloca(AllocaInst &AI, AllocaSlices &S); bool runOnAlloca(AllocaInst &AI); void deleteDeadInstructions(SmallPtrSet<AllocaInst *, 4> &DeletedAllocas); bool promoteAllocas(Function &F); @@ -1429,286 +932,255 @@ INITIALIZE_PASS_DEPENDENCY(DominatorTree) INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", false, false) -namespace { -/// \brief Visitor to speculate PHIs and Selects where possible. -class PHIOrSelectSpeculator : public InstVisitor<PHIOrSelectSpeculator> { - // Befriend the base class so it can delegate to private visit methods. - friend class llvm::InstVisitor<PHIOrSelectSpeculator>; - - const DataLayout &TD; - AllocaPartitioning &P; - SROA &Pass; +/// Walk the range of a partitioning looking for a common type to cover this +/// sequence of slices. +static Type *findCommonType(AllocaSlices::const_iterator B, + AllocaSlices::const_iterator E, + uint64_t EndOffset) { + Type *Ty = 0; + bool IgnoreNonIntegralTypes = false; + for (AllocaSlices::const_iterator I = B; I != E; ++I) { + Use *U = I->getUse(); + if (isa<IntrinsicInst>(*U->getUser())) + continue; + if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset) + continue; -public: - PHIOrSelectSpeculator(const DataLayout &TD, AllocaPartitioning &P, SROA &Pass) - : TD(TD), P(P), Pass(Pass) {} - - /// \brief Visit the users of an alloca partition and rewrite them. - void visitUsers(AllocaPartitioning::const_iterator PI) { - // Note that we need to use an index here as the underlying vector of uses - // may be grown during speculation. However, we never need to re-visit the - // new uses, and so we can use the initial size bound. - for (unsigned Idx = 0, Size = P.use_size(PI); Idx != Size; ++Idx) { - const PartitionUse &PU = P.getUse(PI, Idx); - if (!PU.getUse()) - continue; // Skip dead use. - - visit(cast<Instruction>(PU.getUse()->getUser())); + Type *UserTy = 0; + if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { + UserTy = LI->getType(); + } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) { + UserTy = SI->getValueOperand()->getType(); + } else { + IgnoreNonIntegralTypes = true; // Give up on anything but an iN type. + continue; } - } -private: - // By default, skip this instruction. - void visitInstruction(Instruction &I) {} - - /// PHI instructions that use an alloca and are subsequently loaded can be - /// rewritten to load both input pointers in the pred blocks and then PHI the - /// results, allowing the load of the alloca to be promoted. - /// From this: - /// %P2 = phi [i32* %Alloca, i32* %Other] - /// %V = load i32* %P2 - /// to: - /// %V1 = load i32* %Alloca -> will be mem2reg'd - /// ... - /// %V2 = load i32* %Other - /// ... - /// %V = phi [i32 %V1, i32 %V2] - /// - /// We can do this to a select if its only uses are loads and if the operands - /// to the select can be loaded unconditionally. - /// - /// FIXME: This should be hoisted into a generic utility, likely in - /// Transforms/Util/Local.h - bool isSafePHIToSpeculate(PHINode &PN, SmallVectorImpl<LoadInst *> &Loads) { - // For now, we can only do this promotion if the load is in the same block - // as the PHI, and if there are no stores between the phi and load. - // TODO: Allow recursive phi users. - // TODO: Allow stores. - BasicBlock *BB = PN.getParent(); - unsigned MaxAlign = 0; - for (Value::use_iterator UI = PN.use_begin(), UE = PN.use_end(); - UI != UE; ++UI) { - LoadInst *LI = dyn_cast<LoadInst>(*UI); - if (LI == 0 || !LI->isSimple()) return false; - - // For now we only allow loads in the same block as the PHI. This is - // a common case that happens when instcombine merges two loads through - // a PHI. - if (LI->getParent() != BB) return false; - - // Ensure that there are no instructions between the PHI and the load that - // could store. - for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI) - if (BBI->mayWriteToMemory()) - return false; - - MaxAlign = std::max(MaxAlign, LI->getAlignment()); - Loads.push_back(LI); + if (IntegerType *ITy = dyn_cast<IntegerType>(UserTy)) { + // If the type is larger than the partition, skip it. We only encounter + // this for split integer operations where we want to use the type of the + // entity causing the split. Also skip if the type is not a byte width + // multiple. + if (ITy->getBitWidth() % 8 != 0 || + ITy->getBitWidth() / 8 > (EndOffset - B->beginOffset())) + continue; + + // If we have found an integer type use covering the alloca, use that + // regardless of the other types, as integers are often used for + // a "bucket of bits" type. + // + // NB: This *must* be the only return from inside the loop so that the + // order of slices doesn't impact the computed type. + return ITy; + } else if (IgnoreNonIntegralTypes) { + continue; } - // We can only transform this if it is safe to push the loads into the - // predecessor blocks. The only thing to watch out for is that we can't put - // a possibly trapping load in the predecessor if it is a critical edge. - for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { - TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator(); - Value *InVal = PN.getIncomingValue(Idx); - - // If the value is produced by the terminator of the predecessor (an - // invoke) or it has side-effects, there is no valid place to put a load - // in the predecessor. - if (TI == InVal || TI->mayHaveSideEffects()) - return false; + if (Ty && Ty != UserTy) + IgnoreNonIntegralTypes = true; // Give up on anything but an iN type. - // If the predecessor has a single successor, then the edge isn't - // critical. - if (TI->getNumSuccessors() == 1) - continue; + Ty = UserTy; + } + return Ty; +} - // If this pointer is always safe to load, or if we can prove that there - // is already a load in the block, then we can move the load to the pred - // block. - if (InVal->isDereferenceablePointer() || - isSafeToLoadUnconditionally(InVal, TI, MaxAlign, &TD)) - continue; +/// PHI instructions that use an alloca and are subsequently loaded can be +/// rewritten to load both input pointers in the pred blocks and then PHI the +/// results, allowing the load of the alloca to be promoted. +/// From this: +/// %P2 = phi [i32* %Alloca, i32* %Other] +/// %V = load i32* %P2 +/// to: +/// %V1 = load i32* %Alloca -> will be mem2reg'd +/// ... +/// %V2 = load i32* %Other +/// ... +/// %V = phi [i32 %V1, i32 %V2] +/// +/// We can do this to a select if its only uses are loads and if the operands +/// to the select can be loaded unconditionally. +/// +/// FIXME: This should be hoisted into a generic utility, likely in +/// Transforms/Util/Local.h +static bool isSafePHIToSpeculate(PHINode &PN, + const DataLayout *DL = 0) { + // For now, we can only do this promotion if the load is in the same block + // as the PHI, and if there are no stores between the phi and load. + // TODO: Allow recursive phi users. + // TODO: Allow stores. + BasicBlock *BB = PN.getParent(); + unsigned MaxAlign = 0; + bool HaveLoad = false; + for (Value::use_iterator UI = PN.use_begin(), UE = PN.use_end(); UI != UE; + ++UI) { + LoadInst *LI = dyn_cast<LoadInst>(*UI); + if (LI == 0 || !LI->isSimple()) + return false; + // For now we only allow loads in the same block as the PHI. This is + // a common case that happens when instcombine merges two loads through + // a PHI. + if (LI->getParent() != BB) return false; - } - return true; + // Ensure that there are no instructions between the PHI and the load that + // could store. + for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI) + if (BBI->mayWriteToMemory()) + return false; + + MaxAlign = std::max(MaxAlign, LI->getAlignment()); + HaveLoad = true; } - void visitPHINode(PHINode &PN) { - DEBUG(dbgs() << " original: " << PN << "\n"); + if (!HaveLoad) + return false; - SmallVector<LoadInst *, 4> Loads; - if (!isSafePHIToSpeculate(PN, Loads)) - return; + // We can only transform this if it is safe to push the loads into the + // predecessor blocks. The only thing to watch out for is that we can't put + // a possibly trapping load in the predecessor if it is a critical edge. + for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { + TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator(); + Value *InVal = PN.getIncomingValue(Idx); + + // If the value is produced by the terminator of the predecessor (an + // invoke) or it has side-effects, there is no valid place to put a load + // in the predecessor. + if (TI == InVal || TI->mayHaveSideEffects()) + return false; - assert(!Loads.empty()); + // If the predecessor has a single successor, then the edge isn't + // critical. + if (TI->getNumSuccessors() == 1) + continue; - Type *LoadTy = cast<PointerType>(PN.getType())->getElementType(); - IRBuilderTy PHIBuilder(&PN); - PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(), - PN.getName() + ".sroa.speculated"); + // If this pointer is always safe to load, or if we can prove that there + // is already a load in the block, then we can move the load to the pred + // block. + if (InVal->isDereferenceablePointer() || + isSafeToLoadUnconditionally(InVal, TI, MaxAlign, DL)) + continue; - // Get the TBAA tag and alignment to use from one of the loads. It doesn't - // matter which one we get and if any differ. - LoadInst *SomeLoad = cast<LoadInst>(Loads.back()); - MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa); - unsigned Align = SomeLoad->getAlignment(); + return false; + } - // Rewrite all loads of the PN to use the new PHI. - do { - LoadInst *LI = Loads.pop_back_val(); - LI->replaceAllUsesWith(NewPN); - Pass.DeadInsts.insert(LI); - } while (!Loads.empty()); - - // Inject loads into all of the pred blocks. - for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { - BasicBlock *Pred = PN.getIncomingBlock(Idx); - TerminatorInst *TI = Pred->getTerminator(); - Use *InUse = &PN.getOperandUse(PN.getOperandNumForIncomingValue(Idx)); - Value *InVal = PN.getIncomingValue(Idx); - IRBuilderTy PredBuilder(TI); - - LoadInst *Load - = PredBuilder.CreateLoad(InVal, (PN.getName() + ".sroa.speculate.load." + - Pred->getName())); - ++NumLoadsSpeculated; - Load->setAlignment(Align); - if (TBAATag) - Load->setMetadata(LLVMContext::MD_tbaa, TBAATag); - NewPN->addIncoming(Load, Pred); - - Instruction *Ptr = dyn_cast<Instruction>(InVal); - if (!Ptr) - // No uses to rewrite. - continue; + return true; +} - // Try to lookup and rewrite any partition uses corresponding to this phi - // input. - AllocaPartitioning::iterator PI - = P.findPartitionForPHIOrSelectOperand(InUse); - if (PI == P.end()) - continue; +static void speculatePHINodeLoads(PHINode &PN) { + DEBUG(dbgs() << " original: " << PN << "\n"); + + Type *LoadTy = cast<PointerType>(PN.getType())->getElementType(); + IRBuilderTy PHIBuilder(&PN); + PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(), + PN.getName() + ".sroa.speculated"); + + // Get the TBAA tag and alignment to use from one of the loads. It doesn't + // matter which one we get and if any differ. + LoadInst *SomeLoad = cast<LoadInst>(*PN.use_begin()); + MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa); + unsigned Align = SomeLoad->getAlignment(); + + // Rewrite all loads of the PN to use the new PHI. + while (!PN.use_empty()) { + LoadInst *LI = cast<LoadInst>(*PN.use_begin()); + LI->replaceAllUsesWith(NewPN); + LI->eraseFromParent(); + } + + // Inject loads into all of the pred blocks. + for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { + BasicBlock *Pred = PN.getIncomingBlock(Idx); + TerminatorInst *TI = Pred->getTerminator(); + Value *InVal = PN.getIncomingValue(Idx); + IRBuilderTy PredBuilder(TI); + + LoadInst *Load = PredBuilder.CreateLoad( + InVal, (PN.getName() + ".sroa.speculate.load." + Pred->getName())); + ++NumLoadsSpeculated; + Load->setAlignment(Align); + if (TBAATag) + Load->setMetadata(LLVMContext::MD_tbaa, TBAATag); + NewPN->addIncoming(Load, Pred); + } + + DEBUG(dbgs() << " speculated to: " << *NewPN << "\n"); + PN.eraseFromParent(); +} - // Replace the Use in the PartitionUse for this operand with the Use - // inside the load. - AllocaPartitioning::use_iterator UI - = P.findPartitionUseForPHIOrSelectOperand(InUse); - assert(isa<PHINode>(*UI->getUse()->getUser())); - UI->setUse(&Load->getOperandUse(Load->getPointerOperandIndex())); - } - DEBUG(dbgs() << " speculated to: " << *NewPN << "\n"); - } - - /// Select instructions that use an alloca and are subsequently loaded can be - /// rewritten to load both input pointers and then select between the result, - /// allowing the load of the alloca to be promoted. - /// From this: - /// %P2 = select i1 %cond, i32* %Alloca, i32* %Other - /// %V = load i32* %P2 - /// to: - /// %V1 = load i32* %Alloca -> will be mem2reg'd - /// %V2 = load i32* %Other - /// %V = select i1 %cond, i32 %V1, i32 %V2 - /// - /// We can do this to a select if its only uses are loads and if the operand - /// to the select can be loaded unconditionally. - bool isSafeSelectToSpeculate(SelectInst &SI, - SmallVectorImpl<LoadInst *> &Loads) { - Value *TValue = SI.getTrueValue(); - Value *FValue = SI.getFalseValue(); - bool TDerefable = TValue->isDereferenceablePointer(); - bool FDerefable = FValue->isDereferenceablePointer(); - - for (Value::use_iterator UI = SI.use_begin(), UE = SI.use_end(); - UI != UE; ++UI) { - LoadInst *LI = dyn_cast<LoadInst>(*UI); - if (LI == 0 || !LI->isSimple()) return false; - - // Both operands to the select need to be dereferencable, either - // absolutely (e.g. allocas) or at this point because we can see other - // accesses to it. - if (!TDerefable && !isSafeToLoadUnconditionally(TValue, LI, - LI->getAlignment(), &TD)) - return false; - if (!FDerefable && !isSafeToLoadUnconditionally(FValue, LI, - LI->getAlignment(), &TD)) - return false; - Loads.push_back(LI); - } +/// Select instructions that use an alloca and are subsequently loaded can be +/// rewritten to load both input pointers and then select between the result, +/// allowing the load of the alloca to be promoted. +/// From this: +/// %P2 = select i1 %cond, i32* %Alloca, i32* %Other +/// %V = load i32* %P2 +/// to: +/// %V1 = load i32* %Alloca -> will be mem2reg'd +/// %V2 = load i32* %Other +/// %V = select i1 %cond, i32 %V1, i32 %V2 +/// +/// We can do this to a select if its only uses are loads and if the operand +/// to the select can be loaded unconditionally. +static bool isSafeSelectToSpeculate(SelectInst &SI, const DataLayout *DL = 0) { + Value *TValue = SI.getTrueValue(); + Value *FValue = SI.getFalseValue(); + bool TDerefable = TValue->isDereferenceablePointer(); + bool FDerefable = FValue->isDereferenceablePointer(); + + for (Value::use_iterator UI = SI.use_begin(), UE = SI.use_end(); UI != UE; + ++UI) { + LoadInst *LI = dyn_cast<LoadInst>(*UI); + if (LI == 0 || !LI->isSimple()) + return false; - return true; + // Both operands to the select need to be dereferencable, either + // absolutely (e.g. allocas) or at this point because we can see other + // accesses to it. + if (!TDerefable && + !isSafeToLoadUnconditionally(TValue, LI, LI->getAlignment(), DL)) + return false; + if (!FDerefable && + !isSafeToLoadUnconditionally(FValue, LI, LI->getAlignment(), DL)) + return false; } - void visitSelectInst(SelectInst &SI) { - DEBUG(dbgs() << " original: " << SI << "\n"); - - // If the select isn't safe to speculate, just use simple logic to emit it. - SmallVector<LoadInst *, 4> Loads; - if (!isSafeSelectToSpeculate(SI, Loads)) - return; + return true; +} - IRBuilderTy IRB(&SI); - Use *Ops[2] = { &SI.getOperandUse(1), &SI.getOperandUse(2) }; - AllocaPartitioning::iterator PIs[2]; - PartitionUse PUs[2]; - for (unsigned i = 0, e = 2; i != e; ++i) { - PIs[i] = P.findPartitionForPHIOrSelectOperand(Ops[i]); - if (PIs[i] != P.end()) { - // If the pointer is within the partitioning, remove the select from - // its uses. We'll add in the new loads below. - AllocaPartitioning::use_iterator UI - = P.findPartitionUseForPHIOrSelectOperand(Ops[i]); - PUs[i] = *UI; - // Clear out the use here so that the offsets into the use list remain - // stable but this use is ignored when rewriting. - UI->setUse(0); - } - } +static void speculateSelectInstLoads(SelectInst &SI) { + DEBUG(dbgs() << " original: " << SI << "\n"); - Value *TV = SI.getTrueValue(); - Value *FV = SI.getFalseValue(); - // Replace the loads of the select with a select of two loads. - while (!Loads.empty()) { - LoadInst *LI = Loads.pop_back_val(); + IRBuilderTy IRB(&SI); + Value *TV = SI.getTrueValue(); + Value *FV = SI.getFalseValue(); + // Replace the loads of the select with a select of two loads. + while (!SI.use_empty()) { + LoadInst *LI = cast<LoadInst>(*SI.use_begin()); + assert(LI->isSimple() && "We only speculate simple loads"); - IRB.SetInsertPoint(LI); - LoadInst *TL = + IRB.SetInsertPoint(LI); + LoadInst *TL = IRB.CreateLoad(TV, LI->getName() + ".sroa.speculate.load.true"); - LoadInst *FL = + LoadInst *FL = IRB.CreateLoad(FV, LI->getName() + ".sroa.speculate.load.false"); - NumLoadsSpeculated += 2; - - // Transfer alignment and TBAA info if present. - TL->setAlignment(LI->getAlignment()); - FL->setAlignment(LI->getAlignment()); - if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) { - TL->setMetadata(LLVMContext::MD_tbaa, Tag); - FL->setMetadata(LLVMContext::MD_tbaa, Tag); - } + NumLoadsSpeculated += 2; - Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL, - LI->getName() + ".sroa.speculated"); + // Transfer alignment and TBAA info if present. + TL->setAlignment(LI->getAlignment()); + FL->setAlignment(LI->getAlignment()); + if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) { + TL->setMetadata(LLVMContext::MD_tbaa, Tag); + FL->setMetadata(LLVMContext::MD_tbaa, Tag); + } - LoadInst *Loads[2] = { TL, FL }; - for (unsigned i = 0, e = 2; i != e; ++i) { - if (PIs[i] != P.end()) { - Use *LoadUse = &Loads[i]->getOperandUse(0); - assert(PUs[i].getUse()->get() == LoadUse->get()); - PUs[i].setUse(LoadUse); - P.use_push_back(PIs[i], PUs[i]); - } - } + Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL, + LI->getName() + ".sroa.speculated"); - DEBUG(dbgs() << " speculated to: " << *V << "\n"); - LI->replaceAllUsesWith(V); - Pass.DeadInsts.insert(LI); - } + DEBUG(dbgs() << " speculated to: " << *V << "\n"); + LI->replaceAllUsesWith(V); + LI->eraseFromParent(); } -}; + SI.eraseFromParent(); } /// \brief Build a GEP out of a base pointer and indices. @@ -1737,7 +1209,7 @@ static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr, /// TargetTy. If we can't find one with the same type, we at least try to use /// one with the same size. If none of that works, we just produce the GEP as /// indicated by Indices to have the correct offset. -static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &TD, +static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL, Value *BasePtr, Type *Ty, Type *TargetTy, SmallVectorImpl<Value *> &Indices) { if (Ty == TargetTy) @@ -1754,7 +1226,7 @@ static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &TD, ElementTy = SeqTy->getElementType(); // Note that we use the default address space as this index is over an // array or a vector, not a pointer. - Indices.push_back(IRB.getInt(APInt(TD.getPointerSizeInBits(0), 0))); + Indices.push_back(IRB.getInt(APInt(DL.getPointerSizeInBits(0), 0))); } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) { if (STy->element_begin() == STy->element_end()) break; // Nothing left to descend into. @@ -1775,12 +1247,12 @@ static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &TD, /// /// This is the recursive step for getNaturalGEPWithOffset that walks down the /// element types adding appropriate indices for the GEP. -static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &TD, +static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, Type *Ty, APInt &Offset, Type *TargetTy, SmallVectorImpl<Value *> &Indices) { if (Offset == 0) - return getNaturalGEPWithType(IRB, TD, Ptr, Ty, TargetTy, Indices); + return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices); // We can't recurse through pointer types. if (Ty->isPointerTy()) @@ -1790,7 +1262,7 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &TD, // extremely poorly defined currently. The long-term goal is to remove GEPing // over a vector from the IR completely. if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) { - unsigned ElementSizeInBits = TD.getTypeSizeInBits(VecTy->getScalarType()); + unsigned ElementSizeInBits = DL.getTypeSizeInBits(VecTy->getScalarType()); if (ElementSizeInBits % 8) return 0; // GEPs over non-multiple of 8 size vector elements are invalid. APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8); @@ -1799,20 +1271,20 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &TD, return 0; Offset -= NumSkippedElements * ElementSize; Indices.push_back(IRB.getInt(NumSkippedElements)); - return getNaturalGEPRecursively(IRB, TD, Ptr, VecTy->getElementType(), + return getNaturalGEPRecursively(IRB, DL, Ptr, VecTy->getElementType(), Offset, TargetTy, Indices); } if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { Type *ElementTy = ArrTy->getElementType(); - APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy)); + APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy)); APInt NumSkippedElements = Offset.sdiv(ElementSize); if (NumSkippedElements.ugt(ArrTy->getNumElements())) return 0; Offset -= NumSkippedElements * ElementSize; Indices.push_back(IRB.getInt(NumSkippedElements)); - return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, + return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy, Indices); } @@ -1820,18 +1292,18 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &TD, if (!STy) return 0; - const StructLayout *SL = TD.getStructLayout(STy); + const StructLayout *SL = DL.getStructLayout(STy); uint64_t StructOffset = Offset.getZExtValue(); if (StructOffset >= SL->getSizeInBytes()) return 0; unsigned Index = SL->getElementContainingOffset(StructOffset); Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index)); Type *ElementTy = STy->getElementType(Index); - if (Offset.uge(TD.getTypeAllocSize(ElementTy))) + if (Offset.uge(DL.getTypeAllocSize(ElementTy))) return 0; // The offset points into alignment padding. Indices.push_back(IRB.getInt32(Index)); - return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, + return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy, Indices); } @@ -1845,7 +1317,7 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &TD, /// Indices, and setting Ty to the result subtype. /// /// If no natural GEP can be constructed, this function returns null. -static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &TD, +static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, APInt Offset, Type *TargetTy, SmallVectorImpl<Value *> &Indices) { PointerType *Ty = cast<PointerType>(Ptr->getType()); @@ -1858,14 +1330,14 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &TD, Type *ElementTy = Ty->getElementType(); if (!ElementTy->isSized()) return 0; // We can't GEP through an unsized element. - APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy)); + APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy)); if (ElementSize == 0) return 0; // Zero-length arrays can't help us build a natural GEP. APInt NumSkippedElements = Offset.sdiv(ElementSize); Offset -= NumSkippedElements * ElementSize; Indices.push_back(IRB.getInt(NumSkippedElements)); - return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, + return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy, Indices); } @@ -1884,7 +1356,7 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &TD, /// properties. The algorithm tries to fold as many constant indices into /// a single GEP as possible, thus making each GEP more independent of the /// surrounding code. -static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &TD, +static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, APInt Offset, Type *PointerTy) { // Even though we don't look through PHI nodes, we could be called on an // instruction in an unreachable block, which may be on a cycle. @@ -1908,7 +1380,7 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &TD, // First fold any existing GEPs into the offset. while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) { APInt GEPOffset(Offset.getBitWidth(), 0); - if (!GEP->accumulateConstantOffset(TD, GEPOffset)) + if (!GEP->accumulateConstantOffset(DL, GEPOffset)) break; Offset += GEPOffset; Ptr = GEP->getPointerOperand(); @@ -1918,7 +1390,7 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &TD, // See if we can perform a natural GEP here. Indices.clear(); - if (Value *P = getNaturalGEPWithOffset(IRB, TD, Ptr, Offset, TargetTy, + if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy, Indices)) { if (P->getType() == PointerTy) { // Zap any offset pointer that we ended up computing in previous rounds. @@ -1989,6 +1461,10 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType()) return false; + // We can convert pointers to integers and vice-versa. Same for vectors + // of pointers and integers. + OldTy = OldTy->getScalarType(); + NewTy = NewTy->getScalarType(); if (NewTy->isPointerTy() || OldTy->isPointerTy()) { if (NewTy->isPointerTy() && OldTy->isPointerTy()) return true; @@ -2007,24 +1483,126 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { /// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test /// two types for viability with this routine. static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, - Type *Ty) { - assert(canConvertValue(DL, V->getType(), Ty) && - "Value not convertable to type"); - if (V->getType() == Ty) + Type *NewTy) { + Type *OldTy = V->getType(); + assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type"); + + if (OldTy == NewTy) return V; - if (IntegerType *OldITy = dyn_cast<IntegerType>(V->getType())) - if (IntegerType *NewITy = dyn_cast<IntegerType>(Ty)) + + if (IntegerType *OldITy = dyn_cast<IntegerType>(OldTy)) + if (IntegerType *NewITy = dyn_cast<IntegerType>(NewTy)) if (NewITy->getBitWidth() > OldITy->getBitWidth()) return IRB.CreateZExt(V, NewITy); - if (V->getType()->isIntegerTy() && Ty->isPointerTy()) - return IRB.CreateIntToPtr(V, Ty); - if (V->getType()->isPointerTy() && Ty->isIntegerTy()) - return IRB.CreatePtrToInt(V, Ty); - return IRB.CreateBitCast(V, Ty); + // See if we need inttoptr for this type pair. A cast involving both scalars + // and vectors requires and additional bitcast. + if (OldTy->getScalarType()->isIntegerTy() && + NewTy->getScalarType()->isPointerTy()) { + // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8* + if (OldTy->isVectorTy() && !NewTy->isVectorTy()) + return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)), + NewTy); + + // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*> + if (!OldTy->isVectorTy() && NewTy->isVectorTy()) + return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)), + NewTy); + + return IRB.CreateIntToPtr(V, NewTy); + } + + // See if we need ptrtoint for this type pair. A cast involving both scalars + // and vectors requires and additional bitcast. + if (OldTy->getScalarType()->isPointerTy() && + NewTy->getScalarType()->isIntegerTy()) { + // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128 + if (OldTy->isVectorTy() && !NewTy->isVectorTy()) + return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), + NewTy); + + // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32> + if (!OldTy->isVectorTy() && NewTy->isVectorTy()) + return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), + NewTy); + + return IRB.CreatePtrToInt(V, NewTy); + } + + return IRB.CreateBitCast(V, NewTy); } -/// \brief Test whether the given alloca partition can be promoted to a vector. +/// \brief Test whether the given slice use can be promoted to a vector. +/// +/// This function is called to test each entry in a partioning which is slated +/// for a single slice. +static bool isVectorPromotionViableForSlice( + const DataLayout &DL, AllocaSlices &S, uint64_t SliceBeginOffset, + uint64_t SliceEndOffset, VectorType *Ty, uint64_t ElementSize, + AllocaSlices::const_iterator I) { + // First validate the slice offsets. + uint64_t BeginOffset = + std::max(I->beginOffset(), SliceBeginOffset) - SliceBeginOffset; + uint64_t BeginIndex = BeginOffset / ElementSize; + if (BeginIndex * ElementSize != BeginOffset || + BeginIndex >= Ty->getNumElements()) + return false; + uint64_t EndOffset = + std::min(I->endOffset(), SliceEndOffset) - SliceBeginOffset; + uint64_t EndIndex = EndOffset / ElementSize; + if (EndIndex * ElementSize != EndOffset || EndIndex > Ty->getNumElements()) + return false; + + assert(EndIndex > BeginIndex && "Empty vector!"); + uint64_t NumElements = EndIndex - BeginIndex; + Type *SliceTy = + (NumElements == 1) ? Ty->getElementType() + : VectorType::get(Ty->getElementType(), NumElements); + + Type *SplitIntTy = + Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8); + + Use *U = I->getUse(); + + if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) { + if (MI->isVolatile()) + return false; + if (!I->isSplittable()) + return false; // Skip any unsplittable intrinsics. + } else if (U->get()->getType()->getPointerElementType()->isStructTy()) { + // Disable vector promotion when there are loads or stores of an FCA. + return false; + } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { + if (LI->isVolatile()) + return false; + Type *LTy = LI->getType(); + if (SliceBeginOffset > I->beginOffset() || + SliceEndOffset < I->endOffset()) { + assert(LTy->isIntegerTy()); + LTy = SplitIntTy; + } + if (!canConvertValue(DL, SliceTy, LTy)) + return false; + } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) { + if (SI->isVolatile()) + return false; + Type *STy = SI->getValueOperand()->getType(); + if (SliceBeginOffset > I->beginOffset() || + SliceEndOffset < I->endOffset()) { + assert(STy->isIntegerTy()); + STy = SplitIntTy; + } + if (!canConvertValue(DL, STy, SliceTy)) + return false; + } else { + return false; + } + + return true; +} + +/// \brief Test whether the given alloca partitioning and range of slices can be +/// promoted to a vector. /// /// This is a quick test to check whether we can rewrite a particular alloca /// partition (and its newly formed alloca) into a vector alloca with only @@ -2032,75 +1610,103 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, /// SSA value. We only can ensure this for a limited set of operations, and we /// don't want to do the rewrites unless we are confident that the result will /// be promotable, so we have an early test here. -static bool isVectorPromotionViable(const DataLayout &TD, - Type *AllocaTy, - AllocaPartitioning &P, - uint64_t PartitionBeginOffset, - uint64_t PartitionEndOffset, - AllocaPartitioning::const_use_iterator I, - AllocaPartitioning::const_use_iterator E) { +static bool +isVectorPromotionViable(const DataLayout &DL, Type *AllocaTy, AllocaSlices &S, + uint64_t SliceBeginOffset, uint64_t SliceEndOffset, + AllocaSlices::const_iterator I, + AllocaSlices::const_iterator E, + ArrayRef<AllocaSlices::iterator> SplitUses) { VectorType *Ty = dyn_cast<VectorType>(AllocaTy); if (!Ty) return false; - uint64_t ElementSize = TD.getTypeSizeInBits(Ty->getScalarType()); + uint64_t ElementSize = DL.getTypeSizeInBits(Ty->getScalarType()); // While the definition of LLVM vectors is bitpacked, we don't support sizes // that aren't byte sized. if (ElementSize % 8) return false; - assert((TD.getTypeSizeInBits(Ty) % 8) == 0 && + assert((DL.getTypeSizeInBits(Ty) % 8) == 0 && "vector size not a multiple of element size?"); ElementSize /= 8; - for (; I != E; ++I) { - Use *U = I->getUse(); - if (!U) - continue; // Skip dead use. - - uint64_t BeginOffset = I->BeginOffset - PartitionBeginOffset; - uint64_t BeginIndex = BeginOffset / ElementSize; - if (BeginIndex * ElementSize != BeginOffset || - BeginIndex >= Ty->getNumElements()) + for (; I != E; ++I) + if (!isVectorPromotionViableForSlice(DL, S, SliceBeginOffset, + SliceEndOffset, Ty, ElementSize, I)) return false; - uint64_t EndOffset = I->EndOffset - PartitionBeginOffset; - uint64_t EndIndex = EndOffset / ElementSize; - if (EndIndex * ElementSize != EndOffset || - EndIndex > Ty->getNumElements()) + + for (ArrayRef<AllocaSlices::iterator>::const_iterator SUI = SplitUses.begin(), + SUE = SplitUses.end(); + SUI != SUE; ++SUI) + if (!isVectorPromotionViableForSlice(DL, S, SliceBeginOffset, + SliceEndOffset, Ty, ElementSize, *SUI)) return false; - assert(EndIndex > BeginIndex && "Empty vector!"); - uint64_t NumElements = EndIndex - BeginIndex; - Type *PartitionTy - = (NumElements == 1) ? Ty->getElementType() - : VectorType::get(Ty->getElementType(), NumElements); + return true; +} - if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) { - if (MI->isVolatile()) - return false; - if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(U->getUser())) { - const AllocaPartitioning::MemTransferOffsets &MTO - = P.getMemTransferOffsets(*MTI); - if (!MTO.IsSplittable) - return false; - } - } else if (U->get()->getType()->getPointerElementType()->isStructTy()) { - // Disable vector promotion when there are loads or stores of an FCA. +/// \brief Test whether a slice of an alloca is valid for integer widening. +/// +/// This implements the necessary checking for the \c isIntegerWideningViable +/// test below on a single slice of the alloca. +static bool isIntegerWideningViableForSlice(const DataLayout &DL, + Type *AllocaTy, + uint64_t AllocBeginOffset, + uint64_t Size, AllocaSlices &S, + AllocaSlices::const_iterator I, + bool &WholeAllocaOp) { + uint64_t RelBegin = I->beginOffset() - AllocBeginOffset; + uint64_t RelEnd = I->endOffset() - AllocBeginOffset; + + // We can't reasonably handle cases where the load or store extends past + // the end of the aloca's type and into its padding. + if (RelEnd > Size) + return false; + + Use *U = I->getUse(); + + if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { + if (LI->isVolatile()) return false; - } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { - if (LI->isVolatile()) - return false; - if (!canConvertValue(TD, PartitionTy, LI->getType())) - return false; - } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) { - if (SI->isVolatile()) + if (RelBegin == 0 && RelEnd == Size) + WholeAllocaOp = true; + if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) { + if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy)) return false; - if (!canConvertValue(TD, SI->getValueOperand()->getType(), PartitionTy)) + } else if (RelBegin != 0 || RelEnd != Size || + !canConvertValue(DL, AllocaTy, LI->getType())) { + // Non-integer loads need to be convertible from the alloca type so that + // they are promotable. + return false; + } + } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) { + Type *ValueTy = SI->getValueOperand()->getType(); + if (SI->isVolatile()) + return false; + if (RelBegin == 0 && RelEnd == Size) + WholeAllocaOp = true; + if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) { + if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy)) return false; - } else { + } else if (RelBegin != 0 || RelEnd != Size || + !canConvertValue(DL, ValueTy, AllocaTy)) { + // Non-integer stores need to be convertible to the alloca type so that + // they are promotable. return false; } + } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) { + if (MI->isVolatile() || !isa<Constant>(MI->getLength())) + return false; + if (!I->isSplittable()) + return false; // Skip any unsplittable intrinsics. + } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) { + if (II->getIntrinsicID() != Intrinsic::lifetime_start && + II->getIntrinsicID() != Intrinsic::lifetime_end) + return false; + } else { + return false; } + return true; } @@ -2110,97 +1716,50 @@ static bool isVectorPromotionViable(const DataLayout &TD, /// This is a quick test to check whether we can rewrite the integer loads and /// stores to a particular alloca into wider loads and stores and be able to /// promote the resulting alloca. -static bool isIntegerWideningViable(const DataLayout &TD, - Type *AllocaTy, - uint64_t AllocBeginOffset, - AllocaPartitioning &P, - AllocaPartitioning::const_use_iterator I, - AllocaPartitioning::const_use_iterator E) { - uint64_t SizeInBits = TD.getTypeSizeInBits(AllocaTy); +static bool +isIntegerWideningViable(const DataLayout &DL, Type *AllocaTy, + uint64_t AllocBeginOffset, AllocaSlices &S, + AllocaSlices::const_iterator I, + AllocaSlices::const_iterator E, + ArrayRef<AllocaSlices::iterator> SplitUses) { + uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy); // Don't create integer types larger than the maximum bitwidth. if (SizeInBits > IntegerType::MAX_INT_BITS) return false; // Don't try to handle allocas with bit-padding. - if (SizeInBits != TD.getTypeStoreSizeInBits(AllocaTy)) + if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy)) return false; // We need to ensure that an integer type with the appropriate bitwidth can // be converted to the alloca type, whatever that is. We don't want to force // the alloca itself to have an integer type if there is a more suitable one. Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits); - if (!canConvertValue(TD, AllocaTy, IntTy) || - !canConvertValue(TD, IntTy, AllocaTy)) + if (!canConvertValue(DL, AllocaTy, IntTy) || + !canConvertValue(DL, IntTy, AllocaTy)) return false; - uint64_t Size = TD.getTypeStoreSize(AllocaTy); - - // Check the uses to ensure the uses are (likely) promotable integer uses. - // Also ensure that the alloca has a covering load or store. We don't want - // to widen the integer operations only to fail to promote due to some other - // unsplittable entry (which we may make splittable later). - bool WholeAllocaOp = false; - for (; I != E; ++I) { - Use *U = I->getUse(); - if (!U) - continue; // Skip dead use. + uint64_t Size = DL.getTypeStoreSize(AllocaTy); - uint64_t RelBegin = I->BeginOffset - AllocBeginOffset; - uint64_t RelEnd = I->EndOffset - AllocBeginOffset; + // While examining uses, we ensure that the alloca has a covering load or + // store. We don't want to widen the integer operations only to fail to + // promote due to some other unsplittable entry (which we may make splittable + // later). However, if there are only splittable uses, go ahead and assume + // that we cover the alloca. + bool WholeAllocaOp = (I != E) ? false : DL.isLegalInteger(SizeInBits); - // We can't reasonably handle cases where the load or store extends past - // the end of the aloca's type and into its padding. - if (RelEnd > Size) + for (; I != E; ++I) + if (!isIntegerWideningViableForSlice(DL, AllocaTy, AllocBeginOffset, Size, + S, I, WholeAllocaOp)) return false; - if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { - if (LI->isVolatile()) - return false; - if (RelBegin == 0 && RelEnd == Size) - WholeAllocaOp = true; - if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) { - if (ITy->getBitWidth() < TD.getTypeStoreSizeInBits(ITy)) - return false; - continue; - } - // Non-integer loads need to be convertible from the alloca type so that - // they are promotable. - if (RelBegin != 0 || RelEnd != Size || - !canConvertValue(TD, AllocaTy, LI->getType())) - return false; - } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) { - Type *ValueTy = SI->getValueOperand()->getType(); - if (SI->isVolatile()) - return false; - if (RelBegin == 0 && RelEnd == Size) - WholeAllocaOp = true; - if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) { - if (ITy->getBitWidth() < TD.getTypeStoreSizeInBits(ITy)) - return false; - continue; - } - // Non-integer stores need to be convertible to the alloca type so that - // they are promotable. - if (RelBegin != 0 || RelEnd != Size || - !canConvertValue(TD, ValueTy, AllocaTy)) - return false; - } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) { - if (MI->isVolatile() || !isa<Constant>(MI->getLength())) - return false; - if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(U->getUser())) { - const AllocaPartitioning::MemTransferOffsets &MTO - = P.getMemTransferOffsets(*MTI); - if (!MTO.IsSplittable) - return false; - } - } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) { - if (II->getIntrinsicID() != Intrinsic::lifetime_start && - II->getIntrinsicID() != Intrinsic::lifetime_end) - return false; - } else { + for (ArrayRef<AllocaSlices::iterator>::const_iterator SUI = SplitUses.begin(), + SUE = SplitUses.end(); + SUI != SUE; ++SUI) + if (!isIntegerWideningViableForSlice(DL, AllocaTy, AllocBeginOffset, Size, + S, *SUI, WholeAllocaOp)) return false; - } - } + return WholeAllocaOp; } @@ -2335,19 +1894,19 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V, } namespace { -/// \brief Visitor to rewrite instructions using a partition of an alloca to -/// use a new alloca. +/// \brief Visitor to rewrite instructions using p particular slice of an alloca +/// to use a new alloca. /// /// Also implements the rewriting to vector-based accesses when the partition /// passes the isVectorPromotionViable predicate. Most of the rewriting logic /// lives here. -class AllocaPartitionRewriter : public InstVisitor<AllocaPartitionRewriter, - bool> { +class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> { // Befriend the base class so it can delegate to private visit methods. - friend class llvm::InstVisitor<AllocaPartitionRewriter, bool>; + friend class llvm::InstVisitor<AllocaSliceRewriter, bool>; + typedef llvm::InstVisitor<AllocaSliceRewriter, bool> Base; - const DataLayout &TD; - AllocaPartitioning &P; + const DataLayout &DL; + AllocaSlices &S; SROA &Pass; AllocaInst &OldAI, &NewAI; const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset; @@ -2372,106 +1931,112 @@ class AllocaPartitionRewriter : public InstVisitor<AllocaPartitionRewriter, // integer type will be stored here for easy access during rewriting. IntegerType *IntTy; - // The offset of the partition user currently being rewritten. + // The offset of the slice currently being rewritten. uint64_t BeginOffset, EndOffset; + bool IsSplittable; bool IsSplit; Use *OldUse; Instruction *OldPtr; + // Output members carrying state about the result of visiting and rewriting + // the slice of the alloca. + bool IsUsedByRewrittenSpeculatableInstructions; + // Utility IR builder, whose name prefix is setup for each visited use, and // the insertion point is set to point to the user. IRBuilderTy IRB; public: - AllocaPartitionRewriter(const DataLayout &TD, AllocaPartitioning &P, - AllocaPartitioning::iterator PI, - SROA &Pass, AllocaInst &OldAI, AllocaInst &NewAI, - uint64_t NewBeginOffset, uint64_t NewEndOffset) - : TD(TD), P(P), Pass(Pass), - OldAI(OldAI), NewAI(NewAI), - NewAllocaBeginOffset(NewBeginOffset), - NewAllocaEndOffset(NewEndOffset), - NewAllocaTy(NewAI.getAllocatedType()), - VecTy(), ElementTy(), ElementSize(), IntTy(), - BeginOffset(), EndOffset(), IsSplit(), OldUse(), OldPtr(), - IRB(NewAI.getContext(), ConstantFolder()) { - } - - /// \brief Visit the users of the alloca partition and rewrite them. - bool visitUsers(AllocaPartitioning::const_use_iterator I, - AllocaPartitioning::const_use_iterator E) { - if (isVectorPromotionViable(TD, NewAI.getAllocatedType(), P, - NewAllocaBeginOffset, NewAllocaEndOffset, - I, E)) { - ++NumVectorized; - VecTy = cast<VectorType>(NewAI.getAllocatedType()); - ElementTy = VecTy->getElementType(); - assert((TD.getTypeSizeInBits(VecTy->getScalarType()) % 8) == 0 && + AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &S, SROA &Pass, + AllocaInst &OldAI, AllocaInst &NewAI, + uint64_t NewBeginOffset, uint64_t NewEndOffset, + bool IsVectorPromotable = false, + bool IsIntegerPromotable = false) + : DL(DL), S(S), Pass(Pass), OldAI(OldAI), NewAI(NewAI), + NewAllocaBeginOffset(NewBeginOffset), NewAllocaEndOffset(NewEndOffset), + NewAllocaTy(NewAI.getAllocatedType()), + VecTy(IsVectorPromotable ? cast<VectorType>(NewAllocaTy) : 0), + ElementTy(VecTy ? VecTy->getElementType() : 0), + ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0), + IntTy(IsIntegerPromotable + ? Type::getIntNTy( + NewAI.getContext(), + DL.getTypeSizeInBits(NewAI.getAllocatedType())) + : 0), + BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(), + OldPtr(), IsUsedByRewrittenSpeculatableInstructions(false), + IRB(NewAI.getContext(), ConstantFolder()) { + if (VecTy) { + assert((DL.getTypeSizeInBits(ElementTy) % 8) == 0 && "Only multiple-of-8 sized vector elements are viable"); - ElementSize = TD.getTypeSizeInBits(VecTy->getScalarType()) / 8; - } else if (isIntegerWideningViable(TD, NewAI.getAllocatedType(), - NewAllocaBeginOffset, P, I, E)) { - IntTy = Type::getIntNTy(NewAI.getContext(), - TD.getTypeSizeInBits(NewAI.getAllocatedType())); + ++NumVectorized; } + assert((!IsVectorPromotable && !IsIntegerPromotable) || + IsVectorPromotable != IsIntegerPromotable); + } + + bool visit(AllocaSlices::const_iterator I) { bool CanSROA = true; - for (; I != E; ++I) { - if (!I->getUse()) - continue; // Skip dead uses. - BeginOffset = I->BeginOffset; - EndOffset = I->EndOffset; - IsSplit = I->isSplit(); - OldUse = I->getUse(); - OldPtr = cast<Instruction>(OldUse->get()); - - Instruction *OldUserI = cast<Instruction>(OldUse->getUser()); - IRB.SetInsertPoint(OldUserI); - IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc()); - IRB.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) + - "."); - - CanSROA &= visit(cast<Instruction>(OldUse->getUser())); - } - if (VecTy) { + BeginOffset = I->beginOffset(); + EndOffset = I->endOffset(); + IsSplittable = I->isSplittable(); + IsSplit = + BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset; + + OldUse = I->getUse(); + OldPtr = cast<Instruction>(OldUse->get()); + + Instruction *OldUserI = cast<Instruction>(OldUse->getUser()); + IRB.SetInsertPoint(OldUserI); + IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc()); + IRB.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) + "."); + + CanSROA &= visit(cast<Instruction>(OldUse->getUser())); + if (VecTy || IntTy) assert(CanSROA); - VecTy = 0; - ElementTy = 0; - ElementSize = 0; - } - if (IntTy) { - assert(CanSROA); - IntTy = 0; - } return CanSROA; } + /// \brief Query whether this slice is used by speculatable instructions after + /// rewriting. + /// + /// These instructions (PHIs and Selects currently) require the alloca slice + /// to run back through the rewriter. Thus, they are promotable, but not on + /// this iteration. This is distinct from a slice which is unpromotable for + /// some other reason, in which case we don't even want to perform the + /// speculation. This can be querried at any time and reflects whether (at + /// that point) a visit call has rewritten a speculatable instruction on the + /// current slice. + bool isUsedByRewrittenSpeculatableInstructions() const { + return IsUsedByRewrittenSpeculatableInstructions; + } + private: + // Make sure the other visit overloads are visible. + using Base::visit; + // Every instruction which can end up as a user must have a rewrite rule. bool visitInstruction(Instruction &I) { DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n"); llvm_unreachable("No rewrite rule for this instruction!"); } - Value *getAdjustedAllocaPtr(IRBuilderTy &IRB, Type *PointerTy) { - assert(BeginOffset >= NewAllocaBeginOffset); - APInt Offset(TD.getPointerSizeInBits(), BeginOffset - NewAllocaBeginOffset); - return getAdjustedPtr(IRB, TD, &NewAI, Offset, PointerTy); + Value *getAdjustedAllocaPtr(IRBuilderTy &IRB, uint64_t Offset, + Type *PointerTy) { + assert(Offset >= NewAllocaBeginOffset); + return getAdjustedPtr(IRB, DL, &NewAI, APInt(DL.getPointerSizeInBits(), + Offset - NewAllocaBeginOffset), + PointerTy); } /// \brief Compute suitable alignment to access an offset into the new alloca. unsigned getOffsetAlign(uint64_t Offset) { unsigned NewAIAlign = NewAI.getAlignment(); if (!NewAIAlign) - NewAIAlign = TD.getABITypeAlignment(NewAI.getAllocatedType()); + NewAIAlign = DL.getABITypeAlignment(NewAI.getAllocatedType()); return MinAlign(NewAIAlign, Offset); } - /// \brief Compute suitable alignment to access this partition of the new - /// alloca. - unsigned getPartitionAlign() { - return getOffsetAlign(BeginOffset - NewAllocaBeginOffset); - } - /// \brief Compute suitable alignment to access a type at an offset of the /// new alloca. /// @@ -2479,15 +2044,7 @@ private: /// otherwise returns the maximal suitable alignment. unsigned getOffsetTypeAlign(Type *Ty, uint64_t Offset) { unsigned Align = getOffsetAlign(Offset); - return Align == TD.getABITypeAlignment(Ty) ? 0 : Align; - } - - /// \brief Compute suitable alignment to access a type at the beginning of - /// this partition of the new alloca. - /// - /// See \c getOffsetTypeAlign for details; this routine delegates to it. - unsigned getPartitionTypeAlign(Type *Ty) { - return getOffsetTypeAlign(Ty, BeginOffset - NewAllocaBeginOffset); + return Align == DL.getABITypeAlignment(Ty) ? 0 : Align; } unsigned getIndex(uint64_t Offset) { @@ -2505,9 +2062,10 @@ private: Pass.DeadInsts.insert(I); } - Value *rewriteVectorizedLoadInst() { - unsigned BeginIndex = getIndex(BeginOffset); - unsigned EndIndex = getIndex(EndOffset); + Value *rewriteVectorizedLoadInst(uint64_t NewBeginOffset, + uint64_t NewEndOffset) { + unsigned BeginIndex = getIndex(NewBeginOffset); + unsigned EndIndex = getIndex(NewEndOffset); assert(EndIndex > BeginIndex && "Empty vector!"); Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), @@ -2515,16 +2073,17 @@ private: return extractVector(IRB, V, BeginIndex, EndIndex, "vec"); } - Value *rewriteIntegerLoad(LoadInst &LI) { + Value *rewriteIntegerLoad(LoadInst &LI, uint64_t NewBeginOffset, + uint64_t NewEndOffset) { assert(IntTy && "We cannot insert an integer to the alloca"); assert(!LI.isVolatile()); Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load"); - V = convertValue(TD, IRB, V, IntTy); - assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); - uint64_t Offset = BeginOffset - NewAllocaBeginOffset; - if (Offset > 0 || EndOffset < NewAllocaEndOffset) - V = extractInteger(TD, IRB, V, cast<IntegerType>(LI.getType()), Offset, + V = convertValue(DL, IRB, V, IntTy); + assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); + uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; + if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) + V = extractInteger(DL, IRB, V, cast<IntegerType>(LI.getType()), Offset, "extract"); return V; } @@ -2534,37 +2093,44 @@ private: Value *OldOp = LI.getOperand(0); assert(OldOp == OldPtr); - uint64_t Size = EndOffset - BeginOffset; + // Compute the intersecting offset range. + assert(BeginOffset < NewAllocaEndOffset); + assert(EndOffset > NewAllocaBeginOffset); + uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset); + uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset); + + uint64_t Size = NewEndOffset - NewBeginOffset; Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), Size * 8) : LI.getType(); bool IsPtrAdjusted = false; Value *V; if (VecTy) { - V = rewriteVectorizedLoadInst(); + V = rewriteVectorizedLoadInst(NewBeginOffset, NewEndOffset); } else if (IntTy && LI.getType()->isIntegerTy()) { - V = rewriteIntegerLoad(LI); - } else if (BeginOffset == NewAllocaBeginOffset && - canConvertValue(TD, NewAllocaTy, LI.getType())) { + V = rewriteIntegerLoad(LI, NewBeginOffset, NewEndOffset); + } else if (NewBeginOffset == NewAllocaBeginOffset && + canConvertValue(DL, NewAllocaTy, LI.getType())) { V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), LI.isVolatile(), "load"); } else { Type *LTy = TargetTy->getPointerTo(); - V = IRB.CreateAlignedLoad(getAdjustedAllocaPtr(IRB, LTy), - getPartitionTypeAlign(TargetTy), - LI.isVolatile(), "load"); + V = IRB.CreateAlignedLoad( + getAdjustedAllocaPtr(IRB, NewBeginOffset, LTy), + getOffsetTypeAlign(TargetTy, NewBeginOffset - NewAllocaBeginOffset), + LI.isVolatile(), "load"); IsPtrAdjusted = true; } - V = convertValue(TD, IRB, V, TargetTy); + V = convertValue(DL, IRB, V, TargetTy); if (IsSplit) { assert(!LI.isVolatile()); assert(LI.getType()->isIntegerTy() && "Only integer type loads and stores are split"); - assert(Size < TD.getTypeStoreSize(LI.getType()) && + assert(Size < DL.getTypeStoreSize(LI.getType()) && "Split load isn't smaller than original load"); assert(LI.getType()->getIntegerBitWidth() == - TD.getTypeStoreSizeInBits(LI.getType()) && + DL.getTypeStoreSizeInBits(LI.getType()) && "Non-byte-multiple bit width"); // Move the insertion point just past the load so that we can refer to it. IRB.SetInsertPoint(llvm::next(BasicBlock::iterator(&LI))); @@ -2574,7 +2140,7 @@ private: // LI only used for this computation. Value *Placeholder = new LoadInst(UndefValue::get(LI.getType()->getPointerTo())); - V = insertInteger(TD, IRB, Placeholder, V, BeginOffset, + V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset, "insert"); LI.replaceAllUsesWith(V); Placeholder->replaceAllUsesWith(&LI); @@ -2589,24 +2155,26 @@ private: return !LI.isVolatile() && !IsPtrAdjusted; } - bool rewriteVectorizedStoreInst(Value *V, - StoreInst &SI, Value *OldOp) { - unsigned BeginIndex = getIndex(BeginOffset); - unsigned EndIndex = getIndex(EndOffset); - assert(EndIndex > BeginIndex && "Empty vector!"); - unsigned NumElements = EndIndex - BeginIndex; - assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); - Type *PartitionTy - = (NumElements == 1) ? ElementTy - : VectorType::get(ElementTy, NumElements); - if (V->getType() != PartitionTy) - V = convertValue(TD, IRB, V, PartitionTy); - - // Mix in the existing elements. - Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), - "load"); - V = insertVector(IRB, Old, V, BeginIndex, "vec"); + bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp, + uint64_t NewBeginOffset, + uint64_t NewEndOffset) { + if (V->getType() != VecTy) { + unsigned BeginIndex = getIndex(NewBeginOffset); + unsigned EndIndex = getIndex(NewEndOffset); + assert(EndIndex > BeginIndex && "Empty vector!"); + unsigned NumElements = EndIndex - BeginIndex; + assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); + Type *SliceTy = + (NumElements == 1) ? ElementTy + : VectorType::get(ElementTy, NumElements); + if (V->getType() != SliceTy) + V = convertValue(DL, IRB, V, SliceTy); + // Mix in the existing elements. + Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + "load"); + V = insertVector(IRB, Old, V, BeginIndex, "vec"); + } StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); Pass.DeadInsts.insert(&SI); @@ -2615,19 +2183,20 @@ private: return true; } - bool rewriteIntegerStore(Value *V, StoreInst &SI) { + bool rewriteIntegerStore(Value *V, StoreInst &SI, + uint64_t NewBeginOffset, uint64_t NewEndOffset) { assert(IntTy && "We cannot extract an integer from the alloca"); assert(!SI.isVolatile()); - if (TD.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) { + if (DL.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) { Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload"); - Old = convertValue(TD, IRB, Old, IntTy); + Old = convertValue(DL, IRB, Old, IntTy); assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); uint64_t Offset = BeginOffset - NewAllocaBeginOffset; - V = insertInteger(TD, IRB, Old, SI.getValueOperand(), Offset, + V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert"); } - V = convertValue(TD, IRB, V, NewAllocaTy); + V = convertValue(DL, IRB, V, NewAllocaTy); StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); Pass.DeadInsts.insert(&SI); (void)Store; @@ -2648,37 +2217,45 @@ private: if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets())) Pass.PostPromotionWorklist.insert(AI); - uint64_t Size = EndOffset - BeginOffset; - if (Size < TD.getTypeStoreSize(V->getType())) { + // Compute the intersecting offset range. + assert(BeginOffset < NewAllocaEndOffset); + assert(EndOffset > NewAllocaBeginOffset); + uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset); + uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset); + + uint64_t Size = NewEndOffset - NewBeginOffset; + if (Size < DL.getTypeStoreSize(V->getType())) { assert(!SI.isVolatile()); - assert(IsSplit && "A seemingly split store isn't splittable"); assert(V->getType()->isIntegerTy() && "Only integer type loads and stores are split"); assert(V->getType()->getIntegerBitWidth() == - TD.getTypeStoreSizeInBits(V->getType()) && + DL.getTypeStoreSizeInBits(V->getType()) && "Non-byte-multiple bit width"); IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), Size * 8); - V = extractInteger(TD, IRB, V, NarrowTy, BeginOffset, + V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset, "extract"); } if (VecTy) - return rewriteVectorizedStoreInst(V, SI, OldOp); + return rewriteVectorizedStoreInst(V, SI, OldOp, NewBeginOffset, + NewEndOffset); if (IntTy && V->getType()->isIntegerTy()) - return rewriteIntegerStore(V, SI); + return rewriteIntegerStore(V, SI, NewBeginOffset, NewEndOffset); StoreInst *NewSI; - if (BeginOffset == NewAllocaBeginOffset && - EndOffset == NewAllocaEndOffset && - canConvertValue(TD, V->getType(), NewAllocaTy)) { - V = convertValue(TD, IRB, V, NewAllocaTy); + if (NewBeginOffset == NewAllocaBeginOffset && + NewEndOffset == NewAllocaEndOffset && + canConvertValue(DL, V->getType(), NewAllocaTy)) { + V = convertValue(DL, IRB, V, NewAllocaTy); NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), SI.isVolatile()); } else { - Value *NewPtr = getAdjustedAllocaPtr(IRB, V->getType()->getPointerTo()); - NewSI = IRB.CreateAlignedStore(V, NewPtr, - getPartitionTypeAlign(V->getType()), - SI.isVolatile()); + Value *NewPtr = getAdjustedAllocaPtr(IRB, NewBeginOffset, + V->getType()->getPointerTo()); + NewSI = IRB.CreateAlignedStore( + V, NewPtr, getOffsetTypeAlign( + V->getType(), NewBeginOffset - NewAllocaBeginOffset), + SI.isVolatile()); } (void)NewSI; Pass.DeadInsts.insert(&SI); @@ -2729,9 +2306,12 @@ private: // If the memset has a variable size, it cannot be split, just adjust the // pointer to the new alloca. if (!isa<Constant>(II.getLength())) { - II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType())); + assert(!IsSplit); + assert(BeginOffset >= NewAllocaBeginOffset); + II.setDest( + getAdjustedAllocaPtr(IRB, BeginOffset, II.getRawDest()->getType())); Type *CstTy = II.getAlignmentCst()->getType(); - II.setAlignment(ConstantInt::get(CstTy, getPartitionAlign())); + II.setAlignment(ConstantInt::get(CstTy, getOffsetAlign(BeginOffset))); deleteIfTriviallyDead(OldPtr); return false; @@ -2743,21 +2323,26 @@ private: Type *AllocaTy = NewAI.getAllocatedType(); Type *ScalarTy = AllocaTy->getScalarType(); + // Compute the intersecting offset range. + assert(BeginOffset < NewAllocaEndOffset); + assert(EndOffset > NewAllocaBeginOffset); + uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset); + uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset); + uint64_t SliceOffset = NewBeginOffset - NewAllocaBeginOffset; + // If this doesn't map cleanly onto the alloca type, and that type isn't // a single value type, just emit a memset. if (!VecTy && !IntTy && - (BeginOffset != NewAllocaBeginOffset || - EndOffset != NewAllocaEndOffset || + (BeginOffset > NewAllocaBeginOffset || + EndOffset < NewAllocaEndOffset || !AllocaTy->isSingleValueType() || - !TD.isLegalInteger(TD.getTypeSizeInBits(ScalarTy)) || - TD.getTypeSizeInBits(ScalarTy)%8 != 0)) { + !DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy)) || + DL.getTypeSizeInBits(ScalarTy)%8 != 0)) { Type *SizeTy = II.getLength()->getType(); - Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset); - CallInst *New - = IRB.CreateMemSet(getAdjustedAllocaPtr(IRB, - II.getRawDest()->getType()), - II.getValue(), Size, getPartitionAlign(), - II.isVolatile()); + Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset); + CallInst *New = IRB.CreateMemSet( + getAdjustedAllocaPtr(IRB, NewBeginOffset, II.getRawDest()->getType()), + II.getValue(), Size, getOffsetAlign(SliceOffset), II.isVolatile()); (void)New; DEBUG(dbgs() << " to: " << *New << "\n"); return false; @@ -2774,15 +2359,15 @@ private: // If this is a memset of a vectorized alloca, insert it. assert(ElementTy == ScalarTy); - unsigned BeginIndex = getIndex(BeginOffset); - unsigned EndIndex = getIndex(EndOffset); + unsigned BeginIndex = getIndex(NewBeginOffset); + unsigned EndIndex = getIndex(NewEndOffset); assert(EndIndex > BeginIndex && "Empty vector!"); unsigned NumElements = EndIndex - BeginIndex; assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); Value *Splat = - getIntegerSplat(II.getValue(), TD.getTypeSizeInBits(ElementTy) / 8); - Splat = convertValue(TD, IRB, Splat, ElementTy); + getIntegerSplat(II.getValue(), DL.getTypeSizeInBits(ElementTy) / 8); + Splat = convertValue(DL, IRB, Splat, ElementTy); if (NumElements > 1) Splat = getVectorSplat(Splat, NumElements); @@ -2794,32 +2379,31 @@ private: // set integer. assert(!II.isVolatile()); - uint64_t Size = EndOffset - BeginOffset; + uint64_t Size = NewEndOffset - NewBeginOffset; V = getIntegerSplat(II.getValue(), Size); if (IntTy && (BeginOffset != NewAllocaBeginOffset || EndOffset != NewAllocaBeginOffset)) { Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload"); - Old = convertValue(TD, IRB, Old, IntTy); - assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); - uint64_t Offset = BeginOffset - NewAllocaBeginOffset; - V = insertInteger(TD, IRB, Old, V, Offset, "insert"); + Old = convertValue(DL, IRB, Old, IntTy); + uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; + V = insertInteger(DL, IRB, Old, V, Offset, "insert"); } else { assert(V->getType() == IntTy && "Wrong type for an alloca wide integer!"); } - V = convertValue(TD, IRB, V, AllocaTy); + V = convertValue(DL, IRB, V, AllocaTy); } else { // Established these invariants above. - assert(BeginOffset == NewAllocaBeginOffset); - assert(EndOffset == NewAllocaEndOffset); + assert(NewBeginOffset == NewAllocaBeginOffset); + assert(NewEndOffset == NewAllocaEndOffset); - V = getIntegerSplat(II.getValue(), TD.getTypeSizeInBits(ScalarTy) / 8); + V = getIntegerSplat(II.getValue(), DL.getTypeSizeInBits(ScalarTy) / 8); if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy)) V = getVectorSplat(V, AllocaVecTy->getNumElements()); - V = convertValue(TD, IRB, V, AllocaTy); + V = convertValue(DL, IRB, V, AllocaTy); } Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), @@ -2835,21 +2419,25 @@ private: DEBUG(dbgs() << " original: " << II << "\n"); + // Compute the intersecting offset range. + assert(BeginOffset < NewAllocaEndOffset); + assert(EndOffset > NewAllocaBeginOffset); + uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset); + uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset); + assert(II.getRawSource() == OldPtr || II.getRawDest() == OldPtr); bool IsDest = II.getRawDest() == OldPtr; - const AllocaPartitioning::MemTransferOffsets &MTO - = P.getMemTransferOffsets(II); - // Compute the relative offset within the transfer. - unsigned IntPtrWidth = TD.getPointerSizeInBits(); - APInt RelOffset(IntPtrWidth, BeginOffset - (IsDest ? MTO.DestBegin - : MTO.SourceBegin)); + unsigned IntPtrWidth = DL.getPointerSizeInBits(); + APInt RelOffset(IntPtrWidth, NewBeginOffset - BeginOffset); unsigned Align = II.getAlignment(); + uint64_t SliceOffset = NewBeginOffset - NewAllocaBeginOffset; if (Align > 1) - Align = MinAlign(RelOffset.zextOrTrunc(64).getZExtValue(), - MinAlign(II.getAlignment(), getPartitionAlign())); + Align = + MinAlign(RelOffset.zextOrTrunc(64).getZExtValue(), + MinAlign(II.getAlignment(), getOffsetAlign(SliceOffset))); // For unsplit intrinsics, we simply modify the source and destination // pointers in place. This isn't just an optimization, it is a matter of @@ -2858,12 +2446,14 @@ private: // a variable length. We may also be dealing with memmove instead of // memcpy, and so simply updating the pointers is the necessary for us to // update both source and dest of a single call. - if (!MTO.IsSplittable) { + if (!IsSplittable) { Value *OldOp = IsDest ? II.getRawDest() : II.getRawSource(); if (IsDest) - II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType())); + II.setDest( + getAdjustedAllocaPtr(IRB, BeginOffset, II.getRawDest()->getType())); else - II.setSource(getAdjustedAllocaPtr(IRB, II.getRawSource()->getType())); + II.setSource(getAdjustedAllocaPtr(IRB, BeginOffset, + II.getRawSource()->getType())); Type *CstTy = II.getAlignmentCst()->getType(); II.setAlignment(ConstantInt::get(CstTy, Align)); @@ -2881,24 +2471,21 @@ private: // If this doesn't map cleanly onto the alloca type, and that type isn't // a single value type, just emit a memcpy. bool EmitMemCpy - = !VecTy && !IntTy && (BeginOffset != NewAllocaBeginOffset || - EndOffset != NewAllocaEndOffset || + = !VecTy && !IntTy && (BeginOffset > NewAllocaBeginOffset || + EndOffset < NewAllocaEndOffset || !NewAI.getAllocatedType()->isSingleValueType()); // If we're just going to emit a memcpy, the alloca hasn't changed, and the // size hasn't been shrunk based on analysis of the viable range, this is // a no-op. if (EmitMemCpy && &OldAI == &NewAI) { - uint64_t OrigBegin = IsDest ? MTO.DestBegin : MTO.SourceBegin; - uint64_t OrigEnd = IsDest ? MTO.DestEnd : MTO.SourceEnd; // Ensure the start lines up. - assert(BeginOffset == OrigBegin); - (void)OrigBegin; + assert(NewBeginOffset == BeginOffset); // Rewrite the size as needed. - if (EndOffset != OrigEnd) + if (NewEndOffset != EndOffset) II.setLength(ConstantInt::get(II.getLength()->getType(), - EndOffset - BeginOffset)); + NewEndOffset - NewBeginOffset)); return false; } // Record this instruction for deletion. @@ -2917,13 +2504,13 @@ private: // Compute the other pointer, folding as much as possible to produce // a single, simple GEP in most cases. - OtherPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy); + OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, RelOffset, OtherPtrTy); - Value *OurPtr - = getAdjustedAllocaPtr(IRB, IsDest ? II.getRawDest()->getType() - : II.getRawSource()->getType()); + Value *OurPtr = getAdjustedAllocaPtr( + IRB, NewBeginOffset, + IsDest ? II.getRawDest()->getType() : II.getRawSource()->getType()); Type *SizeTy = II.getLength()->getType(); - Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset); + Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset); CallInst *New = IRB.CreateMemCpy(IsDest ? OurPtr : OtherPtr, IsDest ? OtherPtr : OurPtr, @@ -2939,11 +2526,11 @@ private: if (!Align) Align = 1; - bool IsWholeAlloca = BeginOffset == NewAllocaBeginOffset && - EndOffset == NewAllocaEndOffset; - uint64_t Size = EndOffset - BeginOffset; - unsigned BeginIndex = VecTy ? getIndex(BeginOffset) : 0; - unsigned EndIndex = VecTy ? getIndex(EndOffset) : 0; + bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset && + NewEndOffset == NewAllocaEndOffset; + uint64_t Size = NewEndOffset - NewBeginOffset; + unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0; + unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0; unsigned NumElements = EndIndex - BeginIndex; IntegerType *SubIntTy = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : 0; @@ -2960,7 +2547,7 @@ private: OtherPtrTy = SubIntTy->getPointerTo(); } - Value *SrcPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy); + Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, RelOffset, OtherPtrTy); Value *DstPtr = &NewAI; if (!IsDest) std::swap(SrcPtr, DstPtr); @@ -2973,10 +2560,9 @@ private: } else if (IntTy && !IsWholeAlloca && !IsDest) { Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "load"); - Src = convertValue(TD, IRB, Src, IntTy); - assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); - uint64_t Offset = BeginOffset - NewAllocaBeginOffset; - Src = extractInteger(TD, IRB, Src, SubIntTy, Offset, "extract"); + Src = convertValue(DL, IRB, Src, IntTy); + uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; + Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract"); } else { Src = IRB.CreateAlignedLoad(SrcPtr, Align, II.isVolatile(), "copyload"); @@ -2989,11 +2575,10 @@ private: } else if (IntTy && !IsWholeAlloca && IsDest) { Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), "oldload"); - Old = convertValue(TD, IRB, Old, IntTy); - assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); - uint64_t Offset = BeginOffset - NewAllocaBeginOffset; - Src = insertInteger(TD, IRB, Old, Src, Offset, "insert"); - Src = convertValue(TD, IRB, Src, NewAllocaTy); + Old = convertValue(DL, IRB, Old, IntTy); + uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; + Src = insertInteger(DL, IRB, Old, Src, Offset, "insert"); + Src = convertValue(DL, IRB, Src, NewAllocaTy); } StoreInst *Store = cast<StoreInst>( @@ -3009,13 +2594,20 @@ private: DEBUG(dbgs() << " original: " << II << "\n"); assert(II.getArgOperand(1) == OldPtr); + // Compute the intersecting offset range. + assert(BeginOffset < NewAllocaEndOffset); + assert(EndOffset > NewAllocaBeginOffset); + uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset); + uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset); + // Record this instruction for deletion. Pass.DeadInsts.insert(&II); ConstantInt *Size = ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()), - EndOffset - BeginOffset); - Value *Ptr = getAdjustedAllocaPtr(IRB, II.getArgOperand(1)->getType()); + NewEndOffset - NewBeginOffset); + Value *Ptr = + getAdjustedAllocaPtr(IRB, NewBeginOffset, II.getArgOperand(1)->getType()); Value *New; if (II.getIntrinsicID() == Intrinsic::lifetime_start) New = IRB.CreateLifetimeStart(Ptr, Size); @@ -3029,30 +2621,45 @@ private: bool visitPHINode(PHINode &PN) { DEBUG(dbgs() << " original: " << PN << "\n"); + assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable"); + assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable"); // We would like to compute a new pointer in only one place, but have it be // as local as possible to the PHI. To do that, we re-use the location of // the old pointer, which necessarily must be in the right position to // dominate the PHI. - IRBuilderTy PtrBuilder(cast<Instruction>(OldPtr)); + IRBuilderTy PtrBuilder(OldPtr); PtrBuilder.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) + "."); - Value *NewPtr = getAdjustedAllocaPtr(PtrBuilder, OldPtr->getType()); + Value *NewPtr = + getAdjustedAllocaPtr(PtrBuilder, BeginOffset, OldPtr->getType()); // Replace the operands which were using the old pointer. std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr); DEBUG(dbgs() << " to: " << PN << "\n"); deleteIfTriviallyDead(OldPtr); - return false; + + // Check whether we can speculate this PHI node, and if so remember that + // fact and queue it up for another iteration after the speculation + // occurs. + if (isSafePHIToSpeculate(PN, &DL)) { + Pass.SpeculatablePHIs.insert(&PN); + IsUsedByRewrittenSpeculatableInstructions = true; + return true; + } + + return false; // PHIs can't be promoted on their own. } bool visitSelectInst(SelectInst &SI) { DEBUG(dbgs() << " original: " << SI << "\n"); assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) && "Pointer isn't an operand!"); + assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable"); + assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable"); - Value *NewPtr = getAdjustedAllocaPtr(IRB, OldPtr->getType()); + Value *NewPtr = getAdjustedAllocaPtr(IRB, BeginOffset, OldPtr->getType()); // Replace the operands which were using the old pointer. if (SI.getOperand(1) == OldPtr) SI.setOperand(1, NewPtr); @@ -3061,7 +2668,17 @@ private: DEBUG(dbgs() << " to: " << SI << "\n"); deleteIfTriviallyDead(OldPtr); - return false; + + // Check whether we can speculate this select instruction, and if so + // remember that fact and queue it up for another iteration after the + // speculation occurs. + if (isSafeSelectToSpeculate(SI, &DL)) { + Pass.SpeculatableSelects.insert(&SI); + IsUsedByRewrittenSpeculatableInstructions = true; + return true; + } + + return false; // Selects can't be promoted on their own. } }; @@ -3077,7 +2694,7 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> { // Befriend the base class so it can delegate to private visit methods. friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>; - const DataLayout &TD; + const DataLayout &DL; /// Queue of pointer uses to analyze and potentially rewrite. SmallVector<Use *, 8> Queue; @@ -3090,7 +2707,7 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> { Use *U; public: - AggLoadStoreRewriter(const DataLayout &TD) : TD(TD) {} + AggLoadStoreRewriter(const DataLayout &DL) : DL(DL) {} /// Rewrite loads and stores through a pointer and all pointers derived from /// it. @@ -3319,12 +2936,12 @@ static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) { /// when the size or offset cause either end of type-based partition to be off. /// Also, this is a best-effort routine. It is reasonable to give up and not /// return a type if necessary. -static Type *getTypePartition(const DataLayout &TD, Type *Ty, +static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset, uint64_t Size) { - if (Offset == 0 && TD.getTypeAllocSize(Ty) == Size) - return stripAggregateTypeWrapping(TD, Ty); - if (Offset > TD.getTypeAllocSize(Ty) || - (TD.getTypeAllocSize(Ty) - Offset) < Size) + if (Offset == 0 && DL.getTypeAllocSize(Ty) == Size) + return stripAggregateTypeWrapping(DL, Ty); + if (Offset > DL.getTypeAllocSize(Ty) || + (DL.getTypeAllocSize(Ty) - Offset) < Size) return 0; if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) { @@ -3333,7 +2950,7 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty, return 0; Type *ElementTy = SeqTy->getElementType(); - uint64_t ElementSize = TD.getTypeAllocSize(ElementTy); + uint64_t ElementSize = DL.getTypeAllocSize(ElementTy); uint64_t NumSkippedElements = Offset / ElementSize; if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy)) { if (NumSkippedElements >= ArrTy->getNumElements()) @@ -3350,12 +2967,12 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty, if ((Offset + Size) > ElementSize) return 0; // Recurse through the element type trying to peel off offset bytes. - return getTypePartition(TD, ElementTy, Offset, Size); + return getTypePartition(DL, ElementTy, Offset, Size); } assert(Offset == 0); if (Size == ElementSize) - return stripAggregateTypeWrapping(TD, ElementTy); + return stripAggregateTypeWrapping(DL, ElementTy); assert(Size > ElementSize); uint64_t NumElements = Size / ElementSize; if (NumElements * ElementSize != Size) @@ -3367,7 +2984,7 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty, if (!STy) return 0; - const StructLayout *SL = TD.getStructLayout(STy); + const StructLayout *SL = DL.getStructLayout(STy); if (Offset >= SL->getSizeInBytes()) return 0; uint64_t EndOffset = Offset + Size; @@ -3378,7 +2995,7 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty, Offset -= SL->getElementOffset(Index); Type *ElementTy = STy->getElementType(Index); - uint64_t ElementSize = TD.getTypeAllocSize(ElementTy); + uint64_t ElementSize = DL.getTypeAllocSize(ElementTy); if (Offset >= ElementSize) return 0; // The offset points into alignment padding. @@ -3386,12 +3003,12 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty, if (Offset > 0 || Size < ElementSize) { if ((Offset + Size) > ElementSize) return 0; - return getTypePartition(TD, ElementTy, Offset, Size); + return getTypePartition(DL, ElementTy, Offset, Size); } assert(Offset == 0); if (Size == ElementSize) - return stripAggregateTypeWrapping(TD, ElementTy); + return stripAggregateTypeWrapping(DL, ElementTy); StructType::element_iterator EI = STy->element_begin() + Index, EE = STy->element_end(); @@ -3414,7 +3031,7 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty, // Try to build up a sub-structure. StructType *SubTy = StructType::get(STy->getContext(), makeArrayRef(EI, EE), STy->isPacked()); - const StructLayout *SubSL = TD.getStructLayout(SubTy); + const StructLayout *SubSL = DL.getStructLayout(SubTy); if (Size != SubSL->getSizeInBytes()) return 0; // The sub-struct doesn't have quite the size needed. @@ -3431,113 +3048,280 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty, /// appropriate new offsets. It also evaluates how successful the rewrite was /// at enabling promotion and if it was successful queues the alloca to be /// promoted. -bool SROA::rewriteAllocaPartition(AllocaInst &AI, - AllocaPartitioning &P, - AllocaPartitioning::iterator PI) { - uint64_t AllocaSize = PI->EndOffset - PI->BeginOffset; - bool IsLive = false; - for (AllocaPartitioning::use_iterator UI = P.use_begin(PI), - UE = P.use_end(PI); - UI != UE && !IsLive; ++UI) - if (UI->getUse()) - IsLive = true; - if (!IsLive) - return false; // No live uses left of this partition. - - DEBUG(dbgs() << "Speculating PHIs and selects in partition " - << "[" << PI->BeginOffset << "," << PI->EndOffset << ")\n"); - - PHIOrSelectSpeculator Speculator(*TD, P, *this); - DEBUG(dbgs() << " speculating "); - DEBUG(P.print(dbgs(), PI, "")); - Speculator.visitUsers(PI); +bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &S, + AllocaSlices::iterator B, AllocaSlices::iterator E, + int64_t BeginOffset, int64_t EndOffset, + ArrayRef<AllocaSlices::iterator> SplitUses) { + assert(BeginOffset < EndOffset); + uint64_t SliceSize = EndOffset - BeginOffset; // Try to compute a friendly type for this partition of the alloca. This // won't always succeed, in which case we fall back to a legal integer type // or an i8 array of an appropriate size. - Type *AllocaTy = 0; - if (Type *PartitionTy = P.getCommonType(PI)) - if (TD->getTypeAllocSize(PartitionTy) >= AllocaSize) - AllocaTy = PartitionTy; - if (!AllocaTy) - if (Type *PartitionTy = getTypePartition(*TD, AI.getAllocatedType(), - PI->BeginOffset, AllocaSize)) - AllocaTy = PartitionTy; - if ((!AllocaTy || - (AllocaTy->isArrayTy() && - AllocaTy->getArrayElementType()->isIntegerTy())) && - TD->isLegalInteger(AllocaSize * 8)) - AllocaTy = Type::getIntNTy(*C, AllocaSize * 8); - if (!AllocaTy) - AllocaTy = ArrayType::get(Type::getInt8Ty(*C), AllocaSize); - assert(TD->getTypeAllocSize(AllocaTy) >= AllocaSize); + Type *SliceTy = 0; + if (Type *CommonUseTy = findCommonType(B, E, EndOffset)) + if (DL->getTypeAllocSize(CommonUseTy) >= SliceSize) + SliceTy = CommonUseTy; + if (!SliceTy) + if (Type *TypePartitionTy = getTypePartition(*DL, AI.getAllocatedType(), + BeginOffset, SliceSize)) + SliceTy = TypePartitionTy; + if ((!SliceTy || (SliceTy->isArrayTy() && + SliceTy->getArrayElementType()->isIntegerTy())) && + DL->isLegalInteger(SliceSize * 8)) + SliceTy = Type::getIntNTy(*C, SliceSize * 8); + if (!SliceTy) + SliceTy = ArrayType::get(Type::getInt8Ty(*C), SliceSize); + assert(DL->getTypeAllocSize(SliceTy) >= SliceSize); + + bool IsVectorPromotable = isVectorPromotionViable( + *DL, SliceTy, S, BeginOffset, EndOffset, B, E, SplitUses); + + bool IsIntegerPromotable = + !IsVectorPromotable && + isIntegerWideningViable(*DL, SliceTy, BeginOffset, S, B, E, SplitUses); // Check for the case where we're going to rewrite to a new alloca of the // exact same type as the original, and with the same access offsets. In that // case, re-use the existing alloca, but still run through the rewriter to // perform phi and select speculation. AllocaInst *NewAI; - if (AllocaTy == AI.getAllocatedType()) { - assert(PI->BeginOffset == 0 && + if (SliceTy == AI.getAllocatedType()) { + assert(BeginOffset == 0 && "Non-zero begin offset but same alloca type"); - assert(PI == P.begin() && "Begin offset is zero on later partition"); NewAI = &AI; + // FIXME: We should be able to bail at this point with "nothing changed". + // FIXME: We might want to defer PHI speculation until after here. } else { unsigned Alignment = AI.getAlignment(); if (!Alignment) { // The minimum alignment which users can rely on when the explicit // alignment is omitted or zero is that required by the ABI for this // type. - Alignment = TD->getABITypeAlignment(AI.getAllocatedType()); + Alignment = DL->getABITypeAlignment(AI.getAllocatedType()); } - Alignment = MinAlign(Alignment, PI->BeginOffset); + Alignment = MinAlign(Alignment, BeginOffset); // If we will get at least this much alignment from the type alone, leave // the alloca's alignment unconstrained. - if (Alignment <= TD->getABITypeAlignment(AllocaTy)) + if (Alignment <= DL->getABITypeAlignment(SliceTy)) Alignment = 0; - NewAI = new AllocaInst(AllocaTy, 0, Alignment, - AI.getName() + ".sroa." + Twine(PI - P.begin()), - &AI); + NewAI = new AllocaInst(SliceTy, 0, Alignment, + AI.getName() + ".sroa." + Twine(B - S.begin()), &AI); ++NumNewAllocas; } DEBUG(dbgs() << "Rewriting alloca partition " - << "[" << PI->BeginOffset << "," << PI->EndOffset << ") to: " - << *NewAI << "\n"); + << "[" << BeginOffset << "," << EndOffset << ") to: " << *NewAI + << "\n"); - // Track the high watermark of the post-promotion worklist. We will reset it - // to this point if the alloca is not in fact scheduled for promotion. + // Track the high watermark on several worklists that are only relevant for + // promoted allocas. We will reset it to this point if the alloca is not in + // fact scheduled for promotion. unsigned PPWOldSize = PostPromotionWorklist.size(); + unsigned SPOldSize = SpeculatablePHIs.size(); + unsigned SSOldSize = SpeculatableSelects.size(); + unsigned NumUses = 0; + + AllocaSliceRewriter Rewriter(*DL, S, *this, AI, *NewAI, BeginOffset, + EndOffset, IsVectorPromotable, + IsIntegerPromotable); + bool Promotable = true; + for (ArrayRef<AllocaSlices::iterator>::const_iterator SUI = SplitUses.begin(), + SUE = SplitUses.end(); + SUI != SUE; ++SUI) { + DEBUG(dbgs() << " rewriting split "); + DEBUG(S.printSlice(dbgs(), *SUI, "")); + Promotable &= Rewriter.visit(*SUI); + ++NumUses; + } + for (AllocaSlices::iterator I = B; I != E; ++I) { + DEBUG(dbgs() << " rewriting "); + DEBUG(S.printSlice(dbgs(), I, "")); + Promotable &= Rewriter.visit(I); + ++NumUses; + } + + NumAllocaPartitionUses += NumUses; + MaxUsesPerAllocaPartition = + std::max<unsigned>(NumUses, MaxUsesPerAllocaPartition); - AllocaPartitionRewriter Rewriter(*TD, P, PI, *this, AI, *NewAI, - PI->BeginOffset, PI->EndOffset); - DEBUG(dbgs() << " rewriting "); - DEBUG(P.print(dbgs(), PI, "")); - bool Promotable = Rewriter.visitUsers(P.use_begin(PI), P.use_end(PI)); - if (Promotable) { + if (Promotable && !Rewriter.isUsedByRewrittenSpeculatableInstructions()) { DEBUG(dbgs() << " and queuing for promotion\n"); PromotableAllocas.push_back(NewAI); - } else if (NewAI != &AI) { + } else if (NewAI != &AI || + (Promotable && + Rewriter.isUsedByRewrittenSpeculatableInstructions())) { // If we can't promote the alloca, iterate on it to check for new // refinements exposed by splitting the current alloca. Don't iterate on an // alloca which didn't actually change and didn't get promoted. + // + // Alternatively, if we could promote the alloca but have speculatable + // instructions then we will speculate them after finishing our processing + // of the original alloca. Mark the new one for re-visiting in the next + // iteration so the speculated operations can be rewritten. + // + // FIXME: We should actually track whether the rewriter changed anything. Worklist.insert(NewAI); } // Drop any post-promotion work items if promotion didn't happen. - if (!Promotable) + if (!Promotable) { while (PostPromotionWorklist.size() > PPWOldSize) PostPromotionWorklist.pop_back(); + while (SpeculatablePHIs.size() > SPOldSize) + SpeculatablePHIs.pop_back(); + while (SpeculatableSelects.size() > SSOldSize) + SpeculatableSelects.pop_back(); + } return true; } -/// \brief Walks the partitioning of an alloca rewriting uses of each partition. -bool SROA::splitAlloca(AllocaInst &AI, AllocaPartitioning &P) { +namespace { +struct IsSliceEndLessOrEqualTo { + uint64_t UpperBound; + + IsSliceEndLessOrEqualTo(uint64_t UpperBound) : UpperBound(UpperBound) {} + + bool operator()(const AllocaSlices::iterator &I) { + return I->endOffset() <= UpperBound; + } +}; +} + +static void +removeFinishedSplitUses(SmallVectorImpl<AllocaSlices::iterator> &SplitUses, + uint64_t &MaxSplitUseEndOffset, uint64_t Offset) { + if (Offset >= MaxSplitUseEndOffset) { + SplitUses.clear(); + MaxSplitUseEndOffset = 0; + return; + } + + size_t SplitUsesOldSize = SplitUses.size(); + SplitUses.erase(std::remove_if(SplitUses.begin(), SplitUses.end(), + IsSliceEndLessOrEqualTo(Offset)), + SplitUses.end()); + if (SplitUsesOldSize == SplitUses.size()) + return; + + // Recompute the max. While this is linear, so is remove_if. + MaxSplitUseEndOffset = 0; + for (SmallVectorImpl<AllocaSlices::iterator>::iterator + SUI = SplitUses.begin(), + SUE = SplitUses.end(); + SUI != SUE; ++SUI) + MaxSplitUseEndOffset = std::max((*SUI)->endOffset(), MaxSplitUseEndOffset); +} + +/// \brief Walks the slices of an alloca and form partitions based on them, +/// rewriting each of their uses. +bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &S) { + if (S.begin() == S.end()) + return false; + + unsigned NumPartitions = 0; bool Changed = false; - for (AllocaPartitioning::iterator PI = P.begin(), PE = P.end(); PI != PE; - ++PI) - Changed |= rewriteAllocaPartition(AI, P, PI); + SmallVector<AllocaSlices::iterator, 4> SplitUses; + uint64_t MaxSplitUseEndOffset = 0; + + uint64_t BeginOffset = S.begin()->beginOffset(); + + for (AllocaSlices::iterator SI = S.begin(), SJ = llvm::next(SI), SE = S.end(); + SI != SE; SI = SJ) { + uint64_t MaxEndOffset = SI->endOffset(); + + if (!SI->isSplittable()) { + // When we're forming an unsplittable region, it must always start at the + // first slice and will extend through its end. + assert(BeginOffset == SI->beginOffset()); + + // Form a partition including all of the overlapping slices with this + // unsplittable slice. + while (SJ != SE && SJ->beginOffset() < MaxEndOffset) { + if (!SJ->isSplittable()) + MaxEndOffset = std::max(MaxEndOffset, SJ->endOffset()); + ++SJ; + } + } else { + assert(SI->isSplittable()); // Established above. + + // Collect all of the overlapping splittable slices. + while (SJ != SE && SJ->beginOffset() < MaxEndOffset && + SJ->isSplittable()) { + MaxEndOffset = std::max(MaxEndOffset, SJ->endOffset()); + ++SJ; + } + + // Back up MaxEndOffset and SJ if we ended the span early when + // encountering an unsplittable slice. + if (SJ != SE && SJ->beginOffset() < MaxEndOffset) { + assert(!SJ->isSplittable()); + MaxEndOffset = SJ->beginOffset(); + } + } + + // Check if we have managed to move the end offset forward yet. If so, + // we'll have to rewrite uses and erase old split uses. + if (BeginOffset < MaxEndOffset) { + // Rewrite a sequence of overlapping slices. + Changed |= + rewritePartition(AI, S, SI, SJ, BeginOffset, MaxEndOffset, SplitUses); + ++NumPartitions; + + removeFinishedSplitUses(SplitUses, MaxSplitUseEndOffset, MaxEndOffset); + } + + // Accumulate all the splittable slices from the [SI,SJ) region which + // overlap going forward. + for (AllocaSlices::iterator SK = SI; SK != SJ; ++SK) + if (SK->isSplittable() && SK->endOffset() > MaxEndOffset) { + SplitUses.push_back(SK); + MaxSplitUseEndOffset = std::max(SK->endOffset(), MaxSplitUseEndOffset); + } + + // If we're already at the end and we have no split uses, we're done. + if (SJ == SE && SplitUses.empty()) + break; + + // If we have no split uses or no gap in offsets, we're ready to move to + // the next slice. + if (SplitUses.empty() || (SJ != SE && MaxEndOffset == SJ->beginOffset())) { + BeginOffset = SJ->beginOffset(); + continue; + } + + // Even if we have split slices, if the next slice is splittable and the + // split slices reach it, we can simply set up the beginning offset of the + // next iteration to bridge between them. + if (SJ != SE && SJ->isSplittable() && + MaxSplitUseEndOffset > SJ->beginOffset()) { + BeginOffset = MaxEndOffset; + continue; + } + + // Otherwise, we have a tail of split slices. Rewrite them with an empty + // range of slices. + uint64_t PostSplitEndOffset = + SJ == SE ? MaxSplitUseEndOffset : SJ->beginOffset(); + + Changed |= rewritePartition(AI, S, SJ, SJ, MaxEndOffset, PostSplitEndOffset, + SplitUses); + ++NumPartitions; + + if (SJ == SE) + break; // Skip the rest, we don't need to do any cleanup. + + removeFinishedSplitUses(SplitUses, MaxSplitUseEndOffset, + PostSplitEndOffset); + + // Now just reset the begin offset for the next iteration. + BeginOffset = SJ->beginOffset(); + } + + NumAllocaPartitions += NumPartitions; + MaxPartitionsPerAlloca = + std::max<unsigned>(NumPartitions, MaxPartitionsPerAlloca); return Changed; } @@ -3545,7 +3329,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaPartitioning &P) { /// \brief Analyze an alloca for SROA. /// /// This analyzes the alloca to ensure we can reason about it, builds -/// a partitioning of the alloca, and then hands it off to be split and +/// the slices of the alloca, and then hands it off to be split and /// rewritten as needed. bool SROA::runOnAlloca(AllocaInst &AI) { DEBUG(dbgs() << "SROA alloca: " << AI << "\n"); @@ -3559,32 +3343,32 @@ bool SROA::runOnAlloca(AllocaInst &AI) { // Skip alloca forms that this analysis can't handle. if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() || - TD->getTypeAllocSize(AI.getAllocatedType()) == 0) + DL->getTypeAllocSize(AI.getAllocatedType()) == 0) return false; bool Changed = false; // First, split any FCA loads and stores touching this alloca to promote // better splitting and promotion opportunities. - AggLoadStoreRewriter AggRewriter(*TD); + AggLoadStoreRewriter AggRewriter(*DL); Changed |= AggRewriter.rewrite(AI); - // Build the partition set using a recursive instruction-visiting builder. - AllocaPartitioning P(*TD, AI); - DEBUG(P.print(dbgs())); - if (P.isEscaped()) + // Build the slices using a recursive instruction-visiting builder. + AllocaSlices S(*DL, AI); + DEBUG(S.print(dbgs())); + if (S.isEscaped()) return Changed; // Delete all the dead users of this alloca before splitting and rewriting it. - for (AllocaPartitioning::dead_user_iterator DI = P.dead_user_begin(), - DE = P.dead_user_end(); + for (AllocaSlices::dead_user_iterator DI = S.dead_user_begin(), + DE = S.dead_user_end(); DI != DE; ++DI) { Changed = true; (*DI)->replaceAllUsesWith(UndefValue::get((*DI)->getType())); DeadInsts.insert(*DI); } - for (AllocaPartitioning::dead_op_iterator DO = P.dead_op_begin(), - DE = P.dead_op_end(); + for (AllocaSlices::dead_op_iterator DO = S.dead_op_begin(), + DE = S.dead_op_end(); DO != DE; ++DO) { Value *OldV = **DO; // Clobber the use with an undef value. @@ -3596,11 +3380,21 @@ bool SROA::runOnAlloca(AllocaInst &AI) { } } - // No partitions to split. Leave the dead alloca for a later pass to clean up. - if (P.begin() == P.end()) + // No slices to split. Leave the dead alloca for a later pass to clean up. + if (S.begin() == S.end()) return Changed; - return splitAlloca(AI, P) || Changed; + Changed |= splitAlloca(AI, S); + + DEBUG(dbgs() << " Speculating PHIs\n"); + while (!SpeculatablePHIs.empty()) + speculatePHINodeLoads(*SpeculatablePHIs.pop_back_val()); + + DEBUG(dbgs() << " Speculating Selects\n"); + while (!SpeculatableSelects.empty()) + speculateSelectInstLoads(*SpeculatableSelects.pop_back_val()); + + return Changed; } /// \brief Delete the dead instructions accumulated in this run. @@ -3635,6 +3429,15 @@ void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) { } } +static void enqueueUsersInWorklist(Instruction &I, + SmallVectorImpl<Instruction *> &Worklist, + SmallPtrSet<Instruction *, 8> &Visited) { + for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE; + ++UI) + if (Visited.insert(cast<Instruction>(*UI))) + Worklist.push_back(cast<Instruction>(*UI)); +} + /// \brief Promote the allocas, using the best available technique. /// /// This attempts to promote whatever allocas have been identified as viable in @@ -3659,25 +3462,28 @@ bool SROA::promoteAllocas(Function &F) { DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n"); SSAUpdater SSA; DIBuilder DIB(*F.getParent()); - SmallVector<Instruction*, 64> Insts; + SmallVector<Instruction *, 64> Insts; + + // We need a worklist to walk the uses of each alloca. + SmallVector<Instruction *, 8> Worklist; + SmallPtrSet<Instruction *, 8> Visited; + SmallVector<Instruction *, 32> DeadInsts; for (unsigned Idx = 0, Size = PromotableAllocas.size(); Idx != Size; ++Idx) { AllocaInst *AI = PromotableAllocas[Idx]; - for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end(); - UI != UE;) { - Instruction *I = cast<Instruction>(*UI++); + Insts.clear(); + Worklist.clear(); + Visited.clear(); + + enqueueUsersInWorklist(*AI, Worklist, Visited); + + while (!Worklist.empty()) { + Instruction *I = Worklist.pop_back_val(); + // FIXME: Currently the SSAUpdater infrastructure doesn't reason about // lifetime intrinsics and so we strip them (and the bitcasts+GEPs // leading to them) here. Eventually it should use them to optimize the // scalar values produced. - if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I)) { - assert(onlyUsedByLifetimeMarkers(I) && - "Found a bitcast used outside of a lifetime marker."); - while (!I->use_empty()) - cast<Instruction>(*I->use_begin())->eraseFromParent(); - I->eraseFromParent(); - continue; - } if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { assert(II->getIntrinsicID() == Intrinsic::lifetime_start || II->getIntrinsicID() == Intrinsic::lifetime_end); @@ -3685,10 +3491,30 @@ bool SROA::promoteAllocas(Function &F) { continue; } - Insts.push_back(I); + // Push the loads and stores we find onto the list. SROA will already + // have validated that all loads and stores are viable candidates for + // promotion. + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + assert(LI->getType() == AI->getAllocatedType()); + Insts.push_back(LI); + continue; + } + if (StoreInst *SI = dyn_cast<StoreInst>(I)) { + assert(SI->getValueOperand()->getType() == AI->getAllocatedType()); + Insts.push_back(SI); + continue; + } + + // For everything else, we know that only no-op bitcasts and GEPs will + // make it this far, just recurse through them and recall them for later + // removal. + DeadInsts.push_back(I); + enqueueUsersInWorklist(*I, Worklist, Visited); } AllocaPromoter(Insts, SSA, *AI, DIB).run(Insts); - Insts.clear(); + while (!DeadInsts.empty()) + DeadInsts.pop_back_val()->eraseFromParent(); + AI->eraseFromParent(); } PromotableAllocas.clear(); @@ -3712,8 +3538,8 @@ namespace { bool SROA::runOnFunction(Function &F) { DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); C = &F.getContext(); - TD = getAnalysisIfAvailable<DataLayout>(); - if (!TD) { + DL = getAnalysisIfAvailable<DataLayout>(); + if (!DL) { DEBUG(dbgs() << " Skipping SROA -- no target data!\n"); return false; } diff --git a/contrib/llvm/lib/Transforms/Scalar/SampleProfile.cpp b/contrib/llvm/lib/Transforms/Scalar/SampleProfile.cpp new file mode 100644 index 0000000..9bcd702 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/SampleProfile.cpp @@ -0,0 +1,479 @@ +//===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the SampleProfileLoader transformation. This pass +// reads a profile file generated by a sampling profiler (e.g. Linux Perf - +// http://perf.wiki.kernel.org/) and generates IR metadata to reflect the +// profile information in the given profile. +// +// This pass generates branch weight annotations on the IR: +// +// - prof: Represents branch weights. This annotation is added to branches +// to indicate the weights of each edge coming out of the branch. +// The weight of each edge is the weight of the target block for +// that edge. The weight of a block B is computed as the maximum +// number of samples found in B. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "sample-profile" + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/OwningPtr.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/DebugInfo/DIContext.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/InstIterator.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Regex.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" + +using namespace llvm; + +// Command line option to specify the file to read samples from. This is +// mainly used for debugging. +static cl::opt<std::string> SampleProfileFile( + "sample-profile-file", cl::init(""), cl::value_desc("filename"), + cl::desc("Profile file loaded by -sample-profile"), cl::Hidden); + +namespace { +/// \brief Sample-based profile reader. +/// +/// Each profile contains sample counts for all the functions +/// executed. Inside each function, statements are annotated with the +/// collected samples on all the instructions associated with that +/// statement. +/// +/// For this to produce meaningful data, the program needs to be +/// compiled with some debug information (at minimum, line numbers: +/// -gline-tables-only). Otherwise, it will be impossible to match IR +/// instructions to the line numbers collected by the profiler. +/// +/// From the profile file, we are interested in collecting the +/// following information: +/// +/// * A list of functions included in the profile (mangled names). +/// +/// * For each function F: +/// 1. The total number of samples collected in F. +/// +/// 2. The samples collected at each line in F. To provide some +/// protection against source code shuffling, line numbers should +/// be relative to the start of the function. +class SampleProfile { +public: + SampleProfile(StringRef F) : Profiles(0), Filename(F) {} + + void dump(); + void loadText(); + void loadNative() { llvm_unreachable("not implemented"); } + bool emitAnnotations(Function &F); + void printFunctionProfile(raw_ostream &OS, StringRef FName); + void dumpFunctionProfile(StringRef FName); + +protected: + typedef DenseMap<uint32_t, uint32_t> BodySampleMap; + typedef DenseMap<BasicBlock *, uint32_t> BlockWeightMap; + + /// \brief Representation of the runtime profile for a function. + /// + /// This data structure contains the runtime profile for a given + /// function. It contains the total number of samples collected + /// in the function and a map of samples collected in every statement. + struct FunctionProfile { + /// \brief Total number of samples collected inside this function. + /// + /// Samples are cumulative, they include all the samples collected + /// inside this function and all its inlined callees. + unsigned TotalSamples; + + // \brief Total number of samples collected at the head of the function. + unsigned TotalHeadSamples; + + /// \brief Map line offsets to collected samples. + /// + /// Each entry in this map contains the number of samples + /// collected at the corresponding line offset. All line locations + /// are an offset from the start of the function. + BodySampleMap BodySamples; + + /// \brief Map basic blocks to their computed weights. + /// + /// The weight of a basic block is defined to be the maximum + /// of all the instruction weights in that block. + BlockWeightMap BlockWeights; + }; + + uint32_t getInstWeight(Instruction &I, unsigned FirstLineno, + BodySampleMap &BodySamples); + uint32_t computeBlockWeight(BasicBlock *B, unsigned FirstLineno, + BodySampleMap &BodySamples); + + /// \brief Map every function to its associated profile. + /// + /// The profile of every function executed at runtime is collected + /// in the structure FunctionProfile. This maps function objects + /// to their corresponding profiles. + StringMap<FunctionProfile> Profiles; + + /// \brief Path name to the file holding the profile data. + /// + /// The format of this file is defined by each profiler + /// independently. If possible, the profiler should have a text + /// version of the profile format to be used in constructing test + /// cases and debugging. + StringRef Filename; +}; + +/// \brief Loader class for text-based profiles. +/// +/// This class defines a simple interface to read text files containing +/// profiles. It keeps track of line number information and location of +/// the file pointer. Users of this class are responsible for actually +/// parsing the lines returned by the readLine function. +/// +/// TODO - This does not really belong here. It is a generic text file +/// reader. It should be moved to the Support library and made more general. +class ExternalProfileTextLoader { +public: + ExternalProfileTextLoader(StringRef F) : Filename(F) { + error_code EC; + EC = MemoryBuffer::getFile(Filename, Buffer); + if (EC) + report_fatal_error("Could not open profile file " + Filename + ": " + + EC.message()); + FP = Buffer->getBufferStart(); + Lineno = 0; + } + + /// \brief Read a line from the mapped file. + StringRef readLine() { + size_t Length = 0; + const char *start = FP; + while (FP != Buffer->getBufferEnd() && *FP != '\n') { + Length++; + FP++; + } + if (FP != Buffer->getBufferEnd()) + FP++; + Lineno++; + return StringRef(start, Length); + } + + /// \brief Return true, if we've reached EOF. + bool atEOF() const { return FP == Buffer->getBufferEnd(); } + + /// \brief Report a parse error message and stop compilation. + void reportParseError(Twine Msg) const { + report_fatal_error(Filename + ":" + Twine(Lineno) + ": " + Msg + "\n"); + } + +private: + /// \brief Memory buffer holding the text file. + OwningPtr<MemoryBuffer> Buffer; + + /// \brief Current position into the memory buffer. + const char *FP; + + /// \brief Current line number. + int64_t Lineno; + + /// \brief Path name where to the profile file. + StringRef Filename; +}; + +/// \brief Sample profile pass. +/// +/// This pass reads profile data from the file specified by +/// -sample-profile-file and annotates every affected function with the +/// profile information found in that file. +class SampleProfileLoader : public FunctionPass { +public: + // Class identification, replacement for typeinfo + static char ID; + + SampleProfileLoader(StringRef Name = SampleProfileFile) + : FunctionPass(ID), Profiler(0), Filename(Name) { + initializeSampleProfileLoaderPass(*PassRegistry::getPassRegistry()); + } + + virtual bool doInitialization(Module &M); + + void dump() { Profiler->dump(); } + + virtual const char *getPassName() const { return "Sample profile pass"; } + + virtual bool runOnFunction(Function &F); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + } + +protected: + /// \brief Profile reader object. + OwningPtr<SampleProfile> Profiler; + + /// \brief Name of the profile file to load. + StringRef Filename; +}; +} + +/// \brief Print the function profile for \p FName on stream \p OS. +/// +/// \param OS Stream to emit the output to. +/// \param FName Name of the function to print. +void SampleProfile::printFunctionProfile(raw_ostream &OS, StringRef FName) { + FunctionProfile FProfile = Profiles[FName]; + OS << "Function: " << FName << ", " << FProfile.TotalSamples << ", " + << FProfile.TotalHeadSamples << ", " << FProfile.BodySamples.size() + << " sampled lines\n"; + for (BodySampleMap::const_iterator SI = FProfile.BodySamples.begin(), + SE = FProfile.BodySamples.end(); + SI != SE; ++SI) + OS << "\tline offset: " << SI->first + << ", number of samples: " << SI->second << "\n"; + OS << "\n"; +} + +/// \brief Dump the function profile for \p FName. +/// +/// \param FName Name of the function to print. +void SampleProfile::dumpFunctionProfile(StringRef FName) { + printFunctionProfile(dbgs(), FName); +} + +/// \brief Dump all the function profiles found. +void SampleProfile::dump() { + for (StringMap<FunctionProfile>::const_iterator I = Profiles.begin(), + E = Profiles.end(); + I != E; ++I) + dumpFunctionProfile(I->getKey()); +} + +/// \brief Load samples from a text file. +/// +/// The file is divided in two segments: +/// +/// Symbol table (represented with the string "symbol table") +/// Number of symbols in the table +/// symbol 1 +/// symbol 2 +/// ... +/// symbol N +/// +/// Function body profiles +/// function1:total_samples:total_head_samples:number_of_locations +/// location_offset_1: number_of_samples +/// location_offset_2: number_of_samples +/// ... +/// location_offset_N: number_of_samples +/// +/// Function names must be mangled in order for the profile loader to +/// match them in the current translation unit. +/// +/// Since this is a flat profile, a function that shows up more than +/// once gets all its samples aggregated across all its instances. +/// TODO - flat profiles are too imprecise to provide good optimization +/// opportunities. Convert them to context-sensitive profile. +/// +/// This textual representation is useful to generate unit tests and +/// for debugging purposes, but it should not be used to generate +/// profiles for large programs, as the representation is extremely +/// inefficient. +void SampleProfile::loadText() { + ExternalProfileTextLoader Loader(Filename); + + // Read the symbol table. + StringRef Line = Loader.readLine(); + if (Line != "symbol table") + Loader.reportParseError("Expected 'symbol table', found " + Line); + int NumSymbols; + Line = Loader.readLine(); + if (Line.getAsInteger(10, NumSymbols)) + Loader.reportParseError("Expected a number, found " + Line); + for (int I = 0; I < NumSymbols; I++) { + StringRef FName = Loader.readLine(); + FunctionProfile &FProfile = Profiles[FName]; + FProfile.BodySamples.clear(); + FProfile.TotalSamples = 0; + FProfile.TotalHeadSamples = 0; + } + + // Read the profile of each function. Since each function may be + // mentioned more than once, and we are collecting flat profiles, + // accumulate samples as we parse them. + Regex HeadRE("^([^:]+):([0-9]+):([0-9]+):([0-9]+)$"); + Regex LineSample("^([0-9]+): ([0-9]+)$"); + while (!Loader.atEOF()) { + SmallVector<StringRef, 4> Matches; + Line = Loader.readLine(); + if (!HeadRE.match(Line, &Matches)) + Loader.reportParseError("Expected 'mangled_name:NUM:NUM:NUM', found " + + Line); + assert(Matches.size() == 5); + StringRef FName = Matches[1]; + unsigned NumSamples, NumHeadSamples, NumSampledLines; + Matches[2].getAsInteger(10, NumSamples); + Matches[3].getAsInteger(10, NumHeadSamples); + Matches[4].getAsInteger(10, NumSampledLines); + FunctionProfile &FProfile = Profiles[FName]; + FProfile.TotalSamples += NumSamples; + FProfile.TotalHeadSamples += NumHeadSamples; + BodySampleMap &SampleMap = FProfile.BodySamples; + unsigned I; + for (I = 0; I < NumSampledLines && !Loader.atEOF(); I++) { + Line = Loader.readLine(); + if (!LineSample.match(Line, &Matches)) + Loader.reportParseError("Expected 'NUM: NUM', found " + Line); + assert(Matches.size() == 3); + unsigned LineOffset, NumSamples; + Matches[1].getAsInteger(10, LineOffset); + Matches[2].getAsInteger(10, NumSamples); + SampleMap[LineOffset] += NumSamples; + } + + if (I < NumSampledLines) + Loader.reportParseError("Unexpected end of file"); + } +} + +/// \brief Get the weight for an instruction. +/// +/// The "weight" of an instruction \p Inst is the number of samples +/// collected on that instruction at runtime. To retrieve it, we +/// need to compute the line number of \p Inst relative to the start of its +/// function. We use \p FirstLineno to compute the offset. We then +/// look up the samples collected for \p Inst using \p BodySamples. +/// +/// \param Inst Instruction to query. +/// \param FirstLineno Line number of the first instruction in the function. +/// \param BodySamples Map of relative source line locations to samples. +/// +/// \returns The profiled weight of I. +uint32_t SampleProfile::getInstWeight(Instruction &Inst, unsigned FirstLineno, + BodySampleMap &BodySamples) { + unsigned LOffset = Inst.getDebugLoc().getLine() - FirstLineno + 1; + return BodySamples.lookup(LOffset); +} + +/// \brief Compute the weight of a basic block. +/// +/// The weight of basic block \p B is the maximum weight of all the +/// instructions in B. +/// +/// \param B The basic block to query. +/// \param FirstLineno The line number for the first line in the +/// function holding B. +/// \param BodySamples The map containing all the samples collected in that +/// function. +/// +/// \returns The computed weight of B. +uint32_t SampleProfile::computeBlockWeight(BasicBlock *B, unsigned FirstLineno, + BodySampleMap &BodySamples) { + // If we've computed B's weight before, return it. + Function *F = B->getParent(); + FunctionProfile &FProfile = Profiles[F->getName()]; + std::pair<BlockWeightMap::iterator, bool> Entry = + FProfile.BlockWeights.insert(std::make_pair(B, 0)); + if (!Entry.second) + return Entry.first->second; + + // Otherwise, compute and cache B's weight. + uint32_t Weight = 0; + for (BasicBlock::iterator I = B->begin(), E = B->end(); I != E; ++I) { + uint32_t InstWeight = getInstWeight(*I, FirstLineno, BodySamples); + if (InstWeight > Weight) + Weight = InstWeight; + } + Entry.first->second = Weight; + return Weight; +} + +/// \brief Generate branch weight metadata for all branches in \p F. +/// +/// For every branch instruction B in \p F, we compute the weight of the +/// target block for each of the edges out of B. This is the weight +/// that we associate with that branch. +/// +/// TODO - This weight assignment will most likely be wrong if the +/// target branch has more than two predecessors. This needs to be done +/// using some form of flow propagation. +/// +/// Once all the branch weights are computed, we emit the MD_prof +/// metadata on B using the computed values. +/// +/// \param F The function to query. +bool SampleProfile::emitAnnotations(Function &F) { + bool Changed = false; + FunctionProfile &FProfile = Profiles[F.getName()]; + unsigned FirstLineno = inst_begin(F)->getDebugLoc().getLine(); + MDBuilder MDB(F.getContext()); + + // Clear the block weights cache. + FProfile.BlockWeights.clear(); + + // When we find a branch instruction: For each edge E out of the branch, + // the weight of E is the weight of the target block. + for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { + BasicBlock *B = I; + TerminatorInst *TI = B->getTerminator(); + if (TI->getNumSuccessors() == 1) + continue; + if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI)) + continue; + + SmallVector<uint32_t, 4> Weights; + unsigned NSuccs = TI->getNumSuccessors(); + for (unsigned I = 0; I < NSuccs; ++I) { + BasicBlock *Succ = TI->getSuccessor(I); + uint32_t Weight = + computeBlockWeight(Succ, FirstLineno, FProfile.BodySamples); + Weights.push_back(Weight); + } + + TI->setMetadata(llvm::LLVMContext::MD_prof, + MDB.createBranchWeights(Weights)); + Changed = true; + } + + return Changed; +} + +char SampleProfileLoader::ID = 0; +INITIALIZE_PASS(SampleProfileLoader, "sample-profile", "Sample Profile loader", + false, false) + +bool SampleProfileLoader::runOnFunction(Function &F) { + return Profiler->emitAnnotations(F); +} + +bool SampleProfileLoader::doInitialization(Module &M) { + Profiler.reset(new SampleProfile(Filename)); + Profiler->loadText(); + return true; +} + +FunctionPass *llvm::createSampleProfileLoaderPass() { + return new SampleProfileLoader(SampleProfileFile); +} + +FunctionPass *llvm::createSampleProfileLoaderPass(StringRef Name) { + return new SampleProfileLoader(Name); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp index 8a9c7da..857597e 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -28,7 +28,7 @@ using namespace llvm; /// ScalarOpts library. void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeADCEPass(Registry); - initializeBlockPlacementPass(Registry); + initializeSampleProfileLoaderPass(Registry); initializeCodeGenPreparePass(Registry); initializeConstantPropagationPass(Registry); initializeCorrelatedValuePropagationPass(Registry); @@ -44,12 +44,14 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeLoopInstSimplifyPass(Registry); initializeLoopRotatePass(Registry); initializeLoopStrengthReducePass(Registry); + initializeLoopRerollPass(Registry); initializeLoopUnrollPass(Registry); initializeLoopUnswitchPass(Registry); initializeLoopIdiomRecognizePass(Registry); initializeLowerAtomicPass(Registry); initializeLowerExpectIntrinsicPass(Registry); initializeMemCpyOptPass(Registry); + initializePartiallyInlineLibCallsPass(Registry); initializeReassociatePass(Registry); initializeRegToMemPass(Registry); initializeSCCPPass(Registry); @@ -58,7 +60,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeSROA_DTPass(Registry); initializeSROA_SSAUpPass(Registry); initializeCFGSimplifyPassPass(Registry); - initializeSimplifyLibCallsPass(Registry); + initializeStructurizeCFGPass(Registry); initializeSinkingPass(Registry); initializeTailCallElimPass(Registry); } @@ -111,6 +113,10 @@ void LLVMAddLoopRotatePass(LLVMPassManagerRef PM) { unwrap(PM)->add(createLoopRotatePass()); } +void LLVMAddLoopRerollPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLoopRerollPass()); +} + void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createLoopUnrollPass()); } @@ -123,6 +129,10 @@ void LLVMAddMemCpyOptPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createMemCpyOptPass()); } +void LLVMAddPartiallyInlineLibCallsPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createPartiallyInlineLibCallsPass()); +} + void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createPromoteMemoryToRegisterPass()); } @@ -149,7 +159,7 @@ void LLVMAddScalarReplAggregatesPassWithThreshold(LLVMPassManagerRef PM, } void LLVMAddSimplifyLibCallsPass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createSimplifyLibCallsPass()); + // NOTE: The simplify-libcalls pass has been removed. } void LLVMAddTailCallEliminationPass(LLVMPassManagerRef PM) { diff --git a/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp index bfde334..57b290e 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -166,21 +166,21 @@ namespace { void DeleteDeadInstructions(); void RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, - SmallVector<AllocaInst*, 32> &NewElts); + SmallVectorImpl<AllocaInst *> &NewElts); void RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset, - SmallVector<AllocaInst*, 32> &NewElts); + SmallVectorImpl<AllocaInst *> &NewElts); void RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, - SmallVector<AllocaInst*, 32> &NewElts); + SmallVectorImpl<AllocaInst *> &NewElts); void RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI, uint64_t Offset, - SmallVector<AllocaInst*, 32> &NewElts); + SmallVectorImpl<AllocaInst *> &NewElts); void RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, AllocaInst *AI, - SmallVector<AllocaInst*, 32> &NewElts); + SmallVectorImpl<AllocaInst *> &NewElts); void RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, - SmallVector<AllocaInst*, 32> &NewElts); + SmallVectorImpl<AllocaInst *> &NewElts); void RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, - SmallVector<AllocaInst*, 32> &NewElts); + SmallVectorImpl<AllocaInst *> &NewElts); bool ShouldAttemptScalarRepl(AllocaInst *AI); }; @@ -963,7 +963,7 @@ ConvertScalar_InsertValue(Value *SV, Value *Old, if (SV->getType()->isFloatingPointTy() || SV->getType()->isVectorTy()) SV = Builder.CreateBitCast(SV, IntegerType::get(SV->getContext(),SrcWidth)); else if (SV->getType()->isPointerTy()) - SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getContext())); + SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getType())); // Zero extend or truncate the value if needed. if (SV->getType() != AllocaType) { @@ -1066,12 +1066,12 @@ public: LoadAndStorePromoter::run(Insts); AI->eraseFromParent(); - for (SmallVector<DbgDeclareInst *, 4>::iterator I = DDIs.begin(), + for (SmallVectorImpl<DbgDeclareInst *>::iterator I = DDIs.begin(), E = DDIs.end(); I != E; ++I) { DbgDeclareInst *DDI = *I; DDI->eraseFromParent(); } - for (SmallVector<DbgValueInst *, 4>::iterator I = DVIs.begin(), + for (SmallVectorImpl<DbgValueInst *>::iterator I = DVIs.begin(), E = DVIs.end(); I != E; ++I) { DbgValueInst *DVI = *I; DVI->eraseFromParent(); @@ -1086,7 +1086,7 @@ public: } virtual void updateDebugInfo(Instruction *Inst) const { - for (SmallVector<DbgDeclareInst *, 4>::const_iterator I = DDIs.begin(), + for (SmallVectorImpl<DbgDeclareInst *>::const_iterator I = DDIs.begin(), E = DDIs.end(); I != E; ++I) { DbgDeclareInst *DDI = *I; if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) @@ -1094,7 +1094,7 @@ public: else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) ConvertDebugDeclareToDebugValue(DDI, LI, *DIB); } - for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(), + for (SmallVectorImpl<DbgValueInst *>::const_iterator I = DVIs.begin(), E = DVIs.end(); I != E; ++I) { DbgValueInst *DVI = *I; Value *Arg = NULL; @@ -1865,7 +1865,7 @@ bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size) { /// Offset indicates the position within AI that is referenced by this /// instruction. void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, - SmallVector<AllocaInst*, 32> &NewElts) { + SmallVectorImpl<AllocaInst *> &NewElts) { for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E;) { Use &TheUse = UI.getUse(); Instruction *User = cast<Instruction>(*UI++); @@ -1979,7 +1979,7 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, /// RewriteBitCast - Update a bitcast reference to the alloca being replaced /// and recursively continue updating all of its uses. void SROA::RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset, - SmallVector<AllocaInst*, 32> &NewElts) { + SmallVectorImpl<AllocaInst *> &NewElts) { RewriteForScalarRepl(BC, AI, Offset, NewElts); if (BC->getOperand(0) != AI) return; @@ -2037,7 +2037,7 @@ uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset, /// elements of the alloca that are being split apart, and if so, rewrite /// the GEP to be relative to the new element. void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, - SmallVector<AllocaInst*, 32> &NewElts) { + SmallVectorImpl<AllocaInst *> &NewElts) { uint64_t OldOffset = Offset; SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end()); // If the GEP was dynamic then it must have been a dynamic vector lookup. @@ -2099,7 +2099,7 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, /// to mark the lifetime of the scalarized memory. void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI, uint64_t Offset, - SmallVector<AllocaInst*, 32> &NewElts) { + SmallVectorImpl<AllocaInst *> &NewElts) { ConstantInt *OldSize = cast<ConstantInt>(II->getArgOperand(0)); // Put matching lifetime markers on everything from Offset up to // Offset+OldSize. @@ -2153,9 +2153,10 @@ void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI, /// RewriteMemIntrinUserOfAlloca - MI is a memcpy/memset/memmove from or to AI. /// Rewrite it to copy or set the elements of the scalarized memory. -void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, - AllocaInst *AI, - SmallVector<AllocaInst*, 32> &NewElts) { +void +SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, + AllocaInst *AI, + SmallVectorImpl<AllocaInst *> &NewElts) { // If this is a memcpy/memmove, construct the other pointer as the // appropriate type. The "Other" pointer is the pointer that goes to memory // that doesn't have anything to do with the alloca that we are promoting. For @@ -2189,7 +2190,7 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, if (OtherPtr == AI || OtherPtr == NewElts[0]) { // This code will run twice for a no-op memcpy -- once for each operand. // Put only one reference to MI on the DeadInsts list. - for (SmallVector<Value*, 32>::const_iterator I = DeadInsts.begin(), + for (SmallVectorImpl<Value *>::const_iterator I = DeadInsts.begin(), E = DeadInsts.end(); I != E; ++I) if (*I == MI) return; DeadInsts.push_back(MI); @@ -2326,8 +2327,9 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, /// RewriteStoreUserOfWholeAlloca - We found a store of an integer that /// overwrites the entire allocation. Extract out the pieces of the stored /// integer and store them individually. -void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, - SmallVector<AllocaInst*, 32> &NewElts){ +void +SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, + SmallVectorImpl<AllocaInst *> &NewElts) { // Extract each element out of the integer according to its structure offset // and store the element value to the individual alloca. Value *SrcVal = SI->getOperand(0); @@ -2440,8 +2442,9 @@ void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, /// RewriteLoadUserOfWholeAlloca - We found a load of the entire allocation to /// an integer. Load the individual pieces to form the aggregate value. -void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, - SmallVector<AllocaInst*, 32> &NewElts) { +void +SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, + SmallVectorImpl<AllocaInst *> &NewElts) { // Extract each element out of the NewElts according to its structure offset // and form the result value. Type *AllocaEltTy = AI->getAllocatedType(); diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp index c243d34..8371f6d 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -41,187 +41,31 @@ using namespace llvm; STATISTIC(NumSimpl, "Number of blocks simplified"); namespace { - struct CFGSimplifyPass : public FunctionPass { - static char ID; // Pass identification, replacement for typeid - CFGSimplifyPass() : FunctionPass(ID) { - initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry()); - } - - virtual bool runOnFunction(Function &F); +struct CFGSimplifyPass : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + CFGSimplifyPass() : FunctionPass(ID) { + initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry()); + } + virtual bool runOnFunction(Function &F); - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<TargetTransformInfo>(); - } - }; + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<TargetTransformInfo>(); + } +}; } char CFGSimplifyPass::ID = 0; -INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", - false, false) +INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false, + false) INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) -INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", - false, false) +INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false, + false) // Public interface to the CFGSimplification pass FunctionPass *llvm::createCFGSimplificationPass() { return new CFGSimplifyPass(); } -/// changeToUnreachable - Insert an unreachable instruction before the specified -/// instruction, making it and the rest of the code in the block dead. -static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) { - BasicBlock *BB = I->getParent(); - // Loop over all of the successors, removing BB's entry from any PHI - // nodes. - for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) - (*SI)->removePredecessor(BB); - - // Insert a call to llvm.trap right before this. This turns the undefined - // behavior into a hard fail instead of falling through into random code. - if (UseLLVMTrap) { - Function *TrapFn = - Intrinsic::getDeclaration(BB->getParent()->getParent(), Intrinsic::trap); - CallInst *CallTrap = CallInst::Create(TrapFn, "", I); - CallTrap->setDebugLoc(I->getDebugLoc()); - } - new UnreachableInst(I->getContext(), I); - - // All instructions after this are dead. - BasicBlock::iterator BBI = I, BBE = BB->end(); - while (BBI != BBE) { - if (!BBI->use_empty()) - BBI->replaceAllUsesWith(UndefValue::get(BBI->getType())); - BB->getInstList().erase(BBI++); - } -} - -/// changeToCall - Convert the specified invoke into a normal call. -static void changeToCall(InvokeInst *II) { - SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3); - CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, "", II); - NewCall->takeName(II); - NewCall->setCallingConv(II->getCallingConv()); - NewCall->setAttributes(II->getAttributes()); - NewCall->setDebugLoc(II->getDebugLoc()); - II->replaceAllUsesWith(NewCall); - - // Follow the call by a branch to the normal destination. - BranchInst::Create(II->getNormalDest(), II); - - // Update PHI nodes in the unwind destination - II->getUnwindDest()->removePredecessor(II->getParent()); - II->eraseFromParent(); -} - -static bool markAliveBlocks(BasicBlock *BB, - SmallPtrSet<BasicBlock*, 128> &Reachable) { - - SmallVector<BasicBlock*, 128> Worklist; - Worklist.push_back(BB); - Reachable.insert(BB); - bool Changed = false; - do { - BB = Worklist.pop_back_val(); - - // Do a quick scan of the basic block, turning any obviously unreachable - // instructions into LLVM unreachable insts. The instruction combining pass - // canonicalizes unreachable insts into stores to null or undef. - for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;++BBI){ - if (CallInst *CI = dyn_cast<CallInst>(BBI)) { - if (CI->doesNotReturn()) { - // If we found a call to a no-return function, insert an unreachable - // instruction after it. Make sure there isn't *already* one there - // though. - ++BBI; - if (!isa<UnreachableInst>(BBI)) { - // Don't insert a call to llvm.trap right before the unreachable. - changeToUnreachable(BBI, false); - Changed = true; - } - break; - } - } - - // Store to undef and store to null are undefined and used to signal that - // they should be changed to unreachable by passes that can't modify the - // CFG. - if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) { - // Don't touch volatile stores. - if (SI->isVolatile()) continue; - - Value *Ptr = SI->getOperand(1); - - if (isa<UndefValue>(Ptr) || - (isa<ConstantPointerNull>(Ptr) && - SI->getPointerAddressSpace() == 0)) { - changeToUnreachable(SI, true); - Changed = true; - break; - } - } - } - - // Turn invokes that call 'nounwind' functions into ordinary calls. - if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) { - Value *Callee = II->getCalledValue(); - if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) { - changeToUnreachable(II, true); - Changed = true; - } else if (II->doesNotThrow()) { - if (II->use_empty() && II->onlyReadsMemory()) { - // jump to the normal destination branch. - BranchInst::Create(II->getNormalDest(), II); - II->getUnwindDest()->removePredecessor(II->getParent()); - II->eraseFromParent(); - } else - changeToCall(II); - Changed = true; - } - } - - Changed |= ConstantFoldTerminator(BB, true); - for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) - if (Reachable.insert(*SI)) - Worklist.push_back(*SI); - } while (!Worklist.empty()); - return Changed; -} - -/// removeUnreachableBlocksFromFn - Remove blocks that are not reachable, even -/// if they are in a dead cycle. Return true if a change was made, false -/// otherwise. -static bool removeUnreachableBlocksFromFn(Function &F) { - SmallPtrSet<BasicBlock*, 128> Reachable; - bool Changed = markAliveBlocks(F.begin(), Reachable); - - // If there are unreachable blocks in the CFG... - if (Reachable.size() == F.size()) - return Changed; - - assert(Reachable.size() < F.size()); - NumSimpl += F.size()-Reachable.size(); - - // Loop over all of the basic blocks that are not reachable, dropping all of - // their internal references... - for (Function::iterator BB = ++F.begin(), E = F.end(); BB != E; ++BB) { - if (Reachable.count(BB)) - continue; - - for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) - if (Reachable.count(*SI)) - (*SI)->removePredecessor(BB); - BB->dropAllReferences(); - } - - for (Function::iterator I = ++F.begin(); I != F.end();) - if (!Reachable.count(I)) - I = F.getBasicBlockList().erase(I); - else - ++I; - - return true; -} - /// mergeEmptyReturnBlocks - If we have more than one empty (other than phi /// node) return blocks, merge them together to promote recursive block merging. static bool mergeEmptyReturnBlocks(Function &F) { @@ -326,7 +170,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, bool CFGSimplifyPass::runOnFunction(Function &F) { const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>(); const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); - bool EverChanged = removeUnreachableBlocksFromFn(F); + bool EverChanged = removeUnreachableBlocks(F); EverChanged |= mergeEmptyReturnBlocks(F); EverChanged |= iterativelySimplifyCFG(F, TTI, TD); @@ -334,16 +178,16 @@ bool CFGSimplifyPass::runOnFunction(Function &F) { if (!EverChanged) return false; // iterativelySimplifyCFG can (rarely) make some loops dead. If this happens, - // removeUnreachableBlocksFromFn is needed to nuke them, which means we should + // removeUnreachableBlocks is needed to nuke them, which means we should // iterate between the two optimizations. We structure the code like this to // avoid reruning iterativelySimplifyCFG if the second pass of - // removeUnreachableBlocksFromFn doesn't do anything. - if (!removeUnreachableBlocksFromFn(F)) + // removeUnreachableBlocks doesn't do anything. + if (!removeUnreachableBlocks(F)) return true; do { EverChanged = iterativelySimplifyCFG(F, TTI, TD); - EverChanged |= removeUnreachableBlocksFromFn(F); + EverChanged |= removeUnreachableBlocks(F); } while (EverChanged); return true; diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyLibCalls.cpp deleted file mode 100644 index 3514e6c..0000000 --- a/contrib/llvm/lib/Transforms/Scalar/SimplifyLibCalls.cpp +++ /dev/null @@ -1,247 +0,0 @@ -//===- SimplifyLibCalls.cpp - Optimize specific well-known library calls --===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements a simple pass that applies a variety of small -// optimizations for calls to specific well-known function calls (e.g. runtime -// library functions). Any optimization that takes the very simple form -// "replace call to library function with simpler code that provides the same -// result" belongs in this file. -// -//===----------------------------------------------------------------------===// - -#define DEBUG_TYPE "simplify-libcalls" -#include "llvm/Transforms/Scalar.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/StringMap.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/Config/config.h" // FIXME: Shouldn't depend on host! -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/Module.h" -#include "llvm/Pass.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetLibraryInfo.h" -#include "llvm/Transforms/Utils/BuildLibCalls.h" -using namespace llvm; - - -//===----------------------------------------------------------------------===// -// Optimizer Base Class -//===----------------------------------------------------------------------===// - -/// This class is the abstract base class for the set of optimizations that -/// corresponds to one library call. -namespace { -class LibCallOptimization { -protected: - Function *Caller; - const DataLayout *TD; - const TargetLibraryInfo *TLI; - LLVMContext* Context; -public: - LibCallOptimization() { } - virtual ~LibCallOptimization() {} - - /// CallOptimizer - This pure virtual method is implemented by base classes to - /// do various optimizations. If this returns null then no transformation was - /// performed. If it returns CI, then it transformed the call and CI is to be - /// deleted. If it returns something else, replace CI with the new value and - /// delete CI. - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) - =0; - - Value *OptimizeCall(CallInst *CI, const DataLayout *TD, - const TargetLibraryInfo *TLI, IRBuilder<> &B) { - Caller = CI->getParent()->getParent(); - this->TD = TD; - this->TLI = TLI; - if (CI->getCalledFunction()) - Context = &CI->getCalledFunction()->getContext(); - - // We never change the calling convention. - if (CI->getCallingConv() != llvm::CallingConv::C) - return NULL; - - return CallOptimizer(CI->getCalledFunction(), CI, B); - } -}; -} // End anonymous namespace. - - -//===----------------------------------------------------------------------===// -// SimplifyLibCalls Pass Implementation -//===----------------------------------------------------------------------===// - -namespace { - /// This pass optimizes well known library functions from libc and libm. - /// - class SimplifyLibCalls : public FunctionPass { - TargetLibraryInfo *TLI; - - StringMap<LibCallOptimization*> Optimizations; - public: - static char ID; // Pass identification - SimplifyLibCalls() : FunctionPass(ID) { - initializeSimplifyLibCallsPass(*PassRegistry::getPassRegistry()); - } - void AddOpt(LibFunc::Func F, LibCallOptimization* Opt); - void AddOpt(LibFunc::Func F1, LibFunc::Func F2, LibCallOptimization* Opt); - - void InitOptimizations(); - bool runOnFunction(Function &F); - - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<TargetLibraryInfo>(); - } - }; -} // end anonymous namespace. - -char SimplifyLibCalls::ID = 0; - -INITIALIZE_PASS_BEGIN(SimplifyLibCalls, "simplify-libcalls", - "Simplify well-known library calls", false, false) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) -INITIALIZE_PASS_END(SimplifyLibCalls, "simplify-libcalls", - "Simplify well-known library calls", false, false) - -// Public interface to the Simplify LibCalls pass. -FunctionPass *llvm::createSimplifyLibCallsPass() { - return new SimplifyLibCalls(); -} - -void SimplifyLibCalls::AddOpt(LibFunc::Func F, LibCallOptimization* Opt) { - if (TLI->has(F)) - Optimizations[TLI->getName(F)] = Opt; -} - -void SimplifyLibCalls::AddOpt(LibFunc::Func F1, LibFunc::Func F2, - LibCallOptimization* Opt) { - if (TLI->has(F1) && TLI->has(F2)) - Optimizations[TLI->getName(F1)] = Opt; -} - -/// Optimizations - Populate the Optimizations map with all the optimizations -/// we know. -void SimplifyLibCalls::InitOptimizations() { -} - - -/// runOnFunction - Top level algorithm. -/// -bool SimplifyLibCalls::runOnFunction(Function &F) { - TLI = &getAnalysis<TargetLibraryInfo>(); - - if (Optimizations.empty()) - InitOptimizations(); - - const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); - - IRBuilder<> Builder(F.getContext()); - - bool Changed = false; - for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) { - // Ignore non-calls. - CallInst *CI = dyn_cast<CallInst>(I++); - if (!CI || CI->hasFnAttr(Attribute::NoBuiltin)) continue; - - // Ignore indirect calls and calls to non-external functions. - Function *Callee = CI->getCalledFunction(); - if (Callee == 0 || !Callee->isDeclaration() || - !(Callee->hasExternalLinkage() || Callee->hasDLLImportLinkage())) - continue; - - // Ignore unknown calls. - LibCallOptimization *LCO = Optimizations.lookup(Callee->getName()); - if (!LCO) continue; - - // Set the builder to the instruction after the call. - Builder.SetInsertPoint(BB, I); - - // Use debug location of CI for all new instructions. - Builder.SetCurrentDebugLocation(CI->getDebugLoc()); - - // Try to optimize this call. - Value *Result = LCO->OptimizeCall(CI, TD, TLI, Builder); - if (Result == 0) continue; - - DEBUG(dbgs() << "SimplifyLibCalls simplified: " << *CI; - dbgs() << " into: " << *Result << "\n"); - - // Something changed! - Changed = true; - - // Inspect the instruction after the call (which was potentially just - // added) next. - I = CI; ++I; - - if (CI != Result && !CI->use_empty()) { - CI->replaceAllUsesWith(Result); - if (!Result->hasName()) - Result->takeName(CI); - } - CI->eraseFromParent(); - } - } - return Changed; -} - -// TODO: -// Additional cases that we need to add to this file: -// -// cbrt: -// * cbrt(expN(X)) -> expN(x/3) -// * cbrt(sqrt(x)) -> pow(x,1/6) -// * cbrt(sqrt(x)) -> pow(x,1/9) -// -// exp, expf, expl: -// * exp(log(x)) -> x -// -// log, logf, logl: -// * log(exp(x)) -> x -// * log(x**y) -> y*log(x) -// * log(exp(y)) -> y*log(e) -// * log(exp2(y)) -> y*log(2) -// * log(exp10(y)) -> y*log(10) -// * log(sqrt(x)) -> 0.5*log(x) -// * log(pow(x,y)) -> y*log(x) -// -// lround, lroundf, lroundl: -// * lround(cnst) -> cnst' -// -// pow, powf, powl: -// * pow(exp(x),y) -> exp(x*y) -// * pow(sqrt(x),y) -> pow(x,y*0.5) -// * pow(pow(x,y),z)-> pow(x,y*z) -// -// round, roundf, roundl: -// * round(cnst) -> cnst' -// -// signbit: -// * signbit(cnst) -> cnst' -// * signbit(nncst) -> 0 (if pstv is a non-negative constant) -// -// sqrt, sqrtf, sqrtl: -// * sqrt(expN(x)) -> expN(x*0.5) -// * sqrt(Nroot(x)) -> pow(x,1/(2*N)) -// * sqrt(pow(x,y)) -> pow(|x|,y*0.5) -// -// strchr: -// * strchr(p, 0) -> strlen(p) -// tan, tanf, tanl: -// * tan(atan(x)) -> x -// -// trunc, truncf, truncl: -// * trunc(cnst) -> cnst' -// -// diff --git a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp new file mode 100644 index 0000000..5045ff8f --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -0,0 +1,906 @@ +//===-- StructurizeCFG.cpp ------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "structurizecfg" +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/Analysis/RegionInfo.h" +#include "llvm/Analysis/RegionIterator.h" +#include "llvm/Analysis/RegionPass.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/PatternMatch.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" + +using namespace llvm; +using namespace llvm::PatternMatch; + +namespace { + +// Definition of the complex types used in this pass. + +typedef std::pair<BasicBlock *, Value *> BBValuePair; + +typedef SmallVector<RegionNode*, 8> RNVector; +typedef SmallVector<BasicBlock*, 8> BBVector; +typedef SmallVector<BranchInst*, 8> BranchVector; +typedef SmallVector<BBValuePair, 2> BBValueVector; + +typedef SmallPtrSet<BasicBlock *, 8> BBSet; + +typedef MapVector<PHINode *, BBValueVector> PhiMap; +typedef MapVector<BasicBlock *, BBVector> BB2BBVecMap; + +typedef DenseMap<DomTreeNode *, unsigned> DTN2UnsignedMap; +typedef DenseMap<BasicBlock *, PhiMap> BBPhiMap; +typedef DenseMap<BasicBlock *, Value *> BBPredicates; +typedef DenseMap<BasicBlock *, BBPredicates> PredMap; +typedef DenseMap<BasicBlock *, BasicBlock*> BB2BBMap; + +// The name for newly created blocks. + +static const char *const FlowBlockName = "Flow"; + +/// @brief Find the nearest common dominator for multiple BasicBlocks +/// +/// Helper class for StructurizeCFG +/// TODO: Maybe move into common code +class NearestCommonDominator { + DominatorTree *DT; + + DTN2UnsignedMap IndexMap; + + BasicBlock *Result; + unsigned ResultIndex; + bool ExplicitMentioned; + +public: + /// \brief Start a new query + NearestCommonDominator(DominatorTree *DomTree) { + DT = DomTree; + Result = 0; + } + + /// \brief Add BB to the resulting dominator + void addBlock(BasicBlock *BB, bool Remember = true) { + DomTreeNode *Node = DT->getNode(BB); + + if (Result == 0) { + unsigned Numbering = 0; + for (;Node;Node = Node->getIDom()) + IndexMap[Node] = ++Numbering; + Result = BB; + ResultIndex = 1; + ExplicitMentioned = Remember; + return; + } + + for (;Node;Node = Node->getIDom()) + if (IndexMap.count(Node)) + break; + else + IndexMap[Node] = 0; + + assert(Node && "Dominator tree invalid!"); + + unsigned Numbering = IndexMap[Node]; + if (Numbering > ResultIndex) { + Result = Node->getBlock(); + ResultIndex = Numbering; + ExplicitMentioned = Remember && (Result == BB); + } else if (Numbering == ResultIndex) { + ExplicitMentioned |= Remember; + } + } + + /// \brief Is "Result" one of the BBs added with "Remember" = True? + bool wasResultExplicitMentioned() { + return ExplicitMentioned; + } + + /// \brief Get the query result + BasicBlock *getResult() { + return Result; + } +}; + +/// @brief Transforms the control flow graph on one single entry/exit region +/// at a time. +/// +/// After the transform all "If"/"Then"/"Else" style control flow looks like +/// this: +/// +/// \verbatim +/// 1 +/// || +/// | | +/// 2 | +/// | / +/// |/ +/// 3 +/// || Where: +/// | | 1 = "If" block, calculates the condition +/// 4 | 2 = "Then" subregion, runs if the condition is true +/// | / 3 = "Flow" blocks, newly inserted flow blocks, rejoins the flow +/// |/ 4 = "Else" optional subregion, runs if the condition is false +/// 5 5 = "End" block, also rejoins the control flow +/// \endverbatim +/// +/// Control flow is expressed as a branch where the true exit goes into the +/// "Then"/"Else" region, while the false exit skips the region +/// The condition for the optional "Else" region is expressed as a PHI node. +/// The incomming values of the PHI node are true for the "If" edge and false +/// for the "Then" edge. +/// +/// Additionally to that even complicated loops look like this: +/// +/// \verbatim +/// 1 +/// || +/// | | +/// 2 ^ Where: +/// | / 1 = "Entry" block +/// |/ 2 = "Loop" optional subregion, with all exits at "Flow" block +/// 3 3 = "Flow" block, with back edge to entry block +/// | +/// \endverbatim +/// +/// The back edge of the "Flow" block is always on the false side of the branch +/// while the true side continues the general flow. So the loop condition +/// consist of a network of PHI nodes where the true incoming values expresses +/// breaks and the false values expresses continue states. +class StructurizeCFG : public RegionPass { + Type *Boolean; + ConstantInt *BoolTrue; + ConstantInt *BoolFalse; + UndefValue *BoolUndef; + + Function *Func; + Region *ParentRegion; + + DominatorTree *DT; + + RNVector Order; + BBSet Visited; + + BBPhiMap DeletedPhis; + BB2BBVecMap AddedPhis; + + PredMap Predicates; + BranchVector Conditions; + + BB2BBMap Loops; + PredMap LoopPreds; + BranchVector LoopConds; + + RegionNode *PrevNode; + + void orderNodes(); + + void analyzeLoops(RegionNode *N); + + Value *invert(Value *Condition); + + Value *buildCondition(BranchInst *Term, unsigned Idx, bool Invert); + + void gatherPredicates(RegionNode *N); + + void collectInfos(); + + void insertConditions(bool Loops); + + void delPhiValues(BasicBlock *From, BasicBlock *To); + + void addPhiValues(BasicBlock *From, BasicBlock *To); + + void setPhiValues(); + + void killTerminator(BasicBlock *BB); + + void changeExit(RegionNode *Node, BasicBlock *NewExit, + bool IncludeDominator); + + BasicBlock *getNextFlow(BasicBlock *Dominator); + + BasicBlock *needPrefix(bool NeedEmpty); + + BasicBlock *needPostfix(BasicBlock *Flow, bool ExitUseAllowed); + + void setPrevNode(BasicBlock *BB); + + bool dominatesPredicates(BasicBlock *BB, RegionNode *Node); + + bool isPredictableTrue(RegionNode *Node); + + void wireFlow(bool ExitUseAllowed, BasicBlock *LoopEnd); + + void handleLoops(bool ExitUseAllowed, BasicBlock *LoopEnd); + + void createFlow(); + + void rebuildSSA(); + +public: + static char ID; + + StructurizeCFG() : + RegionPass(ID) { + initializeStructurizeCFGPass(*PassRegistry::getPassRegistry()); + } + + using Pass::doInitialization; + virtual bool doInitialization(Region *R, RGPassManager &RGM); + + virtual bool runOnRegion(Region *R, RGPassManager &RGM); + + virtual const char *getPassName() const { + return "Structurize control flow"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequiredID(LowerSwitchID); + AU.addRequired<DominatorTree>(); + AU.addPreserved<DominatorTree>(); + RegionPass::getAnalysisUsage(AU); + } +}; + +} // end anonymous namespace + +char StructurizeCFG::ID = 0; + +INITIALIZE_PASS_BEGIN(StructurizeCFG, "structurizecfg", "Structurize the CFG", + false, false) +INITIALIZE_PASS_DEPENDENCY(LowerSwitch) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(RegionInfo) +INITIALIZE_PASS_END(StructurizeCFG, "structurizecfg", "Structurize the CFG", + false, false) + +/// \brief Initialize the types and constants used in the pass +bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) { + LLVMContext &Context = R->getEntry()->getContext(); + + Boolean = Type::getInt1Ty(Context); + BoolTrue = ConstantInt::getTrue(Context); + BoolFalse = ConstantInt::getFalse(Context); + BoolUndef = UndefValue::get(Boolean); + + return false; +} + +/// \brief Build up the general order of nodes +void StructurizeCFG::orderNodes() { + scc_iterator<Region *> I = scc_begin(ParentRegion), + E = scc_end(ParentRegion); + for (Order.clear(); I != E; ++I) { + std::vector<RegionNode *> &Nodes = *I; + Order.append(Nodes.begin(), Nodes.end()); + } +} + +/// \brief Determine the end of the loops +void StructurizeCFG::analyzeLoops(RegionNode *N) { + if (N->isSubRegion()) { + // Test for exit as back edge + BasicBlock *Exit = N->getNodeAs<Region>()->getExit(); + if (Visited.count(Exit)) + Loops[Exit] = N->getEntry(); + + } else { + // Test for sucessors as back edge + BasicBlock *BB = N->getNodeAs<BasicBlock>(); + BranchInst *Term = cast<BranchInst>(BB->getTerminator()); + + for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) { + BasicBlock *Succ = Term->getSuccessor(i); + + if (Visited.count(Succ)) + Loops[Succ] = BB; + } + } +} + +/// \brief Invert the given condition +Value *StructurizeCFG::invert(Value *Condition) { + // First: Check if it's a constant + if (Condition == BoolTrue) + return BoolFalse; + + if (Condition == BoolFalse) + return BoolTrue; + + if (Condition == BoolUndef) + return BoolUndef; + + // Second: If the condition is already inverted, return the original value + if (match(Condition, m_Not(m_Value(Condition)))) + return Condition; + + if (Instruction *Inst = dyn_cast<Instruction>(Condition)) { + // Third: Check all the users for an invert + BasicBlock *Parent = Inst->getParent(); + for (Value::use_iterator I = Condition->use_begin(), + E = Condition->use_end(); I != E; ++I) { + + Instruction *User = dyn_cast<Instruction>(*I); + if (!User || User->getParent() != Parent) + continue; + + if (match(*I, m_Not(m_Specific(Condition)))) + return *I; + } + + // Last option: Create a new instruction + return BinaryOperator::CreateNot(Condition, "", Parent->getTerminator()); + } + + if (Argument *Arg = dyn_cast<Argument>(Condition)) { + BasicBlock &EntryBlock = Arg->getParent()->getEntryBlock(); + return BinaryOperator::CreateNot(Condition, + Arg->getName() + ".inv", + EntryBlock.getTerminator()); + } + + llvm_unreachable("Unhandled condition to invert"); +} + +/// \brief Build the condition for one edge +Value *StructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx, + bool Invert) { + Value *Cond = Invert ? BoolFalse : BoolTrue; + if (Term->isConditional()) { + Cond = Term->getCondition(); + + if (Idx != (unsigned)Invert) + Cond = invert(Cond); + } + return Cond; +} + +/// \brief Analyze the predecessors of each block and build up predicates +void StructurizeCFG::gatherPredicates(RegionNode *N) { + RegionInfo *RI = ParentRegion->getRegionInfo(); + BasicBlock *BB = N->getEntry(); + BBPredicates &Pred = Predicates[BB]; + BBPredicates &LPred = LoopPreds[BB]; + + for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); + PI != PE; ++PI) { + + // Ignore it if it's a branch from outside into our region entry + if (!ParentRegion->contains(*PI)) + continue; + + Region *R = RI->getRegionFor(*PI); + if (R == ParentRegion) { + + // It's a top level block in our region + BranchInst *Term = cast<BranchInst>((*PI)->getTerminator()); + for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) { + BasicBlock *Succ = Term->getSuccessor(i); + if (Succ != BB) + continue; + + if (Visited.count(*PI)) { + // Normal forward edge + if (Term->isConditional()) { + // Try to treat it like an ELSE block + BasicBlock *Other = Term->getSuccessor(!i); + if (Visited.count(Other) && !Loops.count(Other) && + !Pred.count(Other) && !Pred.count(*PI)) { + + Pred[Other] = BoolFalse; + Pred[*PI] = BoolTrue; + continue; + } + } + Pred[*PI] = buildCondition(Term, i, false); + + } else { + // Back edge + LPred[*PI] = buildCondition(Term, i, true); + } + } + + } else { + + // It's an exit from a sub region + while(R->getParent() != ParentRegion) + R = R->getParent(); + + // Edge from inside a subregion to its entry, ignore it + if (R == N) + continue; + + BasicBlock *Entry = R->getEntry(); + if (Visited.count(Entry)) + Pred[Entry] = BoolTrue; + else + LPred[Entry] = BoolFalse; + } + } +} + +/// \brief Collect various loop and predicate infos +void StructurizeCFG::collectInfos() { + // Reset predicate + Predicates.clear(); + + // and loop infos + Loops.clear(); + LoopPreds.clear(); + + // Reset the visited nodes + Visited.clear(); + + for (RNVector::reverse_iterator OI = Order.rbegin(), OE = Order.rend(); + OI != OE; ++OI) { + + // Analyze all the conditions leading to a node + gatherPredicates(*OI); + + // Remember that we've seen this node + Visited.insert((*OI)->getEntry()); + + // Find the last back edges + analyzeLoops(*OI); + } +} + +/// \brief Insert the missing branch conditions +void StructurizeCFG::insertConditions(bool Loops) { + BranchVector &Conds = Loops ? LoopConds : Conditions; + Value *Default = Loops ? BoolTrue : BoolFalse; + SSAUpdater PhiInserter; + + for (BranchVector::iterator I = Conds.begin(), + E = Conds.end(); I != E; ++I) { + + BranchInst *Term = *I; + assert(Term->isConditional()); + + BasicBlock *Parent = Term->getParent(); + BasicBlock *SuccTrue = Term->getSuccessor(0); + BasicBlock *SuccFalse = Term->getSuccessor(1); + + PhiInserter.Initialize(Boolean, ""); + PhiInserter.AddAvailableValue(&Func->getEntryBlock(), Default); + PhiInserter.AddAvailableValue(Loops ? SuccFalse : Parent, Default); + + BBPredicates &Preds = Loops ? LoopPreds[SuccFalse] : Predicates[SuccTrue]; + + NearestCommonDominator Dominator(DT); + Dominator.addBlock(Parent, false); + + Value *ParentValue = 0; + for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end(); + PI != PE; ++PI) { + + if (PI->first == Parent) { + ParentValue = PI->second; + break; + } + PhiInserter.AddAvailableValue(PI->first, PI->second); + Dominator.addBlock(PI->first); + } + + if (ParentValue) { + Term->setCondition(ParentValue); + } else { + if (!Dominator.wasResultExplicitMentioned()) + PhiInserter.AddAvailableValue(Dominator.getResult(), Default); + + Term->setCondition(PhiInserter.GetValueInMiddleOfBlock(Parent)); + } + } +} + +/// \brief Remove all PHI values coming from "From" into "To" and remember +/// them in DeletedPhis +void StructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) { + PhiMap &Map = DeletedPhis[To]; + for (BasicBlock::iterator I = To->begin(), E = To->end(); + I != E && isa<PHINode>(*I);) { + + PHINode &Phi = cast<PHINode>(*I++); + while (Phi.getBasicBlockIndex(From) != -1) { + Value *Deleted = Phi.removeIncomingValue(From, false); + Map[&Phi].push_back(std::make_pair(From, Deleted)); + } + } +} + +/// \brief Add a dummy PHI value as soon as we knew the new predecessor +void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) { + for (BasicBlock::iterator I = To->begin(), E = To->end(); + I != E && isa<PHINode>(*I);) { + + PHINode &Phi = cast<PHINode>(*I++); + Value *Undef = UndefValue::get(Phi.getType()); + Phi.addIncoming(Undef, From); + } + AddedPhis[To].push_back(From); +} + +/// \brief Add the real PHI value as soon as everything is set up +void StructurizeCFG::setPhiValues() { + SSAUpdater Updater; + for (BB2BBVecMap::iterator AI = AddedPhis.begin(), AE = AddedPhis.end(); + AI != AE; ++AI) { + + BasicBlock *To = AI->first; + BBVector &From = AI->second; + + if (!DeletedPhis.count(To)) + continue; + + PhiMap &Map = DeletedPhis[To]; + for (PhiMap::iterator PI = Map.begin(), PE = Map.end(); + PI != PE; ++PI) { + + PHINode *Phi = PI->first; + Value *Undef = UndefValue::get(Phi->getType()); + Updater.Initialize(Phi->getType(), ""); + Updater.AddAvailableValue(&Func->getEntryBlock(), Undef); + Updater.AddAvailableValue(To, Undef); + + NearestCommonDominator Dominator(DT); + Dominator.addBlock(To, false); + for (BBValueVector::iterator VI = PI->second.begin(), + VE = PI->second.end(); VI != VE; ++VI) { + + Updater.AddAvailableValue(VI->first, VI->second); + Dominator.addBlock(VI->first); + } + + if (!Dominator.wasResultExplicitMentioned()) + Updater.AddAvailableValue(Dominator.getResult(), Undef); + + for (BBVector::iterator FI = From.begin(), FE = From.end(); + FI != FE; ++FI) { + + int Idx = Phi->getBasicBlockIndex(*FI); + assert(Idx != -1); + Phi->setIncomingValue(Idx, Updater.GetValueAtEndOfBlock(*FI)); + } + } + + DeletedPhis.erase(To); + } + assert(DeletedPhis.empty()); +} + +/// \brief Remove phi values from all successors and then remove the terminator. +void StructurizeCFG::killTerminator(BasicBlock *BB) { + TerminatorInst *Term = BB->getTerminator(); + if (!Term) + return; + + for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); + SI != SE; ++SI) { + + delPhiValues(BB, *SI); + } + + Term->eraseFromParent(); +} + +/// \brief Let node exit(s) point to NewExit +void StructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit, + bool IncludeDominator) { + if (Node->isSubRegion()) { + Region *SubRegion = Node->getNodeAs<Region>(); + BasicBlock *OldExit = SubRegion->getExit(); + BasicBlock *Dominator = 0; + + // Find all the edges from the sub region to the exit + for (pred_iterator I = pred_begin(OldExit), E = pred_end(OldExit); + I != E;) { + + BasicBlock *BB = *I++; + if (!SubRegion->contains(BB)) + continue; + + // Modify the edges to point to the new exit + delPhiValues(BB, OldExit); + BB->getTerminator()->replaceUsesOfWith(OldExit, NewExit); + addPhiValues(BB, NewExit); + + // Find the new dominator (if requested) + if (IncludeDominator) { + if (!Dominator) + Dominator = BB; + else + Dominator = DT->findNearestCommonDominator(Dominator, BB); + } + } + + // Change the dominator (if requested) + if (Dominator) + DT->changeImmediateDominator(NewExit, Dominator); + + // Update the region info + SubRegion->replaceExit(NewExit); + + } else { + BasicBlock *BB = Node->getNodeAs<BasicBlock>(); + killTerminator(BB); + BranchInst::Create(NewExit, BB); + addPhiValues(BB, NewExit); + if (IncludeDominator) + DT->changeImmediateDominator(NewExit, BB); + } +} + +/// \brief Create a new flow node and update dominator tree and region info +BasicBlock *StructurizeCFG::getNextFlow(BasicBlock *Dominator) { + LLVMContext &Context = Func->getContext(); + BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() : + Order.back()->getEntry(); + BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName, + Func, Insert); + DT->addNewBlock(Flow, Dominator); + ParentRegion->getRegionInfo()->setRegionFor(Flow, ParentRegion); + return Flow; +} + +/// \brief Create a new or reuse the previous node as flow node +BasicBlock *StructurizeCFG::needPrefix(bool NeedEmpty) { + BasicBlock *Entry = PrevNode->getEntry(); + + if (!PrevNode->isSubRegion()) { + killTerminator(Entry); + if (!NeedEmpty || Entry->getFirstInsertionPt() == Entry->end()) + return Entry; + + } + + // create a new flow node + BasicBlock *Flow = getNextFlow(Entry); + + // and wire it up + changeExit(PrevNode, Flow, true); + PrevNode = ParentRegion->getBBNode(Flow); + return Flow; +} + +/// \brief Returns the region exit if possible, otherwise just a new flow node +BasicBlock *StructurizeCFG::needPostfix(BasicBlock *Flow, + bool ExitUseAllowed) { + if (Order.empty() && ExitUseAllowed) { + BasicBlock *Exit = ParentRegion->getExit(); + DT->changeImmediateDominator(Exit, Flow); + addPhiValues(Flow, Exit); + return Exit; + } + return getNextFlow(Flow); +} + +/// \brief Set the previous node +void StructurizeCFG::setPrevNode(BasicBlock *BB) { + PrevNode = ParentRegion->contains(BB) ? ParentRegion->getBBNode(BB) : 0; +} + +/// \brief Does BB dominate all the predicates of Node ? +bool StructurizeCFG::dominatesPredicates(BasicBlock *BB, RegionNode *Node) { + BBPredicates &Preds = Predicates[Node->getEntry()]; + for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end(); + PI != PE; ++PI) { + + if (!DT->dominates(BB, PI->first)) + return false; + } + return true; +} + +/// \brief Can we predict that this node will always be called? +bool StructurizeCFG::isPredictableTrue(RegionNode *Node) { + BBPredicates &Preds = Predicates[Node->getEntry()]; + bool Dominated = false; + + // Regionentry is always true + if (PrevNode == 0) + return true; + + for (BBPredicates::iterator I = Preds.begin(), E = Preds.end(); + I != E; ++I) { + + if (I->second != BoolTrue) + return false; + + if (!Dominated && DT->dominates(I->first, PrevNode->getEntry())) + Dominated = true; + } + + // TODO: The dominator check is too strict + return Dominated; +} + +/// Take one node from the order vector and wire it up +void StructurizeCFG::wireFlow(bool ExitUseAllowed, + BasicBlock *LoopEnd) { + RegionNode *Node = Order.pop_back_val(); + Visited.insert(Node->getEntry()); + + if (isPredictableTrue(Node)) { + // Just a linear flow + if (PrevNode) { + changeExit(PrevNode, Node->getEntry(), true); + } + PrevNode = Node; + + } else { + // Insert extra prefix node (or reuse last one) + BasicBlock *Flow = needPrefix(false); + + // Insert extra postfix node (or use exit instead) + BasicBlock *Entry = Node->getEntry(); + BasicBlock *Next = needPostfix(Flow, ExitUseAllowed); + + // let it point to entry and next block + Conditions.push_back(BranchInst::Create(Entry, Next, BoolUndef, Flow)); + addPhiValues(Flow, Entry); + DT->changeImmediateDominator(Entry, Flow); + + PrevNode = Node; + while (!Order.empty() && !Visited.count(LoopEnd) && + dominatesPredicates(Entry, Order.back())) { + handleLoops(false, LoopEnd); + } + + changeExit(PrevNode, Next, false); + setPrevNode(Next); + } +} + +void StructurizeCFG::handleLoops(bool ExitUseAllowed, + BasicBlock *LoopEnd) { + RegionNode *Node = Order.back(); + BasicBlock *LoopStart = Node->getEntry(); + + if (!Loops.count(LoopStart)) { + wireFlow(ExitUseAllowed, LoopEnd); + return; + } + + if (!isPredictableTrue(Node)) + LoopStart = needPrefix(true); + + LoopEnd = Loops[Node->getEntry()]; + wireFlow(false, LoopEnd); + while (!Visited.count(LoopEnd)) { + handleLoops(false, LoopEnd); + } + + // If the start of the loop is the entry block, we can't branch to it so + // insert a new dummy entry block. + Function *LoopFunc = LoopStart->getParent(); + if (LoopStart == &LoopFunc->getEntryBlock()) { + LoopStart->setName("entry.orig"); + + BasicBlock *NewEntry = + BasicBlock::Create(LoopStart->getContext(), + "entry", + LoopFunc, + LoopStart); + BranchInst::Create(LoopStart, NewEntry); + } + + // Create an extra loop end node + LoopEnd = needPrefix(false); + BasicBlock *Next = needPostfix(LoopEnd, ExitUseAllowed); + LoopConds.push_back(BranchInst::Create(Next, LoopStart, + BoolUndef, LoopEnd)); + addPhiValues(LoopEnd, LoopStart); + setPrevNode(Next); +} + +/// After this function control flow looks like it should be, but +/// branches and PHI nodes only have undefined conditions. +void StructurizeCFG::createFlow() { + BasicBlock *Exit = ParentRegion->getExit(); + bool EntryDominatesExit = DT->dominates(ParentRegion->getEntry(), Exit); + + DeletedPhis.clear(); + AddedPhis.clear(); + Conditions.clear(); + LoopConds.clear(); + + PrevNode = 0; + Visited.clear(); + + while (!Order.empty()) { + handleLoops(EntryDominatesExit, 0); + } + + if (PrevNode) + changeExit(PrevNode, Exit, EntryDominatesExit); + else + assert(EntryDominatesExit); +} + +/// Handle a rare case where the disintegrated nodes instructions +/// no longer dominate all their uses. Not sure if this is really nessasary +void StructurizeCFG::rebuildSSA() { + SSAUpdater Updater; + for (Region::block_iterator I = ParentRegion->block_begin(), + E = ParentRegion->block_end(); + I != E; ++I) { + + BasicBlock *BB = *I; + for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); + II != IE; ++II) { + + bool Initialized = false; + for (Use *I = &II->use_begin().getUse(), *Next; I; I = Next) { + + Next = I->getNext(); + + Instruction *User = cast<Instruction>(I->getUser()); + if (User->getParent() == BB) { + continue; + + } else if (PHINode *UserPN = dyn_cast<PHINode>(User)) { + if (UserPN->getIncomingBlock(*I) == BB) + continue; + } + + if (DT->dominates(II, User)) + continue; + + if (!Initialized) { + Value *Undef = UndefValue::get(II->getType()); + Updater.Initialize(II->getType(), ""); + Updater.AddAvailableValue(&Func->getEntryBlock(), Undef); + Updater.AddAvailableValue(BB, II); + Initialized = true; + } + Updater.RewriteUseAfterInsertions(*I); + } + } + } +} + +/// \brief Run the transformation for each region found +bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) { + if (R->isTopLevelRegion()) + return false; + + Func = R->getEntry()->getParent(); + ParentRegion = R; + + DT = &getAnalysis<DominatorTree>(); + + orderNodes(); + collectInfos(); + createFlow(); + insertConditions(false); + insertConditions(true); + setPhiValues(); + rebuildSSA(); + + // Cleanup + Order.clear(); + Visited.clear(); + DeletedPhis.clear(); + AddedPhis.clear(); + Predicates.clear(); + Conditions.clear(); + Loops.clear(); + LoopPreds.clear(); + LoopConds.clear(); + + return true; +} + +/// \brief Create the pass +Pass *llvm::createStructurizeCFGPass() { + return new StructurizeCFG(); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp index 2002e68..9fb8ddc 100644 --- a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -53,6 +53,7 @@ #define DEBUG_TYPE "tailcallelim" #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/InlineCost.h" @@ -69,6 +70,7 @@ #include "llvm/Support/CFG.h" #include "llvm/Support/CallSite.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" @@ -97,16 +99,16 @@ namespace { bool EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, BasicBlock *&OldEntry, bool &TailCallsAreMarkedTail, - SmallVector<PHINode*, 8> &ArgumentPHIs, + SmallVectorImpl<PHINode *> &ArgumentPHIs, bool CannotTailCallElimCallsMarkedTail); bool FoldReturnAndProcessPred(BasicBlock *BB, ReturnInst *Ret, BasicBlock *&OldEntry, bool &TailCallsAreMarkedTail, - SmallVector<PHINode*, 8> &ArgumentPHIs, + SmallVectorImpl<PHINode *> &ArgumentPHIs, bool CannotTailCallElimCallsMarkedTail); bool ProcessReturningBlock(ReturnInst *RI, BasicBlock *&OldEntry, bool &TailCallsAreMarkedTail, - SmallVector<PHINode*, 8> &ArgumentPHIs, + SmallVectorImpl<PHINode *> &ArgumentPHIs, bool CannotTailCallElimCallsMarkedTail); bool CanMoveAboveCall(Instruction *I, CallInst *CI); Value *CanTransformAccumulatorRecursion(Instruction *I, CallInst *CI); @@ -129,34 +131,44 @@ void TailCallElim::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<TargetTransformInfo>(); } -/// AllocaMightEscapeToCalls - Return true if this alloca may be accessed by -/// callees of this function. We only do very simple analysis right now, this -/// could be expanded in the future to use mod/ref information for particular -/// call sites if desired. -static bool AllocaMightEscapeToCalls(AllocaInst *AI) { - // FIXME: do simple 'address taken' analysis. - return true; +/// CanTRE - Scan the specified basic block for alloca instructions. +/// If it contains any that are variable-sized or not in the entry block, +/// returns false. +static bool CanTRE(AllocaInst *AI) { + // Because of PR962, we don't TRE allocas outside the entry block. + + // If this alloca is in the body of the function, or if it is a variable + // sized allocation, we cannot tail call eliminate calls marked 'tail' + // with this mechanism. + BasicBlock *BB = AI->getParent(); + return BB == &BB->getParent()->getEntryBlock() && + isa<ConstantInt>(AI->getArraySize()); } -/// CheckForEscapingAllocas - Scan the specified basic block for alloca -/// instructions. If it contains any that might be accessed by calls, return -/// true. -static bool CheckForEscapingAllocas(BasicBlock *BB, - bool &CannotTCETailMarkedCall) { - bool RetVal = false; - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) - if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) { - RetVal |= AllocaMightEscapeToCalls(AI); - - // If this alloca is in the body of the function, or if it is a variable - // sized allocation, we cannot tail call eliminate calls marked 'tail' - // with this mechanism. - if (BB != &BB->getParent()->getEntryBlock() || - !isa<ConstantInt>(AI->getArraySize())) - CannotTCETailMarkedCall = true; - } - return RetVal; -} +namespace { +struct AllocaCaptureTracker : public CaptureTracker { + AllocaCaptureTracker() : Captured(false) {} + + void tooManyUses() LLVM_OVERRIDE { Captured = true; } + + bool shouldExplore(Use *U) LLVM_OVERRIDE { + Value *V = U->getUser(); + if (isa<CallInst>(V) || isa<InvokeInst>(V)) + UsesAlloca.insert(V); + return true; + } + + bool captured(Use *U) LLVM_OVERRIDE { + if (isa<ReturnInst>(U->getUser())) + return false; + Captured = true; + return true; + } + + bool Captured; + SmallPtrSet<const Value *, 16> UsesAlloca; +}; +} // end anonymous namespace bool TailCallElim::runOnFunction(Function &F) { // If this function is a varargs function, we won't be able to PHI the args @@ -168,41 +180,44 @@ bool TailCallElim::runOnFunction(Function &F) { bool TailCallsAreMarkedTail = false; SmallVector<PHINode*, 8> ArgumentPHIs; bool MadeChange = false; - bool FunctionContainsEscapingAllocas = false; - // CannotTCETailMarkedCall - If true, we cannot perform TCE on tail calls + // CanTRETailMarkedCall - If false, we cannot perform TRE on tail calls // marked with the 'tail' attribute, because doing so would cause the stack - // size to increase (real TCE would deallocate variable sized allocas, TCE + // size to increase (real TRE would deallocate variable sized allocas, TRE // doesn't). - bool CannotTCETailMarkedCall = false; - - // Loop over the function, looking for any returning blocks, and keeping track - // of whether this function has any non-trivially used allocas. - for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - if (FunctionContainsEscapingAllocas && CannotTCETailMarkedCall) - break; - - FunctionContainsEscapingAllocas |= - CheckForEscapingAllocas(BB, CannotTCETailMarkedCall); + bool CanTRETailMarkedCall = true; + + // Find calls that can be marked tail. + AllocaCaptureTracker ACT; + for (Function::iterator BB = F.begin(), EE = F.end(); BB != EE; ++BB) { + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) { + CanTRETailMarkedCall &= CanTRE(AI); + PointerMayBeCaptured(AI, &ACT); + // If any allocas are captured, exit. + if (ACT.Captured) + return false; + } + } } - /// FIXME: The code generator produces really bad code when an 'escaping - /// alloca' is changed from being a static alloca to being a dynamic alloca. - /// Until this is resolved, disable this transformation if that would ever - /// happen. This bug is PR962. - if (FunctionContainsEscapingAllocas) - return false; - - // Second pass, change any tail calls to loops. - for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) { - bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail, - ArgumentPHIs,CannotTCETailMarkedCall); - if (!Change && BB->getFirstNonPHIOrDbg() == Ret) - Change = FoldReturnAndProcessPred(BB, Ret, OldEntry, - TailCallsAreMarkedTail, ArgumentPHIs, - CannotTCETailMarkedCall); - MadeChange |= Change; + // Second pass, change any tail recursive calls to loops. + // + // FIXME: The code generator produces really bad code when an 'escaping + // alloca' is changed from being a static alloca to being a dynamic alloca. + // Until this is resolved, disable this transformation if that would ever + // happen. This bug is PR962. + if (ACT.UsesAlloca.empty()) { + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { + if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) { + bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail, + ArgumentPHIs, !CanTRETailMarkedCall); + if (!Change && BB->getFirstNonPHIOrDbg() == Ret) + Change = FoldReturnAndProcessPred(BB, Ret, OldEntry, + TailCallsAreMarkedTail, ArgumentPHIs, + !CanTRETailMarkedCall); + MadeChange |= Change; + } } } @@ -223,16 +238,24 @@ bool TailCallElim::runOnFunction(Function &F) { } } - // Finally, if this function contains no non-escaping allocas, or calls - // setjmp, mark all calls in the function as eligible for tail calls - //(there is no stack memory for them to access). - if (!FunctionContainsEscapingAllocas && !F.callsFunctionThatReturnsTwice()) - for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) + // At this point, we know that the function does not have any captured + // allocas. If additionally the function does not call setjmp, mark all calls + // in the function that do not access stack memory with the tail keyword. This + // implies ensuring that there does not exist any path from a call that takes + // in an alloca but does not capture it and the call which we wish to mark + // with "tail". + if (!F.callsFunctionThatReturnsTwice()) { + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { if (CallInst *CI = dyn_cast<CallInst>(I)) { - CI->setTailCall(); - MadeChange = true; + if (!ACT.UsesAlloca.count(CI)) { + CI->setTailCall(); + MadeChange = true; + } } + } + } + } return MadeChange; } @@ -424,7 +447,7 @@ TailCallElim::FindTRECandidate(Instruction *TI, bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, BasicBlock *&OldEntry, bool &TailCallsAreMarkedTail, - SmallVector<PHINode*, 8> &ArgumentPHIs, + SmallVectorImpl<PHINode *> &ArgumentPHIs, bool CannotTailCallElimCallsMarkedTail) { // If we are introducing accumulator recursion to eliminate operations after // the call instruction that are both associative and commutative, the initial @@ -600,7 +623,7 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, bool TailCallElim::FoldReturnAndProcessPred(BasicBlock *BB, ReturnInst *Ret, BasicBlock *&OldEntry, bool &TailCallsAreMarkedTail, - SmallVector<PHINode*, 8> &ArgumentPHIs, + SmallVectorImpl<PHINode *> &ArgumentPHIs, bool CannotTailCallElimCallsMarkedTail) { bool Change = false; @@ -634,10 +657,11 @@ bool TailCallElim::FoldReturnAndProcessPred(BasicBlock *BB, return Change; } -bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry, - bool &TailCallsAreMarkedTail, - SmallVector<PHINode*, 8> &ArgumentPHIs, - bool CannotTailCallElimCallsMarkedTail) { +bool +TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry, + bool &TailCallsAreMarkedTail, + SmallVectorImpl<PHINode *> &ArgumentPHIs, + bool CannotTailCallElimCallsMarkedTail) { CallInst *CI = FindTRECandidate(Ret, CannotTailCallElimCallsMarkedTail); if (!CI) return false; |