diff options
Diffstat (limited to 'contrib/llvm/lib/Transforms/Scalar')
26 files changed, 4576 insertions, 1522 deletions
diff --git a/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp b/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp index a3c426a..123ed0f 100644 --- a/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp @@ -27,6 +27,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/DominatorInternals.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/ProfileInfo.h" #include "llvm/Assembly/Writer.h" @@ -37,12 +38,13 @@ #include "llvm/Support/PatternMatch.h" #include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetData.h" +#include "llvm/DataLayout.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Transforms/Utils/AddrModeMatcher.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" +#include "llvm/Transforms/Utils/BypassSlowDivision.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; using namespace llvm::PatternMatch; @@ -146,9 +148,18 @@ bool CodeGenPrepare::runOnFunction(Function &F) { TLInfo = &getAnalysis<TargetLibraryInfo>(); DT = getAnalysisIfAvailable<DominatorTree>(); PFI = getAnalysisIfAvailable<ProfileInfo>(); - OptSize = F.hasFnAttr(Attribute::OptimizeForSize); + OptSize = F.getFnAttributes().hasAttribute(Attributes::OptimizeForSize); + + /// This optimization identifies DIV instructions that can be + /// profitably bypassed and carried out with a shorter, faster divide. + if (TLI && TLI->isSlowDivBypassed()) { + const DenseMap<unsigned int, unsigned int> &BypassWidths = + TLI->getBypassSlowDivWidths(); + for (Function::iterator I = F.begin(); I != F.end(); I++) + EverMadeChange |= bypassSlowDivision(F, I, BypassWidths); + } - // First pass, eliminate blocks that contain only PHI nodes and an + // Eliminate blocks that contain only PHI nodes and an // unconditional branch. EverMadeChange |= EliminateMostlyEmptyBlocks(F); @@ -160,7 +171,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) { bool MadeChange = true; while (MadeChange) { MadeChange = false; - for (Function::iterator I = F.begin(), E = F.end(); I != E; ) { + for (Function::iterator I = F.begin(); I != F.end(); ) { BasicBlock *BB = I++; MadeChange |= OptimizeBlock(*BB); } @@ -215,11 +226,13 @@ bool CodeGenPrepare::EliminateFallThrough(Function &F) { // edge, just collapse it. BasicBlock *SinglePred = BB->getSinglePredecessor(); - if (!SinglePred || SinglePred == BB) continue; + // Don't merge if BB's address is taken. + if (!SinglePred || SinglePred == BB || BB->hasAddressTaken()) continue; BranchInst *Term = dyn_cast<BranchInst>(SinglePred->getTerminator()); if (Term && !Term->isConditional()) { Changed = true; + DEBUG(dbgs() << "To merge:\n"<< *SinglePred << "\n\n\n"); // Remember if SinglePred was the entry block of the function. // If so, we will need to move BB back to the entry position. bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock(); @@ -230,7 +243,6 @@ bool CodeGenPrepare::EliminateFallThrough(Function &F) { // We have erased a block. Update the iterator. I = BB; - DEBUG(dbgs() << "Merged:\n"<< *SinglePred << "\n\n\n"); } } return Changed; @@ -610,7 +622,7 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { // happens. WeakVH IterHandle(CurInstIterator); - replaceAndRecursivelySimplify(CI, RetVal, TLI ? TLI->getTargetData() : 0, + replaceAndRecursivelySimplify(CI, RetVal, TLI ? TLI->getDataLayout() : 0, TLInfo, ModifiedDT ? 0 : DT); // If the iterator instruction was recursively deleted, start over at the @@ -634,8 +646,8 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { // From here on out we're working with named functions. if (CI->getCalledFunction() == 0) return false; - // We'll need TargetData from here on out. - const TargetData *TD = TLI ? TLI->getTargetData() : 0; + // We'll need DataLayout from here on out. + const DataLayout *TD = TLI ? TLI->getDataLayout() : 0; if (!TD) return false; // Lower all default uses of _chk calls. This is very similar @@ -649,6 +661,7 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { /// DupRetToEnableTailCallOpts - Look for opportunities to duplicate return /// instructions to the predecessor to enable tail call optimizations. The /// case it is currently looking for is: +/// @code /// bb0: /// %tmp0 = tail call i32 @f0() /// br label %return @@ -661,9 +674,11 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { /// return: /// %retval = phi i32 [ %tmp0, %bb0 ], [ %tmp1, %bb1 ], [ %tmp2, %bb2 ] /// ret i32 %retval +/// @endcode /// /// => /// +/// @code /// bb0: /// %tmp0 = tail call i32 @f0() /// ret i32 %tmp0 @@ -673,7 +688,7 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { /// bb2: /// %tmp2 = tail call i32 @f2() /// ret i32 %tmp2 -/// +/// @endcode bool CodeGenPrepare::DupRetToEnableTailCallOpts(ReturnInst *RI) { if (!TLI) return false; @@ -699,7 +714,8 @@ bool CodeGenPrepare::DupRetToEnableTailCallOpts(ReturnInst *RI) { // See llvm::isInTailCallPosition(). const Function *F = BB->getParent(); Attributes CallerRetAttr = F->getAttributes().getRetAttributes(); - if ((CallerRetAttr & Attribute::ZExt) || (CallerRetAttr & Attribute::SExt)) + if (CallerRetAttr.hasAttribute(Attributes::ZExt) || + CallerRetAttr.hasAttribute(Attributes::SExt)) return false; // Make sure there are no instructions between the PHI and return, or that the @@ -757,7 +773,10 @@ bool CodeGenPrepare::DupRetToEnableTailCallOpts(ReturnInst *RI) { // Conservatively require the attributes of the call to match those of the // return. Ignore noalias because it doesn't affect the call sequence. Attributes CalleeRetAttr = CS.getAttributes().getRetAttributes(); - if ((CalleeRetAttr ^ CallerRetAttr) & ~Attribute::NoAlias) + if (AttrBuilder(CalleeRetAttr). + removeAttribute(Attributes::NoAlias) != + AttrBuilder(CallerRetAttr). + removeAttribute(Attributes::NoAlias)) continue; // Make sure the call instruction is followed by an unconditional branch to @@ -774,7 +793,7 @@ bool CodeGenPrepare::DupRetToEnableTailCallOpts(ReturnInst *RI) { } // If we eliminated all predecessors of the block, delete the block now. - if (Changed && pred_begin(BB) == pred_end(BB)) + if (Changed && !BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB)) BB->eraseFromParent(); return Changed; @@ -914,7 +933,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for " << *MemoryInst); Type *IntPtrTy = - TLI->getTargetData()->getIntPtrType(AccessTy->getContext()); + TLI->getDataLayout()->getIntPtrType(AccessTy->getContext()); Value *Result = 0; @@ -988,7 +1007,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, WeakVH IterHandle(CurInstIterator); BasicBlock *BB = CurInstIterator->getParent(); - RecursivelyDeleteTriviallyDeadInstructions(Repl); + RecursivelyDeleteTriviallyDeadInstructions(Repl, TLInfo); if (IterHandle != CurInstIterator) { // If the iterator instruction was recursively deleted, start over at the @@ -1174,17 +1193,32 @@ static bool isFormingBranchFromSelectProfitable(SelectInst *SI) { } +/// If we have a SelectInst that will likely profit from branch prediction, +/// turn it into a branch. bool CodeGenPrepare::OptimizeSelectInst(SelectInst *SI) { - // If we have a SelectInst that will likely profit from branch prediction, - // turn it into a branch. - if (DisableSelectToBranch || OptSize || !TLI || - !TLI->isPredictableSelectExpensive()) - return false; + bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1); - if (!SI->getCondition()->getType()->isIntegerTy(1) || - !isFormingBranchFromSelectProfitable(SI)) + // Can we convert the 'select' to CF ? + if (DisableSelectToBranch || OptSize || !TLI || VectorCond) return false; + TargetLowering::SelectSupportKind SelectKind; + if (VectorCond) + SelectKind = TargetLowering::VectorMaskSelect; + else if (SI->getType()->isVectorTy()) + SelectKind = TargetLowering::ScalarCondVectorVal; + else + SelectKind = TargetLowering::ScalarValSelect; + + // Do we have efficient codegen support for this kind of 'selects' ? + if (TLI->isSelectSupported(SelectKind)) { + // We have efficient codegen support for the select instruction. + // Check if it is profitable to keep this 'select'. + if (!TLI->isPredictableSelectExpensive() || + !isFormingBranchFromSelectProfitable(SI)) + return false; + } + ModifiedDT = true; // First, we split the block containing the select into 2 blocks. @@ -1302,7 +1336,7 @@ bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) { bool MadeChange = false; CurInstIterator = BB.begin(); - for (BasicBlock::iterator E = BB.end(); CurInstIterator != E; ) + while (CurInstIterator != BB.end()) MadeChange |= OptimizeInst(CurInstIterator++); return MadeChange; diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp index 5430f62..369720b 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp @@ -24,7 +24,7 @@ #include "llvm/Constant.h" #include "llvm/Instruction.h" #include "llvm/Pass.h" -#include "llvm/Target/TargetData.h" +#include "llvm/DataLayout.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Support/InstIterator.h" #include "llvm/ADT/Statistic.h" @@ -67,7 +67,7 @@ bool ConstantPropagation::runOnFunction(Function &F) { WorkList.insert(&*i); } bool Changed = false; - TargetData *TD = getAnalysisIfAvailable<TargetData>(); + DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); while (!WorkList.empty()) { diff --git a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 9b0aadb..3ec6f3d 100644 --- a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -235,6 +235,11 @@ bool CorrelatedValuePropagation::processSwitch(SwitchInst *SI) { // This case never fires - remove it. CI.getCaseSuccessor()->removePredecessor(BB); SI->removeCase(CI); // Does not invalidate the iterator. + + // The condition can be modified by removePredecessor's PHI simplification + // logic. + Cond = SI->getCondition(); + ++NumDeadCases; Changed = true; } else if (State == LazyValueInfo::True) { diff --git a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp index 8dbcc23..a2e074f 100644 --- a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp @@ -22,6 +22,7 @@ #include "llvm/Instruction.h" #include "llvm/Pass.h" #include "llvm/Support/InstIterator.h" +#include "llvm/Target/TargetLibraryInfo.h" #include "llvm/ADT/Statistic.h" using namespace llvm; @@ -38,10 +39,11 @@ namespace { initializeDeadInstEliminationPass(*PassRegistry::getPassRegistry()); } virtual bool runOnBasicBlock(BasicBlock &BB) { + TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); bool Changed = false; for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) { Instruction *Inst = DI++; - if (isInstructionTriviallyDead(Inst)) { + if (isInstructionTriviallyDead(Inst, TLI)) { Inst->eraseFromParent(); Changed = true; ++DIEEliminated; @@ -87,6 +89,8 @@ char DCE::ID = 0; INITIALIZE_PASS(DCE, "dce", "Dead Code Elimination", false, false) bool DCE::runOnFunction(Function &F) { + TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); + // Start out with all of the instructions in the worklist... std::vector<Instruction*> WorkList; for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) @@ -101,7 +105,7 @@ bool DCE::runOnFunction(Function &F) { Instruction *I = WorkList.back(); WorkList.pop_back(); - if (isInstructionTriviallyDead(I)) { // If the instruction is dead. + if (isInstructionTriviallyDead(I, TLI)) { // If the instruction is dead. // Loop over all of the values that the instruction uses, if there are // instructions being used, add them to the worklist, because they might // go dead after this one is removed. @@ -114,13 +118,8 @@ bool DCE::runOnFunction(Function &F) { I->eraseFromParent(); // Remove the instruction from the worklist if it still exists in it. - for (std::vector<Instruction*>::iterator WI = WorkList.begin(); - WI != WorkList.end(); ) { - if (*WI == I) - WI = WorkList.erase(WI); - else - ++WI; - } + WorkList.erase(std::remove(WorkList.begin(), WorkList.end(), I), + WorkList.end()); MadeChange = true; ++DCEEliminated; diff --git a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index 8b1283f..736cc05 100644 --- a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -29,7 +29,8 @@ #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Target/TargetData.h" +#include "llvm/DataLayout.h" +#include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Support/Debug.h" #include "llvm/ADT/SetVector.h" @@ -45,6 +46,7 @@ namespace { AliasAnalysis *AA; MemoryDependenceAnalysis *MD; DominatorTree *DT; + const TargetLibraryInfo *TLI; static char ID; // Pass identification, replacement for typeid DSE() : FunctionPass(ID), AA(0), MD(0), DT(0) { @@ -55,6 +57,7 @@ namespace { AA = &getAnalysis<AliasAnalysis>(); MD = &getAnalysis<MemoryDependenceAnalysis>(); DT = &getAnalysis<DominatorTree>(); + TLI = AA->getTargetLibraryInfo(); bool Changed = false; for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) @@ -106,6 +109,7 @@ FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); } /// static void DeleteDeadInstruction(Instruction *I, MemoryDependenceAnalysis &MD, + const TargetLibraryInfo *TLI, SmallSetVector<Value*, 16> *ValueSet = 0) { SmallVector<Instruction*, 32> NowDeadInsts; @@ -130,7 +134,7 @@ static void DeleteDeadInstruction(Instruction *I, if (!Op->use_empty()) continue; if (Instruction *OpI = dyn_cast<Instruction>(Op)) - if (isInstructionTriviallyDead(OpI)) + if (isInstructionTriviallyDead(OpI, TLI)) NowDeadInsts.push_back(OpI); } @@ -143,7 +147,7 @@ static void DeleteDeadInstruction(Instruction *I, /// hasMemoryWrite - Does this instruction write some memory? This only returns /// true for things that we can analyze with other helpers below. -static bool hasMemoryWrite(Instruction *I) { +static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo *TLI) { if (isa<StoreInst>(I)) return true; if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { @@ -158,6 +162,26 @@ static bool hasMemoryWrite(Instruction *I) { return true; } } + if (CallSite CS = I) { + if (Function *F = CS.getCalledFunction()) { + if (TLI && TLI->has(LibFunc::strcpy) && + F->getName() == TLI->getName(LibFunc::strcpy)) { + return true; + } + if (TLI && TLI->has(LibFunc::strncpy) && + F->getName() == TLI->getName(LibFunc::strncpy)) { + return true; + } + if (TLI && TLI->has(LibFunc::strcat) && + F->getName() == TLI->getName(LibFunc::strcat)) { + return true; + } + if (TLI && TLI->has(LibFunc::strncat) && + F->getName() == TLI->getName(LibFunc::strncat)) { + return true; + } + } + } return false; } @@ -175,7 +199,7 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { // If we don't have target data around, an unknown size in Location means // that we should use the size of the pointee type. This isn't valid for // memset/memcpy, which writes more than an i8. - if (Loc.Size == AliasAnalysis::UnknownSize && AA.getTargetData() == 0) + if (Loc.Size == AliasAnalysis::UnknownSize && AA.getDataLayout() == 0) return AliasAnalysis::Location(); return Loc; } @@ -189,7 +213,7 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { // If we don't have target data around, an unknown size in Location means // that we should use the size of the pointee type. This isn't valid for // init.trampoline, which writes more than an i8. - if (AA.getTargetData() == 0) return AliasAnalysis::Location(); + if (AA.getDataLayout() == 0) return AliasAnalysis::Location(); // FIXME: We don't know the size of the trampoline, so we can't really // handle it here. @@ -205,7 +229,8 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { /// instruction if any. static AliasAnalysis::Location getLocForRead(Instruction *Inst, AliasAnalysis &AA) { - assert(hasMemoryWrite(Inst) && "Unknown instruction case"); + assert(hasMemoryWrite(Inst, AA.getTargetLibraryInfo()) && + "Unknown instruction case"); // The only instructions that both read and write are the mem transfer // instructions (memcpy/memmove). @@ -222,23 +247,29 @@ static bool isRemovable(Instruction *I) { if (StoreInst *SI = dyn_cast<StoreInst>(I)) return SI->isUnordered(); - IntrinsicInst *II = cast<IntrinsicInst>(I); - switch (II->getIntrinsicID()) { - default: llvm_unreachable("doesn't pass 'hasMemoryWrite' predicate"); - case Intrinsic::lifetime_end: - // Never remove dead lifetime_end's, e.g. because it is followed by a - // free. - return false; - case Intrinsic::init_trampoline: - // Always safe to remove init_trampoline. - return true; + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + switch (II->getIntrinsicID()) { + default: llvm_unreachable("doesn't pass 'hasMemoryWrite' predicate"); + case Intrinsic::lifetime_end: + // Never remove dead lifetime_end's, e.g. because it is followed by a + // free. + return false; + case Intrinsic::init_trampoline: + // Always safe to remove init_trampoline. + return true; - case Intrinsic::memset: - case Intrinsic::memmove: - case Intrinsic::memcpy: - // Don't remove volatile memory intrinsics. - return !cast<MemIntrinsic>(II)->isVolatile(); + case Intrinsic::memset: + case Intrinsic::memmove: + case Intrinsic::memcpy: + // Don't remove volatile memory intrinsics. + return !cast<MemIntrinsic>(II)->isVolatile(); + } } + + if (CallSite CS = I) + return CS.getInstruction()->use_empty(); + + return false; } @@ -249,14 +280,19 @@ static bool isShortenable(Instruction *I) { if (isa<StoreInst>(I)) return false; - IntrinsicInst *II = cast<IntrinsicInst>(I); - switch (II->getIntrinsicID()) { - default: return false; - case Intrinsic::memset: - case Intrinsic::memcpy: - // Do shorten memory intrinsics. - return true; + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + switch (II->getIntrinsicID()) { + default: return false; + case Intrinsic::memset: + case Intrinsic::memcpy: + // Do shorten memory intrinsics. + return true; + } } + + // Don't shorten libcalls calls for now. + + return false; } /// getStoredPointerOperand - Return the pointer that is being written to. @@ -266,17 +302,23 @@ static Value *getStoredPointerOperand(Instruction *I) { if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) return MI->getDest(); - IntrinsicInst *II = cast<IntrinsicInst>(I); - switch (II->getIntrinsicID()) { - default: llvm_unreachable("Unexpected intrinsic!"); - case Intrinsic::init_trampoline: - return II->getArgOperand(0); + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + switch (II->getIntrinsicID()) { + default: llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::init_trampoline: + return II->getArgOperand(0); + } } + + CallSite CS = I; + // All the supported functions so far happen to have dest as their first + // argument. + return CS.getArgument(0); } static uint64_t getPointerSize(const Value *V, AliasAnalysis &AA) { uint64_t Size; - if (getObjectSize(V, Size, AA.getTargetData())) + if (getObjectSize(V, Size, AA.getDataLayout(), AA.getTargetLibraryInfo())) return Size; return AliasAnalysis::UnknownSize; } @@ -309,10 +351,10 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later, // comparison. if (Later.Size == AliasAnalysis::UnknownSize || Earlier.Size == AliasAnalysis::UnknownSize) { - // If we have no TargetData information around, then the size of the store + // If we have no DataLayout information around, then the size of the store // is inferrable from the pointee type. If they are the same type, then // we know that the store is safe. - if (AA.getTargetData() == 0 && + if (AA.getDataLayout() == 0 && Later.Ptr->getType() == Earlier.Ptr->getType()) return OverwriteComplete; @@ -328,13 +370,13 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later, // larger than the earlier one. if (Later.Size == AliasAnalysis::UnknownSize || Earlier.Size == AliasAnalysis::UnknownSize || - AA.getTargetData() == 0) + AA.getDataLayout() == 0) return OverwriteUnknown; // Check to see if the later store is to the entire object (either a global, // an alloca, or a byval argument). If so, then it clearly overwrites any // other store to the same object. - const TargetData &TD = *AA.getTargetData(); + const DataLayout &TD = *AA.getDataLayout(); const Value *UO1 = GetUnderlyingObject(P1, &TD), *UO2 = GetUnderlyingObject(P2, &TD); @@ -454,13 +496,13 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { Instruction *Inst = BBI++; // Handle 'free' calls specially. - if (CallInst *F = isFreeCall(Inst)) { + if (CallInst *F = isFreeCall(Inst, TLI)) { MadeChange |= HandleFree(F); continue; } // If we find something that writes memory, get its memory dependence. - if (!hasMemoryWrite(Inst)) + if (!hasMemoryWrite(Inst, TLI)) continue; MemDepResult InstDep = MD->getDependency(Inst); @@ -483,7 +525,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { // in case we need it. WeakVH NextInst(BBI); - DeleteDeadInstruction(SI, *MD); + DeleteDeadInstruction(SI, *MD, TLI); if (NextInst == 0) // Next instruction deleted. BBI = BB.begin(); @@ -530,7 +572,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { << *DepWrite << "\n KILLER: " << *Inst << '\n'); // Delete the store and now-dead instructions that feed it. - DeleteDeadInstruction(DepWrite, *MD); + DeleteDeadInstruction(DepWrite, *MD, TLI); ++NumFastStores; MadeChange = true; @@ -627,7 +669,7 @@ bool DSE::HandleFree(CallInst *F) { MemDepResult Dep = MD->getPointerDependencyFrom(Loc, false, InstPt, BB); while (Dep.isDef() || Dep.isClobber()) { Instruction *Dependency = Dep.getInst(); - if (!hasMemoryWrite(Dependency) || !isRemovable(Dependency)) + if (!hasMemoryWrite(Dependency, TLI) || !isRemovable(Dependency)) break; Value *DepPointer = @@ -640,7 +682,7 @@ bool DSE::HandleFree(CallInst *F) { Instruction *Next = llvm::next(BasicBlock::iterator(Dependency)); // DCE instructions only used to calculate that store - DeleteDeadInstruction(Dependency, *MD); + DeleteDeadInstruction(Dependency, *MD, TLI); ++NumFastStores; MadeChange = true; @@ -659,6 +701,22 @@ bool DSE::HandleFree(CallInst *F) { return MadeChange; } +namespace { + struct CouldRef { + typedef Value *argument_type; + const CallSite CS; + AliasAnalysis *AA; + + bool operator()(Value *I) { + // See if the call site touches the value. + AliasAnalysis::ModRefResult A = + AA->getModRefInfo(CS, I, getPointerSize(I, *AA)); + + return A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref; + } + }; +} + /// handleEndBlock - Remove dead stores to stack-allocated locations in the /// function end block. Ex: /// %A = alloca i32 @@ -680,7 +738,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { // Okay, so these are dead heap objects, but if the pointer never escapes // then it's leaked by this function anyways. - else if (isAllocLikeFn(I) && !PointerMayBeCaptured(I, true, true)) + else if (isAllocLikeFn(I, TLI) && !PointerMayBeCaptured(I, true, true)) DeadStackObjects.insert(I); } @@ -696,7 +754,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { --BBI; // If we find a store, check to see if it points into a dead stack value. - if (hasMemoryWrite(BBI) && isRemovable(BBI)) { + if (hasMemoryWrite(BBI, TLI) && isRemovable(BBI)) { // See through pointer-to-pointer bitcasts SmallVector<Value *, 4> Pointers; GetUnderlyingObjects(getStoredPointerOperand(BBI), Pointers); @@ -724,7 +782,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { dbgs() << '\n'); // DCE instructions only used to calculate that store. - DeleteDeadInstruction(Dead, *MD, &DeadStackObjects); + DeleteDeadInstruction(Dead, *MD, TLI, &DeadStackObjects); ++NumFastStores; MadeChange = true; continue; @@ -732,9 +790,9 @@ bool DSE::handleEndBlock(BasicBlock &BB) { } // Remove any dead non-memory-mutating instructions. - if (isInstructionTriviallyDead(BBI)) { + if (isInstructionTriviallyDead(BBI, TLI)) { Instruction *Inst = BBI++; - DeleteDeadInstruction(Inst, *MD, &DeadStackObjects); + DeleteDeadInstruction(Inst, *MD, TLI, &DeadStackObjects); ++NumFastOther; MadeChange = true; continue; @@ -750,7 +808,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { if (CallSite CS = cast<Value>(BBI)) { // Remove allocation function calls from the list of dead stack objects; // there can't be any references before the definition. - if (isAllocLikeFn(BBI)) + if (isAllocLikeFn(BBI, TLI)) DeadStackObjects.remove(BBI); // If this call does not access memory, it can't be loading any of our @@ -760,20 +818,8 @@ bool DSE::handleEndBlock(BasicBlock &BB) { // If the call might load from any of our allocas, then any store above // the call is live. - SmallVector<Value*, 8> LiveAllocas; - for (SmallSetVector<Value*, 16>::iterator I = DeadStackObjects.begin(), - E = DeadStackObjects.end(); I != E; ++I) { - // See if the call site touches it. - AliasAnalysis::ModRefResult A = - AA->getModRefInfo(CS, *I, getPointerSize(*I, *AA)); - - if (A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref) - LiveAllocas.push_back(*I); - } - - for (SmallVector<Value*, 8>::iterator I = LiveAllocas.begin(), - E = LiveAllocas.end(); I != E; ++I) - DeadStackObjects.remove(*I); + CouldRef Pred = { CS, AA }; + DeadStackObjects.remove_if(Pred); // If all of the allocas were clobbered by the call then we're not going // to find anything else to process. @@ -816,6 +862,20 @@ bool DSE::handleEndBlock(BasicBlock &BB) { return MadeChange; } +namespace { + struct CouldAlias { + typedef Value *argument_type; + const AliasAnalysis::Location &LoadedLoc; + AliasAnalysis *AA; + + bool operator()(Value *I) { + // See if the loaded location could alias the stack location. + AliasAnalysis::Location StackLoc(I, getPointerSize(I, *AA)); + return !AA->isNoAlias(StackLoc, LoadedLoc); + } + }; +} + /// RemoveAccessedObjects - Check to see if the specified location may alias any /// of the stack objects in the DeadStackObjects set. If so, they become live /// because the location is being loaded. @@ -834,16 +894,7 @@ void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc, return; } - SmallVector<Value*, 16> NowLive; - for (SmallSetVector<Value*, 16>::iterator I = DeadStackObjects.begin(), - E = DeadStackObjects.end(); I != E; ++I) { - // See if the loaded location could alias the stack location. - AliasAnalysis::Location StackLoc(*I, getPointerSize(*I, *AA)); - if (!AA->isNoAlias(StackLoc, LoadedLoc)) - NowLive.push_back(*I); - } - - for (SmallVector<Value*, 16>::iterator I = NowLive.begin(), E = NowLive.end(); - I != E; ++I) - DeadStackObjects.remove(*I); + // Remove objects that could alias LoadedLoc. + CouldAlias Pred = { LoadedLoc, AA }; + DeadStackObjects.remove_if(Pred); } diff --git a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp index 9759549..101009d 100644 --- a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -18,11 +18,12 @@ #include "llvm/Pass.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Target/TargetData.h" +#include "llvm/DataLayout.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Support/Debug.h" #include "llvm/Support/RecyclingAllocator.h" +#include "llvm/ADT/Hashing.h" #include "llvm/ADT/ScopedHashTable.h" #include "llvm/ADT/Statistic.h" #include <deque> @@ -90,35 +91,56 @@ template<> struct DenseMapInfo<SimpleValue> { unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) { Instruction *Inst = Val.Inst; - // Hash in all of the operands as pointers. - unsigned Res = 0; - for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i) - Res ^= getHash(Inst->getOperand(i)) << (i & 0xF); + if (BinaryOperator* BinOp = dyn_cast<BinaryOperator>(Inst)) { + Value *LHS = BinOp->getOperand(0); + Value *RHS = BinOp->getOperand(1); + if (BinOp->isCommutative() && BinOp->getOperand(0) > BinOp->getOperand(1)) + std::swap(LHS, RHS); + + if (isa<OverflowingBinaryOperator>(BinOp)) { + // Hash the overflow behavior + unsigned Overflow = + BinOp->hasNoSignedWrap() * OverflowingBinaryOperator::NoSignedWrap | + BinOp->hasNoUnsignedWrap() * OverflowingBinaryOperator::NoUnsignedWrap; + return hash_combine(BinOp->getOpcode(), Overflow, LHS, RHS); + } - if (CastInst *CI = dyn_cast<CastInst>(Inst)) - Res ^= getHash(CI->getType()); - else if (CmpInst *CI = dyn_cast<CmpInst>(Inst)) - Res ^= CI->getPredicate(); - else if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(Inst)) { - for (ExtractValueInst::idx_iterator I = EVI->idx_begin(), - E = EVI->idx_end(); I != E; ++I) - Res ^= *I; - } else if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(Inst)) { - for (InsertValueInst::idx_iterator I = IVI->idx_begin(), - E = IVI->idx_end(); I != E; ++I) - Res ^= *I; - } else { - // nothing extra to hash in. - assert((isa<CallInst>(Inst) || - isa<BinaryOperator>(Inst) || isa<GetElementPtrInst>(Inst) || - isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) || - isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst)) && - "Invalid/unknown instruction"); + return hash_combine(BinOp->getOpcode(), LHS, RHS); } + if (CmpInst *CI = dyn_cast<CmpInst>(Inst)) { + Value *LHS = CI->getOperand(0); + Value *RHS = CI->getOperand(1); + CmpInst::Predicate Pred = CI->getPredicate(); + if (Inst->getOperand(0) > Inst->getOperand(1)) { + std::swap(LHS, RHS); + Pred = CI->getSwappedPredicate(); + } + return hash_combine(Inst->getOpcode(), Pred, LHS, RHS); + } + + if (CastInst *CI = dyn_cast<CastInst>(Inst)) + return hash_combine(CI->getOpcode(), CI->getType(), CI->getOperand(0)); + + if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(Inst)) + return hash_combine(EVI->getOpcode(), EVI->getOperand(0), + hash_combine_range(EVI->idx_begin(), EVI->idx_end())); + + if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(Inst)) + return hash_combine(IVI->getOpcode(), IVI->getOperand(0), + IVI->getOperand(1), + hash_combine_range(IVI->idx_begin(), IVI->idx_end())); + + assert((isa<CallInst>(Inst) || isa<BinaryOperator>(Inst) || + isa<GetElementPtrInst>(Inst) || isa<SelectInst>(Inst) || + isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) || + isa<ShuffleVectorInst>(Inst)) && "Invalid/unknown instruction"); + // Mix in the opcode. - return (Res << 1) ^ Inst->getOpcode(); + return hash_combine(Inst->getOpcode(), + hash_combine_range(Inst->value_op_begin(), + Inst->value_op_end())); } bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) { @@ -128,7 +150,41 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) { return LHSI == RHSI; if (LHSI->getOpcode() != RHSI->getOpcode()) return false; - return LHSI->isIdenticalTo(RHSI); + if (LHSI->isIdenticalTo(RHSI)) return true; + + // If we're not strictly identical, we still might be a commutable instruction + if (BinaryOperator *LHSBinOp = dyn_cast<BinaryOperator>(LHSI)) { + if (!LHSBinOp->isCommutative()) + return false; + + assert(isa<BinaryOperator>(RHSI) + && "same opcode, but different instruction type?"); + BinaryOperator *RHSBinOp = cast<BinaryOperator>(RHSI); + + // Check overflow attributes + if (isa<OverflowingBinaryOperator>(LHSBinOp)) { + assert(isa<OverflowingBinaryOperator>(RHSBinOp) + && "same opcode, but different operator type?"); + if (LHSBinOp->hasNoUnsignedWrap() != RHSBinOp->hasNoUnsignedWrap() || + LHSBinOp->hasNoSignedWrap() != RHSBinOp->hasNoSignedWrap()) + return false; + } + + // Commuted equality + return LHSBinOp->getOperand(0) == RHSBinOp->getOperand(1) && + LHSBinOp->getOperand(1) == RHSBinOp->getOperand(0); + } + if (CmpInst *LHSCmp = dyn_cast<CmpInst>(LHSI)) { + assert(isa<CmpInst>(RHSI) + && "same opcode, but different instruction type?"); + CmpInst *RHSCmp = cast<CmpInst>(RHSI); + // Commuted equality + return LHSCmp->getOperand(0) == RHSCmp->getOperand(1) && + LHSCmp->getOperand(1) == RHSCmp->getOperand(0) && + LHSCmp->getSwappedPredicate() == RHSCmp->getPredicate(); + } + + return false; } //===----------------------------------------------------------------------===// @@ -216,7 +272,7 @@ namespace { /// cases. class EarlyCSE : public FunctionPass { public: - const TargetData *TD; + const DataLayout *TD; const TargetLibraryInfo *TLI; DominatorTree *DT; typedef RecyclingAllocator<BumpPtrAllocator, @@ -274,7 +330,8 @@ private: CallScope(*availableCalls) {} private: - NodeScope(const NodeScope&); // DO NOT IMPLEMENT + NodeScope(const NodeScope&) LLVM_DELETED_FUNCTION; + void operator=(const NodeScope&) LLVM_DELETED_FUNCTION; ScopedHTType::ScopeTy Scope; LoadHTType::ScopeTy LoadScope; @@ -313,7 +370,8 @@ private: void process() { Processed = true; } private: - StackNode(const StackNode&); // DO NOT IMPLEMENT + StackNode(const StackNode&) LLVM_DELETED_FUNCTION; + void operator=(const StackNode&) LLVM_DELETED_FUNCTION; // Members. unsigned CurrentGeneration; @@ -374,7 +432,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { Instruction *Inst = I++; // Dead instructions should just be removed. - if (isInstructionTriviallyDead(Inst)) { + if (isInstructionTriviallyDead(Inst, TLI)) { DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n'); Inst->eraseFromParent(); Changed = true; @@ -506,7 +564,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { bool EarlyCSE::runOnFunction(Function &F) { std::deque<StackNode *> nodesToProcess; - TD = getAnalysisIfAvailable<TargetData>(); + TD = getAnalysisIfAvailable<DataLayout>(); TLI = &getAnalysis<TargetLibraryInfo>(); DT = &getAnalysis<DominatorTree>(); diff --git a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp index 4822fd0..f003e06 100644 --- a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp @@ -41,7 +41,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/PatternMatch.h" -#include "llvm/Target/TargetData.h" +#include "llvm/DataLayout.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" @@ -271,16 +271,16 @@ void ValueTable::add(Value *V, uint32_t num) { valueNumbering.insert(std::make_pair(V, num)); } -uint32_t ValueTable::lookup_or_add_call(CallInst* C) { +uint32_t ValueTable::lookup_or_add_call(CallInst *C) { if (AA->doesNotAccessMemory(C)) { Expression exp = create_expression(C); - uint32_t& e = expressionNumbering[exp]; + uint32_t &e = expressionNumbering[exp]; if (!e) e = nextValueNumber++; valueNumbering[C] = e; return e; } else if (AA->onlyReadsMemory(C)) { Expression exp = create_expression(C); - uint32_t& e = expressionNumbering[exp]; + uint32_t &e = expressionNumbering[exp]; if (!e) { e = nextValueNumber++; valueNumbering[C] = e; @@ -413,7 +413,7 @@ uint32_t ValueTable::lookup_or_add(Value *V) { case Instruction::LShr: case Instruction::AShr: case Instruction::And: - case Instruction::Or : + case Instruction::Or: case Instruction::Xor: case Instruction::ICmp: case Instruction::FCmp: @@ -503,7 +503,7 @@ namespace { bool NoLoads; MemoryDependenceAnalysis *MD; DominatorTree *DT; - const TargetData *TD; + const DataLayout *TD; const TargetLibraryInfo *TLI; ValueTable VN; @@ -535,7 +535,7 @@ namespace { InstrsToErase.push_back(I); } - const TargetData *getTargetData() const { return TD; } + const DataLayout *getDataLayout() const { return TD; } DominatorTree &getDominatorTree() const { return *DT; } AliasAnalysis *getAliasAnalysis() const { return VN.getAliasAnalysis(); } MemoryDependenceAnalysis &getMemDep() const { return *MD; } @@ -632,6 +632,7 @@ INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_PASS_END(GVN, "gvn", "Global Value Numbering", false, false) +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void GVN::dump(DenseMap<uint32_t, Value*>& d) { errs() << "{\n"; for (DenseMap<uint32_t, Value*>::iterator I = d.begin(), @@ -641,6 +642,7 @@ void GVN::dump(DenseMap<uint32_t, Value*>& d) { } errs() << "}\n"; } +#endif /// IsValueFullyAvailableInBlock - Return true if we can prove that the value /// we're analyzing is fully available in the specified block. As we go, keep @@ -728,7 +730,7 @@ SpeculationFailure: /// CoerceAvailableValueToLoadType will succeed. static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy, - const TargetData &TD) { + const DataLayout &TD) { // If the loaded or stored value is an first class array or struct, don't try // to transform them. We need to be able to bitcast to integer. if (LoadTy->isStructTy() || LoadTy->isArrayTy() || @@ -744,7 +746,6 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal, return true; } - /// CoerceAvailableValueToLoadType - If we saw a store of a value to memory, and /// then a load from a must-aliased pointer of a different type, try to coerce /// the stored value. LoadedTy is the type of the load we want to replace and @@ -754,7 +755,7 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal, static Value *CoerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy, Instruction *InsertPt, - const TargetData &TD) { + const DataLayout &TD) { if (!CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, TD)) return 0; @@ -767,24 +768,25 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, // If the store and reload are the same size, we can always reuse it. if (StoreSize == LoadSize) { // Pointer to Pointer -> use bitcast. - if (StoredValTy->isPointerTy() && LoadedTy->isPointerTy()) + if (StoredValTy->getScalarType()->isPointerTy() && + LoadedTy->getScalarType()->isPointerTy()) return new BitCastInst(StoredVal, LoadedTy, "", InsertPt); // Convert source pointers to integers, which can be bitcast. - if (StoredValTy->isPointerTy()) { - StoredValTy = TD.getIntPtrType(StoredValTy->getContext()); + if (StoredValTy->getScalarType()->isPointerTy()) { + StoredValTy = TD.getIntPtrType(StoredValTy); StoredVal = new PtrToIntInst(StoredVal, StoredValTy, "", InsertPt); } Type *TypeToCastTo = LoadedTy; - if (TypeToCastTo->isPointerTy()) - TypeToCastTo = TD.getIntPtrType(StoredValTy->getContext()); + if (TypeToCastTo->getScalarType()->isPointerTy()) + TypeToCastTo = TD.getIntPtrType(TypeToCastTo); if (StoredValTy != TypeToCastTo) StoredVal = new BitCastInst(StoredVal, TypeToCastTo, "", InsertPt); // Cast to pointer if the load needs a pointer type. - if (LoadedTy->isPointerTy()) + if (LoadedTy->getScalarType()->isPointerTy()) StoredVal = new IntToPtrInst(StoredVal, LoadedTy, "", InsertPt); return StoredVal; @@ -796,8 +798,8 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, assert(StoreSize >= LoadSize && "CanCoerceMustAliasedValueToLoad fail"); // Convert source pointers to integers, which can be manipulated. - if (StoredValTy->isPointerTy()) { - StoredValTy = TD.getIntPtrType(StoredValTy->getContext()); + if (StoredValTy->getScalarType()->isPointerTy()) { + StoredValTy = TD.getIntPtrType(StoredValTy); StoredVal = new PtrToIntInst(StoredVal, StoredValTy, "", InsertPt); } @@ -822,7 +824,7 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, return StoredVal; // If the result is a pointer, inttoptr. - if (LoadedTy->isPointerTy()) + if (LoadedTy->getScalarType()->isPointerTy()) return new IntToPtrInst(StoredVal, LoadedTy, "inttoptr", InsertPt); // Otherwise, bitcast. @@ -840,7 +842,7 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr, Value *WritePtr, uint64_t WriteSizeInBits, - const TargetData &TD) { + const DataLayout &TD) { // If the loaded or stored value is a first class array or struct, don't try // to transform them. We need to be able to bitcast to integer. if (LoadTy->isStructTy() || LoadTy->isArrayTy()) @@ -913,7 +915,7 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr, /// memdep query of a load that ends up being a clobbering store. static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr, StoreInst *DepSI, - const TargetData &TD) { + const DataLayout &TD) { // Cannot handle reading from store of first-class aggregate yet. if (DepSI->getValueOperand()->getType()->isStructTy() || DepSI->getValueOperand()->getType()->isArrayTy()) @@ -929,7 +931,7 @@ static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr, /// memdep query of a load that ends up being clobbered by another load. See if /// the other load can feed into the second load. static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, - LoadInst *DepLI, const TargetData &TD){ + LoadInst *DepLI, const DataLayout &TD){ // Cannot handle reading from store of first-class aggregate yet. if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy()) return -1; @@ -957,7 +959,7 @@ static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, MemIntrinsic *MI, - const TargetData &TD) { + const DataLayout &TD) { // If the mem operation is a non-constant size, we can't handle it. ConstantInt *SizeCst = dyn_cast<ConstantInt>(MI->getLength()); if (SizeCst == 0) return -1; @@ -1007,7 +1009,7 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, /// before we give up. static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy, - Instruction *InsertPt, const TargetData &TD){ + Instruction *InsertPt, const DataLayout &TD){ LLVMContext &Ctx = SrcVal->getType()->getContext(); uint64_t StoreSize = (TD.getTypeSizeInBits(SrcVal->getType()) + 7) / 8; @@ -1017,8 +1019,9 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, // Compute which bits of the stored value are being used by the load. Convert // to an integer type to start with. - if (SrcVal->getType()->isPointerTy()) - SrcVal = Builder.CreatePtrToInt(SrcVal, TD.getIntPtrType(Ctx)); + if (SrcVal->getType()->getScalarType()->isPointerTy()) + SrcVal = Builder.CreatePtrToInt(SrcVal, + TD.getIntPtrType(SrcVal->getType())); if (!SrcVal->getType()->isIntegerTy()) SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8)); @@ -1046,7 +1049,7 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, Type *LoadTy, Instruction *InsertPt, GVN &gvn) { - const TargetData &TD = *gvn.getTargetData(); + const DataLayout &TD = *gvn.getDataLayout(); // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to // widen SrcVal out to a larger load. unsigned SrcValSize = TD.getTypeStoreSize(SrcVal->getType()); @@ -1105,7 +1108,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, /// memdep query of a load that ends up being a clobbering mem intrinsic. static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, Type *LoadTy, Instruction *InsertPt, - const TargetData &TD){ + const DataLayout &TD){ LLVMContext &Ctx = LoadTy->getContext(); uint64_t LoadSize = TD.getTypeSizeInBits(LoadTy)/8; @@ -1229,7 +1232,7 @@ struct AvailableValueInBlock { if (isSimpleValue()) { Res = getSimpleValue(); if (Res->getType() != LoadTy) { - const TargetData *TD = gvn.getTargetData(); + const DataLayout *TD = gvn.getDataLayout(); assert(TD && "Need target data to handle type mismatch case"); Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(), *TD); @@ -1251,7 +1254,7 @@ struct AvailableValueInBlock { << *Res << '\n' << "\n\n\n"); } } else { - const TargetData *TD = gvn.getTargetData(); + const DataLayout *TD = gvn.getDataLayout(); assert(TD && "Need target data to handle type mismatch case"); Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset, LoadTy, BB->getTerminator(), *TD); @@ -1299,7 +1302,7 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, Value *V = SSAUpdate.GetValueInMiddleOfBlock(LI->getParent()); // If new PHI nodes were created, notify alias analysis. - if (V->getType()->isPointerTy()) { + if (V->getType()->getScalarType()->isPointerTy()) { AliasAnalysis *AA = gvn.getAliasAnalysis(); for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i) @@ -1436,7 +1439,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { Instruction *DepInst = DepInfo.getInst(); // Loading the allocation -> undef. - if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst) || + if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) || // Loading immediately after lifetime begin -> undef. isLifetimeStart(DepInst)) { ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB, @@ -1496,7 +1499,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { if (isa<PHINode>(V)) V->takeName(LI); - if (V->getType()->isPointerTy()) + if (V->getType()->getScalarType()->isPointerTy()) MD->invalidateCachedPointerInfo(V); markInstructionForDeletion(LI); ++NumGVNLoad; @@ -1728,7 +1731,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { LI->replaceAllUsesWith(V); if (isa<PHINode>(V)) V->takeName(LI); - if (V->getType()->isPointerTy()) + if (V->getType()->getScalarType()->isPointerTy()) MD->invalidateCachedPointerInfo(V); markInstructionForDeletion(LI); ++NumPRELoad; @@ -1855,7 +1858,7 @@ bool GVN::processLoad(LoadInst *L) { // Replace the load! L->replaceAllUsesWith(AvailVal); - if (AvailVal->getType()->isPointerTy()) + if (AvailVal->getType()->getScalarType()->isPointerTy()) MD->invalidateCachedPointerInfo(AvailVal); markInstructionForDeletion(L); ++NumGVNLoad; @@ -1912,7 +1915,7 @@ bool GVN::processLoad(LoadInst *L) { // Remove it! L->replaceAllUsesWith(StoredVal); - if (StoredVal->getType()->isPointerTy()) + if (StoredVal->getType()->getScalarType()->isPointerTy()) MD->invalidateCachedPointerInfo(StoredVal); markInstructionForDeletion(L); ++NumGVNLoad; @@ -1941,7 +1944,7 @@ bool GVN::processLoad(LoadInst *L) { // Remove it! patchAndReplaceAllUsesWith(AvailableVal, L); - if (DepLI->getType()->isPointerTy()) + if (DepLI->getType()->getScalarType()->isPointerTy()) MD->invalidateCachedPointerInfo(DepLI); markInstructionForDeletion(L); ++NumGVNLoad; @@ -1951,7 +1954,7 @@ bool GVN::processLoad(LoadInst *L) { // If this load really doesn't depend on anything, then we must be loading an // undef value. This can happen when loading for a fresh allocation with no // intervening stores, for example. - if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst)) { + if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI)) { L->replaceAllUsesWith(UndefValue::get(L->getType())); markInstructionForDeletion(L); ++NumGVNLoad; @@ -2182,7 +2185,7 @@ bool GVN::processInstruction(Instruction *I) { // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify. if (Value *V = SimplifyInstruction(I, TD, TLI, DT)) { I->replaceAllUsesWith(V); - if (MD && V->getType()->isPointerTy()) + if (MD && V->getType()->getScalarType()->isPointerTy()) MD->invalidateCachedPointerInfo(V); markInstructionForDeletion(I); ++NumGVNSimpl; @@ -2231,12 +2234,20 @@ bool GVN::processInstruction(Instruction *I) { Value *SwitchCond = SI->getCondition(); BasicBlock *Parent = SI->getParent(); bool Changed = false; + + // Remember how many outgoing edges there are to every successor. + SmallDenseMap<BasicBlock *, unsigned, 16> SwitchEdges; + for (unsigned i = 0, n = SI->getNumSuccessors(); i != n; ++i) + ++SwitchEdges[SI->getSuccessor(i)]; + for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i) { BasicBlock *Dst = i.getCaseSuccessor(); - BasicBlockEdge E(Parent, Dst); - if (E.isSingleEdge()) + // If there is only a single edge, propagate the case value into it. + if (SwitchEdges.lookup(Dst) == 1) { + BasicBlockEdge E(Parent, Dst); Changed |= propagateEquality(SwitchCond, i.getCaseValue(), E); + } } return Changed; } @@ -2274,7 +2285,7 @@ bool GVN::processInstruction(Instruction *I) { // Remove it! patchAndReplaceAllUsesWith(repl, I); - if (MD && repl->getType()->isPointerTy()) + if (MD && repl->getType()->getScalarType()->isPointerTy()) MD->invalidateCachedPointerInfo(repl); markInstructionForDeletion(I); return true; @@ -2285,7 +2296,7 @@ bool GVN::runOnFunction(Function& F) { if (!NoLoads) MD = &getAnalysis<MemoryDependenceAnalysis>(); DT = &getAnalysis<DominatorTree>(); - TD = getAnalysisIfAvailable<TargetData>(); + TD = getAnalysisIfAvailable<DataLayout>(); TLI = &getAnalysis<TargetLibraryInfo>(); VN.setAliasAnalysis(&getAnalysis<AliasAnalysis>()); VN.setMemDep(MD); @@ -2522,7 +2533,7 @@ bool GVN::performPRE(Function &F) { addToLeaderTable(ValNo, Phi, CurrentBlock); Phi->setDebugLoc(CurInst->getDebugLoc()); CurInst->replaceAllUsesWith(Phi); - if (Phi->getType()->isPointerTy()) { + if (Phi->getType()->getScalarType()->isPointerTy()) { // Because we have added a PHI-use of the pointer value, it has now // "escaped" from alias analysis' perspective. We need to inform // AA of this. diff --git a/contrib/llvm/lib/Transforms/Scalar/GlobalMerge.cpp b/contrib/llvm/lib/Transforms/Scalar/GlobalMerge.cpp index b36a3cb..6301aad 100644 --- a/contrib/llvm/lib/Transforms/Scalar/GlobalMerge.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/GlobalMerge.cpp @@ -62,7 +62,7 @@ #include "llvm/Intrinsics.h" #include "llvm/Module.h" #include "llvm/Pass.h" -#include "llvm/Target/TargetData.h" +#include "llvm/DataLayout.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/ADT/Statistic.h" @@ -98,9 +98,9 @@ namespace { } struct GlobalCmp { - const TargetData *TD; + const DataLayout *TD; - GlobalCmp(const TargetData *td) : TD(td) { } + GlobalCmp(const DataLayout *td) : TD(td) { } bool operator()(const GlobalVariable *GV1, const GlobalVariable *GV2) { Type *Ty1 = cast<PointerType>(GV1->getType())->getElementType(); @@ -119,7 +119,7 @@ INITIALIZE_PASS(GlobalMerge, "global-merge", bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals, Module &M, bool isConst) const { - const TargetData *TD = TLI->getTargetData(); + const DataLayout *TD = TLI->getDataLayout(); // FIXME: Infer the maximum possible offset depending on the actual users // (these max offsets are different for the users inside Thumb or ARM @@ -170,7 +170,7 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals, bool GlobalMerge::doInitialization(Module &M) { SmallVector<GlobalVariable*, 16> Globals, ConstGlobals, BSSGlobals; - const TargetData *TD = TLI->getTargetData(); + const DataLayout *TD = TLI->getDataLayout(); unsigned MaxOffset = TLI->getMaximalGlobalOffset(); bool Changed = false; diff --git a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 37f8bdf..310fd61 100644 --- a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -43,7 +43,8 @@ #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/SimplifyIndVar.h" -#include "llvm/Target/TargetData.h" +#include "llvm/DataLayout.h" +#include "llvm/Target/TargetLibraryInfo.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" @@ -67,7 +68,8 @@ namespace { LoopInfo *LI; ScalarEvolution *SE; DominatorTree *DT; - TargetData *TD; + DataLayout *TD; + TargetLibraryInfo *TLI; SmallVector<WeakVH, 16> DeadInsts; bool Changed; @@ -218,8 +220,6 @@ static Instruction *getInsertPointForUses(Instruction *User, Value *Def, /// ConvertToSInt - Convert APF to an integer, if possible. static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) { bool isExact = false; - if (&APF.getSemantics() == &APFloat::PPCDoubleDouble) - return false; // See if we can convert this to an int64_t uint64_t UIntVal; if (APF.convertToInteger(&UIntVal, 64, true, APFloat::rmTowardZero, @@ -414,11 +414,11 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) { // new comparison. NewCompare->takeName(Compare); Compare->replaceAllUsesWith(NewCompare); - RecursivelyDeleteTriviallyDeadInstructions(Compare); + RecursivelyDeleteTriviallyDeadInstructions(Compare, TLI); // Delete the old floating point increment. Incr->replaceAllUsesWith(UndefValue::get(Incr->getType())); - RecursivelyDeleteTriviallyDeadInstructions(Incr); + RecursivelyDeleteTriviallyDeadInstructions(Incr, TLI); // If the FP induction variable still has uses, this is because something else // in the loop uses its value. In order to canonicalize the induction @@ -431,7 +431,7 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) { Value *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv", PN->getParent()->getFirstInsertionPt()); PN->replaceAllUsesWith(Conv); - RecursivelyDeleteTriviallyDeadInstructions(PN); + RecursivelyDeleteTriviallyDeadInstructions(PN, TLI); } Changed = true; } @@ -549,15 +549,17 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { PN->setIncomingValue(i, ExitVal); - // If this instruction is dead now, delete it. - RecursivelyDeleteTriviallyDeadInstructions(Inst); + // If this instruction is dead now, delete it. Don't do it now to avoid + // invalidating iterators. + if (isInstructionTriviallyDead(Inst, TLI)) + DeadInsts.push_back(Inst); if (NumPreds == 1) { // Completely replace a single-pred PHI. This is safe, because the // NewVal won't be variant in the loop, so we don't need an LCSSA phi // node anymore. PN->replaceAllUsesWith(ExitVal); - RecursivelyDeleteTriviallyDeadInstructions(PN); + PN->eraseFromParent(); } } if (NumPreds != 1) { @@ -595,13 +597,13 @@ namespace { class WideIVVisitor : public IVVisitor { ScalarEvolution *SE; - const TargetData *TD; + const DataLayout *TD; public: WideIVInfo WI; WideIVVisitor(PHINode *NarrowIV, ScalarEvolution *SCEV, - const TargetData *TData) : + const DataLayout *TData) : SE(SCEV), TD(TData) { WI.NarrowIV = NarrowIV; } // Implement the interface used by simplifyUsersOfIV. @@ -1259,8 +1261,13 @@ static bool needsLFTR(Loop *L, DominatorTree *DT) { if (!Phi) return true; + // Do LFTR if PHI node is defined in the loop, but is *not* a counter. + int Idx = Phi->getBasicBlockIndex(L->getLoopLatch()); + if (Idx < 0) + return true; + // Do LFTR if the exit condition's IV is *not* a simple counter. - Value *IncV = Phi->getIncomingValueForBlock(L->getLoopLatch()); + Value *IncV = Phi->getIncomingValue(Idx); return Phi != getLoopPhiForCounter(IncV, L, DT); } @@ -1339,7 +1346,7 @@ static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) { /// could at least handle constant BECounts. static PHINode * FindLoopCounter(Loop *L, const SCEV *BECount, - ScalarEvolution *SE, DominatorTree *DT, const TargetData *TD) { + ScalarEvolution *SE, DominatorTree *DT, const DataLayout *TD) { uint64_t BCWidth = SE->getTypeSizeInBits(BECount->getType()); Value *Cond = @@ -1696,7 +1703,8 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { LI = &getAnalysis<LoopInfo>(); SE = &getAnalysis<ScalarEvolution>(); DT = &getAnalysis<DominatorTree>(); - TD = getAnalysisIfAvailable<TargetData>(); + TD = getAnalysisIfAvailable<DataLayout>(); + TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); DeadInsts.clear(); Changed = false; @@ -1763,7 +1771,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { while (!DeadInsts.empty()) if (Instruction *Inst = dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val())) - RecursivelyDeleteTriviallyDeadInstructions(Inst); + RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI); // The Rewriter may not be used from this point on. @@ -1772,7 +1780,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { SinkUnusedInvariants(L); // Clean up dead instructions. - Changed |= DeleteDeadPHIs(L->getHeader()); + Changed |= DeleteDeadPHIs(L->getHeader(), TLI); // Check a post-condition. assert(L->isLCSSAForm(*DT) && "Indvars did not leave the loop in lcssa form!"); diff --git a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp index dd42c59..e7ffa09 100644 --- a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -23,7 +23,7 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" -#include "llvm/Target/TargetData.h" +#include "llvm/DataLayout.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" @@ -75,7 +75,7 @@ namespace { /// revectored to the false side of the second if. /// class JumpThreading : public FunctionPass { - TargetData *TD; + DataLayout *TD; TargetLibraryInfo *TLI; LazyValueInfo *LVI; #ifdef NDEBUG @@ -147,7 +147,7 @@ FunctionPass *llvm::createJumpThreadingPass() { return new JumpThreading(); } /// bool JumpThreading::runOnFunction(Function &F) { DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n"); - TD = getAnalysisIfAvailable<TargetData>(); + TD = getAnalysisIfAvailable<DataLayout>(); TLI = &getAnalysis<TargetLibraryInfo>(); LVI = &getAnalysis<LazyValueInfo>(); @@ -1455,7 +1455,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, // At this point, the IR is fully up to date and consistent. Do a quick scan // over the new instructions and zap any that are constants or dead. This // frequently happens because of phi translation. - SimplifyInstructionsInBlock(NewBB, TD); + SimplifyInstructionsInBlock(NewBB, TD, TLI); // Threaded an edge! ++NumThreads; diff --git a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp index 0192e92..4818437 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp @@ -46,7 +46,7 @@ #include "llvm/Analysis/ValueTracking.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" -#include "llvm/Target/TargetData.h" +#include "llvm/DataLayout.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Support/CFG.h" #include "llvm/Support/CommandLine.h" @@ -100,7 +100,7 @@ namespace { LoopInfo *LI; // Current LoopInfo DominatorTree *DT; // Dominator Tree for the current Loop. - TargetData *TD; // TargetData for constant folding. + DataLayout *TD; // DataLayout for constant folding. TargetLibraryInfo *TLI; // TargetLibraryInfo for constant folding. // State that is updated as we process loops. @@ -108,6 +108,9 @@ namespace { BasicBlock *Preheader; // The preheader block of the current loop... Loop *CurLoop; // The current loop we are working on... AliasSetTracker *CurAST; // AliasSet information for the current loop... + bool MayThrow; // The current loop contains an instruction which + // may throw, thus preventing code motion of + // instructions with side effects. DenseMap<Loop*, AliasSetTracker*> LoopToAliasSetMap; /// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info. @@ -204,7 +207,7 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { AA = &getAnalysis<AliasAnalysis>(); DT = &getAnalysis<DominatorTree>(); - TD = getAnalysisIfAvailable<TargetData>(); + TD = getAnalysisIfAvailable<DataLayout>(); TLI = &getAnalysis<TargetLibraryInfo>(); CurAST = new AliasSetTracker(*AA); @@ -240,6 +243,15 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { CurAST->add(*BB); // Incorporate the specified basic block } + MayThrow = false; + // TODO: We've already searched for instructions which may throw in subloops. + // We may want to reuse this information. + for (Loop::block_iterator BB = L->block_begin(), BBE = L->block_end(); + (BB != BBE) && !MayThrow ; ++BB) + for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); + (I != E) && !MayThrow; ++I) + MayThrow |= I->mayThrow(); + // We want to visit all of the instructions in this loop... that are not parts // of our subloops (they have already had their invariants hoisted out of // their loop, into this loop, so there is no need to process the BODIES of @@ -307,7 +319,7 @@ void LICM::SinkRegion(DomTreeNode *N) { // If the instruction is dead, we would try to sink it because it isn't used // in the loop, instead, just delete it. - if (isInstructionTriviallyDead(&I)) { + if (isInstructionTriviallyDead(&I, TLI)) { DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n'); ++II; CurAST->deleteValue(&I); @@ -418,17 +430,22 @@ bool LICM::canSinkOrHoistInst(Instruction &I) { if (!FoundMod) return true; } - // FIXME: This should use mod/ref information to see if we can hoist or sink - // the call. + // FIXME: This should use mod/ref information to see if we can hoist or + // sink the call. return false; } - // Otherwise these instructions are hoistable/sinkable - return isa<BinaryOperator>(I) || isa<CastInst>(I) || - isa<SelectInst>(I) || isa<GetElementPtrInst>(I) || isa<CmpInst>(I) || - isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || - isa<ShuffleVectorInst>(I); + // Only these instructions are hoistable/sinkable. + bool HoistableKind = (isa<BinaryOperator>(I) || isa<CastInst>(I) || + isa<SelectInst>(I) || isa<GetElementPtrInst>(I) || + isa<CmpInst>(I) || isa<InsertElementInst>(I) || + isa<ExtractElementInst>(I) || + isa<ShuffleVectorInst>(I)); + if (!HoistableKind) + return false; + + return isSafeToExecuteUnconditionally(I); } /// isNotUsedInLoop - Return true if the only users of this instruction are @@ -604,6 +621,12 @@ bool LICM::isSafeToExecuteUnconditionally(Instruction &Inst) { } bool LICM::isGuaranteedToExecute(Instruction &Inst) { + + // Somewhere in this loop there is an instruction which may throw and make us + // exit the loop. + if (MayThrow) + return false; + // Otherwise we have to check to make sure that the instruction dominates all // of the exit blocks. If it doesn't, then there is a path out of the loop // which does not execute this instruction, so we can't hoist it. diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index ac1082c..a44e798 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -54,7 +54,7 @@ #include "llvm/Analysis/ValueTracking.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetData.h" +#include "llvm/DataLayout.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -65,7 +65,7 @@ STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores"); namespace { class LoopIdiomRecognize : public LoopPass { Loop *CurLoop; - const TargetData *TD; + const DataLayout *TD; DominatorTree *DT; ScalarEvolution *SE; TargetLibraryInfo *TLI; @@ -132,7 +132,8 @@ Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognize(); } /// and zero out all the operands of this instruction. If any of them become /// dead, delete them and the computation tree that feeds them. /// -static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE) { +static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE, + const TargetLibraryInfo *TLI) { SmallVector<Instruction*, 32> NowDeadInsts; NowDeadInsts.push_back(I); @@ -153,7 +154,7 @@ static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE) { if (!Op->use_empty()) continue; if (Instruction *OpI = dyn_cast<Instruction>(Op)) - if (isInstructionTriviallyDead(OpI)) + if (isInstructionTriviallyDead(OpI, TLI)) NowDeadInsts.push_back(OpI); } @@ -164,15 +165,21 @@ static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE) { /// deleteIfDeadInstruction - If the specified value is a dead instruction, /// delete it and any recursively used instructions. -static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE) { +static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE, + const TargetLibraryInfo *TLI) { if (Instruction *I = dyn_cast<Instruction>(V)) - if (isInstructionTriviallyDead(I)) - deleteDeadInstruction(I, SE); + if (isInstructionTriviallyDead(I, TLI)) + deleteDeadInstruction(I, SE, TLI); } bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { CurLoop = L; + // If the loop could not be converted to canonical form, it must have an + // indirectbr in it, just give up. + if (!L->getLoopPreheader()) + return false; + // Disable loop idiom recognition if the function's name is a common idiom. StringRef Name = L->getHeader()->getParent()->getName(); if (Name == "memset" || Name == "memcpy") @@ -192,7 +199,7 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { return false; // We require target data for now. - TD = getAnalysisIfAvailable<TargetData>(); + TD = getAnalysisIfAvailable<DataLayout>(); if (TD == 0) return false; DT = &getAnalysis<DominatorTree>(); @@ -401,7 +408,7 @@ static bool mayLoopAccessLocation(Value *Ptr,AliasAnalysis::ModRefResult Access, /// /// Note that we don't ever attempt to use memset_pattern8 or 4, because these /// just replicate their input array and then pass on to memset_pattern16. -static Constant *getMemSetPatternValue(Value *V, const TargetData &TD) { +static Constant *getMemSetPatternValue(Value *V, const DataLayout &TD) { // If the value isn't a constant, we can't promote it to being in a constant // array. We could theoretically do a store to an alloca or something, but // that doesn't seem worthwhile. @@ -490,7 +497,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, StoreSize, getAnalysis<AliasAnalysis>(), TheStore)){ Expander.clear(); // If we generated new code for the base pointer, clean up. - deleteIfDeadInstruction(BasePtr, *SE); + deleteIfDeadInstruction(BasePtr, *SE, TLI); return false; } @@ -538,7 +545,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, // Okay, the memset has been formed. Zap the original store and anything that // feeds into it. - deleteDeadInstruction(TheStore, *SE); + deleteDeadInstruction(TheStore, *SE, TLI); ++NumMemSet; return true; } @@ -579,7 +586,7 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, getAnalysis<AliasAnalysis>(), SI)) { Expander.clear(); // If we generated new code for the base pointer, clean up. - deleteIfDeadInstruction(StoreBasePtr, *SE); + deleteIfDeadInstruction(StoreBasePtr, *SE, TLI); return false; } @@ -594,8 +601,8 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, StoreSize, getAnalysis<AliasAnalysis>(), SI)) { Expander.clear(); // If we generated new code for the base pointer, clean up. - deleteIfDeadInstruction(LoadBasePtr, *SE); - deleteIfDeadInstruction(StoreBasePtr, *SE); + deleteIfDeadInstruction(LoadBasePtr, *SE, TLI); + deleteIfDeadInstruction(StoreBasePtr, *SE, TLI); return false; } @@ -628,7 +635,7 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, // Okay, the memset has been formed. Zap the original store and anything that // feeds into it. - deleteDeadInstruction(SI, *SE); + deleteDeadInstruction(SI, *SE, TLI); ++NumMemCpy; return true; } diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp index 982400c..558f62e 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp @@ -18,7 +18,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Support/Debug.h" -#include "llvm/Target/TargetData.h" +#include "llvm/DataLayout.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" @@ -66,7 +66,7 @@ Pass *llvm::createLoopInstSimplifyPass() { bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>(); LoopInfo *LI = &getAnalysis<LoopInfo>(); - const TargetData *TD = getAnalysisIfAvailable<TargetData>(); + const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); SmallVector<BasicBlock*, 8> ExitBlocks; @@ -120,7 +120,7 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { ++NumSimplified; } } - LocalChanged |= RecursivelyDeleteTriviallyDeadInstructions(I); + LocalChanged |= RecursivelyDeleteTriviallyDeadInstructions(I, TLI); if (IsSubloopHeader && !isa<PHINode>(I)) break; diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp index 7eeb152..abe07aa 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp @@ -24,6 +24,7 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm/Support/CFG.h" #include "llvm/Support/Debug.h" #include "llvm/ADT/Statistic.h" using namespace llvm; @@ -256,6 +257,7 @@ bool LoopRotate::rotateLoop(Loop *L) { return false; BasicBlock *OrigHeader = L->getHeader(); + BasicBlock *OrigLatch = L->getLoopLatch(); BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator()); if (BI == 0 || BI->isUnconditional()) @@ -267,13 +269,9 @@ bool LoopRotate::rotateLoop(Loop *L) { if (!L->isLoopExiting(OrigHeader)) return false; - // Updating PHInodes in loops with multiple exits adds complexity. - // Keep it simple, and restrict loop rotation to loops with one exit only. - // In future, lift this restriction and support for multiple exits if - // required. - SmallVector<BasicBlock*, 8> ExitBlocks; - L->getExitBlocks(ExitBlocks); - if (ExitBlocks.size() > 1) + // If the loop latch already contains a branch that leaves the loop then the + // loop is already rotated. + if (OrigLatch == 0 || L->isLoopExiting(OrigLatch)) return false; // Check size of original header and reject loop if it is very big. @@ -286,11 +284,10 @@ bool LoopRotate::rotateLoop(Loop *L) { // Now, this loop is suitable for rotation. BasicBlock *OrigPreheader = L->getLoopPreheader(); - BasicBlock *OrigLatch = L->getLoopLatch(); // If the loop could not be converted to canonical form, it must have an // indirectbr in it, just give up. - if (OrigPreheader == 0 || OrigLatch == 0) + if (OrigPreheader == 0) return false; // Anything ScalarEvolution may know about this loop or the PHI nodes @@ -298,6 +295,8 @@ bool LoopRotate::rotateLoop(Loop *L) { if (ScalarEvolution *SE = getAnalysisIfAvailable<ScalarEvolution>()) SE->forgetLoop(L); + DEBUG(dbgs() << "LoopRotation: rotating "; L->dump()); + // Find new Loop header. NewHeader is a Header's one and only successor // that is inside loop. Header's other successor is outside the // loop. Otherwise loop is not suitable for rotation. @@ -408,10 +407,19 @@ bool LoopRotate::rotateLoop(Loop *L) { // Update DominatorTree to reflect the CFG change we just made. Then split // edges as necessary to preserve LoopSimplify form. if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>()) { - // Since OrigPreheader now has the conditional branch to Exit block, it is - // the dominator of Exit. - DT->changeImmediateDominator(Exit, OrigPreheader); - DT->changeImmediateDominator(NewHeader, OrigPreheader); + // Everything that was dominated by the old loop header is now dominated + // by the original loop preheader. Conceptually the header was merged + // into the preheader, even though we reuse the actual block as a new + // loop latch. + DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader); + SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(), + OrigHeaderNode->end()); + DomTreeNode *OrigPreheaderNode = DT->getNode(OrigPreheader); + for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I) + DT->changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode); + + assert(DT->getNode(Exit)->getIDom() == OrigPreheaderNode); + assert(DT->getNode(NewHeader)->getIDom() == OrigPreheaderNode); // Update OrigHeader to be dominated by the new header block. DT->changeImmediateDominator(OrigHeader, OrigLatch); @@ -440,6 +448,35 @@ bool LoopRotate::rotateLoop(Loop *L) { // Update OrigHeader to be dominated by the new header block. DT->changeImmediateDominator(NewHeader, OrigPreheader); DT->changeImmediateDominator(OrigHeader, OrigLatch); + + // Brute force incremental dominator tree update. Call + // findNearestCommonDominator on all CFG predecessors of each child of the + // original header. + DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader); + SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(), + OrigHeaderNode->end()); + bool Changed; + do { + Changed = false; + for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I) { + DomTreeNode *Node = HeaderChildren[I]; + BasicBlock *BB = Node->getBlock(); + + pred_iterator PI = pred_begin(BB); + BasicBlock *NearestDom = *PI; + for (pred_iterator PE = pred_end(BB); PI != PE; ++PI) + NearestDom = DT->findNearestCommonDominator(NearestDom, *PI); + + // Remember if this changes the DomTree. + if (Node->getIDom()->getBlock() != NearestDom) { + DT->changeImmediateDominator(BB, NearestDom); + Changed = true; + } + } + + // If the dominator changed, this may have an effect on other + // predecessors, continue until we reach a fixpoint. + } while (Changed); } } @@ -452,6 +489,8 @@ bool LoopRotate::rotateLoop(Loop *L) { // emitted code isn't too gross in this common case. MergeBlockIntoPredecessor(OrigHeader, this); + DEBUG(dbgs() << "LoopRotation: into "; L->dump()); + ++NumRotated; return true; } diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index b14a713..958348d 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -54,7 +54,7 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "loop-reduce" -#include "llvm/Transforms/Scalar.h" +#include "llvm/AddressingMode.h" #include "llvm/Constants.h" #include "llvm/Instructions.h" #include "llvm/IntrinsicInst.h" @@ -64,6 +64,7 @@ #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Assembly/Writer.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/ADT/SmallBitVector.h" @@ -121,9 +122,11 @@ void RegSortData::print(raw_ostream &OS) const { OS << "[NumUses=" << UsedByIndices.count() << ']'; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void RegSortData::dump() const { print(errs()); errs() << '\n'; } +#endif namespace { @@ -223,7 +226,7 @@ namespace { struct Formula { /// AM - This is used to represent complex addressing, as well as other kinds /// of interesting uses. - TargetLowering::AddrMode AM; + AddrMode AM; /// BaseRegs - The list of "base" registers for this use. When this is /// non-empty, AM.HasBaseReg should be set to true. @@ -414,9 +417,11 @@ void Formula::print(raw_ostream &OS) const { } } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void Formula::dump() const { print(errs()); errs() << '\n'; } +#endif /// isAddRecSExtable - Return true if the given addrec can be sign-extended /// without changing its value. @@ -738,7 +743,8 @@ DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) { bool Changed = false; while (!DeadInsts.empty()) { - Instruction *I = dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val()); + Value *V = DeadInsts.pop_back_val(); + Instruction *I = dyn_cast_or_null<Instruction>(V); if (I == 0 || !isInstructionTriviallyDead(I)) continue; @@ -973,9 +979,11 @@ void Cost::print(raw_ostream &OS) const { OS << ", plus " << SetupCost << " setup cost"; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void Cost::dump() const { print(errs()); errs() << '\n'; } +#endif namespace { @@ -1059,9 +1067,11 @@ void LSRFixup::print(raw_ostream &OS) const { OS << ", Offset=" << Offset; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void LSRFixup::dump() const { print(errs()); errs() << '\n'; } +#endif namespace { @@ -1251,14 +1261,16 @@ void LSRUse::print(raw_ostream &OS) const { OS << ", widest fixup type: " << *WidestFixupType; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void LSRUse::dump() const { print(errs()); errs() << '\n'; } +#endif /// isLegalUse - Test whether the use described by AM is "legal", meaning it can /// be completely folded into the user instruction at isel time. This includes /// address-mode folding and special icmp tricks. -static bool isLegalUse(const TargetLowering::AddrMode &AM, +static bool isLegalUse(const AddrMode &AM, LSRUse::KindType Kind, Type *AccessTy, const TargetLowering *TLI) { switch (Kind) { @@ -1315,7 +1327,7 @@ static bool isLegalUse(const TargetLowering::AddrMode &AM, llvm_unreachable("Invalid LSRUse Kind!"); } -static bool isLegalUse(TargetLowering::AddrMode AM, +static bool isLegalUse(AddrMode AM, int64_t MinOffset, int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy, const TargetLowering *TLI) { @@ -1346,7 +1358,7 @@ static bool isAlwaysFoldable(int64_t BaseOffs, // Conservatively, create an address with an immediate and a // base and a scale. - TargetLowering::AddrMode AM; + AddrMode AM; AM.BaseOffs = BaseOffs; AM.BaseGV = BaseGV; AM.HasBaseReg = HasBaseReg; @@ -1384,7 +1396,7 @@ static bool isAlwaysFoldable(const SCEV *S, // Conservatively, create an address with an immediate and a // base and a scale. - TargetLowering::AddrMode AM; + AddrMode AM; AM.BaseOffs = BaseOffs; AM.BaseGV = BaseGV; AM.HasBaseReg = HasBaseReg; @@ -2009,7 +2021,7 @@ LSRInstance::OptimizeLoopTermCond() { goto decline_post_inc; // Check for possible scaled-address reuse. Type *AccessTy = getAccessType(UI->getUser()); - TargetLowering::AddrMode AM; + AddrMode AM; AM.Scale = C->getSExtValue(); if (TLI->isLegalAddressingMode(AM, AccessTy)) goto decline_post_inc; @@ -3435,9 +3447,11 @@ void WorkItem::print(raw_ostream &OS) const { << " , add offset " << Imm; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void WorkItem::dump() const { print(errs()); errs() << '\n'; } +#endif /// GenerateCrossUseConstantOffsets - Look for registers which are a constant /// distance apart and try to form reuse opportunities between them. @@ -4451,17 +4465,21 @@ void LSRInstance::RewriteForPHI(PHINode *PN, SplitLandingPadPredecessors(Parent, BB, "", "", P, NewBBs); NewBB = NewBBs[0]; } - - // If PN is outside of the loop and BB is in the loop, we want to - // move the block to be immediately before the PHI block, not - // immediately after BB. - if (L->contains(BB) && !L->contains(PN)) - NewBB->moveBefore(PN->getParent()); - - // Splitting the edge can reduce the number of PHI entries we have. - e = PN->getNumIncomingValues(); - BB = NewBB; - i = PN->getBasicBlockIndex(BB); + // If NewBB==NULL, then SplitCriticalEdge refused to split because all + // phi predecessors are identical. The simple thing to do is skip + // splitting in this case rather than complicate the API. + if (NewBB) { + // If PN is outside of the loop and BB is in the loop, we want to + // move the block to be immediately before the PHI block, not + // immediately after BB. + if (L->contains(BB) && !L->contains(PN)) + NewBB->moveBefore(PN->getParent()); + + // Splitting the edge can reduce the number of PHI entries we have. + e = PN->getNumIncomingValues(); + BB = NewBB; + i = PN->getBasicBlockIndex(BB); + } } } @@ -4730,9 +4748,11 @@ void LSRInstance::print(raw_ostream &OS) const { print_uses(OS); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void LSRInstance::dump() const { print(errs()); errs() << '\n'; } +#endif namespace { diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 09a186f..0d781ac 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -22,7 +22,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/UnrollLoop.h" -#include "llvm/Target/TargetData.h" +#include "llvm/DataLayout.h" #include <climits> using namespace llvm; @@ -113,7 +113,7 @@ Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial) { /// ApproximateLoopSize - Approximate the size of the loop. static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, - const TargetData *TD) { + const DataLayout *TD) { CodeMetrics Metrics; for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E; ++I) @@ -145,7 +145,8 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { // not user specified. unsigned Threshold = CurrentThreshold; if (!UserThreshold && - Header->getParent()->hasFnAttr(Attribute::OptimizeForSize)) + Header->getParent()->getFnAttributes(). + hasAttribute(Attributes::OptimizeForSize)) Threshold = OptSizeUnrollThreshold; // Find trip count and trip multiple if count is not available @@ -178,7 +179,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { // Enforce the threshold. if (Threshold != NoThreshold) { - const TargetData *TD = getAnalysisIfAvailable<TargetData>(); + const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); unsigned NumInlineCandidates; unsigned LoopSize = ApproximateLoopSize(L, NumInlineCandidates, TD); DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n"); diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp index 58f7739..047b43e 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -638,7 +638,8 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val) { // Check to see if it would be profitable to unswitch current loop. // Do not do non-trivial unswitch while optimizing for size. - if (OptimizeForSize || F->hasFnAttr(Attribute::OptimizeForSize)) + if (OptimizeForSize || + F->getFnAttributes().hasAttribute(Attributes::OptimizeForSize)) return false; UnswitchNontrivialCondition(LoopCond, Val, currentLoop); @@ -906,13 +907,9 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, /// specified. static void RemoveFromWorklist(Instruction *I, std::vector<Instruction*> &Worklist) { - std::vector<Instruction*>::iterator WI = std::find(Worklist.begin(), - Worklist.end(), I); - while (WI != Worklist.end()) { - unsigned Offset = WI-Worklist.begin(); - Worklist.erase(WI); - WI = std::find(Worklist.begin()+Offset, Worklist.end(), I); - } + + Worklist.erase(std::remove(Worklist.begin(), Worklist.end(), I), + Worklist.end()); } /// ReplaceUsesOfWith - When we find that I really equals V, remove I from the diff --git a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 2a5ee33..517657cf 100644 --- a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -27,7 +27,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/GetElementPtrTypeIterator.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetData.h" +#include "llvm/DataLayout.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" #include <list> @@ -38,8 +38,8 @@ STATISTIC(NumMemSetInfer, "Number of memsets inferred"); STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy"); STATISTIC(NumCpyToSet, "Number of memcpys converted to memset"); -static int64_t GetOffsetFromIndex(const GetElementPtrInst *GEP, unsigned Idx, - bool &VariableIdxFound, const TargetData &TD){ +static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx, + bool &VariableIdxFound, const DataLayout &TD){ // Skip over the first indices. gep_type_iterator GTI = gep_type_begin(GEP); for (unsigned i = 1; i != Idx; ++i, ++GTI) @@ -72,11 +72,11 @@ static int64_t GetOffsetFromIndex(const GetElementPtrInst *GEP, unsigned Idx, /// constant offset, and return that constant offset. For example, Ptr1 might /// be &A[42], and Ptr2 might be &A[40]. In this case offset would be -8. static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, - const TargetData &TD) { + const DataLayout &TD) { Ptr1 = Ptr1->stripPointerCasts(); Ptr2 = Ptr2->stripPointerCasts(); - GetElementPtrInst *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1); - GetElementPtrInst *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2); + GEPOperator *GEP1 = dyn_cast<GEPOperator>(Ptr1); + GEPOperator *GEP2 = dyn_cast<GEPOperator>(Ptr2); bool VariableIdxFound = false; @@ -141,12 +141,12 @@ struct MemsetRange { /// TheStores - The actual stores that make up this range. SmallVector<Instruction*, 16> TheStores; - bool isProfitableToUseMemset(const TargetData &TD) const; + bool isProfitableToUseMemset(const DataLayout &TD) const; }; } // end anon namespace -bool MemsetRange::isProfitableToUseMemset(const TargetData &TD) const { +bool MemsetRange::isProfitableToUseMemset(const DataLayout &TD) const { // If we found more than 4 stores to merge or 16 bytes, use memset. if (TheStores.size() >= 4 || End-Start >= 16) return true; @@ -192,9 +192,9 @@ class MemsetRanges { /// because each element is relatively large and expensive to copy. std::list<MemsetRange> Ranges; typedef std::list<MemsetRange>::iterator range_iterator; - const TargetData &TD; + const DataLayout &TD; public: - MemsetRanges(const TargetData &td) : TD(td) {} + MemsetRanges(const DataLayout &td) : TD(td) {} typedef std::list<MemsetRange>::const_iterator const_iterator; const_iterator begin() const { return Ranges.begin(); } @@ -302,7 +302,7 @@ namespace { class MemCpyOpt : public FunctionPass { MemoryDependenceAnalysis *MD; TargetLibraryInfo *TLI; - const TargetData *TD; + const DataLayout *TD; public: static char ID; // Pass identification, replacement for typeid MemCpyOpt() : FunctionPass(ID) { @@ -332,7 +332,7 @@ namespace { bool processMemCpy(MemCpyInst *M); bool processMemMove(MemMoveInst *M); bool performCallSlotOptzn(Instruction *cpy, Value *cpyDst, Value *cpySrc, - uint64_t cpyLen, CallInst *C); + uint64_t cpyLen, unsigned cpyAlign, CallInst *C); bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, uint64_t MSize); bool processByValArgument(CallSite CS, unsigned ArgNo); @@ -509,10 +509,18 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { } if (C) { + unsigned storeAlign = SI->getAlignment(); + if (!storeAlign) + storeAlign = TD->getABITypeAlignment(SI->getOperand(0)->getType()); + unsigned loadAlign = LI->getAlignment(); + if (!loadAlign) + loadAlign = TD->getABITypeAlignment(LI->getType()); + bool changed = performCallSlotOptzn(LI, SI->getPointerOperand()->stripPointerCasts(), LI->getPointerOperand()->stripPointerCasts(), - TD->getTypeStoreSize(SI->getOperand(0)->getType()), C); + TD->getTypeStoreSize(SI->getOperand(0)->getType()), + std::min(storeAlign, loadAlign), C); if (changed) { MD->removeInstruction(SI); SI->eraseFromParent(); @@ -559,7 +567,8 @@ bool MemCpyOpt::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) { /// the call write its result directly into the destination of the memcpy. bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, Value *cpyDest, Value *cpySrc, - uint64_t cpyLen, CallInst *C) { + uint64_t cpyLen, unsigned cpyAlign, + CallInst *C) { // The general transformation to keep in mind is // // call @func(..., src, ...) @@ -625,6 +634,16 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, return false; } + // Check that dest points to memory that is at least as aligned as src. + unsigned srcAlign = srcAlloca->getAlignment(); + if (!srcAlign) + srcAlign = TD->getABITypeAlignment(srcAlloca->getAllocatedType()); + bool isDestSufficientlyAligned = srcAlign <= cpyAlign; + // If dest is not aligned enough and we can't increase its alignment then + // bail out. + if (!isDestSufficientlyAligned && !isa<AllocaInst>(cpyDest)) + return false; + // Check that src is not accessed except via the call and the memcpy. This // guarantees that it holds only undefined values when passed in (so the final // memcpy can be dropped), that it is not read or written between the call and @@ -673,20 +692,26 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, bool changedArgument = false; for (unsigned i = 0; i < CS.arg_size(); ++i) if (CS.getArgument(i)->stripPointerCasts() == cpySrc) { - if (cpySrc->getType() != cpyDest->getType()) - cpyDest = CastInst::CreatePointerCast(cpyDest, cpySrc->getType(), - cpyDest->getName(), C); + Value *Dest = cpySrc->getType() == cpyDest->getType() ? cpyDest + : CastInst::CreatePointerCast(cpyDest, cpySrc->getType(), + cpyDest->getName(), C); changedArgument = true; - if (CS.getArgument(i)->getType() == cpyDest->getType()) - CS.setArgument(i, cpyDest); + if (CS.getArgument(i)->getType() == Dest->getType()) + CS.setArgument(i, Dest); else - CS.setArgument(i, CastInst::CreatePointerCast(cpyDest, - CS.getArgument(i)->getType(), cpyDest->getName(), C)); + CS.setArgument(i, CastInst::CreatePointerCast(Dest, + CS.getArgument(i)->getType(), Dest->getName(), C)); } if (!changedArgument) return false; + // If the destination wasn't sufficiently aligned then increase its alignment. + if (!isDestSufficientlyAligned) { + assert(isa<AllocaInst>(cpyDest) && "Can only increase alloca alignment!"); + cast<AllocaInst>(cpyDest)->setAlignment(srcAlign); + } + // Drop any cached information about the call, because we may have changed // its dependence information by changing its parameter. MD->removeInstruction(C); @@ -813,7 +838,8 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) { if (DepInfo.isClobber()) { if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) { if (performCallSlotOptzn(M, M->getDest(), M->getSource(), - CopySize->getZExtValue(), C)) { + CopySize->getZExtValue(), M->getAlignment(), + C)) { MD->removeInstruction(M); M->eraseFromParent(); return true; @@ -974,7 +1000,7 @@ bool MemCpyOpt::iterateOnFunction(Function &F) { bool MemCpyOpt::runOnFunction(Function &F) { bool MadeChange = false; MD = &getAnalysis<MemoryDependenceAnalysis>(); - TD = getAnalysisIfAvailable<TargetData>(); + TD = getAnalysisIfAvailable<DataLayout>(); TLI = &getAnalysis<TargetLibraryInfo>(); // If we don't have at least memset and memcpy, there is little point of doing diff --git a/contrib/llvm/lib/Transforms/Scalar/ObjCARC.cpp b/contrib/llvm/lib/Transforms/Scalar/ObjCARC.cpp index 3222f20..dfdf505 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ObjCARC.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ObjCARC.cpp @@ -29,6 +29,7 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "objc-arc" +#include "llvm/Support/raw_ostream.h" #include "llvm/Support/CommandLine.h" #include "llvm/ADT/DenseMap.h" using namespace llvm; @@ -1120,9 +1121,8 @@ namespace { bool relatedSelect(const SelectInst *A, const Value *B); bool relatedPHI(const PHINode *A, const Value *B); - // Do not implement. - void operator=(const ProvenanceAnalysis &); - ProvenanceAnalysis(const ProvenanceAnalysis &); + void operator=(const ProvenanceAnalysis &) LLVM_DELETED_FUNCTION; + ProvenanceAnalysis(const ProvenanceAnalysis &) LLVM_DELETED_FUNCTION; public: ProvenanceAnalysis() {} @@ -1236,16 +1236,19 @@ bool ProvenanceAnalysis::relatedCheck(const Value *A, const Value *B) { // An ObjC-Identified object can't alias a load if it is never locally stored. if (AIsIdentified) { + // Check for an obvious escape. + if (isa<LoadInst>(B)) + return isStoredObjCPointer(A); if (BIsIdentified) { - // If both pointers have provenance, they can be directly compared. - if (A != B) - return false; - } else { - if (isa<LoadInst>(B)) - return isStoredObjCPointer(A); + // Check for an obvious escape. + if (isa<LoadInst>(A)) + return isStoredObjCPointer(B); + // Both pointers are identified and escapes aren't an evident problem. + return false; } - } else { - if (BIsIdentified && isa<LoadInst>(A)) + } else if (BIsIdentified) { + // Check for an obvious escape. + if (isa<LoadInst>(A)) return isStoredObjCPointer(B); } @@ -1381,9 +1384,6 @@ namespace { /// PtrState - This class summarizes several per-pointer runtime properties /// which are propogated through the flow graph. class PtrState { - /// NestCount - The known minimum level of retain+release nesting. - unsigned NestCount; - /// KnownPositiveRefCount - True if the reference count is known to /// be incremented. bool KnownPositiveRefCount; @@ -1401,7 +1401,7 @@ namespace { /// TODO: Encapsulate this better. RRInfo RRI; - PtrState() : NestCount(0), KnownPositiveRefCount(false), Partial(false), + PtrState() : KnownPositiveRefCount(false), Partial(false), Seq(S_None) {} void SetKnownPositiveRefCount() { @@ -1416,18 +1416,6 @@ namespace { return KnownPositiveRefCount; } - void IncrementNestCount() { - if (NestCount != UINT_MAX) ++NestCount; - } - - void DecrementNestCount() { - if (NestCount != 0) --NestCount; - } - - bool IsKnownNested() const { - return NestCount > 0; - } - void SetSeq(Sequence NewSeq) { Seq = NewSeq; } @@ -1454,7 +1442,6 @@ void PtrState::Merge(const PtrState &Other, bool TopDown) { Seq = MergeSeqs(Seq, Other.Seq, TopDown); KnownPositiveRefCount = KnownPositiveRefCount && Other.KnownPositiveRefCount; - NestCount = std::min(NestCount, Other.NestCount); // We can't merge a plain objc_retain with an objc_retainBlock. if (RRI.IsRetainBlock != Other.RRI.IsRetainBlock) @@ -1610,6 +1597,12 @@ void BBState::MergePred(const BBState &Other) { // loop backedge. Loop backedges are special. TopDownPathCount += Other.TopDownPathCount; + // Check for overflow. If we have overflow, fall back to conservative behavior. + if (TopDownPathCount < Other.TopDownPathCount) { + clearTopDownPointers(); + return; + } + // For each entry in the other set, if our set has an entry with the same key, // merge the entries. Otherwise, copy the entry and merge it with an empty // entry. @@ -1635,6 +1628,12 @@ void BBState::MergeSucc(const BBState &Other) { // loop backedge. Loop backedges are special. BottomUpPathCount += Other.BottomUpPathCount; + // Check for overflow. If we have overflow, fall back to conservative behavior. + if (BottomUpPathCount < Other.BottomUpPathCount) { + clearBottomUpPointers(); + return; + } + // For each entry in the other set, if our set has an entry with the // same key, merge the entries. Otherwise, copy the entry and merge // it with an empty entry. @@ -1789,7 +1788,9 @@ Constant *ObjCARCOpt::getRetainRVCallee(Module *M) { Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); Type *Params[] = { I8X }; FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false); - AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); + AttrListPtr Attributes = + AttrListPtr().addAttr(M->getContext(), AttrListPtr::FunctionIndex, + Attributes::get(C, Attributes::NoUnwind)); RetainRVCallee = M->getOrInsertFunction("objc_retainAutoreleasedReturnValue", FTy, Attributes); @@ -1803,7 +1804,9 @@ Constant *ObjCARCOpt::getAutoreleaseRVCallee(Module *M) { Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); Type *Params[] = { I8X }; FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false); - AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); + AttrListPtr Attributes = + AttrListPtr().addAttr(M->getContext(), AttrListPtr::FunctionIndex, + Attributes::get(C, Attributes::NoUnwind)); AutoreleaseRVCallee = M->getOrInsertFunction("objc_autoreleaseReturnValue", FTy, Attributes); @@ -1815,7 +1818,9 @@ Constant *ObjCARCOpt::getReleaseCallee(Module *M) { if (!ReleaseCallee) { LLVMContext &C = M->getContext(); Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) }; - AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); + AttrListPtr Attributes = + AttrListPtr().addAttr(M->getContext(), AttrListPtr::FunctionIndex, + Attributes::get(C, Attributes::NoUnwind)); ReleaseCallee = M->getOrInsertFunction( "objc_release", @@ -1829,7 +1834,9 @@ Constant *ObjCARCOpt::getRetainCallee(Module *M) { if (!RetainCallee) { LLVMContext &C = M->getContext(); Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) }; - AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); + AttrListPtr Attributes = + AttrListPtr().addAttr(M->getContext(), AttrListPtr::FunctionIndex, + Attributes::get(C, Attributes::NoUnwind)); RetainCallee = M->getOrInsertFunction( "objc_retain", @@ -1858,7 +1865,9 @@ Constant *ObjCARCOpt::getAutoreleaseCallee(Module *M) { if (!AutoreleaseCallee) { LLVMContext &C = M->getContext(); Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) }; - AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); + AttrListPtr Attributes = + AttrListPtr().addAttr(M->getContext(), AttrListPtr::FunctionIndex, + Attributes::get(C, Attributes::NoUnwind)); AutoreleaseCallee = M->getOrInsertFunction( "objc_autorelease", @@ -1868,6 +1877,26 @@ Constant *ObjCARCOpt::getAutoreleaseCallee(Module *M) { return AutoreleaseCallee; } +/// IsPotentialUse - Test whether the given value is possible a +/// reference-counted pointer, including tests which utilize AliasAnalysis. +static bool IsPotentialUse(const Value *Op, AliasAnalysis &AA) { + // First make the rudimentary check. + if (!IsPotentialUse(Op)) + return false; + + // Objects in constant memory are not reference-counted. + if (AA.pointsToConstantMemory(Op)) + return false; + + // Pointers in constant memory are not pointing to reference-counted objects. + if (const LoadInst *LI = dyn_cast<LoadInst>(Op)) + if (AA.pointsToConstantMemory(LI->getPointerOperand())) + return false; + + // Otherwise assume the worst. + return true; +} + /// CanAlterRefCount - Test whether the given instruction can result in a /// reference count modification (positive or negative) for the pointer's /// object. @@ -1894,7 +1923,7 @@ CanAlterRefCount(const Instruction *Inst, const Value *Ptr, for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E; ++I) { const Value *Op = *I; - if (IsPotentialUse(Op) && PA.related(Ptr, Op)) + if (IsPotentialUse(Op, *PA.getAA()) && PA.related(Ptr, Op)) return true; } return false; @@ -1919,14 +1948,14 @@ CanUse(const Instruction *Inst, const Value *Ptr, ProvenanceAnalysis &PA, // Comparing a pointer with null, or any other constant, isn't really a use, // because we don't care what the pointer points to, or about the values // of any other dynamic reference-counted pointers. - if (!IsPotentialUse(ICI->getOperand(1))) + if (!IsPotentialUse(ICI->getOperand(1), *PA.getAA())) return false; } else if (ImmutableCallSite CS = static_cast<const Value *>(Inst)) { // For calls, just check the arguments (and not the callee operand). for (ImmutableCallSite::arg_iterator OI = CS.arg_begin(), OE = CS.arg_end(); OI != OE; ++OI) { const Value *Op = *OI; - if (IsPotentialUse(Op) && PA.related(Ptr, Op)) + if (IsPotentialUse(Op, *PA.getAA()) && PA.related(Ptr, Op)) return true; } return false; @@ -1936,14 +1965,14 @@ CanUse(const Instruction *Inst, const Value *Ptr, ProvenanceAnalysis &PA, const Value *Op = GetUnderlyingObjCPtr(SI->getPointerOperand()); // If we can't tell what the underlying object was, assume there is a // dependence. - return IsPotentialUse(Op) && PA.related(Op, Ptr); + return IsPotentialUse(Op, *PA.getAA()) && PA.related(Op, Ptr); } // Check each operand for a match. for (User::const_op_iterator OI = Inst->op_begin(), OE = Inst->op_end(); OI != OE; ++OI) { const Value *Op = *OI; - if (IsPotentialUse(Op) && PA.related(Ptr, Op)) + if (IsPotentialUse(Op, *PA.getAA()) && PA.related(Ptr, Op)) return true; } return false; @@ -2612,11 +2641,11 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst, MDNode *ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind); S.ResetSequenceProgress(ReleaseMetadata ? S_MovableRelease : S_Release); S.RRI.ReleaseMetadata = ReleaseMetadata; - S.RRI.KnownSafe = S.IsKnownNested() || S.IsKnownIncremented(); + S.RRI.KnownSafe = S.IsKnownIncremented(); S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall(); S.RRI.Calls.insert(Inst); - S.IncrementNestCount(); + S.SetKnownPositiveRefCount(); break; } case IC_RetainBlock: @@ -2631,7 +2660,6 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst, PtrState &S = MyStates.getPtrBottomUpState(Arg); S.SetKnownPositiveRefCount(); - S.DecrementNestCount(); switch (S.GetSeq()) { case S_Stop: @@ -2747,8 +2775,9 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB, // Merge the states from each successor to compute the initial state // for the current block. - for (BBState::edge_iterator SI(MyStates.succ_begin()), - SE(MyStates.succ_end()); SI != SE; ++SI) { + BBState::edge_iterator SI(MyStates.succ_begin()), + SE(MyStates.succ_end()); + if (SI != SE) { const BasicBlock *Succ = *SI; DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Succ); assert(I != BBStates.end()); @@ -2760,7 +2789,6 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB, assert(I != BBStates.end()); MyStates.MergeSucc(I->second); } - break; } // Visit all the instructions, bottom-up. @@ -2823,12 +2851,11 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst, S.ResetSequenceProgress(S_Retain); S.RRI.IsRetainBlock = Class == IC_RetainBlock; - // Don't check S.IsKnownIncremented() here because it's not sufficient. - S.RRI.KnownSafe = S.IsKnownNested(); + S.RRI.KnownSafe = S.IsKnownIncremented(); S.RRI.Calls.insert(Inst); } - S.IncrementNestCount(); + S.SetKnownPositiveRefCount(); // A retain can be a potential use; procede to the generic checking // code below. @@ -2838,7 +2865,7 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst, Arg = GetObjCArg(Inst); PtrState &S = MyStates.getPtrTopDownState(Arg); - S.DecrementNestCount(); + S.ClearRefCount(); switch (S.GetSeq()) { case S_Retain: @@ -2935,8 +2962,9 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB, // Merge the states from each predecessor to compute the initial state // for the current block. - for (BBState::edge_iterator PI(MyStates.pred_begin()), - PE(MyStates.pred_end()); PI != PE; ++PI) { + BBState::edge_iterator PI(MyStates.pred_begin()), + PE(MyStates.pred_end()); + if (PI != PE) { const BasicBlock *Pred = *PI; DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred); assert(I != BBStates.end()); @@ -2948,7 +2976,6 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB, assert(I != BBStates.end()); MyStates.MergePred(I->second); } - break; } // Visit all the instructions, top-down. @@ -3532,19 +3559,19 @@ bool ObjCARCOpt::OptimizeSequences(Function &F) { } /// OptimizeReturns - Look for this pattern: -/// +/// \code /// %call = call i8* @something(...) /// %2 = call i8* @objc_retain(i8* %call) /// %3 = call i8* @objc_autorelease(i8* %2) /// ret i8* %3 -/// +/// \endcode /// And delete the retain and autorelease. /// /// Otherwise if it's just this: -/// +/// \code /// %3 = call i8* @objc_autorelease(i8* %2) /// ret i8* %3 -/// +/// \endcode /// convert the autorelease to autoreleaseRV. void ObjCARCOpt::OptimizeReturns(Function &F) { if (!F.getReturnType()->isPointerTy()) @@ -3814,8 +3841,9 @@ Constant *ObjCARCContract::getStoreStrongCallee(Module *M) { Type *Params[] = { I8XX, I8X }; AttrListPtr Attributes = AttrListPtr() - .addAttr(~0u, Attribute::NoUnwind) - .addAttr(1, Attribute::NoCapture); + .addAttr(M->getContext(), AttrListPtr::FunctionIndex, + Attributes::get(C, Attributes::NoUnwind)) + .addAttr(M->getContext(), 1, Attributes::get(C, Attributes::NoCapture)); StoreStrongCallee = M->getOrInsertFunction( @@ -3832,7 +3860,9 @@ Constant *ObjCARCContract::getRetainAutoreleaseCallee(Module *M) { Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); Type *Params[] = { I8X }; FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false); - AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); + AttrListPtr Attributes = + AttrListPtr().addAttr(M->getContext(), AttrListPtr::FunctionIndex, + Attributes::get(C, Attributes::NoUnwind)); RetainAutoreleaseCallee = M->getOrInsertFunction("objc_retainAutorelease", FTy, Attributes); } @@ -3845,7 +3875,9 @@ Constant *ObjCARCContract::getRetainAutoreleaseRVCallee(Module *M) { Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); Type *Params[] = { I8X }; FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false); - AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); + AttrListPtr Attributes = + AttrListPtr().addAttr(M->getContext(), AttrListPtr::FunctionIndex, + Attributes::get(C, Attributes::NoUnwind)); RetainAutoreleaseRVCallee = M->getOrInsertFunction("objc_retainAutoreleaseReturnValue", FTy, Attributes); diff --git a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp index 09687d8..7a40797 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -339,36 +339,6 @@ static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) { } } -/// EvaluateRepeatedConstant - Compute C op C op ... op C where the constant C -/// is repeated Weight times. -static Constant *EvaluateRepeatedConstant(unsigned Opcode, Constant *C, - APInt Weight) { - // For addition the result can be efficiently computed as the product of the - // constant and the weight. - if (Opcode == Instruction::Add) - return ConstantExpr::getMul(C, ConstantInt::get(C->getContext(), Weight)); - - // The weight might be huge, so compute by repeated squaring to ensure that - // compile time is proportional to the logarithm of the weight. - Constant *Result = 0; - Constant *Power = C; // Successively C, C op C, (C op C) op (C op C) etc. - // Visit the bits in Weight. - while (Weight != 0) { - // If the current bit in Weight is non-zero do Result = Result op Power. - if (Weight[0]) - Result = Result ? ConstantExpr::get(Opcode, Result, Power) : Power; - // Move on to the next bit if any more are non-zero. - Weight = Weight.lshr(1); - if (Weight.isMinValue()) - break; - // Square the power. - Power = ConstantExpr::get(Opcode, Power, Power); - } - - assert(Result && "Only positive weights supported!"); - return Result; -} - typedef std::pair<Value*, APInt> RepeatedValue; /// LinearizeExprTree - Given an associative binary expression, return the leaf @@ -382,9 +352,7 @@ typedef std::pair<Value*, APInt> RepeatedValue; /// op /// (Ops[N].first op Ops[N].first op ... Ops[N].first) <- Ops[N].second times /// -/// Note that the values Ops[0].first, ..., Ops[N].first are all distinct, and -/// they are all non-constant except possibly for the last one, which if it is -/// constant will have weight one (Ops[N].second === 1). +/// Note that the values Ops[0].first, ..., Ops[N].first are all distinct. /// /// This routine may modify the function, in which case it returns 'true'. The /// changes it makes may well be destructive, changing the value computed by 'I' @@ -604,7 +572,6 @@ static bool LinearizeExprTree(BinaryOperator *I, // The leaves, repeated according to their weights, represent the linearized // form of the expression. - Constant *Cst = 0; // Accumulate constants here. for (unsigned i = 0, e = LeafOrder.size(); i != e; ++i) { Value *V = LeafOrder[i]; LeafMap::iterator It = Leaves.find(V); @@ -618,31 +585,14 @@ static bool LinearizeExprTree(BinaryOperator *I, continue; // Ensure the leaf is only output once. It->second = 0; - // Glob all constants together into Cst. - if (Constant *C = dyn_cast<Constant>(V)) { - C = EvaluateRepeatedConstant(Opcode, C, Weight); - Cst = Cst ? ConstantExpr::get(Opcode, Cst, C) : C; - continue; - } - // Add non-constant Ops.push_back(std::make_pair(V, Weight)); } - // Add any constants back into Ops, all globbed together and reduced to having - // weight 1 for the convenience of users. - Constant *Identity = ConstantExpr::getBinOpIdentity(Opcode, I->getType()); - if (Cst && Cst != Identity) { - // If combining multiple constants resulted in the absorber then the entire - // expression must evaluate to the absorber. - if (Cst == Absorber) - Ops.clear(); - Ops.push_back(std::make_pair(Cst, APInt(Bitwidth, 1))); - } - // For nilpotent operations or addition there may be no operands, for example // because the expression was "X xor X" or consisted of 2^Bitwidth additions: // in both cases the weight reduces to 0 causing the value to be skipped. if (Ops.empty()) { + Constant *Identity = ConstantExpr::getBinOpIdentity(Opcode, I->getType()); assert(Identity && "Associative operation without identity!"); Ops.push_back(std::make_pair(Identity, APInt(Bitwidth, 1))); } @@ -656,8 +606,8 @@ void Reassociate::RewriteExprTree(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops) { assert(Ops.size() > 1 && "Single values should be used directly!"); - // Since our optimizations never increase the number of operations, the new - // expression can always be written by reusing the existing binary operators + // Since our optimizations should never increase the number of operations, the + // new expression can usually be written reusing the existing binary operators // from the original expression tree, without creating any new instructions, // though the rewritten expression may have a completely different topology. // We take care to not change anything if the new expression will be the same @@ -671,6 +621,20 @@ void Reassociate::RewriteExprTree(BinaryOperator *I, unsigned Opcode = I->getOpcode(); BinaryOperator *Op = I; + /// NotRewritable - The operands being written will be the leaves of the new + /// expression and must not be used as inner nodes (via NodesToRewrite) by + /// mistake. Inner nodes are always reassociable, and usually leaves are not + /// (if they were they would have been incorporated into the expression and so + /// would not be leaves), so most of the time there is no danger of this. But + /// in rare cases a leaf may become reassociable if an optimization kills uses + /// of it, or it may momentarily become reassociable during rewriting (below) + /// due it being removed as an operand of one of its uses. Ensure that misuse + /// of leaf nodes as inner nodes cannot occur by remembering all of the future + /// leaves and refusing to reuse any of them as inner nodes. + SmallPtrSet<Value*, 8> NotRewritable; + for (unsigned i = 0, e = Ops.size(); i != e; ++i) + NotRewritable.insert(Ops[i].Op); + // ExpressionChanged - Non-null if the rewritten expression differs from the // original in some non-trivial way, requiring the clearing of optional flags. // Flags are cleared from the operator in ExpressionChanged up to I inclusive. @@ -703,12 +667,14 @@ void Reassociate::RewriteExprTree(BinaryOperator *I, // the old operands with the new ones. DEBUG(dbgs() << "RA: " << *Op << '\n'); if (NewLHS != OldLHS) { - if (BinaryOperator *BO = isReassociableOp(OldLHS, Opcode)) + BinaryOperator *BO = isReassociableOp(OldLHS, Opcode); + if (BO && !NotRewritable.count(BO)) NodesToRewrite.push_back(BO); Op->setOperand(0, NewLHS); } if (NewRHS != OldRHS) { - if (BinaryOperator *BO = isReassociableOp(OldRHS, Opcode)) + BinaryOperator *BO = isReassociableOp(OldRHS, Opcode); + if (BO && !NotRewritable.count(BO)) NodesToRewrite.push_back(BO); Op->setOperand(1, NewRHS); } @@ -732,7 +698,8 @@ void Reassociate::RewriteExprTree(BinaryOperator *I, Op->swapOperands(); } else { // Overwrite with the new right-hand side. - if (BinaryOperator *BO = isReassociableOp(Op->getOperand(1), Opcode)) + BinaryOperator *BO = isReassociableOp(Op->getOperand(1), Opcode); + if (BO && !NotRewritable.count(BO)) NodesToRewrite.push_back(BO); Op->setOperand(1, NewRHS); ExpressionChanged = Op; @@ -745,7 +712,8 @@ void Reassociate::RewriteExprTree(BinaryOperator *I, // Now deal with the left-hand side. If this is already an operation node // from the original expression then just rewrite the rest of the expression // into it. - if (BinaryOperator *BO = isReassociableOp(Op->getOperand(0), Opcode)) { + BinaryOperator *BO = isReassociableOp(Op->getOperand(0), Opcode); + if (BO && !NotRewritable.count(BO)) { Op = BO; continue; } @@ -1446,9 +1414,26 @@ Value *Reassociate::OptimizeExpression(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops) { // Now that we have the linearized expression tree, try to optimize it. // Start by folding any constants that we found. - if (Ops.size() == 1) return Ops[0].Op; - + Constant *Cst = 0; unsigned Opcode = I->getOpcode(); + while (!Ops.empty() && isa<Constant>(Ops.back().Op)) { + Constant *C = cast<Constant>(Ops.pop_back_val().Op); + Cst = Cst ? ConstantExpr::get(Opcode, C, Cst) : C; + } + // If there was nothing but constants then we are done. + if (Ops.empty()) + return Cst; + + // Put the combined constant back at the end of the operand list, except if + // there is no point. For example, an add of 0 gets dropped here, while a + // multiplication by zero turns the whole expression into zero. + if (Cst && Cst != ConstantExpr::getBinOpIdentity(Opcode, I->getType())) { + if (Cst == ConstantExpr::getBinOpAbsorber(Opcode, I->getType())) + return Cst; + Ops.push_back(ValueEntry(0, Cst)); + } + + if (Ops.size() == 1) return Ops[0].Op; // Handle destructive annihilation due to identities between elements in the // argument list here. diff --git a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp index 2c39aab..686520e 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -26,7 +26,7 @@ #include "llvm/Pass.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/Target/TargetData.h" +#include "llvm/DataLayout.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Support/CallSite.h" #include "llvm/Support/Debug.h" @@ -153,7 +153,7 @@ namespace { /// Constant Propagation. /// class SCCPSolver : public InstVisitor<SCCPSolver> { - const TargetData *TD; + const DataLayout *TD; const TargetLibraryInfo *TLI; SmallPtrSet<BasicBlock*, 8> BBExecutable; // The BBs that are executable. DenseMap<Value*, LatticeVal> ValueState; // The state each value is in. @@ -205,7 +205,7 @@ class SCCPSolver : public InstVisitor<SCCPSolver> { typedef std::pair<BasicBlock*, BasicBlock*> Edge; DenseSet<Edge> KnownFeasibleEdges; public: - SCCPSolver(const TargetData *td, const TargetLibraryInfo *tli) + SCCPSolver(const DataLayout *td, const TargetLibraryInfo *tli) : TD(td), TLI(tli) {} /// MarkBlockExecutable - This method can be used by clients to mark all of @@ -1564,7 +1564,7 @@ static void DeleteInstructionInBlock(BasicBlock *BB) { // bool SCCP::runOnFunction(Function &F) { DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n"); - const TargetData *TD = getAnalysisIfAvailable<TargetData>(); + const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); SCCPSolver Solver(TD, TLI); @@ -1693,7 +1693,7 @@ static bool AddressIsTaken(const GlobalValue *GV) { } bool IPSCCP::runOnModule(Module &M) { - const TargetData *TD = getAnalysisIfAvailable<TargetData>(); + const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>(); SCCPSolver Solver(TD, TLI); diff --git a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp new file mode 100644 index 0000000..ccc2f7a --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp @@ -0,0 +1,3697 @@ +//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This transformation implements the well known scalar replacement of +/// aggregates transformation. It tries to identify promotable elements of an +/// aggregate alloca, and promote them to registers. It will also try to +/// convert uses of an element (or set of elements) of an alloca into a vector +/// or bitfield-style integer scalar if appropriate. +/// +/// It works to do this with minimal slicing of the alloca so that regions +/// which are merely transferred in and out of external memory remain unchanged +/// and are not decomposed to scalar code. +/// +/// Because this also performs alloca promotion, it can be thought of as also +/// serving the purpose of SSA formation. The algorithm iterates on the +/// function until all opportunities for promotion have been realized. +/// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "sroa" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Constants.h" +#include "llvm/DIBuilder.h" +#include "llvm/DebugInfo.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/IRBuilder.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/LLVMContext.h" +#include "llvm/Module.h" +#include "llvm/Operator.h" +#include "llvm/Pass.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/GetElementPtrTypeIterator.h" +#include "llvm/Support/InstVisitor.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/DataLayout.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/PromoteMemToReg.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" +using namespace llvm; + +STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement"); +STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced"); +STATISTIC(NumPromoted, "Number of allocas promoted to SSA values"); +STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion"); +STATISTIC(NumDeleted, "Number of instructions deleted"); +STATISTIC(NumVectorized, "Number of vectorized aggregates"); + +/// Hidden option to force the pass to not use DomTree and mem2reg, instead +/// forming SSA values through the SSAUpdater infrastructure. +static cl::opt<bool> +ForceSSAUpdater("force-ssa-updater", cl::init(false), cl::Hidden); + +namespace { +/// \brief Alloca partitioning representation. +/// +/// This class represents a partitioning of an alloca into slices, and +/// information about the nature of uses of each slice of the alloca. The goal +/// is that this information is sufficient to decide if and how to split the +/// alloca apart and replace slices with scalars. It is also intended that this +/// structure can capture the relevant information needed both to decide about +/// and to enact these transformations. +class AllocaPartitioning { +public: + /// \brief A common base class for representing a half-open byte range. + struct ByteRange { + /// \brief The beginning offset of the range. + uint64_t BeginOffset; + + /// \brief The ending offset, not included in the range. + uint64_t EndOffset; + + ByteRange() : BeginOffset(), EndOffset() {} + ByteRange(uint64_t BeginOffset, uint64_t EndOffset) + : BeginOffset(BeginOffset), EndOffset(EndOffset) {} + + /// \brief Support for ordering ranges. + /// + /// This provides an ordering over ranges such that start offsets are + /// always increasing, and within equal start offsets, the end offsets are + /// decreasing. Thus the spanning range comes first in a cluster with the + /// same start position. + bool operator<(const ByteRange &RHS) const { + if (BeginOffset < RHS.BeginOffset) return true; + if (BeginOffset > RHS.BeginOffset) return false; + if (EndOffset > RHS.EndOffset) return true; + return false; + } + + /// \brief Support comparison with a single offset to allow binary searches. + friend bool operator<(const ByteRange &LHS, uint64_t RHSOffset) { + return LHS.BeginOffset < RHSOffset; + } + + friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset, + const ByteRange &RHS) { + return LHSOffset < RHS.BeginOffset; + } + + bool operator==(const ByteRange &RHS) const { + return BeginOffset == RHS.BeginOffset && EndOffset == RHS.EndOffset; + } + bool operator!=(const ByteRange &RHS) const { return !operator==(RHS); } + }; + + /// \brief A partition of an alloca. + /// + /// This structure represents a contiguous partition of the alloca. These are + /// formed by examining the uses of the alloca. During formation, they may + /// overlap but once an AllocaPartitioning is built, the Partitions within it + /// are all disjoint. + struct Partition : public ByteRange { + /// \brief Whether this partition is splittable into smaller partitions. + /// + /// We flag partitions as splittable when they are formed entirely due to + /// accesses by trivially splittable operations such as memset and memcpy. + bool IsSplittable; + + /// \brief Test whether a partition has been marked as dead. + bool isDead() const { + if (BeginOffset == UINT64_MAX) { + assert(EndOffset == UINT64_MAX); + return true; + } + return false; + } + + /// \brief Kill a partition. + /// This is accomplished by setting both its beginning and end offset to + /// the maximum possible value. + void kill() { + assert(!isDead() && "He's Dead, Jim!"); + BeginOffset = EndOffset = UINT64_MAX; + } + + Partition() : ByteRange(), IsSplittable() {} + Partition(uint64_t BeginOffset, uint64_t EndOffset, bool IsSplittable) + : ByteRange(BeginOffset, EndOffset), IsSplittable(IsSplittable) {} + }; + + /// \brief A particular use of a partition of the alloca. + /// + /// This structure is used to associate uses of a partition with it. They + /// mark the range of bytes which are referenced by a particular instruction, + /// and includes a handle to the user itself and the pointer value in use. + /// The bounds of these uses are determined by intersecting the bounds of the + /// memory use itself with a particular partition. As a consequence there is + /// intentionally overlap between various uses of the same partition. + struct PartitionUse : public ByteRange { + /// \brief The use in question. Provides access to both user and used value. + /// + /// Note that this may be null if the partition use is *dead*, that is, it + /// should be ignored. + Use *U; + + PartitionUse() : ByteRange(), U() {} + PartitionUse(uint64_t BeginOffset, uint64_t EndOffset, Use *U) + : ByteRange(BeginOffset, EndOffset), U(U) {} + }; + + /// \brief Construct a partitioning of a particular alloca. + /// + /// Construction does most of the work for partitioning the alloca. This + /// performs the necessary walks of users and builds a partitioning from it. + AllocaPartitioning(const DataLayout &TD, AllocaInst &AI); + + /// \brief Test whether a pointer to the allocation escapes our analysis. + /// + /// If this is true, the partitioning is never fully built and should be + /// ignored. + bool isEscaped() const { return PointerEscapingInstr; } + + /// \brief Support for iterating over the partitions. + /// @{ + typedef SmallVectorImpl<Partition>::iterator iterator; + iterator begin() { return Partitions.begin(); } + iterator end() { return Partitions.end(); } + + typedef SmallVectorImpl<Partition>::const_iterator const_iterator; + const_iterator begin() const { return Partitions.begin(); } + const_iterator end() const { return Partitions.end(); } + /// @} + + /// \brief Support for iterating over and manipulating a particular + /// partition's uses. + /// + /// The iteration support provided for uses is more limited, but also + /// includes some manipulation routines to support rewriting the uses of + /// partitions during SROA. + /// @{ + typedef SmallVectorImpl<PartitionUse>::iterator use_iterator; + use_iterator use_begin(unsigned Idx) { return Uses[Idx].begin(); } + use_iterator use_begin(const_iterator I) { return Uses[I - begin()].begin(); } + use_iterator use_end(unsigned Idx) { return Uses[Idx].end(); } + use_iterator use_end(const_iterator I) { return Uses[I - begin()].end(); } + + typedef SmallVectorImpl<PartitionUse>::const_iterator const_use_iterator; + const_use_iterator use_begin(unsigned Idx) const { return Uses[Idx].begin(); } + const_use_iterator use_begin(const_iterator I) const { + return Uses[I - begin()].begin(); + } + const_use_iterator use_end(unsigned Idx) const { return Uses[Idx].end(); } + const_use_iterator use_end(const_iterator I) const { + return Uses[I - begin()].end(); + } + + unsigned use_size(unsigned Idx) const { return Uses[Idx].size(); } + unsigned use_size(const_iterator I) const { return Uses[I - begin()].size(); } + const PartitionUse &getUse(unsigned PIdx, unsigned UIdx) const { + return Uses[PIdx][UIdx]; + } + const PartitionUse &getUse(const_iterator I, unsigned UIdx) const { + return Uses[I - begin()][UIdx]; + } + + void use_push_back(unsigned Idx, const PartitionUse &PU) { + Uses[Idx].push_back(PU); + } + void use_push_back(const_iterator I, const PartitionUse &PU) { + Uses[I - begin()].push_back(PU); + } + /// @} + + /// \brief Allow iterating the dead users for this alloca. + /// + /// These are instructions which will never actually use the alloca as they + /// are outside the allocated range. They are safe to replace with undef and + /// delete. + /// @{ + typedef SmallVectorImpl<Instruction *>::const_iterator dead_user_iterator; + dead_user_iterator dead_user_begin() const { return DeadUsers.begin(); } + dead_user_iterator dead_user_end() const { return DeadUsers.end(); } + /// @} + + /// \brief Allow iterating the dead expressions referring to this alloca. + /// + /// These are operands which have cannot actually be used to refer to the + /// alloca as they are outside its range and the user doesn't correct for + /// that. These mostly consist of PHI node inputs and the like which we just + /// need to replace with undef. + /// @{ + typedef SmallVectorImpl<Use *>::const_iterator dead_op_iterator; + dead_op_iterator dead_op_begin() const { return DeadOperands.begin(); } + dead_op_iterator dead_op_end() const { return DeadOperands.end(); } + /// @} + + /// \brief MemTransferInst auxiliary data. + /// This struct provides some auxiliary data about memory transfer + /// intrinsics such as memcpy and memmove. These intrinsics can use two + /// different ranges within the same alloca, and provide other challenges to + /// correctly represent. We stash extra data to help us untangle this + /// after the partitioning is complete. + struct MemTransferOffsets { + /// The destination begin and end offsets when the destination is within + /// this alloca. If the end offset is zero the destination is not within + /// this alloca. + uint64_t DestBegin, DestEnd; + + /// The source begin and end offsets when the source is within this alloca. + /// If the end offset is zero, the source is not within this alloca. + uint64_t SourceBegin, SourceEnd; + + /// Flag for whether an alloca is splittable. + bool IsSplittable; + }; + MemTransferOffsets getMemTransferOffsets(MemTransferInst &II) const { + return MemTransferInstData.lookup(&II); + } + + /// \brief Map from a PHI or select operand back to a partition. + /// + /// When manipulating PHI nodes or selects, they can use more than one + /// partition of an alloca. We store a special mapping to allow finding the + /// partition referenced by each of these operands, if any. + iterator findPartitionForPHIOrSelectOperand(Use *U) { + SmallDenseMap<Use *, std::pair<unsigned, unsigned> >::const_iterator MapIt + = PHIOrSelectOpMap.find(U); + if (MapIt == PHIOrSelectOpMap.end()) + return end(); + + return begin() + MapIt->second.first; + } + + /// \brief Map from a PHI or select operand back to the specific use of + /// a partition. + /// + /// Similar to mapping these operands back to the partitions, this maps + /// directly to the use structure of that partition. + use_iterator findPartitionUseForPHIOrSelectOperand(Use *U) { + SmallDenseMap<Use *, std::pair<unsigned, unsigned> >::const_iterator MapIt + = PHIOrSelectOpMap.find(U); + assert(MapIt != PHIOrSelectOpMap.end()); + return Uses[MapIt->second.first].begin() + MapIt->second.second; + } + + /// \brief Compute a common type among the uses of a particular partition. + /// + /// This routines walks all of the uses of a particular partition and tries + /// to find a common type between them. Untyped operations such as memset and + /// memcpy are ignored. + Type *getCommonType(iterator I) const; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const; + void printUsers(raw_ostream &OS, const_iterator I, + StringRef Indent = " ") const; + void print(raw_ostream &OS) const; + void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump(const_iterator I) const; + void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump() const; +#endif + +private: + template <typename DerivedT, typename RetT = void> class BuilderBase; + class PartitionBuilder; + friend class AllocaPartitioning::PartitionBuilder; + class UseBuilder; + friend class AllocaPartitioning::UseBuilder; + +#ifndef NDEBUG + /// \brief Handle to alloca instruction to simplify method interfaces. + AllocaInst &AI; +#endif + + /// \brief The instruction responsible for this alloca having no partitioning. + /// + /// When an instruction (potentially) escapes the pointer to the alloca, we + /// store a pointer to that here and abort trying to partition the alloca. + /// This will be null if the alloca is partitioned successfully. + Instruction *PointerEscapingInstr; + + /// \brief The partitions of the alloca. + /// + /// We store a vector of the partitions over the alloca here. This vector is + /// sorted by increasing begin offset, and then by decreasing end offset. See + /// the Partition inner class for more details. Initially (during + /// construction) there are overlaps, but we form a disjoint sequence of + /// partitions while finishing construction and a fully constructed object is + /// expected to always have this as a disjoint space. + SmallVector<Partition, 8> Partitions; + + /// \brief The uses of the partitions. + /// + /// This is essentially a mapping from each partition to a list of uses of + /// that partition. The mapping is done with a Uses vector that has the exact + /// same number of entries as the partition vector. Each entry is itself + /// a vector of the uses. + SmallVector<SmallVector<PartitionUse, 2>, 8> Uses; + + /// \brief Instructions which will become dead if we rewrite the alloca. + /// + /// Note that these are not separated by partition. This is because we expect + /// a partitioned alloca to be completely rewritten or not rewritten at all. + /// If rewritten, all these instructions can simply be removed and replaced + /// with undef as they come from outside of the allocated space. + SmallVector<Instruction *, 8> DeadUsers; + + /// \brief Operands which will become dead if we rewrite the alloca. + /// + /// These are operands that in their particular use can be replaced with + /// undef when we rewrite the alloca. These show up in out-of-bounds inputs + /// to PHI nodes and the like. They aren't entirely dead (there might be + /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we + /// want to swap this particular input for undef to simplify the use lists of + /// the alloca. + SmallVector<Use *, 8> DeadOperands; + + /// \brief The underlying storage for auxiliary memcpy and memset info. + SmallDenseMap<MemTransferInst *, MemTransferOffsets, 4> MemTransferInstData; + + /// \brief A side datastructure used when building up the partitions and uses. + /// + /// This mapping is only really used during the initial building of the + /// partitioning so that we can retain information about PHI and select nodes + /// processed. + SmallDenseMap<Instruction *, std::pair<uint64_t, bool> > PHIOrSelectSizes; + + /// \brief Auxiliary information for particular PHI or select operands. + SmallDenseMap<Use *, std::pair<unsigned, unsigned>, 4> PHIOrSelectOpMap; + + /// \brief A utility routine called from the constructor. + /// + /// This does what it says on the tin. It is the key of the alloca partition + /// splitting and merging. After it is called we have the desired disjoint + /// collection of partitions. + void splitAndMergePartitions(); +}; +} + +template <typename DerivedT, typename RetT> +class AllocaPartitioning::BuilderBase + : public InstVisitor<DerivedT, RetT> { +public: + BuilderBase(const DataLayout &TD, AllocaInst &AI, AllocaPartitioning &P) + : TD(TD), + AllocSize(TD.getTypeAllocSize(AI.getAllocatedType())), + P(P) { + enqueueUsers(AI, 0); + } + +protected: + const DataLayout &TD; + const uint64_t AllocSize; + AllocaPartitioning &P; + + SmallPtrSet<Use *, 8> VisitedUses; + + struct OffsetUse { + Use *U; + int64_t Offset; + }; + SmallVector<OffsetUse, 8> Queue; + + // The active offset and use while visiting. + Use *U; + int64_t Offset; + + void enqueueUsers(Instruction &I, int64_t UserOffset) { + for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); + UI != UE; ++UI) { + if (VisitedUses.insert(&UI.getUse())) { + OffsetUse OU = { &UI.getUse(), UserOffset }; + Queue.push_back(OU); + } + } + } + + bool computeConstantGEPOffset(GetElementPtrInst &GEPI, int64_t &GEPOffset) { + GEPOffset = Offset; + for (gep_type_iterator GTI = gep_type_begin(GEPI), GTE = gep_type_end(GEPI); + GTI != GTE; ++GTI) { + ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); + if (!OpC) + return false; + if (OpC->isZero()) + continue; + + // Handle a struct index, which adds its field offset to the pointer. + if (StructType *STy = dyn_cast<StructType>(*GTI)) { + unsigned ElementIdx = OpC->getZExtValue(); + const StructLayout *SL = TD.getStructLayout(STy); + uint64_t ElementOffset = SL->getElementOffset(ElementIdx); + // Check that we can continue to model this GEP in a signed 64-bit offset. + if (ElementOffset > INT64_MAX || + (GEPOffset >= 0 && + ((uint64_t)GEPOffset + ElementOffset) > INT64_MAX)) { + DEBUG(dbgs() << "WARNING: Encountered a cumulative offset exceeding " + << "what can be represented in an int64_t!\n" + << " alloca: " << P.AI << "\n"); + return false; + } + if (GEPOffset < 0) + GEPOffset = ElementOffset + (uint64_t)-GEPOffset; + else + GEPOffset += ElementOffset; + continue; + } + + APInt Index = OpC->getValue().sextOrTrunc(TD.getPointerSizeInBits()); + Index *= APInt(Index.getBitWidth(), + TD.getTypeAllocSize(GTI.getIndexedType())); + Index += APInt(Index.getBitWidth(), (uint64_t)GEPOffset, + /*isSigned*/true); + // Check if the result can be stored in our int64_t offset. + if (!Index.isSignedIntN(sizeof(GEPOffset) * 8)) { + DEBUG(dbgs() << "WARNING: Encountered a cumulative offset exceeding " + << "what can be represented in an int64_t!\n" + << " alloca: " << P.AI << "\n"); + return false; + } + + GEPOffset = Index.getSExtValue(); + } + return true; + } + + Value *foldSelectInst(SelectInst &SI) { + // If the condition being selected on is a constant or the same value is + // being selected between, fold the select. Yes this does (rarely) happen + // early on. + if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition())) + return SI.getOperand(1+CI->isZero()); + if (SI.getOperand(1) == SI.getOperand(2)) { + assert(*U == SI.getOperand(1)); + return SI.getOperand(1); + } + return 0; + } +}; + +/// \brief Builder for the alloca partitioning. +/// +/// This class builds an alloca partitioning by recursively visiting the uses +/// of an alloca and splitting the partitions for each load and store at each +/// offset. +class AllocaPartitioning::PartitionBuilder + : public BuilderBase<PartitionBuilder, bool> { + friend class InstVisitor<PartitionBuilder, bool>; + + SmallDenseMap<Instruction *, unsigned> MemTransferPartitionMap; + +public: + PartitionBuilder(const DataLayout &TD, AllocaInst &AI, AllocaPartitioning &P) + : BuilderBase<PartitionBuilder, bool>(TD, AI, P) {} + + /// \brief Run the builder over the allocation. + bool operator()() { + // Note that we have to re-evaluate size on each trip through the loop as + // the queue grows at the tail. + for (unsigned Idx = 0; Idx < Queue.size(); ++Idx) { + U = Queue[Idx].U; + Offset = Queue[Idx].Offset; + if (!visit(cast<Instruction>(U->getUser()))) + return false; + } + return true; + } + +private: + bool markAsEscaping(Instruction &I) { + P.PointerEscapingInstr = &I; + return false; + } + + void insertUse(Instruction &I, int64_t Offset, uint64_t Size, + bool IsSplittable = false) { + // Completely skip uses which have a zero size or don't overlap the + // allocation. + if (Size == 0 || + (Offset >= 0 && (uint64_t)Offset >= AllocSize) || + (Offset < 0 && (uint64_t)-Offset >= Size)) { + DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset + << " which starts past the end of the " << AllocSize + << " byte alloca:\n" + << " alloca: " << P.AI << "\n" + << " use: " << I << "\n"); + return; + } + + // Clamp the start to the beginning of the allocation. + if (Offset < 0) { + DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset + << " to start at the beginning of the alloca:\n" + << " alloca: " << P.AI << "\n" + << " use: " << I << "\n"); + Size -= (uint64_t)-Offset; + Offset = 0; + } + + uint64_t BeginOffset = Offset, EndOffset = BeginOffset + Size; + + // Clamp the end offset to the end of the allocation. Note that this is + // formulated to handle even the case where "BeginOffset + Size" overflows. + // NOTE! This may appear superficially to be something we could ignore + // entirely, but that is not so! There may be PHI-node uses where some + // instructions are dead but not others. We can't completely ignore the + // PHI node, and so have to record at least the information here. + assert(AllocSize >= BeginOffset); // Established above. + if (Size > AllocSize - BeginOffset) { + DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset + << " to remain within the " << AllocSize << " byte alloca:\n" + << " alloca: " << P.AI << "\n" + << " use: " << I << "\n"); + EndOffset = AllocSize; + } + + Partition New(BeginOffset, EndOffset, IsSplittable); + P.Partitions.push_back(New); + } + + bool handleLoadOrStore(Type *Ty, Instruction &I, int64_t Offset, + bool IsVolatile) { + uint64_t Size = TD.getTypeStoreSize(Ty); + + // If this memory access can be shown to *statically* extend outside the + // bounds of of the allocation, it's behavior is undefined, so simply + // ignore it. Note that this is more strict than the generic clamping + // behavior of insertUse. We also try to handle cases which might run the + // risk of overflow. + // FIXME: We should instead consider the pointer to have escaped if this + // function is being instrumented for addressing bugs or race conditions. + if (Offset < 0 || (uint64_t)Offset >= AllocSize || + Size > (AllocSize - (uint64_t)Offset)) { + DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte " + << (isa<LoadInst>(I) ? "load" : "store") << " @" << Offset + << " which extends past the end of the " << AllocSize + << " byte alloca:\n" + << " alloca: " << P.AI << "\n" + << " use: " << I << "\n"); + return true; + } + + // We allow splitting of loads and stores where the type is an integer type + // and which cover the entire alloca. Such integer loads and stores + // often require decomposition into fine grained loads and stores. + bool IsSplittable = false; + if (IntegerType *ITy = dyn_cast<IntegerType>(Ty)) + IsSplittable = !IsVolatile && ITy->getBitWidth() == AllocSize*8; + + insertUse(I, Offset, Size, IsSplittable); + return true; + } + + bool visitBitCastInst(BitCastInst &BC) { + enqueueUsers(BC, Offset); + return true; + } + + bool visitGetElementPtrInst(GetElementPtrInst &GEPI) { + int64_t GEPOffset; + if (!computeConstantGEPOffset(GEPI, GEPOffset)) + return markAsEscaping(GEPI); + + enqueueUsers(GEPI, GEPOffset); + return true; + } + + bool visitLoadInst(LoadInst &LI) { + assert((!LI.isSimple() || LI.getType()->isSingleValueType()) && + "All simple FCA loads should have been pre-split"); + return handleLoadOrStore(LI.getType(), LI, Offset, LI.isVolatile()); + } + + bool visitStoreInst(StoreInst &SI) { + Value *ValOp = SI.getValueOperand(); + if (ValOp == *U) + return markAsEscaping(SI); + + assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) && + "All simple FCA stores should have been pre-split"); + return handleLoadOrStore(ValOp->getType(), SI, Offset, SI.isVolatile()); + } + + + bool visitMemSetInst(MemSetInst &II) { + assert(II.getRawDest() == *U && "Pointer use is not the destination?"); + ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); + uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset; + insertUse(II, Offset, Size, Length); + return true; + } + + bool visitMemTransferInst(MemTransferInst &II) { + ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); + uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset; + if (!Size) + // Zero-length mem transfer intrinsics can be ignored entirely. + return true; + + MemTransferOffsets &Offsets = P.MemTransferInstData[&II]; + + // Only intrinsics with a constant length can be split. + Offsets.IsSplittable = Length; + + if (*U == II.getRawDest()) { + Offsets.DestBegin = Offset; + Offsets.DestEnd = Offset + Size; + } + if (*U == II.getRawSource()) { + Offsets.SourceBegin = Offset; + Offsets.SourceEnd = Offset + Size; + } + + // If we have set up end offsets for both the source and the destination, + // we have found both sides of this transfer pointing at the same alloca. + bool SeenBothEnds = Offsets.SourceEnd && Offsets.DestEnd; + if (SeenBothEnds && II.getRawDest() != II.getRawSource()) { + unsigned PrevIdx = MemTransferPartitionMap[&II]; + + // Check if the begin offsets match and this is a non-volatile transfer. + // In that case, we can completely elide the transfer. + if (!II.isVolatile() && Offsets.SourceBegin == Offsets.DestBegin) { + P.Partitions[PrevIdx].kill(); + return true; + } + + // Otherwise we have an offset transfer within the same alloca. We can't + // split those. + P.Partitions[PrevIdx].IsSplittable = Offsets.IsSplittable = false; + } else if (SeenBothEnds) { + // Handle the case where this exact use provides both ends of the + // operation. + assert(II.getRawDest() == II.getRawSource()); + + // For non-volatile transfers this is a no-op. + if (!II.isVolatile()) + return true; + + // Otherwise just suppress splitting. + Offsets.IsSplittable = false; + } + + + // Insert the use now that we've fixed up the splittable nature. + insertUse(II, Offset, Size, Offsets.IsSplittable); + + // Setup the mapping from intrinsic to partition of we've not seen both + // ends of this transfer. + if (!SeenBothEnds) { + unsigned NewIdx = P.Partitions.size() - 1; + bool Inserted + = MemTransferPartitionMap.insert(std::make_pair(&II, NewIdx)).second; + assert(Inserted && + "Already have intrinsic in map but haven't seen both ends"); + (void)Inserted; + } + + return true; + } + + // Disable SRoA for any intrinsics except for lifetime invariants. + // FIXME: What about debug instrinsics? This matches old behavior, but + // doesn't make sense. + bool visitIntrinsicInst(IntrinsicInst &II) { + if (II.getIntrinsicID() == Intrinsic::lifetime_start || + II.getIntrinsicID() == Intrinsic::lifetime_end) { + ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0)); + uint64_t Size = std::min(AllocSize - Offset, Length->getLimitedValue()); + insertUse(II, Offset, Size, true); + return true; + } + + return markAsEscaping(II); + } + + Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) { + // We consider any PHI or select that results in a direct load or store of + // the same offset to be a viable use for partitioning purposes. These uses + // are considered unsplittable and the size is the maximum loaded or stored + // size. + SmallPtrSet<Instruction *, 4> Visited; + SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses; + Visited.insert(Root); + Uses.push_back(std::make_pair(cast<Instruction>(*U), Root)); + // If there are no loads or stores, the access is dead. We mark that as + // a size zero access. + Size = 0; + do { + Instruction *I, *UsedI; + llvm::tie(UsedI, I) = Uses.pop_back_val(); + + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + Size = std::max(Size, TD.getTypeStoreSize(LI->getType())); + continue; + } + if (StoreInst *SI = dyn_cast<StoreInst>(I)) { + Value *Op = SI->getOperand(0); + if (Op == UsedI) + return SI; + Size = std::max(Size, TD.getTypeStoreSize(Op->getType())); + continue; + } + + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) { + if (!GEP->hasAllZeroIndices()) + return GEP; + } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) && + !isa<SelectInst>(I)) { + return I; + } + + for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); UI != UE; + ++UI) + if (Visited.insert(cast<Instruction>(*UI))) + Uses.push_back(std::make_pair(I, cast<Instruction>(*UI))); + } while (!Uses.empty()); + + return 0; + } + + bool visitPHINode(PHINode &PN) { + // See if we already have computed info on this node. + std::pair<uint64_t, bool> &PHIInfo = P.PHIOrSelectSizes[&PN]; + if (PHIInfo.first) { + PHIInfo.second = true; + insertUse(PN, Offset, PHIInfo.first); + return true; + } + + // Check for an unsafe use of the PHI node. + if (Instruction *EscapingI = hasUnsafePHIOrSelectUse(&PN, PHIInfo.first)) + return markAsEscaping(*EscapingI); + + insertUse(PN, Offset, PHIInfo.first); + return true; + } + + bool visitSelectInst(SelectInst &SI) { + if (Value *Result = foldSelectInst(SI)) { + if (Result == *U) + // If the result of the constant fold will be the pointer, recurse + // through the select as if we had RAUW'ed it. + enqueueUsers(SI, Offset); + + return true; + } + + // See if we already have computed info on this node. + std::pair<uint64_t, bool> &SelectInfo = P.PHIOrSelectSizes[&SI]; + if (SelectInfo.first) { + SelectInfo.second = true; + insertUse(SI, Offset, SelectInfo.first); + return true; + } + + // Check for an unsafe use of the PHI node. + if (Instruction *EscapingI = hasUnsafePHIOrSelectUse(&SI, SelectInfo.first)) + return markAsEscaping(*EscapingI); + + insertUse(SI, Offset, SelectInfo.first); + return true; + } + + /// \brief Disable SROA entirely if there are unhandled users of the alloca. + bool visitInstruction(Instruction &I) { return markAsEscaping(I); } +}; + + +/// \brief Use adder for the alloca partitioning. +/// +/// This class adds the uses of an alloca to all of the partitions which they +/// use. For splittable partitions, this can end up doing essentially a linear +/// walk of the partitions, but the number of steps remains bounded by the +/// total result instruction size: +/// - The number of partitions is a result of the number unsplittable +/// instructions using the alloca. +/// - The number of users of each partition is at worst the total number of +/// splittable instructions using the alloca. +/// Thus we will produce N * M instructions in the end, where N are the number +/// of unsplittable uses and M are the number of splittable. This visitor does +/// the exact same number of updates to the partitioning. +/// +/// In the more common case, this visitor will leverage the fact that the +/// partition space is pre-sorted, and do a logarithmic search for the +/// partition needed, making the total visit a classical ((N + M) * log(N)) +/// complexity operation. +class AllocaPartitioning::UseBuilder : public BuilderBase<UseBuilder> { + friend class InstVisitor<UseBuilder>; + + /// \brief Set to de-duplicate dead instructions found in the use walk. + SmallPtrSet<Instruction *, 4> VisitedDeadInsts; + +public: + UseBuilder(const DataLayout &TD, AllocaInst &AI, AllocaPartitioning &P) + : BuilderBase<UseBuilder>(TD, AI, P) {} + + /// \brief Run the builder over the allocation. + void operator()() { + // Note that we have to re-evaluate size on each trip through the loop as + // the queue grows at the tail. + for (unsigned Idx = 0; Idx < Queue.size(); ++Idx) { + U = Queue[Idx].U; + Offset = Queue[Idx].Offset; + this->visit(cast<Instruction>(U->getUser())); + } + } + +private: + void markAsDead(Instruction &I) { + if (VisitedDeadInsts.insert(&I)) + P.DeadUsers.push_back(&I); + } + + void insertUse(Instruction &User, int64_t Offset, uint64_t Size) { + // If the use has a zero size or extends outside of the allocation, record + // it as a dead use for elimination later. + if (Size == 0 || (uint64_t)Offset >= AllocSize || + (Offset < 0 && (uint64_t)-Offset >= Size)) + return markAsDead(User); + + // Clamp the start to the beginning of the allocation. + if (Offset < 0) { + Size -= (uint64_t)-Offset; + Offset = 0; + } + + uint64_t BeginOffset = Offset, EndOffset = BeginOffset + Size; + + // Clamp the end offset to the end of the allocation. Note that this is + // formulated to handle even the case where "BeginOffset + Size" overflows. + assert(AllocSize >= BeginOffset); // Established above. + if (Size > AllocSize - BeginOffset) + EndOffset = AllocSize; + + // NB: This only works if we have zero overlapping partitions. + iterator B = std::lower_bound(P.begin(), P.end(), BeginOffset); + if (B != P.begin() && llvm::prior(B)->EndOffset > BeginOffset) + B = llvm::prior(B); + for (iterator I = B, E = P.end(); I != E && I->BeginOffset < EndOffset; + ++I) { + PartitionUse NewPU(std::max(I->BeginOffset, BeginOffset), + std::min(I->EndOffset, EndOffset), U); + P.use_push_back(I, NewPU); + if (isa<PHINode>(U->getUser()) || isa<SelectInst>(U->getUser())) + P.PHIOrSelectOpMap[U] + = std::make_pair(I - P.begin(), P.Uses[I - P.begin()].size() - 1); + } + } + + void handleLoadOrStore(Type *Ty, Instruction &I, int64_t Offset) { + uint64_t Size = TD.getTypeStoreSize(Ty); + + // If this memory access can be shown to *statically* extend outside the + // bounds of of the allocation, it's behavior is undefined, so simply + // ignore it. Note that this is more strict than the generic clamping + // behavior of insertUse. + if (Offset < 0 || (uint64_t)Offset >= AllocSize || + Size > (AllocSize - (uint64_t)Offset)) + return markAsDead(I); + + insertUse(I, Offset, Size); + } + + void visitBitCastInst(BitCastInst &BC) { + if (BC.use_empty()) + return markAsDead(BC); + + enqueueUsers(BC, Offset); + } + + void visitGetElementPtrInst(GetElementPtrInst &GEPI) { + if (GEPI.use_empty()) + return markAsDead(GEPI); + + int64_t GEPOffset; + if (!computeConstantGEPOffset(GEPI, GEPOffset)) + llvm_unreachable("Unable to compute constant offset for use"); + + enqueueUsers(GEPI, GEPOffset); + } + + void visitLoadInst(LoadInst &LI) { + handleLoadOrStore(LI.getType(), LI, Offset); + } + + void visitStoreInst(StoreInst &SI) { + handleLoadOrStore(SI.getOperand(0)->getType(), SI, Offset); + } + + void visitMemSetInst(MemSetInst &II) { + ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); + uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset; + insertUse(II, Offset, Size); + } + + void visitMemTransferInst(MemTransferInst &II) { + ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); + uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset; + if (!Size) + return markAsDead(II); + + MemTransferOffsets &Offsets = P.MemTransferInstData[&II]; + if (!II.isVolatile() && Offsets.DestEnd && Offsets.SourceEnd && + Offsets.DestBegin == Offsets.SourceBegin) + return markAsDead(II); // Skip identity transfers without side-effects. + + insertUse(II, Offset, Size); + } + + void visitIntrinsicInst(IntrinsicInst &II) { + assert(II.getIntrinsicID() == Intrinsic::lifetime_start || + II.getIntrinsicID() == Intrinsic::lifetime_end); + + ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0)); + insertUse(II, Offset, + std::min(AllocSize - Offset, Length->getLimitedValue())); + } + + void insertPHIOrSelect(Instruction &User, uint64_t Offset) { + uint64_t Size = P.PHIOrSelectSizes.lookup(&User).first; + + // For PHI and select operands outside the alloca, we can't nuke the entire + // phi or select -- the other side might still be relevant, so we special + // case them here and use a separate structure to track the operands + // themselves which should be replaced with undef. + if (Offset >= AllocSize) { + P.DeadOperands.push_back(U); + return; + } + + insertUse(User, Offset, Size); + } + void visitPHINode(PHINode &PN) { + if (PN.use_empty()) + return markAsDead(PN); + + insertPHIOrSelect(PN, Offset); + } + void visitSelectInst(SelectInst &SI) { + if (SI.use_empty()) + return markAsDead(SI); + + if (Value *Result = foldSelectInst(SI)) { + if (Result == *U) + // If the result of the constant fold will be the pointer, recurse + // through the select as if we had RAUW'ed it. + enqueueUsers(SI, Offset); + else + // Otherwise the operand to the select is dead, and we can replace it + // with undef. + P.DeadOperands.push_back(U); + + return; + } + + insertPHIOrSelect(SI, Offset); + } + + /// \brief Unreachable, we've already visited the alloca once. + void visitInstruction(Instruction &I) { + llvm_unreachable("Unhandled instruction in use builder."); + } +}; + +void AllocaPartitioning::splitAndMergePartitions() { + size_t NumDeadPartitions = 0; + + // Track the range of splittable partitions that we pass when accumulating + // overlapping unsplittable partitions. + uint64_t SplitEndOffset = 0ull; + + Partition New(0ull, 0ull, false); + + for (unsigned i = 0, j = i, e = Partitions.size(); i != e; i = j) { + ++j; + + if (!Partitions[i].IsSplittable || New.BeginOffset == New.EndOffset) { + assert(New.BeginOffset == New.EndOffset); + New = Partitions[i]; + } else { + assert(New.IsSplittable); + New.EndOffset = std::max(New.EndOffset, Partitions[i].EndOffset); + } + assert(New.BeginOffset != New.EndOffset); + + // Scan the overlapping partitions. + while (j != e && New.EndOffset > Partitions[j].BeginOffset) { + // If the new partition we are forming is splittable, stop at the first + // unsplittable partition. + if (New.IsSplittable && !Partitions[j].IsSplittable) + break; + + // Grow the new partition to include any equally splittable range. 'j' is + // always equally splittable when New is splittable, but when New is not + // splittable, we may subsume some (or part of some) splitable partition + // without growing the new one. + if (New.IsSplittable == Partitions[j].IsSplittable) { + New.EndOffset = std::max(New.EndOffset, Partitions[j].EndOffset); + } else { + assert(!New.IsSplittable); + assert(Partitions[j].IsSplittable); + SplitEndOffset = std::max(SplitEndOffset, Partitions[j].EndOffset); + } + + Partitions[j].kill(); + ++NumDeadPartitions; + ++j; + } + + // If the new partition is splittable, chop off the end as soon as the + // unsplittable subsequent partition starts and ensure we eventually cover + // the splittable area. + if (j != e && New.IsSplittable) { + SplitEndOffset = std::max(SplitEndOffset, New.EndOffset); + New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset); + } + + // Add the new partition if it differs from the original one and is + // non-empty. We can end up with an empty partition here if it was + // splittable but there is an unsplittable one that starts at the same + // offset. + if (New != Partitions[i]) { + if (New.BeginOffset != New.EndOffset) + Partitions.push_back(New); + // Mark the old one for removal. + Partitions[i].kill(); + ++NumDeadPartitions; + } + + New.BeginOffset = New.EndOffset; + if (!New.IsSplittable) { + New.EndOffset = std::max(New.EndOffset, SplitEndOffset); + if (j != e && !Partitions[j].IsSplittable) + New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset); + New.IsSplittable = true; + // If there is a trailing splittable partition which won't be fused into + // the next splittable partition go ahead and add it onto the partitions + // list. + if (New.BeginOffset < New.EndOffset && + (j == e || !Partitions[j].IsSplittable || + New.EndOffset < Partitions[j].BeginOffset)) { + Partitions.push_back(New); + New.BeginOffset = New.EndOffset = 0ull; + } + } + } + + // Re-sort the partitions now that they have been split and merged into + // disjoint set of partitions. Also remove any of the dead partitions we've + // replaced in the process. + std::sort(Partitions.begin(), Partitions.end()); + if (NumDeadPartitions) { + assert(Partitions.back().isDead()); + assert((ptrdiff_t)NumDeadPartitions == + std::count(Partitions.begin(), Partitions.end(), Partitions.back())); + } + Partitions.erase(Partitions.end() - NumDeadPartitions, Partitions.end()); +} + +AllocaPartitioning::AllocaPartitioning(const DataLayout &TD, AllocaInst &AI) + : +#ifndef NDEBUG + AI(AI), +#endif + PointerEscapingInstr(0) { + PartitionBuilder PB(TD, AI, *this); + if (!PB()) + return; + + // Sort the uses. This arranges for the offsets to be in ascending order, + // and the sizes to be in descending order. + std::sort(Partitions.begin(), Partitions.end()); + + // Remove any partitions from the back which are marked as dead. + while (!Partitions.empty() && Partitions.back().isDead()) + Partitions.pop_back(); + + if (Partitions.size() > 1) { + // Intersect splittability for all partitions with equal offsets and sizes. + // Then remove all but the first so that we have a sequence of non-equal but + // potentially overlapping partitions. + for (iterator I = Partitions.begin(), J = I, E = Partitions.end(); I != E; + I = J) { + ++J; + while (J != E && *I == *J) { + I->IsSplittable &= J->IsSplittable; + ++J; + } + } + Partitions.erase(std::unique(Partitions.begin(), Partitions.end()), + Partitions.end()); + + // Split splittable and merge unsplittable partitions into a disjoint set + // of partitions over the used space of the allocation. + splitAndMergePartitions(); + } + + // Now build up the user lists for each of these disjoint partitions by + // re-walking the recursive users of the alloca. + Uses.resize(Partitions.size()); + UseBuilder UB(TD, AI, *this); + UB(); +} + +Type *AllocaPartitioning::getCommonType(iterator I) const { + Type *Ty = 0; + for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) { + if (!UI->U) + continue; // Skip dead uses. + if (isa<IntrinsicInst>(*UI->U->getUser())) + continue; + if (UI->BeginOffset != I->BeginOffset || UI->EndOffset != I->EndOffset) + continue; + + Type *UserTy = 0; + if (LoadInst *LI = dyn_cast<LoadInst>(UI->U->getUser())) { + UserTy = LI->getType(); + } else if (StoreInst *SI = dyn_cast<StoreInst>(UI->U->getUser())) { + UserTy = SI->getValueOperand()->getType(); + } else { + return 0; // Bail if we have weird uses. + } + + if (IntegerType *ITy = dyn_cast<IntegerType>(UserTy)) { + // If the type is larger than the partition, skip it. We only encounter + // this for split integer operations where we want to use the type of the + // entity causing the split. + if (ITy->getBitWidth() > (I->EndOffset - I->BeginOffset)*8) + continue; + + // If we have found an integer type use covering the alloca, use that + // regardless of the other types, as integers are often used for a "bucket + // of bits" type. + return ITy; + } + + if (Ty && Ty != UserTy) + return 0; + + Ty = UserTy; + } + return Ty; +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + +void AllocaPartitioning::print(raw_ostream &OS, const_iterator I, + StringRef Indent) const { + OS << Indent << "partition #" << (I - begin()) + << " [" << I->BeginOffset << "," << I->EndOffset << ")" + << (I->IsSplittable ? " (splittable)" : "") + << (Uses[I - begin()].empty() ? " (zero uses)" : "") + << "\n"; +} + +void AllocaPartitioning::printUsers(raw_ostream &OS, const_iterator I, + StringRef Indent) const { + for (const_use_iterator UI = use_begin(I), UE = use_end(I); + UI != UE; ++UI) { + if (!UI->U) + continue; // Skip dead uses. + OS << Indent << " [" << UI->BeginOffset << "," << UI->EndOffset << ") " + << "used by: " << *UI->U->getUser() << "\n"; + if (MemTransferInst *II = dyn_cast<MemTransferInst>(UI->U->getUser())) { + const MemTransferOffsets &MTO = MemTransferInstData.lookup(II); + bool IsDest; + if (!MTO.IsSplittable) + IsDest = UI->BeginOffset == MTO.DestBegin; + else + IsDest = MTO.DestBegin != 0u; + OS << Indent << " (original " << (IsDest ? "dest" : "source") << ": " + << "[" << (IsDest ? MTO.DestBegin : MTO.SourceBegin) + << "," << (IsDest ? MTO.DestEnd : MTO.SourceEnd) << ")\n"; + } + } +} + +void AllocaPartitioning::print(raw_ostream &OS) const { + if (PointerEscapingInstr) { + OS << "No partitioning for alloca: " << AI << "\n" + << " A pointer to this alloca escaped by:\n" + << " " << *PointerEscapingInstr << "\n"; + return; + } + + OS << "Partitioning of alloca: " << AI << "\n"; + unsigned Num = 0; + for (const_iterator I = begin(), E = end(); I != E; ++I, ++Num) { + print(OS, I); + printUsers(OS, I); + } +} + +void AllocaPartitioning::dump(const_iterator I) const { print(dbgs(), I); } +void AllocaPartitioning::dump() const { print(dbgs()); } + +#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + + +namespace { +/// \brief Implementation of LoadAndStorePromoter for promoting allocas. +/// +/// This subclass of LoadAndStorePromoter adds overrides to handle promoting +/// the loads and stores of an alloca instruction, as well as updating its +/// debug information. This is used when a domtree is unavailable and thus +/// mem2reg in its full form can't be used to handle promotion of allocas to +/// scalar values. +class AllocaPromoter : public LoadAndStorePromoter { + AllocaInst &AI; + DIBuilder &DIB; + + SmallVector<DbgDeclareInst *, 4> DDIs; + SmallVector<DbgValueInst *, 4> DVIs; + +public: + AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S, + AllocaInst &AI, DIBuilder &DIB) + : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {} + + void run(const SmallVectorImpl<Instruction*> &Insts) { + // Remember which alloca we're promoting (for isInstInList). + if (MDNode *DebugNode = MDNode::getIfExists(AI.getContext(), &AI)) { + for (Value::use_iterator UI = DebugNode->use_begin(), + UE = DebugNode->use_end(); + UI != UE; ++UI) + if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(*UI)) + DDIs.push_back(DDI); + else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(*UI)) + DVIs.push_back(DVI); + } + + LoadAndStorePromoter::run(Insts); + AI.eraseFromParent(); + while (!DDIs.empty()) + DDIs.pop_back_val()->eraseFromParent(); + while (!DVIs.empty()) + DVIs.pop_back_val()->eraseFromParent(); + } + + virtual bool isInstInList(Instruction *I, + const SmallVectorImpl<Instruction*> &Insts) const { + if (LoadInst *LI = dyn_cast<LoadInst>(I)) + return LI->getOperand(0) == &AI; + return cast<StoreInst>(I)->getPointerOperand() == &AI; + } + + virtual void updateDebugInfo(Instruction *Inst) const { + for (SmallVector<DbgDeclareInst *, 4>::const_iterator I = DDIs.begin(), + E = DDIs.end(); I != E; ++I) { + DbgDeclareInst *DDI = *I; + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) + ConvertDebugDeclareToDebugValue(DDI, SI, DIB); + else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) + ConvertDebugDeclareToDebugValue(DDI, LI, DIB); + } + for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(), + E = DVIs.end(); I != E; ++I) { + DbgValueInst *DVI = *I; + Value *Arg = NULL; + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + // If an argument is zero extended then use argument directly. The ZExt + // may be zapped by an optimization pass in future. + if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0))) + Arg = dyn_cast<Argument>(ZExt->getOperand(0)); + if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0))) + Arg = dyn_cast<Argument>(SExt->getOperand(0)); + if (!Arg) + Arg = SI->getOperand(0); + } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { + Arg = LI->getOperand(0); + } else { + continue; + } + Instruction *DbgVal = + DIB.insertDbgValueIntrinsic(Arg, 0, DIVariable(DVI->getVariable()), + Inst); + DbgVal->setDebugLoc(DVI->getDebugLoc()); + } + } +}; +} // end anon namespace + + +namespace { +/// \brief An optimization pass providing Scalar Replacement of Aggregates. +/// +/// This pass takes allocations which can be completely analyzed (that is, they +/// don't escape) and tries to turn them into scalar SSA values. There are +/// a few steps to this process. +/// +/// 1) It takes allocations of aggregates and analyzes the ways in which they +/// are used to try to split them into smaller allocations, ideally of +/// a single scalar data type. It will split up memcpy and memset accesses +/// as necessary and try to isolate invidual scalar accesses. +/// 2) It will transform accesses into forms which are suitable for SSA value +/// promotion. This can be replacing a memset with a scalar store of an +/// integer value, or it can involve speculating operations on a PHI or +/// select to be a PHI or select of the results. +/// 3) Finally, this will try to detect a pattern of accesses which map cleanly +/// onto insert and extract operations on a vector value, and convert them to +/// this form. By doing so, it will enable promotion of vector aggregates to +/// SSA vector values. +class SROA : public FunctionPass { + const bool RequiresDomTree; + + LLVMContext *C; + const DataLayout *TD; + DominatorTree *DT; + + /// \brief Worklist of alloca instructions to simplify. + /// + /// Each alloca in the function is added to this. Each new alloca formed gets + /// added to it as well to recursively simplify unless that alloca can be + /// directly promoted. Finally, each time we rewrite a use of an alloca other + /// the one being actively rewritten, we add it back onto the list if not + /// already present to ensure it is re-visited. + SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > Worklist; + + /// \brief A collection of instructions to delete. + /// We try to batch deletions to simplify code and make things a bit more + /// efficient. + SetVector<Instruction *, SmallVector<Instruction *, 8> > DeadInsts; + + /// \brief Post-promotion worklist. + /// + /// Sometimes we discover an alloca which has a high probability of becoming + /// viable for SROA after a round of promotion takes place. In those cases, + /// the alloca is enqueued here for re-processing. + /// + /// Note that we have to be very careful to clear allocas out of this list in + /// the event they are deleted. + SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > PostPromotionWorklist; + + /// \brief A collection of alloca instructions we can directly promote. + std::vector<AllocaInst *> PromotableAllocas; + +public: + SROA(bool RequiresDomTree = true) + : FunctionPass(ID), RequiresDomTree(RequiresDomTree), + C(0), TD(0), DT(0) { + initializeSROAPass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F); + void getAnalysisUsage(AnalysisUsage &AU) const; + + const char *getPassName() const { return "SROA"; } + static char ID; + +private: + friend class PHIOrSelectSpeculator; + friend class AllocaPartitionRewriter; + friend class AllocaPartitionVectorRewriter; + + bool rewriteAllocaPartition(AllocaInst &AI, + AllocaPartitioning &P, + AllocaPartitioning::iterator PI); + bool splitAlloca(AllocaInst &AI, AllocaPartitioning &P); + bool runOnAlloca(AllocaInst &AI); + void deleteDeadInstructions(SmallPtrSet<AllocaInst *, 4> &DeletedAllocas); + bool promoteAllocas(Function &F); +}; +} + +char SROA::ID = 0; + +FunctionPass *llvm::createSROAPass(bool RequiresDomTree) { + return new SROA(RequiresDomTree); +} + +INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates", + false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", + false, false) + +namespace { +/// \brief Visitor to speculate PHIs and Selects where possible. +class PHIOrSelectSpeculator : public InstVisitor<PHIOrSelectSpeculator> { + // Befriend the base class so it can delegate to private visit methods. + friend class llvm::InstVisitor<PHIOrSelectSpeculator>; + + const DataLayout &TD; + AllocaPartitioning &P; + SROA &Pass; + +public: + PHIOrSelectSpeculator(const DataLayout &TD, AllocaPartitioning &P, SROA &Pass) + : TD(TD), P(P), Pass(Pass) {} + + /// \brief Visit the users of an alloca partition and rewrite them. + void visitUsers(AllocaPartitioning::const_iterator PI) { + // Note that we need to use an index here as the underlying vector of uses + // may be grown during speculation. However, we never need to re-visit the + // new uses, and so we can use the initial size bound. + for (unsigned Idx = 0, Size = P.use_size(PI); Idx != Size; ++Idx) { + const AllocaPartitioning::PartitionUse &PU = P.getUse(PI, Idx); + if (!PU.U) + continue; // Skip dead use. + + visit(cast<Instruction>(PU.U->getUser())); + } + } + +private: + // By default, skip this instruction. + void visitInstruction(Instruction &I) {} + + /// PHI instructions that use an alloca and are subsequently loaded can be + /// rewritten to load both input pointers in the pred blocks and then PHI the + /// results, allowing the load of the alloca to be promoted. + /// From this: + /// %P2 = phi [i32* %Alloca, i32* %Other] + /// %V = load i32* %P2 + /// to: + /// %V1 = load i32* %Alloca -> will be mem2reg'd + /// ... + /// %V2 = load i32* %Other + /// ... + /// %V = phi [i32 %V1, i32 %V2] + /// + /// We can do this to a select if its only uses are loads and if the operands + /// to the select can be loaded unconditionally. + /// + /// FIXME: This should be hoisted into a generic utility, likely in + /// Transforms/Util/Local.h + bool isSafePHIToSpeculate(PHINode &PN, SmallVectorImpl<LoadInst *> &Loads) { + // For now, we can only do this promotion if the load is in the same block + // as the PHI, and if there are no stores between the phi and load. + // TODO: Allow recursive phi users. + // TODO: Allow stores. + BasicBlock *BB = PN.getParent(); + unsigned MaxAlign = 0; + for (Value::use_iterator UI = PN.use_begin(), UE = PN.use_end(); + UI != UE; ++UI) { + LoadInst *LI = dyn_cast<LoadInst>(*UI); + if (LI == 0 || !LI->isSimple()) return false; + + // For now we only allow loads in the same block as the PHI. This is + // a common case that happens when instcombine merges two loads through + // a PHI. + if (LI->getParent() != BB) return false; + + // Ensure that there are no instructions between the PHI and the load that + // could store. + for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI) + if (BBI->mayWriteToMemory()) + return false; + + MaxAlign = std::max(MaxAlign, LI->getAlignment()); + Loads.push_back(LI); + } + + // We can only transform this if it is safe to push the loads into the + // predecessor blocks. The only thing to watch out for is that we can't put + // a possibly trapping load in the predecessor if it is a critical edge. + for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; + ++Idx) { + TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator(); + Value *InVal = PN.getIncomingValue(Idx); + + // If the value is produced by the terminator of the predecessor (an + // invoke) or it has side-effects, there is no valid place to put a load + // in the predecessor. + if (TI == InVal || TI->mayHaveSideEffects()) + return false; + + // If the predecessor has a single successor, then the edge isn't + // critical. + if (TI->getNumSuccessors() == 1) + continue; + + // If this pointer is always safe to load, or if we can prove that there + // is already a load in the block, then we can move the load to the pred + // block. + if (InVal->isDereferenceablePointer() || + isSafeToLoadUnconditionally(InVal, TI, MaxAlign, &TD)) + continue; + + return false; + } + + return true; + } + + void visitPHINode(PHINode &PN) { + DEBUG(dbgs() << " original: " << PN << "\n"); + + SmallVector<LoadInst *, 4> Loads; + if (!isSafePHIToSpeculate(PN, Loads)) + return; + + assert(!Loads.empty()); + + Type *LoadTy = cast<PointerType>(PN.getType())->getElementType(); + IRBuilder<> PHIBuilder(&PN); + PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(), + PN.getName() + ".sroa.speculated"); + + // Get the TBAA tag and alignment to use from one of the loads. It doesn't + // matter which one we get and if any differ, it doesn't matter. + LoadInst *SomeLoad = cast<LoadInst>(Loads.back()); + MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa); + unsigned Align = SomeLoad->getAlignment(); + + // Rewrite all loads of the PN to use the new PHI. + do { + LoadInst *LI = Loads.pop_back_val(); + LI->replaceAllUsesWith(NewPN); + Pass.DeadInsts.insert(LI); + } while (!Loads.empty()); + + // Inject loads into all of the pred blocks. + for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { + BasicBlock *Pred = PN.getIncomingBlock(Idx); + TerminatorInst *TI = Pred->getTerminator(); + Use *InUse = &PN.getOperandUse(PN.getOperandNumForIncomingValue(Idx)); + Value *InVal = PN.getIncomingValue(Idx); + IRBuilder<> PredBuilder(TI); + + LoadInst *Load + = PredBuilder.CreateLoad(InVal, (PN.getName() + ".sroa.speculate.load." + + Pred->getName())); + ++NumLoadsSpeculated; + Load->setAlignment(Align); + if (TBAATag) + Load->setMetadata(LLVMContext::MD_tbaa, TBAATag); + NewPN->addIncoming(Load, Pred); + + Instruction *Ptr = dyn_cast<Instruction>(InVal); + if (!Ptr) + // No uses to rewrite. + continue; + + // Try to lookup and rewrite any partition uses corresponding to this phi + // input. + AllocaPartitioning::iterator PI + = P.findPartitionForPHIOrSelectOperand(InUse); + if (PI == P.end()) + continue; + + // Replace the Use in the PartitionUse for this operand with the Use + // inside the load. + AllocaPartitioning::use_iterator UI + = P.findPartitionUseForPHIOrSelectOperand(InUse); + assert(isa<PHINode>(*UI->U->getUser())); + UI->U = &Load->getOperandUse(Load->getPointerOperandIndex()); + } + DEBUG(dbgs() << " speculated to: " << *NewPN << "\n"); + } + + /// Select instructions that use an alloca and are subsequently loaded can be + /// rewritten to load both input pointers and then select between the result, + /// allowing the load of the alloca to be promoted. + /// From this: + /// %P2 = select i1 %cond, i32* %Alloca, i32* %Other + /// %V = load i32* %P2 + /// to: + /// %V1 = load i32* %Alloca -> will be mem2reg'd + /// %V2 = load i32* %Other + /// %V = select i1 %cond, i32 %V1, i32 %V2 + /// + /// We can do this to a select if its only uses are loads and if the operand + /// to the select can be loaded unconditionally. + bool isSafeSelectToSpeculate(SelectInst &SI, + SmallVectorImpl<LoadInst *> &Loads) { + Value *TValue = SI.getTrueValue(); + Value *FValue = SI.getFalseValue(); + bool TDerefable = TValue->isDereferenceablePointer(); + bool FDerefable = FValue->isDereferenceablePointer(); + + for (Value::use_iterator UI = SI.use_begin(), UE = SI.use_end(); + UI != UE; ++UI) { + LoadInst *LI = dyn_cast<LoadInst>(*UI); + if (LI == 0 || !LI->isSimple()) return false; + + // Both operands to the select need to be dereferencable, either + // absolutely (e.g. allocas) or at this point because we can see other + // accesses to it. + if (!TDerefable && !isSafeToLoadUnconditionally(TValue, LI, + LI->getAlignment(), &TD)) + return false; + if (!FDerefable && !isSafeToLoadUnconditionally(FValue, LI, + LI->getAlignment(), &TD)) + return false; + Loads.push_back(LI); + } + + return true; + } + + void visitSelectInst(SelectInst &SI) { + DEBUG(dbgs() << " original: " << SI << "\n"); + IRBuilder<> IRB(&SI); + + // If the select isn't safe to speculate, just use simple logic to emit it. + SmallVector<LoadInst *, 4> Loads; + if (!isSafeSelectToSpeculate(SI, Loads)) + return; + + Use *Ops[2] = { &SI.getOperandUse(1), &SI.getOperandUse(2) }; + AllocaPartitioning::iterator PIs[2]; + AllocaPartitioning::PartitionUse PUs[2]; + for (unsigned i = 0, e = 2; i != e; ++i) { + PIs[i] = P.findPartitionForPHIOrSelectOperand(Ops[i]); + if (PIs[i] != P.end()) { + // If the pointer is within the partitioning, remove the select from + // its uses. We'll add in the new loads below. + AllocaPartitioning::use_iterator UI + = P.findPartitionUseForPHIOrSelectOperand(Ops[i]); + PUs[i] = *UI; + // Clear out the use here so that the offsets into the use list remain + // stable but this use is ignored when rewriting. + UI->U = 0; + } + } + + Value *TV = SI.getTrueValue(); + Value *FV = SI.getFalseValue(); + // Replace the loads of the select with a select of two loads. + while (!Loads.empty()) { + LoadInst *LI = Loads.pop_back_val(); + + IRB.SetInsertPoint(LI); + LoadInst *TL = + IRB.CreateLoad(TV, LI->getName() + ".sroa.speculate.load.true"); + LoadInst *FL = + IRB.CreateLoad(FV, LI->getName() + ".sroa.speculate.load.false"); + NumLoadsSpeculated += 2; + + // Transfer alignment and TBAA info if present. + TL->setAlignment(LI->getAlignment()); + FL->setAlignment(LI->getAlignment()); + if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) { + TL->setMetadata(LLVMContext::MD_tbaa, Tag); + FL->setMetadata(LLVMContext::MD_tbaa, Tag); + } + + Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL, + LI->getName() + ".sroa.speculated"); + + LoadInst *Loads[2] = { TL, FL }; + for (unsigned i = 0, e = 2; i != e; ++i) { + if (PIs[i] != P.end()) { + Use *LoadUse = &Loads[i]->getOperandUse(0); + assert(PUs[i].U->get() == LoadUse->get()); + PUs[i].U = LoadUse; + P.use_push_back(PIs[i], PUs[i]); + } + } + + DEBUG(dbgs() << " speculated to: " << *V << "\n"); + LI->replaceAllUsesWith(V); + Pass.DeadInsts.insert(LI); + } + } +}; +} + +/// \brief Accumulate the constant offsets in a GEP into a single APInt offset. +/// +/// If the provided GEP is all-constant, the total byte offset formed by the +/// GEP is computed and Offset is set to it. If the GEP has any non-constant +/// operands, the function returns false and the value of Offset is unmodified. +static bool accumulateGEPOffsets(const DataLayout &TD, GEPOperator &GEP, + APInt &Offset) { + APInt GEPOffset(Offset.getBitWidth(), 0); + for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); + GTI != GTE; ++GTI) { + ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); + if (!OpC) + return false; + if (OpC->isZero()) continue; + + // Handle a struct index, which adds its field offset to the pointer. + if (StructType *STy = dyn_cast<StructType>(*GTI)) { + unsigned ElementIdx = OpC->getZExtValue(); + const StructLayout *SL = TD.getStructLayout(STy); + GEPOffset += APInt(Offset.getBitWidth(), + SL->getElementOffset(ElementIdx)); + continue; + } + + APInt TypeSize(Offset.getBitWidth(), + TD.getTypeAllocSize(GTI.getIndexedType())); + if (VectorType *VTy = dyn_cast<VectorType>(*GTI)) { + assert((VTy->getScalarSizeInBits() % 8) == 0 && + "vector element size is not a multiple of 8, cannot GEP over it"); + TypeSize = VTy->getScalarSizeInBits() / 8; + } + + GEPOffset += OpC->getValue().sextOrTrunc(Offset.getBitWidth()) * TypeSize; + } + Offset = GEPOffset; + return true; +} + +/// \brief Build a GEP out of a base pointer and indices. +/// +/// This will return the BasePtr if that is valid, or build a new GEP +/// instruction using the IRBuilder if GEP-ing is needed. +static Value *buildGEP(IRBuilder<> &IRB, Value *BasePtr, + SmallVectorImpl<Value *> &Indices, + const Twine &Prefix) { + if (Indices.empty()) + return BasePtr; + + // A single zero index is a no-op, so check for this and avoid building a GEP + // in that case. + if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero()) + return BasePtr; + + return IRB.CreateInBoundsGEP(BasePtr, Indices, Prefix + ".idx"); +} + +/// \brief Get a natural GEP off of the BasePtr walking through Ty toward +/// TargetTy without changing the offset of the pointer. +/// +/// This routine assumes we've already established a properly offset GEP with +/// Indices, and arrived at the Ty type. The goal is to continue to GEP with +/// zero-indices down through type layers until we find one the same as +/// TargetTy. If we can't find one with the same type, we at least try to use +/// one with the same size. If none of that works, we just produce the GEP as +/// indicated by Indices to have the correct offset. +static Value *getNaturalGEPWithType(IRBuilder<> &IRB, const DataLayout &TD, + Value *BasePtr, Type *Ty, Type *TargetTy, + SmallVectorImpl<Value *> &Indices, + const Twine &Prefix) { + if (Ty == TargetTy) + return buildGEP(IRB, BasePtr, Indices, Prefix); + + // See if we can descend into a struct and locate a field with the correct + // type. + unsigned NumLayers = 0; + Type *ElementTy = Ty; + do { + if (ElementTy->isPointerTy()) + break; + if (SequentialType *SeqTy = dyn_cast<SequentialType>(ElementTy)) { + ElementTy = SeqTy->getElementType(); + // Note that we use the default address space as this index is over an + // array or a vector, not a pointer. + Indices.push_back(IRB.getInt(APInt(TD.getPointerSizeInBits(0), 0))); + } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) { + if (STy->element_begin() == STy->element_end()) + break; // Nothing left to descend into. + ElementTy = *STy->element_begin(); + Indices.push_back(IRB.getInt32(0)); + } else { + break; + } + ++NumLayers; + } while (ElementTy != TargetTy); + if (ElementTy != TargetTy) + Indices.erase(Indices.end() - NumLayers, Indices.end()); + + return buildGEP(IRB, BasePtr, Indices, Prefix); +} + +/// \brief Recursively compute indices for a natural GEP. +/// +/// This is the recursive step for getNaturalGEPWithOffset that walks down the +/// element types adding appropriate indices for the GEP. +static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD, + Value *Ptr, Type *Ty, APInt &Offset, + Type *TargetTy, + SmallVectorImpl<Value *> &Indices, + const Twine &Prefix) { + if (Offset == 0) + return getNaturalGEPWithType(IRB, TD, Ptr, Ty, TargetTy, Indices, Prefix); + + // We can't recurse through pointer types. + if (Ty->isPointerTy()) + return 0; + + // We try to analyze GEPs over vectors here, but note that these GEPs are + // extremely poorly defined currently. The long-term goal is to remove GEPing + // over a vector from the IR completely. + if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) { + unsigned ElementSizeInBits = VecTy->getScalarSizeInBits(); + if (ElementSizeInBits % 8) + return 0; // GEPs over non-multiple of 8 size vector elements are invalid. + APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8); + APInt NumSkippedElements = Offset.sdiv(ElementSize); + if (NumSkippedElements.ugt(VecTy->getNumElements())) + return 0; + Offset -= NumSkippedElements * ElementSize; + Indices.push_back(IRB.getInt(NumSkippedElements)); + return getNaturalGEPRecursively(IRB, TD, Ptr, VecTy->getElementType(), + Offset, TargetTy, Indices, Prefix); + } + + if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { + Type *ElementTy = ArrTy->getElementType(); + APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy)); + APInt NumSkippedElements = Offset.sdiv(ElementSize); + if (NumSkippedElements.ugt(ArrTy->getNumElements())) + return 0; + + Offset -= NumSkippedElements * ElementSize; + Indices.push_back(IRB.getInt(NumSkippedElements)); + return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, + Indices, Prefix); + } + + StructType *STy = dyn_cast<StructType>(Ty); + if (!STy) + return 0; + + const StructLayout *SL = TD.getStructLayout(STy); + uint64_t StructOffset = Offset.getZExtValue(); + if (StructOffset >= SL->getSizeInBytes()) + return 0; + unsigned Index = SL->getElementContainingOffset(StructOffset); + Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index)); + Type *ElementTy = STy->getElementType(Index); + if (Offset.uge(TD.getTypeAllocSize(ElementTy))) + return 0; // The offset points into alignment padding. + + Indices.push_back(IRB.getInt32(Index)); + return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, + Indices, Prefix); +} + +/// \brief Get a natural GEP from a base pointer to a particular offset and +/// resulting in a particular type. +/// +/// The goal is to produce a "natural" looking GEP that works with the existing +/// composite types to arrive at the appropriate offset and element type for +/// a pointer. TargetTy is the element type the returned GEP should point-to if +/// possible. We recurse by decreasing Offset, adding the appropriate index to +/// Indices, and setting Ty to the result subtype. +/// +/// If no natural GEP can be constructed, this function returns null. +static Value *getNaturalGEPWithOffset(IRBuilder<> &IRB, const DataLayout &TD, + Value *Ptr, APInt Offset, Type *TargetTy, + SmallVectorImpl<Value *> &Indices, + const Twine &Prefix) { + PointerType *Ty = cast<PointerType>(Ptr->getType()); + + // Don't consider any GEPs through an i8* as natural unless the TargetTy is + // an i8. + if (Ty == IRB.getInt8PtrTy() && TargetTy->isIntegerTy(8)) + return 0; + + Type *ElementTy = Ty->getElementType(); + if (!ElementTy->isSized()) + return 0; // We can't GEP through an unsized element. + APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy)); + if (ElementSize == 0) + return 0; // Zero-length arrays can't help us build a natural GEP. + APInt NumSkippedElements = Offset.sdiv(ElementSize); + + Offset -= NumSkippedElements * ElementSize; + Indices.push_back(IRB.getInt(NumSkippedElements)); + return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy, + Indices, Prefix); +} + +/// \brief Compute an adjusted pointer from Ptr by Offset bytes where the +/// resulting pointer has PointerTy. +/// +/// This tries very hard to compute a "natural" GEP which arrives at the offset +/// and produces the pointer type desired. Where it cannot, it will try to use +/// the natural GEP to arrive at the offset and bitcast to the type. Where that +/// fails, it will try to use an existing i8* and GEP to the byte offset and +/// bitcast to the type. +/// +/// The strategy for finding the more natural GEPs is to peel off layers of the +/// pointer, walking back through bit casts and GEPs, searching for a base +/// pointer from which we can compute a natural GEP with the desired +/// properities. The algorithm tries to fold as many constant indices into +/// a single GEP as possible, thus making each GEP more independent of the +/// surrounding code. +static Value *getAdjustedPtr(IRBuilder<> &IRB, const DataLayout &TD, + Value *Ptr, APInt Offset, Type *PointerTy, + const Twine &Prefix) { + // Even though we don't look through PHI nodes, we could be called on an + // instruction in an unreachable block, which may be on a cycle. + SmallPtrSet<Value *, 4> Visited; + Visited.insert(Ptr); + SmallVector<Value *, 4> Indices; + + // We may end up computing an offset pointer that has the wrong type. If we + // never are able to compute one directly that has the correct type, we'll + // fall back to it, so keep it around here. + Value *OffsetPtr = 0; + + // Remember any i8 pointer we come across to re-use if we need to do a raw + // byte offset. + Value *Int8Ptr = 0; + APInt Int8PtrOffset(Offset.getBitWidth(), 0); + + Type *TargetTy = PointerTy->getPointerElementType(); + + do { + // First fold any existing GEPs into the offset. + while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) { + APInt GEPOffset(Offset.getBitWidth(), 0); + if (!accumulateGEPOffsets(TD, *GEP, GEPOffset)) + break; + Offset += GEPOffset; + Ptr = GEP->getPointerOperand(); + if (!Visited.insert(Ptr)) + break; + } + + // See if we can perform a natural GEP here. + Indices.clear(); + if (Value *P = getNaturalGEPWithOffset(IRB, TD, Ptr, Offset, TargetTy, + Indices, Prefix)) { + if (P->getType() == PointerTy) { + // Zap any offset pointer that we ended up computing in previous rounds. + if (OffsetPtr && OffsetPtr->use_empty()) + if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) + I->eraseFromParent(); + return P; + } + if (!OffsetPtr) { + OffsetPtr = P; + } + } + + // Stash this pointer if we've found an i8*. + if (Ptr->getType()->isIntegerTy(8)) { + Int8Ptr = Ptr; + Int8PtrOffset = Offset; + } + + // Peel off a layer of the pointer and update the offset appropriately. + if (Operator::getOpcode(Ptr) == Instruction::BitCast) { + Ptr = cast<Operator>(Ptr)->getOperand(0); + } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) { + if (GA->mayBeOverridden()) + break; + Ptr = GA->getAliasee(); + } else { + break; + } + assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!"); + } while (Visited.insert(Ptr)); + + if (!OffsetPtr) { + if (!Int8Ptr) { + Int8Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy(), + Prefix + ".raw_cast"); + Int8PtrOffset = Offset; + } + + OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr : + IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset), + Prefix + ".raw_idx"); + } + Ptr = OffsetPtr; + + // On the off chance we were targeting i8*, guard the bitcast here. + if (Ptr->getType() != PointerTy) + Ptr = IRB.CreateBitCast(Ptr, PointerTy, Prefix + ".cast"); + + return Ptr; +} + +/// \brief Test whether we can convert a value from the old to the new type. +/// +/// This predicate should be used to guard calls to convertValue in order to +/// ensure that we only try to convert viable values. The strategy is that we +/// will peel off single element struct and array wrappings to get to an +/// underlying value, and convert that value. +static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { + if (OldTy == NewTy) + return true; + if (DL.getTypeSizeInBits(NewTy) != DL.getTypeSizeInBits(OldTy)) + return false; + if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType()) + return false; + + if (NewTy->isPointerTy() || OldTy->isPointerTy()) { + if (NewTy->isPointerTy() && OldTy->isPointerTy()) + return true; + if (NewTy->isIntegerTy() || OldTy->isIntegerTy()) + return true; + return false; + } + + return true; +} + +/// \brief Generic routine to convert an SSA value to a value of a different +/// type. +/// +/// This will try various different casting techniques, such as bitcasts, +/// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test +/// two types for viability with this routine. +static Value *convertValue(const DataLayout &DL, IRBuilder<> &IRB, Value *V, + Type *Ty) { + assert(canConvertValue(DL, V->getType(), Ty) && + "Value not convertable to type"); + if (V->getType() == Ty) + return V; + if (V->getType()->isIntegerTy() && Ty->isPointerTy()) + return IRB.CreateIntToPtr(V, Ty); + if (V->getType()->isPointerTy() && Ty->isIntegerTy()) + return IRB.CreatePtrToInt(V, Ty); + + return IRB.CreateBitCast(V, Ty); +} + +/// \brief Test whether the given alloca partition can be promoted to a vector. +/// +/// This is a quick test to check whether we can rewrite a particular alloca +/// partition (and its newly formed alloca) into a vector alloca with only +/// whole-vector loads and stores such that it could be promoted to a vector +/// SSA value. We only can ensure this for a limited set of operations, and we +/// don't want to do the rewrites unless we are confident that the result will +/// be promotable, so we have an early test here. +static bool isVectorPromotionViable(const DataLayout &TD, + Type *AllocaTy, + AllocaPartitioning &P, + uint64_t PartitionBeginOffset, + uint64_t PartitionEndOffset, + AllocaPartitioning::const_use_iterator I, + AllocaPartitioning::const_use_iterator E) { + VectorType *Ty = dyn_cast<VectorType>(AllocaTy); + if (!Ty) + return false; + + uint64_t VecSize = TD.getTypeSizeInBits(Ty); + uint64_t ElementSize = Ty->getScalarSizeInBits(); + + // While the definition of LLVM vectors is bitpacked, we don't support sizes + // that aren't byte sized. + if (ElementSize % 8) + return false; + assert((VecSize % 8) == 0 && "vector size not a multiple of element size?"); + VecSize /= 8; + ElementSize /= 8; + + for (; I != E; ++I) { + if (!I->U) + continue; // Skip dead use. + + uint64_t BeginOffset = I->BeginOffset - PartitionBeginOffset; + uint64_t BeginIndex = BeginOffset / ElementSize; + if (BeginIndex * ElementSize != BeginOffset || + BeginIndex >= Ty->getNumElements()) + return false; + uint64_t EndOffset = I->EndOffset - PartitionBeginOffset; + uint64_t EndIndex = EndOffset / ElementSize; + if (EndIndex * ElementSize != EndOffset || + EndIndex > Ty->getNumElements()) + return false; + + // FIXME: We should build shuffle vector instructions to handle + // non-element-sized accesses. + if ((EndOffset - BeginOffset) != ElementSize && + (EndOffset - BeginOffset) != VecSize) + return false; + + if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I->U->getUser())) { + if (MI->isVolatile()) + return false; + if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(I->U->getUser())) { + const AllocaPartitioning::MemTransferOffsets &MTO + = P.getMemTransferOffsets(*MTI); + if (!MTO.IsSplittable) + return false; + } + } else if (I->U->get()->getType()->getPointerElementType()->isStructTy()) { + // Disable vector promotion when there are loads or stores of an FCA. + return false; + } else if (LoadInst *LI = dyn_cast<LoadInst>(I->U->getUser())) { + if (LI->isVolatile()) + return false; + } else if (StoreInst *SI = dyn_cast<StoreInst>(I->U->getUser())) { + if (SI->isVolatile()) + return false; + } else { + return false; + } + } + return true; +} + +/// \brief Test whether the given alloca partition's integer operations can be +/// widened to promotable ones. +/// +/// This is a quick test to check whether we can rewrite the integer loads and +/// stores to a particular alloca into wider loads and stores and be able to +/// promote the resulting alloca. +static bool isIntegerWideningViable(const DataLayout &TD, + Type *AllocaTy, + uint64_t AllocBeginOffset, + AllocaPartitioning &P, + AllocaPartitioning::const_use_iterator I, + AllocaPartitioning::const_use_iterator E) { + uint64_t SizeInBits = TD.getTypeSizeInBits(AllocaTy); + + // Don't try to handle allocas with bit-padding. + if (SizeInBits != TD.getTypeStoreSizeInBits(AllocaTy)) + return false; + + // We need to ensure that an integer type with the appropriate bitwidth can + // be converted to the alloca type, whatever that is. We don't want to force + // the alloca itself to have an integer type if there is a more suitable one. + Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits); + if (!canConvertValue(TD, AllocaTy, IntTy) || + !canConvertValue(TD, IntTy, AllocaTy)) + return false; + + uint64_t Size = TD.getTypeStoreSize(AllocaTy); + + // Check the uses to ensure the uses are (likely) promoteable integer uses. + // Also ensure that the alloca has a covering load or store. We don't want + // to widen the integer operotains only to fail to promote due to some other + // unsplittable entry (which we may make splittable later). + bool WholeAllocaOp = false; + for (; I != E; ++I) { + if (!I->U) + continue; // Skip dead use. + + uint64_t RelBegin = I->BeginOffset - AllocBeginOffset; + uint64_t RelEnd = I->EndOffset - AllocBeginOffset; + + // We can't reasonably handle cases where the load or store extends past + // the end of the aloca's type and into its padding. + if (RelEnd > Size) + return false; + + if (LoadInst *LI = dyn_cast<LoadInst>(I->U->getUser())) { + if (LI->isVolatile()) + return false; + if (RelBegin == 0 && RelEnd == Size) + WholeAllocaOp = true; + if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) { + if (ITy->getBitWidth() < TD.getTypeStoreSize(ITy)) + return false; + continue; + } + // Non-integer loads need to be convertible from the alloca type so that + // they are promotable. + if (RelBegin != 0 || RelEnd != Size || + !canConvertValue(TD, AllocaTy, LI->getType())) + return false; + } else if (StoreInst *SI = dyn_cast<StoreInst>(I->U->getUser())) { + Type *ValueTy = SI->getValueOperand()->getType(); + if (SI->isVolatile()) + return false; + if (RelBegin == 0 && RelEnd == Size) + WholeAllocaOp = true; + if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) { + if (ITy->getBitWidth() < TD.getTypeStoreSize(ITy)) + return false; + continue; + } + // Non-integer stores need to be convertible to the alloca type so that + // they are promotable. + if (RelBegin != 0 || RelEnd != Size || + !canConvertValue(TD, ValueTy, AllocaTy)) + return false; + } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I->U->getUser())) { + if (MI->isVolatile()) + return false; + if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(I->U->getUser())) { + const AllocaPartitioning::MemTransferOffsets &MTO + = P.getMemTransferOffsets(*MTI); + if (!MTO.IsSplittable) + return false; + } + } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I->U->getUser())) { + if (II->getIntrinsicID() != Intrinsic::lifetime_start && + II->getIntrinsicID() != Intrinsic::lifetime_end) + return false; + } else { + return false; + } + } + return WholeAllocaOp; +} + +static Value *extractInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *V, + IntegerType *Ty, uint64_t Offset, + const Twine &Name) { + DEBUG(dbgs() << " start: " << *V << "\n"); + IntegerType *IntTy = cast<IntegerType>(V->getType()); + assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && + "Element extends past full value"); + uint64_t ShAmt = 8*Offset; + if (DL.isBigEndian()) + ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); + if (ShAmt) { + V = IRB.CreateLShr(V, ShAmt, Name + ".shift"); + DEBUG(dbgs() << " shifted: " << *V << "\n"); + } + assert(Ty->getBitWidth() <= IntTy->getBitWidth() && + "Cannot extract to a larger integer!"); + if (Ty != IntTy) { + V = IRB.CreateTrunc(V, Ty, Name + ".trunc"); + DEBUG(dbgs() << " trunced: " << *V << "\n"); + } + return V; +} + +static Value *insertInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *Old, + Value *V, uint64_t Offset, const Twine &Name) { + IntegerType *IntTy = cast<IntegerType>(Old->getType()); + IntegerType *Ty = cast<IntegerType>(V->getType()); + assert(Ty->getBitWidth() <= IntTy->getBitWidth() && + "Cannot insert a larger integer!"); + DEBUG(dbgs() << " start: " << *V << "\n"); + if (Ty != IntTy) { + V = IRB.CreateZExt(V, IntTy, Name + ".ext"); + DEBUG(dbgs() << " extended: " << *V << "\n"); + } + assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && + "Element store outside of alloca store"); + uint64_t ShAmt = 8*Offset; + if (DL.isBigEndian()) + ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); + if (ShAmt) { + V = IRB.CreateShl(V, ShAmt, Name + ".shift"); + DEBUG(dbgs() << " shifted: " << *V << "\n"); + } + + if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) { + APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt); + Old = IRB.CreateAnd(Old, Mask, Name + ".mask"); + DEBUG(dbgs() << " masked: " << *Old << "\n"); + V = IRB.CreateOr(Old, V, Name + ".insert"); + DEBUG(dbgs() << " inserted: " << *V << "\n"); + } + return V; +} + +namespace { +/// \brief Visitor to rewrite instructions using a partition of an alloca to +/// use a new alloca. +/// +/// Also implements the rewriting to vector-based accesses when the partition +/// passes the isVectorPromotionViable predicate. Most of the rewriting logic +/// lives here. +class AllocaPartitionRewriter : public InstVisitor<AllocaPartitionRewriter, + bool> { + // Befriend the base class so it can delegate to private visit methods. + friend class llvm::InstVisitor<AllocaPartitionRewriter, bool>; + + const DataLayout &TD; + AllocaPartitioning &P; + SROA &Pass; + AllocaInst &OldAI, &NewAI; + const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset; + Type *NewAllocaTy; + + // If we are rewriting an alloca partition which can be written as pure + // vector operations, we stash extra information here. When VecTy is + // non-null, we have some strict guarantees about the rewriten alloca: + // - The new alloca is exactly the size of the vector type here. + // - The accesses all either map to the entire vector or to a single + // element. + // - The set of accessing instructions is only one of those handled above + // in isVectorPromotionViable. Generally these are the same access kinds + // which are promotable via mem2reg. + VectorType *VecTy; + Type *ElementTy; + uint64_t ElementSize; + + // This is a convenience and flag variable that will be null unless the new + // alloca's integer operations should be widened to this integer type due to + // passing isIntegerWideningViable above. If it is non-null, the desired + // integer type will be stored here for easy access during rewriting. + IntegerType *IntTy; + + // The offset of the partition user currently being rewritten. + uint64_t BeginOffset, EndOffset; + Use *OldUse; + Instruction *OldPtr; + + // The name prefix to use when rewriting instructions for this alloca. + std::string NamePrefix; + +public: + AllocaPartitionRewriter(const DataLayout &TD, AllocaPartitioning &P, + AllocaPartitioning::iterator PI, + SROA &Pass, AllocaInst &OldAI, AllocaInst &NewAI, + uint64_t NewBeginOffset, uint64_t NewEndOffset) + : TD(TD), P(P), Pass(Pass), + OldAI(OldAI), NewAI(NewAI), + NewAllocaBeginOffset(NewBeginOffset), + NewAllocaEndOffset(NewEndOffset), + NewAllocaTy(NewAI.getAllocatedType()), + VecTy(), ElementTy(), ElementSize(), IntTy(), + BeginOffset(), EndOffset() { + } + + /// \brief Visit the users of the alloca partition and rewrite them. + bool visitUsers(AllocaPartitioning::const_use_iterator I, + AllocaPartitioning::const_use_iterator E) { + if (isVectorPromotionViable(TD, NewAI.getAllocatedType(), P, + NewAllocaBeginOffset, NewAllocaEndOffset, + I, E)) { + ++NumVectorized; + VecTy = cast<VectorType>(NewAI.getAllocatedType()); + ElementTy = VecTy->getElementType(); + assert((VecTy->getScalarSizeInBits() % 8) == 0 && + "Only multiple-of-8 sized vector elements are viable"); + ElementSize = VecTy->getScalarSizeInBits() / 8; + } else if (isIntegerWideningViable(TD, NewAI.getAllocatedType(), + NewAllocaBeginOffset, P, I, E)) { + IntTy = Type::getIntNTy(NewAI.getContext(), + TD.getTypeSizeInBits(NewAI.getAllocatedType())); + } + bool CanSROA = true; + for (; I != E; ++I) { + if (!I->U) + continue; // Skip dead uses. + BeginOffset = I->BeginOffset; + EndOffset = I->EndOffset; + OldUse = I->U; + OldPtr = cast<Instruction>(I->U->get()); + NamePrefix = (Twine(NewAI.getName()) + "." + Twine(BeginOffset)).str(); + CanSROA &= visit(cast<Instruction>(I->U->getUser())); + } + if (VecTy) { + assert(CanSROA); + VecTy = 0; + ElementTy = 0; + ElementSize = 0; + } + if (IntTy) { + assert(CanSROA); + IntTy = 0; + } + return CanSROA; + } + +private: + // Every instruction which can end up as a user must have a rewrite rule. + bool visitInstruction(Instruction &I) { + DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n"); + llvm_unreachable("No rewrite rule for this instruction!"); + } + + Twine getName(const Twine &Suffix) { + return NamePrefix + Suffix; + } + + Value *getAdjustedAllocaPtr(IRBuilder<> &IRB, Type *PointerTy) { + assert(BeginOffset >= NewAllocaBeginOffset); + APInt Offset(TD.getPointerSizeInBits(), BeginOffset - NewAllocaBeginOffset); + return getAdjustedPtr(IRB, TD, &NewAI, Offset, PointerTy, getName("")); + } + + /// \brief Compute suitable alignment to access an offset into the new alloca. + unsigned getOffsetAlign(uint64_t Offset) { + unsigned NewAIAlign = NewAI.getAlignment(); + if (!NewAIAlign) + NewAIAlign = TD.getABITypeAlignment(NewAI.getAllocatedType()); + return MinAlign(NewAIAlign, Offset); + } + + /// \brief Compute suitable alignment to access this partition of the new + /// alloca. + unsigned getPartitionAlign() { + return getOffsetAlign(BeginOffset - NewAllocaBeginOffset); + } + + /// \brief Compute suitable alignment to access a type at an offset of the + /// new alloca. + /// + /// \returns zero if the type's ABI alignment is a suitable alignment, + /// otherwise returns the maximal suitable alignment. + unsigned getOffsetTypeAlign(Type *Ty, uint64_t Offset) { + unsigned Align = getOffsetAlign(Offset); + return Align == TD.getABITypeAlignment(Ty) ? 0 : Align; + } + + /// \brief Compute suitable alignment to access a type at the beginning of + /// this partition of the new alloca. + /// + /// See \c getOffsetTypeAlign for details; this routine delegates to it. + unsigned getPartitionTypeAlign(Type *Ty) { + return getOffsetTypeAlign(Ty, BeginOffset - NewAllocaBeginOffset); + } + + ConstantInt *getIndex(IRBuilder<> &IRB, uint64_t Offset) { + assert(VecTy && "Can only call getIndex when rewriting a vector"); + uint64_t RelOffset = Offset - NewAllocaBeginOffset; + assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds"); + uint32_t Index = RelOffset / ElementSize; + assert(Index * ElementSize == RelOffset); + return IRB.getInt32(Index); + } + + void deleteIfTriviallyDead(Value *V) { + Instruction *I = cast<Instruction>(V); + if (isInstructionTriviallyDead(I)) + Pass.DeadInsts.insert(I); + } + + Value *rewriteVectorizedLoadInst(IRBuilder<> &IRB, LoadInst &LI, Value *OldOp) { + Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".load")); + if (LI.getType() == VecTy->getElementType() || + BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset) { + V = IRB.CreateExtractElement(V, getIndex(IRB, BeginOffset), + getName(".extract")); + } + return V; + } + + Value *rewriteIntegerLoad(IRBuilder<> &IRB, LoadInst &LI) { + assert(IntTy && "We cannot insert an integer to the alloca"); + assert(!LI.isVolatile()); + Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".load")); + V = convertValue(TD, IRB, V, IntTy); + assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); + uint64_t Offset = BeginOffset - NewAllocaBeginOffset; + if (Offset > 0 || EndOffset < NewAllocaEndOffset) + V = extractInteger(TD, IRB, V, cast<IntegerType>(LI.getType()), Offset, + getName(".extract")); + return V; + } + + bool visitLoadInst(LoadInst &LI) { + DEBUG(dbgs() << " original: " << LI << "\n"); + Value *OldOp = LI.getOperand(0); + assert(OldOp == OldPtr); + IRBuilder<> IRB(&LI); + + uint64_t Size = EndOffset - BeginOffset; + bool IsSplitIntLoad = Size < TD.getTypeStoreSize(LI.getType()); + + // If this memory access can be shown to *statically* extend outside the + // bounds of the original allocation it's behavior is undefined. Rather + // than trying to transform it, just replace it with undef. + // FIXME: We should do something more clever for functions being + // instrumented by asan. + // FIXME: Eventually, once ASan and friends can flush out bugs here, this + // should be transformed to a load of null making it unreachable. + uint64_t OldAllocSize = TD.getTypeAllocSize(OldAI.getAllocatedType()); + if (TD.getTypeStoreSize(LI.getType()) > OldAllocSize) { + LI.replaceAllUsesWith(UndefValue::get(LI.getType())); + Pass.DeadInsts.insert(&LI); + deleteIfTriviallyDead(OldOp); + DEBUG(dbgs() << " to: undef!!\n"); + return true; + } + + Type *TargetTy = IsSplitIntLoad ? Type::getIntNTy(LI.getContext(), Size * 8) + : LI.getType(); + bool IsPtrAdjusted = false; + Value *V; + if (VecTy) { + V = rewriteVectorizedLoadInst(IRB, LI, OldOp); + } else if (IntTy && LI.getType()->isIntegerTy()) { + V = rewriteIntegerLoad(IRB, LI); + } else if (BeginOffset == NewAllocaBeginOffset && + canConvertValue(TD, NewAllocaTy, LI.getType())) { + V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + LI.isVolatile(), getName(".load")); + } else { + Type *LTy = TargetTy->getPointerTo(); + V = IRB.CreateAlignedLoad(getAdjustedAllocaPtr(IRB, LTy), + getPartitionTypeAlign(TargetTy), + LI.isVolatile(), getName(".load")); + IsPtrAdjusted = true; + } + V = convertValue(TD, IRB, V, TargetTy); + + if (IsSplitIntLoad) { + assert(!LI.isVolatile()); + assert(LI.getType()->isIntegerTy() && + "Only integer type loads and stores are split"); + assert(LI.getType()->getIntegerBitWidth() == + TD.getTypeStoreSizeInBits(LI.getType()) && + "Non-byte-multiple bit width"); + assert(LI.getType()->getIntegerBitWidth() == + TD.getTypeAllocSizeInBits(OldAI.getAllocatedType()) && + "Only alloca-wide loads can be split and recomposed"); + // Move the insertion point just past the load so that we can refer to it. + IRB.SetInsertPoint(llvm::next(BasicBlock::iterator(&LI))); + // Create a placeholder value with the same type as LI to use as the + // basis for the new value. This allows us to replace the uses of LI with + // the computed value, and then replace the placeholder with LI, leaving + // LI only used for this computation. + Value *Placeholder + = new LoadInst(UndefValue::get(LI.getType()->getPointerTo())); + V = insertInteger(TD, IRB, Placeholder, V, BeginOffset, + getName(".insert")); + LI.replaceAllUsesWith(V); + Placeholder->replaceAllUsesWith(&LI); + delete Placeholder; + } else { + LI.replaceAllUsesWith(V); + } + + Pass.DeadInsts.insert(&LI); + deleteIfTriviallyDead(OldOp); + DEBUG(dbgs() << " to: " << *V << "\n"); + return !LI.isVolatile() && !IsPtrAdjusted; + } + + bool rewriteVectorizedStoreInst(IRBuilder<> &IRB, Value *V, + StoreInst &SI, Value *OldOp) { + if (V->getType() == ElementTy || + BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset) { + if (V->getType() != ElementTy) + V = convertValue(TD, IRB, V, ElementTy); + LoadInst *LI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".load")); + V = IRB.CreateInsertElement(LI, V, getIndex(IRB, BeginOffset), + getName(".insert")); + } else if (V->getType() != VecTy) { + V = convertValue(TD, IRB, V, VecTy); + } + StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); + Pass.DeadInsts.insert(&SI); + + (void)Store; + DEBUG(dbgs() << " to: " << *Store << "\n"); + return true; + } + + bool rewriteIntegerStore(IRBuilder<> &IRB, Value *V, StoreInst &SI) { + assert(IntTy && "We cannot extract an integer from the alloca"); + assert(!SI.isVolatile()); + if (TD.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) { + Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".oldload")); + Old = convertValue(TD, IRB, Old, IntTy); + assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); + uint64_t Offset = BeginOffset - NewAllocaBeginOffset; + V = insertInteger(TD, IRB, Old, SI.getValueOperand(), Offset, + getName(".insert")); + } + V = convertValue(TD, IRB, V, NewAllocaTy); + StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); + Pass.DeadInsts.insert(&SI); + (void)Store; + DEBUG(dbgs() << " to: " << *Store << "\n"); + return true; + } + + bool visitStoreInst(StoreInst &SI) { + DEBUG(dbgs() << " original: " << SI << "\n"); + Value *OldOp = SI.getOperand(1); + assert(OldOp == OldPtr); + IRBuilder<> IRB(&SI); + + Value *V = SI.getValueOperand(); + + // Strip all inbounds GEPs and pointer casts to try to dig out any root + // alloca that should be re-examined after promoting this alloca. + if (V->getType()->isPointerTy()) + if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets())) + Pass.PostPromotionWorklist.insert(AI); + + uint64_t Size = EndOffset - BeginOffset; + if (Size < TD.getTypeStoreSize(V->getType())) { + assert(!SI.isVolatile()); + assert(V->getType()->isIntegerTy() && + "Only integer type loads and stores are split"); + assert(V->getType()->getIntegerBitWidth() == + TD.getTypeStoreSizeInBits(V->getType()) && + "Non-byte-multiple bit width"); + assert(V->getType()->getIntegerBitWidth() == + TD.getTypeSizeInBits(OldAI.getAllocatedType()) && + "Only alloca-wide stores can be split and recomposed"); + IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), Size * 8); + V = extractInteger(TD, IRB, V, NarrowTy, BeginOffset, + getName(".extract")); + } + + if (VecTy) + return rewriteVectorizedStoreInst(IRB, V, SI, OldOp); + if (IntTy && V->getType()->isIntegerTy()) + return rewriteIntegerStore(IRB, V, SI); + + StoreInst *NewSI; + if (BeginOffset == NewAllocaBeginOffset && + canConvertValue(TD, V->getType(), NewAllocaTy)) { + V = convertValue(TD, IRB, V, NewAllocaTy); + NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), + SI.isVolatile()); + } else { + Value *NewPtr = getAdjustedAllocaPtr(IRB, V->getType()->getPointerTo()); + NewSI = IRB.CreateAlignedStore(V, NewPtr, + getPartitionTypeAlign(V->getType()), + SI.isVolatile()); + } + (void)NewSI; + Pass.DeadInsts.insert(&SI); + deleteIfTriviallyDead(OldOp); + + DEBUG(dbgs() << " to: " << *NewSI << "\n"); + return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile(); + } + + bool visitMemSetInst(MemSetInst &II) { + DEBUG(dbgs() << " original: " << II << "\n"); + IRBuilder<> IRB(&II); + assert(II.getRawDest() == OldPtr); + + // If the memset has a variable size, it cannot be split, just adjust the + // pointer to the new alloca. + if (!isa<Constant>(II.getLength())) { + II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType())); + Type *CstTy = II.getAlignmentCst()->getType(); + II.setAlignment(ConstantInt::get(CstTy, getPartitionAlign())); + + deleteIfTriviallyDead(OldPtr); + return false; + } + + // Record this instruction for deletion. + Pass.DeadInsts.insert(&II); + + Type *AllocaTy = NewAI.getAllocatedType(); + Type *ScalarTy = AllocaTy->getScalarType(); + + // If this doesn't map cleanly onto the alloca type, and that type isn't + // a single value type, just emit a memset. + if (!VecTy && !IntTy && + (BeginOffset != NewAllocaBeginOffset || + EndOffset != NewAllocaEndOffset || + !AllocaTy->isSingleValueType() || + !TD.isLegalInteger(TD.getTypeSizeInBits(ScalarTy)))) { + Type *SizeTy = II.getLength()->getType(); + Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset); + CallInst *New + = IRB.CreateMemSet(getAdjustedAllocaPtr(IRB, + II.getRawDest()->getType()), + II.getValue(), Size, getPartitionAlign(), + II.isVolatile()); + (void)New; + DEBUG(dbgs() << " to: " << *New << "\n"); + return false; + } + + // If we can represent this as a simple value, we have to build the actual + // value to store, which requires expanding the byte present in memset to + // a sensible representation for the alloca type. This is essentially + // splatting the byte to a sufficiently wide integer, bitcasting to the + // desired scalar type, and splatting it across any desired vector type. + uint64_t Size = EndOffset - BeginOffset; + Value *V = II.getValue(); + IntegerType *VTy = cast<IntegerType>(V->getType()); + Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size*8); + if (Size*8 > VTy->getBitWidth()) + V = IRB.CreateMul(IRB.CreateZExt(V, SplatIntTy, getName(".zext")), + ConstantExpr::getUDiv( + Constant::getAllOnesValue(SplatIntTy), + ConstantExpr::getZExt( + Constant::getAllOnesValue(V->getType()), + SplatIntTy)), + getName(".isplat")); + + // If this is an element-wide memset of a vectorizable alloca, insert it. + if (VecTy && (BeginOffset > NewAllocaBeginOffset || + EndOffset < NewAllocaEndOffset)) { + if (V->getType() != ScalarTy) + V = convertValue(TD, IRB, V, ScalarTy); + StoreInst *Store = IRB.CreateAlignedStore( + IRB.CreateInsertElement(IRB.CreateAlignedLoad(&NewAI, + NewAI.getAlignment(), + getName(".load")), + V, getIndex(IRB, BeginOffset), + getName(".insert")), + &NewAI, NewAI.getAlignment()); + (void)Store; + DEBUG(dbgs() << " to: " << *Store << "\n"); + return true; + } + + // If this is a memset on an alloca where we can widen stores, insert the + // set integer. + if (IntTy && (BeginOffset > NewAllocaBeginOffset || + EndOffset < NewAllocaEndOffset)) { + assert(!II.isVolatile()); + Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".oldload")); + Old = convertValue(TD, IRB, Old, IntTy); + assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); + uint64_t Offset = BeginOffset - NewAllocaBeginOffset; + V = insertInteger(TD, IRB, Old, V, Offset, getName(".insert")); + } + + if (V->getType() != AllocaTy) + V = convertValue(TD, IRB, V, AllocaTy); + + Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), + II.isVolatile()); + (void)New; + DEBUG(dbgs() << " to: " << *New << "\n"); + return !II.isVolatile(); + } + + bool visitMemTransferInst(MemTransferInst &II) { + // Rewriting of memory transfer instructions can be a bit tricky. We break + // them into two categories: split intrinsics and unsplit intrinsics. + + DEBUG(dbgs() << " original: " << II << "\n"); + IRBuilder<> IRB(&II); + + assert(II.getRawSource() == OldPtr || II.getRawDest() == OldPtr); + bool IsDest = II.getRawDest() == OldPtr; + + const AllocaPartitioning::MemTransferOffsets &MTO + = P.getMemTransferOffsets(II); + + // Compute the relative offset within the transfer. + unsigned IntPtrWidth = TD.getPointerSizeInBits(); + APInt RelOffset(IntPtrWidth, BeginOffset - (IsDest ? MTO.DestBegin + : MTO.SourceBegin)); + + unsigned Align = II.getAlignment(); + if (Align > 1) + Align = MinAlign(RelOffset.zextOrTrunc(64).getZExtValue(), + MinAlign(II.getAlignment(), getPartitionAlign())); + + // For unsplit intrinsics, we simply modify the source and destination + // pointers in place. This isn't just an optimization, it is a matter of + // correctness. With unsplit intrinsics we may be dealing with transfers + // within a single alloca before SROA ran, or with transfers that have + // a variable length. We may also be dealing with memmove instead of + // memcpy, and so simply updating the pointers is the necessary for us to + // update both source and dest of a single call. + if (!MTO.IsSplittable) { + Value *OldOp = IsDest ? II.getRawDest() : II.getRawSource(); + if (IsDest) + II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType())); + else + II.setSource(getAdjustedAllocaPtr(IRB, II.getRawSource()->getType())); + + Type *CstTy = II.getAlignmentCst()->getType(); + II.setAlignment(ConstantInt::get(CstTy, Align)); + + DEBUG(dbgs() << " to: " << II << "\n"); + deleteIfTriviallyDead(OldOp); + return false; + } + // For split transfer intrinsics we have an incredibly useful assurance: + // the source and destination do not reside within the same alloca, and at + // least one of them does not escape. This means that we can replace + // memmove with memcpy, and we don't need to worry about all manner of + // downsides to splitting and transforming the operations. + + // If this doesn't map cleanly onto the alloca type, and that type isn't + // a single value type, just emit a memcpy. + bool EmitMemCpy + = !VecTy && !IntTy && (BeginOffset != NewAllocaBeginOffset || + EndOffset != NewAllocaEndOffset || + !NewAI.getAllocatedType()->isSingleValueType()); + + // If we're just going to emit a memcpy, the alloca hasn't changed, and the + // size hasn't been shrunk based on analysis of the viable range, this is + // a no-op. + if (EmitMemCpy && &OldAI == &NewAI) { + uint64_t OrigBegin = IsDest ? MTO.DestBegin : MTO.SourceBegin; + uint64_t OrigEnd = IsDest ? MTO.DestEnd : MTO.SourceEnd; + // Ensure the start lines up. + assert(BeginOffset == OrigBegin); + (void)OrigBegin; + + // Rewrite the size as needed. + if (EndOffset != OrigEnd) + II.setLength(ConstantInt::get(II.getLength()->getType(), + EndOffset - BeginOffset)); + return false; + } + // Record this instruction for deletion. + Pass.DeadInsts.insert(&II); + + bool IsWholeAlloca = BeginOffset == NewAllocaBeginOffset && + EndOffset == NewAllocaEndOffset; + bool IsVectorElement = VecTy && !IsWholeAlloca; + uint64_t Size = EndOffset - BeginOffset; + IntegerType *SubIntTy + = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : 0; + + Type *OtherPtrTy = IsDest ? II.getRawSource()->getType() + : II.getRawDest()->getType(); + if (!EmitMemCpy) { + if (IsVectorElement) + OtherPtrTy = VecTy->getElementType()->getPointerTo(); + else if (IntTy && !IsWholeAlloca) + OtherPtrTy = SubIntTy->getPointerTo(); + else + OtherPtrTy = NewAI.getType(); + } + + // Compute the other pointer, folding as much as possible to produce + // a single, simple GEP in most cases. + Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest(); + OtherPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy, + getName("." + OtherPtr->getName())); + + // Strip all inbounds GEPs and pointer casts to try to dig out any root + // alloca that should be re-examined after rewriting this instruction. + if (AllocaInst *AI + = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) + Pass.Worklist.insert(AI); + + if (EmitMemCpy) { + Value *OurPtr + = getAdjustedAllocaPtr(IRB, IsDest ? II.getRawDest()->getType() + : II.getRawSource()->getType()); + Type *SizeTy = II.getLength()->getType(); + Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset); + + CallInst *New = IRB.CreateMemCpy(IsDest ? OurPtr : OtherPtr, + IsDest ? OtherPtr : OurPtr, + Size, Align, II.isVolatile()); + (void)New; + DEBUG(dbgs() << " to: " << *New << "\n"); + return false; + } + + // Note that we clamp the alignment to 1 here as a 0 alignment for a memcpy + // is equivalent to 1, but that isn't true if we end up rewriting this as + // a load or store. + if (!Align) + Align = 1; + + Value *SrcPtr = OtherPtr; + Value *DstPtr = &NewAI; + if (!IsDest) + std::swap(SrcPtr, DstPtr); + + Value *Src; + if (IsVectorElement && !IsDest) { + // We have to extract rather than load. + Src = IRB.CreateExtractElement( + IRB.CreateAlignedLoad(SrcPtr, Align, getName(".copyload")), + getIndex(IRB, BeginOffset), + getName(".copyextract")); + } else if (IntTy && !IsWholeAlloca && !IsDest) { + Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".load")); + Src = convertValue(TD, IRB, Src, IntTy); + assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); + uint64_t Offset = BeginOffset - NewAllocaBeginOffset; + Src = extractInteger(TD, IRB, Src, SubIntTy, Offset, getName(".extract")); + } else { + Src = IRB.CreateAlignedLoad(SrcPtr, Align, II.isVolatile(), + getName(".copyload")); + } + + if (IntTy && !IsWholeAlloca && IsDest) { + Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), + getName(".oldload")); + Old = convertValue(TD, IRB, Old, IntTy); + assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); + uint64_t Offset = BeginOffset - NewAllocaBeginOffset; + Src = insertInteger(TD, IRB, Old, Src, Offset, getName(".insert")); + Src = convertValue(TD, IRB, Src, NewAllocaTy); + } + + if (IsVectorElement && IsDest) { + // We have to insert into a loaded copy before storing. + Src = IRB.CreateInsertElement( + IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), getName(".load")), + Src, getIndex(IRB, BeginOffset), + getName(".insert")); + } + + StoreInst *Store = cast<StoreInst>( + IRB.CreateAlignedStore(Src, DstPtr, Align, II.isVolatile())); + (void)Store; + DEBUG(dbgs() << " to: " << *Store << "\n"); + return !II.isVolatile(); + } + + bool visitIntrinsicInst(IntrinsicInst &II) { + assert(II.getIntrinsicID() == Intrinsic::lifetime_start || + II.getIntrinsicID() == Intrinsic::lifetime_end); + DEBUG(dbgs() << " original: " << II << "\n"); + IRBuilder<> IRB(&II); + assert(II.getArgOperand(1) == OldPtr); + + // Record this instruction for deletion. + Pass.DeadInsts.insert(&II); + + ConstantInt *Size + = ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()), + EndOffset - BeginOffset); + Value *Ptr = getAdjustedAllocaPtr(IRB, II.getArgOperand(1)->getType()); + Value *New; + if (II.getIntrinsicID() == Intrinsic::lifetime_start) + New = IRB.CreateLifetimeStart(Ptr, Size); + else + New = IRB.CreateLifetimeEnd(Ptr, Size); + + DEBUG(dbgs() << " to: " << *New << "\n"); + return true; + } + + bool visitPHINode(PHINode &PN) { + DEBUG(dbgs() << " original: " << PN << "\n"); + + // We would like to compute a new pointer in only one place, but have it be + // as local as possible to the PHI. To do that, we re-use the location of + // the old pointer, which necessarily must be in the right position to + // dominate the PHI. + IRBuilder<> PtrBuilder(cast<Instruction>(OldPtr)); + + Value *NewPtr = getAdjustedAllocaPtr(PtrBuilder, OldPtr->getType()); + // Replace the operands which were using the old pointer. + std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr); + + DEBUG(dbgs() << " to: " << PN << "\n"); + deleteIfTriviallyDead(OldPtr); + return false; + } + + bool visitSelectInst(SelectInst &SI) { + DEBUG(dbgs() << " original: " << SI << "\n"); + IRBuilder<> IRB(&SI); + + // Find the operand we need to rewrite here. + bool IsTrueVal = SI.getTrueValue() == OldPtr; + if (IsTrueVal) + assert(SI.getFalseValue() != OldPtr && "Pointer is both operands!"); + else + assert(SI.getFalseValue() == OldPtr && "Pointer isn't an operand!"); + + Value *NewPtr = getAdjustedAllocaPtr(IRB, OldPtr->getType()); + SI.setOperand(IsTrueVal ? 1 : 2, NewPtr); + DEBUG(dbgs() << " to: " << SI << "\n"); + deleteIfTriviallyDead(OldPtr); + return false; + } + +}; +} + +namespace { +/// \brief Visitor to rewrite aggregate loads and stores as scalar. +/// +/// This pass aggressively rewrites all aggregate loads and stores on +/// a particular pointer (or any pointer derived from it which we can identify) +/// with scalar loads and stores. +class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> { + // Befriend the base class so it can delegate to private visit methods. + friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>; + + const DataLayout &TD; + + /// Queue of pointer uses to analyze and potentially rewrite. + SmallVector<Use *, 8> Queue; + + /// Set to prevent us from cycling with phi nodes and loops. + SmallPtrSet<User *, 8> Visited; + + /// The current pointer use being rewritten. This is used to dig up the used + /// value (as opposed to the user). + Use *U; + +public: + AggLoadStoreRewriter(const DataLayout &TD) : TD(TD) {} + + /// Rewrite loads and stores through a pointer and all pointers derived from + /// it. + bool rewrite(Instruction &I) { + DEBUG(dbgs() << " Rewriting FCA loads and stores...\n"); + enqueueUsers(I); + bool Changed = false; + while (!Queue.empty()) { + U = Queue.pop_back_val(); + Changed |= visit(cast<Instruction>(U->getUser())); + } + return Changed; + } + +private: + /// Enqueue all the users of the given instruction for further processing. + /// This uses a set to de-duplicate users. + void enqueueUsers(Instruction &I) { + for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE; + ++UI) + if (Visited.insert(*UI)) + Queue.push_back(&UI.getUse()); + } + + // Conservative default is to not rewrite anything. + bool visitInstruction(Instruction &I) { return false; } + + /// \brief Generic recursive split emission class. + template <typename Derived> + class OpSplitter { + protected: + /// The builder used to form new instructions. + IRBuilder<> IRB; + /// The indices which to be used with insert- or extractvalue to select the + /// appropriate value within the aggregate. + SmallVector<unsigned, 4> Indices; + /// The indices to a GEP instruction which will move Ptr to the correct slot + /// within the aggregate. + SmallVector<Value *, 4> GEPIndices; + /// The base pointer of the original op, used as a base for GEPing the + /// split operations. + Value *Ptr; + + /// Initialize the splitter with an insertion point, Ptr and start with a + /// single zero GEP index. + OpSplitter(Instruction *InsertionPoint, Value *Ptr) + : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {} + + public: + /// \brief Generic recursive split emission routine. + /// + /// This method recursively splits an aggregate op (load or store) into + /// scalar or vector ops. It splits recursively until it hits a single value + /// and emits that single value operation via the template argument. + /// + /// The logic of this routine relies on GEPs and insertvalue and + /// extractvalue all operating with the same fundamental index list, merely + /// formatted differently (GEPs need actual values). + /// + /// \param Ty The type being split recursively into smaller ops. + /// \param Agg The aggregate value being built up or stored, depending on + /// whether this is splitting a load or a store respectively. + void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) { + if (Ty->isSingleValueType()) + return static_cast<Derived *>(this)->emitFunc(Ty, Agg, Name); + + if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { + unsigned OldSize = Indices.size(); + (void)OldSize; + for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size; + ++Idx) { + assert(Indices.size() == OldSize && "Did not return to the old size"); + Indices.push_back(Idx); + GEPIndices.push_back(IRB.getInt32(Idx)); + emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx)); + GEPIndices.pop_back(); + Indices.pop_back(); + } + return; + } + + if (StructType *STy = dyn_cast<StructType>(Ty)) { + unsigned OldSize = Indices.size(); + (void)OldSize; + for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size; + ++Idx) { + assert(Indices.size() == OldSize && "Did not return to the old size"); + Indices.push_back(Idx); + GEPIndices.push_back(IRB.getInt32(Idx)); + emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx)); + GEPIndices.pop_back(); + Indices.pop_back(); + } + return; + } + + llvm_unreachable("Only arrays and structs are aggregate loadable types"); + } + }; + + struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> { + LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr) + : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {} + + /// Emit a leaf load of a single value. This is called at the leaves of the + /// recursive emission to actually load values. + void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) { + assert(Ty->isSingleValueType()); + // Load the single value and insert it using the indices. + Value *Load = IRB.CreateLoad(IRB.CreateInBoundsGEP(Ptr, GEPIndices, + Name + ".gep"), + Name + ".load"); + Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert"); + DEBUG(dbgs() << " to: " << *Load << "\n"); + } + }; + + bool visitLoadInst(LoadInst &LI) { + assert(LI.getPointerOperand() == *U); + if (!LI.isSimple() || LI.getType()->isSingleValueType()) + return false; + + // We have an aggregate being loaded, split it apart. + DEBUG(dbgs() << " original: " << LI << "\n"); + LoadOpSplitter Splitter(&LI, *U); + Value *V = UndefValue::get(LI.getType()); + Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca"); + LI.replaceAllUsesWith(V); + LI.eraseFromParent(); + return true; + } + + struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> { + StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr) + : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {} + + /// Emit a leaf store of a single value. This is called at the leaves of the + /// recursive emission to actually produce stores. + void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) { + assert(Ty->isSingleValueType()); + // Extract the single value and store it using the indices. + Value *Store = IRB.CreateStore( + IRB.CreateExtractValue(Agg, Indices, Name + ".extract"), + IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep")); + (void)Store; + DEBUG(dbgs() << " to: " << *Store << "\n"); + } + }; + + bool visitStoreInst(StoreInst &SI) { + if (!SI.isSimple() || SI.getPointerOperand() != *U) + return false; + Value *V = SI.getValueOperand(); + if (V->getType()->isSingleValueType()) + return false; + + // We have an aggregate being stored, split it apart. + DEBUG(dbgs() << " original: " << SI << "\n"); + StoreOpSplitter Splitter(&SI, *U); + Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca"); + SI.eraseFromParent(); + return true; + } + + bool visitBitCastInst(BitCastInst &BC) { + enqueueUsers(BC); + return false; + } + + bool visitGetElementPtrInst(GetElementPtrInst &GEPI) { + enqueueUsers(GEPI); + return false; + } + + bool visitPHINode(PHINode &PN) { + enqueueUsers(PN); + return false; + } + + bool visitSelectInst(SelectInst &SI) { + enqueueUsers(SI); + return false; + } +}; +} + +/// \brief Strip aggregate type wrapping. +/// +/// This removes no-op aggregate types wrapping an underlying type. It will +/// strip as many layers of types as it can without changing either the type +/// size or the allocated size. +static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) { + if (Ty->isSingleValueType()) + return Ty; + + uint64_t AllocSize = DL.getTypeAllocSize(Ty); + uint64_t TypeSize = DL.getTypeSizeInBits(Ty); + + Type *InnerTy; + if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { + InnerTy = ArrTy->getElementType(); + } else if (StructType *STy = dyn_cast<StructType>(Ty)) { + const StructLayout *SL = DL.getStructLayout(STy); + unsigned Index = SL->getElementContainingOffset(0); + InnerTy = STy->getElementType(Index); + } else { + return Ty; + } + + if (AllocSize > DL.getTypeAllocSize(InnerTy) || + TypeSize > DL.getTypeSizeInBits(InnerTy)) + return Ty; + + return stripAggregateTypeWrapping(DL, InnerTy); +} + +/// \brief Try to find a partition of the aggregate type passed in for a given +/// offset and size. +/// +/// This recurses through the aggregate type and tries to compute a subtype +/// based on the offset and size. When the offset and size span a sub-section +/// of an array, it will even compute a new array type for that sub-section, +/// and the same for structs. +/// +/// Note that this routine is very strict and tries to find a partition of the +/// type which produces the *exact* right offset and size. It is not forgiving +/// when the size or offset cause either end of type-based partition to be off. +/// Also, this is a best-effort routine. It is reasonable to give up and not +/// return a type if necessary. +static Type *getTypePartition(const DataLayout &TD, Type *Ty, + uint64_t Offset, uint64_t Size) { + if (Offset == 0 && TD.getTypeAllocSize(Ty) == Size) + return stripAggregateTypeWrapping(TD, Ty); + if (Offset > TD.getTypeAllocSize(Ty) || + (TD.getTypeAllocSize(Ty) - Offset) < Size) + return 0; + + if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) { + // We can't partition pointers... + if (SeqTy->isPointerTy()) + return 0; + + Type *ElementTy = SeqTy->getElementType(); + uint64_t ElementSize = TD.getTypeAllocSize(ElementTy); + uint64_t NumSkippedElements = Offset / ElementSize; + if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy)) + if (NumSkippedElements >= ArrTy->getNumElements()) + return 0; + if (VectorType *VecTy = dyn_cast<VectorType>(SeqTy)) + if (NumSkippedElements >= VecTy->getNumElements()) + return 0; + Offset -= NumSkippedElements * ElementSize; + + // First check if we need to recurse. + if (Offset > 0 || Size < ElementSize) { + // Bail if the partition ends in a different array element. + if ((Offset + Size) > ElementSize) + return 0; + // Recurse through the element type trying to peel off offset bytes. + return getTypePartition(TD, ElementTy, Offset, Size); + } + assert(Offset == 0); + + if (Size == ElementSize) + return stripAggregateTypeWrapping(TD, ElementTy); + assert(Size > ElementSize); + uint64_t NumElements = Size / ElementSize; + if (NumElements * ElementSize != Size) + return 0; + return ArrayType::get(ElementTy, NumElements); + } + + StructType *STy = dyn_cast<StructType>(Ty); + if (!STy) + return 0; + + const StructLayout *SL = TD.getStructLayout(STy); + if (Offset >= SL->getSizeInBytes()) + return 0; + uint64_t EndOffset = Offset + Size; + if (EndOffset > SL->getSizeInBytes()) + return 0; + + unsigned Index = SL->getElementContainingOffset(Offset); + Offset -= SL->getElementOffset(Index); + + Type *ElementTy = STy->getElementType(Index); + uint64_t ElementSize = TD.getTypeAllocSize(ElementTy); + if (Offset >= ElementSize) + return 0; // The offset points into alignment padding. + + // See if any partition must be contained by the element. + if (Offset > 0 || Size < ElementSize) { + if ((Offset + Size) > ElementSize) + return 0; + return getTypePartition(TD, ElementTy, Offset, Size); + } + assert(Offset == 0); + + if (Size == ElementSize) + return stripAggregateTypeWrapping(TD, ElementTy); + + StructType::element_iterator EI = STy->element_begin() + Index, + EE = STy->element_end(); + if (EndOffset < SL->getSizeInBytes()) { + unsigned EndIndex = SL->getElementContainingOffset(EndOffset); + if (Index == EndIndex) + return 0; // Within a single element and its padding. + + // Don't try to form "natural" types if the elements don't line up with the + // expected size. + // FIXME: We could potentially recurse down through the last element in the + // sub-struct to find a natural end point. + if (SL->getElementOffset(EndIndex) != EndOffset) + return 0; + + assert(Index < EndIndex); + EE = STy->element_begin() + EndIndex; + } + + // Try to build up a sub-structure. + StructType *SubTy = StructType::get(STy->getContext(), makeArrayRef(EI, EE), + STy->isPacked()); + const StructLayout *SubSL = TD.getStructLayout(SubTy); + if (Size != SubSL->getSizeInBytes()) + return 0; // The sub-struct doesn't have quite the size needed. + + return SubTy; +} + +/// \brief Rewrite an alloca partition's users. +/// +/// This routine drives both of the rewriting goals of the SROA pass. It tries +/// to rewrite uses of an alloca partition to be conducive for SSA value +/// promotion. If the partition needs a new, more refined alloca, this will +/// build that new alloca, preserving as much type information as possible, and +/// rewrite the uses of the old alloca to point at the new one and have the +/// appropriate new offsets. It also evaluates how successful the rewrite was +/// at enabling promotion and if it was successful queues the alloca to be +/// promoted. +bool SROA::rewriteAllocaPartition(AllocaInst &AI, + AllocaPartitioning &P, + AllocaPartitioning::iterator PI) { + uint64_t AllocaSize = PI->EndOffset - PI->BeginOffset; + bool IsLive = false; + for (AllocaPartitioning::use_iterator UI = P.use_begin(PI), + UE = P.use_end(PI); + UI != UE && !IsLive; ++UI) + if (UI->U) + IsLive = true; + if (!IsLive) + return false; // No live uses left of this partition. + + DEBUG(dbgs() << "Speculating PHIs and selects in partition " + << "[" << PI->BeginOffset << "," << PI->EndOffset << ")\n"); + + PHIOrSelectSpeculator Speculator(*TD, P, *this); + DEBUG(dbgs() << " speculating "); + DEBUG(P.print(dbgs(), PI, "")); + Speculator.visitUsers(PI); + + // Try to compute a friendly type for this partition of the alloca. This + // won't always succeed, in which case we fall back to a legal integer type + // or an i8 array of an appropriate size. + Type *AllocaTy = 0; + if (Type *PartitionTy = P.getCommonType(PI)) + if (TD->getTypeAllocSize(PartitionTy) >= AllocaSize) + AllocaTy = PartitionTy; + if (!AllocaTy) + if (Type *PartitionTy = getTypePartition(*TD, AI.getAllocatedType(), + PI->BeginOffset, AllocaSize)) + AllocaTy = PartitionTy; + if ((!AllocaTy || + (AllocaTy->isArrayTy() && + AllocaTy->getArrayElementType()->isIntegerTy())) && + TD->isLegalInteger(AllocaSize * 8)) + AllocaTy = Type::getIntNTy(*C, AllocaSize * 8); + if (!AllocaTy) + AllocaTy = ArrayType::get(Type::getInt8Ty(*C), AllocaSize); + assert(TD->getTypeAllocSize(AllocaTy) >= AllocaSize); + + // Check for the case where we're going to rewrite to a new alloca of the + // exact same type as the original, and with the same access offsets. In that + // case, re-use the existing alloca, but still run through the rewriter to + // performe phi and select speculation. + AllocaInst *NewAI; + if (AllocaTy == AI.getAllocatedType()) { + assert(PI->BeginOffset == 0 && + "Non-zero begin offset but same alloca type"); + assert(PI == P.begin() && "Begin offset is zero on later partition"); + NewAI = &AI; + } else { + unsigned Alignment = AI.getAlignment(); + if (!Alignment) { + // The minimum alignment which users can rely on when the explicit + // alignment is omitted or zero is that required by the ABI for this + // type. + Alignment = TD->getABITypeAlignment(AI.getAllocatedType()); + } + Alignment = MinAlign(Alignment, PI->BeginOffset); + // If we will get at least this much alignment from the type alone, leave + // the alloca's alignment unconstrained. + if (Alignment <= TD->getABITypeAlignment(AllocaTy)) + Alignment = 0; + NewAI = new AllocaInst(AllocaTy, 0, Alignment, + AI.getName() + ".sroa." + Twine(PI - P.begin()), + &AI); + ++NumNewAllocas; + } + + DEBUG(dbgs() << "Rewriting alloca partition " + << "[" << PI->BeginOffset << "," << PI->EndOffset << ") to: " + << *NewAI << "\n"); + + // Track the high watermark of the post-promotion worklist. We will reset it + // to this point if the alloca is not in fact scheduled for promotion. + unsigned PPWOldSize = PostPromotionWorklist.size(); + + AllocaPartitionRewriter Rewriter(*TD, P, PI, *this, AI, *NewAI, + PI->BeginOffset, PI->EndOffset); + DEBUG(dbgs() << " rewriting "); + DEBUG(P.print(dbgs(), PI, "")); + bool Promotable = Rewriter.visitUsers(P.use_begin(PI), P.use_end(PI)); + if (Promotable) { + DEBUG(dbgs() << " and queuing for promotion\n"); + PromotableAllocas.push_back(NewAI); + } else if (NewAI != &AI) { + // If we can't promote the alloca, iterate on it to check for new + // refinements exposed by splitting the current alloca. Don't iterate on an + // alloca which didn't actually change and didn't get promoted. + Worklist.insert(NewAI); + } + + // Drop any post-promotion work items if promotion didn't happen. + if (!Promotable) + while (PostPromotionWorklist.size() > PPWOldSize) + PostPromotionWorklist.pop_back(); + + return true; +} + +/// \brief Walks the partitioning of an alloca rewriting uses of each partition. +bool SROA::splitAlloca(AllocaInst &AI, AllocaPartitioning &P) { + bool Changed = false; + for (AllocaPartitioning::iterator PI = P.begin(), PE = P.end(); PI != PE; + ++PI) + Changed |= rewriteAllocaPartition(AI, P, PI); + + return Changed; +} + +/// \brief Analyze an alloca for SROA. +/// +/// This analyzes the alloca to ensure we can reason about it, builds +/// a partitioning of the alloca, and then hands it off to be split and +/// rewritten as needed. +bool SROA::runOnAlloca(AllocaInst &AI) { + DEBUG(dbgs() << "SROA alloca: " << AI << "\n"); + ++NumAllocasAnalyzed; + + // Special case dead allocas, as they're trivial. + if (AI.use_empty()) { + AI.eraseFromParent(); + return true; + } + + // Skip alloca forms that this analysis can't handle. + if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() || + TD->getTypeAllocSize(AI.getAllocatedType()) == 0) + return false; + + bool Changed = false; + + // First, split any FCA loads and stores touching this alloca to promote + // better splitting and promotion opportunities. + AggLoadStoreRewriter AggRewriter(*TD); + Changed |= AggRewriter.rewrite(AI); + + // Build the partition set using a recursive instruction-visiting builder. + AllocaPartitioning P(*TD, AI); + DEBUG(P.print(dbgs())); + if (P.isEscaped()) + return Changed; + + // Delete all the dead users of this alloca before splitting and rewriting it. + for (AllocaPartitioning::dead_user_iterator DI = P.dead_user_begin(), + DE = P.dead_user_end(); + DI != DE; ++DI) { + Changed = true; + (*DI)->replaceAllUsesWith(UndefValue::get((*DI)->getType())); + DeadInsts.insert(*DI); + } + for (AllocaPartitioning::dead_op_iterator DO = P.dead_op_begin(), + DE = P.dead_op_end(); + DO != DE; ++DO) { + Value *OldV = **DO; + // Clobber the use with an undef value. + **DO = UndefValue::get(OldV->getType()); + if (Instruction *OldI = dyn_cast<Instruction>(OldV)) + if (isInstructionTriviallyDead(OldI)) { + Changed = true; + DeadInsts.insert(OldI); + } + } + + // No partitions to split. Leave the dead alloca for a later pass to clean up. + if (P.begin() == P.end()) + return Changed; + + return splitAlloca(AI, P) || Changed; +} + +/// \brief Delete the dead instructions accumulated in this run. +/// +/// Recursively deletes the dead instructions we've accumulated. This is done +/// at the very end to maximize locality of the recursive delete and to +/// minimize the problems of invalidated instruction pointers as such pointers +/// are used heavily in the intermediate stages of the algorithm. +/// +/// We also record the alloca instructions deleted here so that they aren't +/// subsequently handed to mem2reg to promote. +void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) { + while (!DeadInsts.empty()) { + Instruction *I = DeadInsts.pop_back_val(); + DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n"); + + I->replaceAllUsesWith(UndefValue::get(I->getType())); + + for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) + if (Instruction *U = dyn_cast<Instruction>(*OI)) { + // Zero out the operand and see if it becomes trivially dead. + *OI = 0; + if (isInstructionTriviallyDead(U)) + DeadInsts.insert(U); + } + + if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) + DeletedAllocas.insert(AI); + + ++NumDeleted; + I->eraseFromParent(); + } +} + +/// \brief Promote the allocas, using the best available technique. +/// +/// This attempts to promote whatever allocas have been identified as viable in +/// the PromotableAllocas list. If that list is empty, there is nothing to do. +/// If there is a domtree available, we attempt to promote using the full power +/// of mem2reg. Otherwise, we build and use the AllocaPromoter above which is +/// based on the SSAUpdater utilities. This function returns whether any +/// promotion occured. +bool SROA::promoteAllocas(Function &F) { + if (PromotableAllocas.empty()) + return false; + + NumPromoted += PromotableAllocas.size(); + + if (DT && !ForceSSAUpdater) { + DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); + PromoteMemToReg(PromotableAllocas, *DT); + PromotableAllocas.clear(); + return true; + } + + DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n"); + SSAUpdater SSA; + DIBuilder DIB(*F.getParent()); + SmallVector<Instruction*, 64> Insts; + + for (unsigned Idx = 0, Size = PromotableAllocas.size(); Idx != Size; ++Idx) { + AllocaInst *AI = PromotableAllocas[Idx]; + for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end(); + UI != UE;) { + Instruction *I = cast<Instruction>(*UI++); + // FIXME: Currently the SSAUpdater infrastructure doesn't reason about + // lifetime intrinsics and so we strip them (and the bitcasts+GEPs + // leading to them) here. Eventually it should use them to optimize the + // scalar values produced. + if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I)) { + assert(onlyUsedByLifetimeMarkers(I) && + "Found a bitcast used outside of a lifetime marker."); + while (!I->use_empty()) + cast<Instruction>(*I->use_begin())->eraseFromParent(); + I->eraseFromParent(); + continue; + } + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + assert(II->getIntrinsicID() == Intrinsic::lifetime_start || + II->getIntrinsicID() == Intrinsic::lifetime_end); + II->eraseFromParent(); + continue; + } + + Insts.push_back(I); + } + AllocaPromoter(Insts, SSA, *AI, DIB).run(Insts); + Insts.clear(); + } + + PromotableAllocas.clear(); + return true; +} + +namespace { + /// \brief A predicate to test whether an alloca belongs to a set. + class IsAllocaInSet { + typedef SmallPtrSet<AllocaInst *, 4> SetType; + const SetType &Set; + + public: + typedef AllocaInst *argument_type; + + IsAllocaInSet(const SetType &Set) : Set(Set) {} + bool operator()(AllocaInst *AI) const { return Set.count(AI); } + }; +} + +bool SROA::runOnFunction(Function &F) { + DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); + C = &F.getContext(); + TD = getAnalysisIfAvailable<DataLayout>(); + if (!TD) { + DEBUG(dbgs() << " Skipping SROA -- no target data!\n"); + return false; + } + DT = getAnalysisIfAvailable<DominatorTree>(); + + BasicBlock &EntryBB = F.getEntryBlock(); + for (BasicBlock::iterator I = EntryBB.begin(), E = llvm::prior(EntryBB.end()); + I != E; ++I) + if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) + Worklist.insert(AI); + + bool Changed = false; + // A set of deleted alloca instruction pointers which should be removed from + // the list of promotable allocas. + SmallPtrSet<AllocaInst *, 4> DeletedAllocas; + + do { + while (!Worklist.empty()) { + Changed |= runOnAlloca(*Worklist.pop_back_val()); + deleteDeadInstructions(DeletedAllocas); + + // Remove the deleted allocas from various lists so that we don't try to + // continue processing them. + if (!DeletedAllocas.empty()) { + Worklist.remove_if(IsAllocaInSet(DeletedAllocas)); + PostPromotionWorklist.remove_if(IsAllocaInSet(DeletedAllocas)); + PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(), + PromotableAllocas.end(), + IsAllocaInSet(DeletedAllocas)), + PromotableAllocas.end()); + DeletedAllocas.clear(); + } + } + + Changed |= promoteAllocas(F); + + Worklist = PostPromotionWorklist; + PostPromotionWorklist.clear(); + } while (!Worklist.empty()); + + return Changed; +} + +void SROA::getAnalysisUsage(AnalysisUsage &AU) const { + if (RequiresDomTree) + AU.addRequired<DominatorTree>(); + AU.setPreservesCFG(); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp index 48318c8..39630fd 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -19,7 +19,7 @@ #include "llvm/PassManager.h" #include "llvm/Analysis/Passes.h" #include "llvm/Analysis/Verifier.h" -#include "llvm/Target/TargetData.h" +#include "llvm/DataLayout.h" #include "llvm/Transforms/Scalar.h" using namespace llvm; @@ -59,6 +59,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeRegToMemPass(Registry); initializeSCCPPass(Registry); initializeIPSCCPPass(Registry); + initializeSROAPass(Registry); initializeSROA_DTPass(Registry); initializeSROA_SSAUpPass(Registry); initializeCFGSimplifyPassPass(Registry); diff --git a/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp index 6637126..a46d09c 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -46,7 +46,7 @@ #include "llvm/Support/GetElementPtrTypeIterator.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetData.h" +#include "llvm/DataLayout.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" #include "llvm/Transforms/Utils/SSAUpdater.h" @@ -56,7 +56,6 @@ STATISTIC(NumReplaced, "Number of allocas broken up"); STATISTIC(NumPromoted, "Number of allocas promoted"); STATISTIC(NumAdjusted, "Number of scalar allocas adjusted to allow promotion"); STATISTIC(NumConverted, "Number of aggregates converted to scalar"); -STATISTIC(NumGlobals, "Number of allocas copied from constant global"); namespace { struct SROA : public FunctionPass { @@ -88,7 +87,7 @@ namespace { private: bool HasDomTree; - TargetData *TD; + DataLayout *TD; /// DeadInsts - Keep track of instructions we have made dead, so that /// we can remove them after we are done working. @@ -183,9 +182,6 @@ namespace { void RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, SmallVector<AllocaInst*, 32> &NewElts); bool ShouldAttemptScalarRepl(AllocaInst *AI); - - static MemTransferInst *isOnlyCopiedFromConstantGlobal( - AllocaInst *AI, SmallVector<Instruction*, 4> &ToDelete); }; // SROA_DT - SROA that uses DominatorTree. @@ -262,7 +258,7 @@ namespace { class ConvertToScalarInfo { /// AllocaSize - The size of the alloca being considered in bytes. unsigned AllocaSize; - const TargetData &TD; + const DataLayout &TD; unsigned ScalarLoadThreshold; /// IsNotTrivial - This is set to true if there is some access to the object @@ -305,7 +301,7 @@ class ConvertToScalarInfo { bool HadDynamicAccess; public: - explicit ConvertToScalarInfo(unsigned Size, const TargetData &td, + explicit ConvertToScalarInfo(unsigned Size, const DataLayout &td, unsigned SLT) : AllocaSize(Size), TD(td), ScalarLoadThreshold(SLT), IsNotTrivial(false), ScalarKind(Unknown), VectorTy(0), HadNonMemTransferAccess(false), @@ -1024,11 +1020,11 @@ ConvertScalar_InsertValue(Value *SV, Value *Old, bool SROA::runOnFunction(Function &F) { - TD = getAnalysisIfAvailable<TargetData>(); + TD = getAnalysisIfAvailable<DataLayout>(); bool Changed = performPromotion(F); - // FIXME: ScalarRepl currently depends on TargetData more than it + // FIXME: ScalarRepl currently depends on DataLayout more than it // theoretically needs to. It should be refactored in order to support // target-independent IR. Until this is done, just skip the actual // scalar-replacement portion of this pass. @@ -1138,7 +1134,7 @@ public: /// /// We can do this to a select if its only uses are loads and if the operand to /// the select can be loaded unconditionally. -static bool isSafeSelectToSpeculate(SelectInst *SI, const TargetData *TD) { +static bool isSafeSelectToSpeculate(SelectInst *SI, const DataLayout *TD) { bool TDerefable = SI->getTrueValue()->isDereferenceablePointer(); bool FDerefable = SI->getFalseValue()->isDereferenceablePointer(); @@ -1176,7 +1172,7 @@ static bool isSafeSelectToSpeculate(SelectInst *SI, const TargetData *TD) { /// /// We can do this to a select if its only uses are loads and if the operand to /// the select can be loaded unconditionally. -static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) { +static bool isSafePHIToSpeculate(PHINode *PN, const DataLayout *TD) { // For now, we can only do this promotion if the load is in the same block as // the PHI, and if there are no stores between the phi and load. // TODO: Allow recursive phi users. @@ -1240,7 +1236,7 @@ static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) { /// direct (non-volatile) loads and stores to it. If the alloca is close but /// not quite there, this will transform the code to allow promotion. As such, /// it is a non-pure predicate. -static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { +static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *TD) { SetVector<Instruction*, SmallVector<Instruction*, 4>, SmallPtrSet<Instruction*, 4> > InstsToRewrite; @@ -1465,26 +1461,6 @@ bool SROA::ShouldAttemptScalarRepl(AllocaInst *AI) { return false; } -/// getPointeeAlignment - Compute the minimum alignment of the value pointed -/// to by the given pointer. -static unsigned getPointeeAlignment(Value *V, const TargetData &TD) { - if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) - if (CE->getOpcode() == Instruction::BitCast || - (CE->getOpcode() == Instruction::GetElementPtr && - cast<GEPOperator>(CE)->hasAllZeroIndices())) - return getPointeeAlignment(CE->getOperand(0), TD); - - if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) - if (!GV->isDeclaration()) - return TD.getPreferredAlignment(GV); - - if (PointerType *PT = dyn_cast<PointerType>(V->getType())) - return TD.getABITypeAlignment(PT->getElementType()); - - return 0; -} - - // performScalarRepl - This algorithm is a simple worklist driven algorithm, // which runs on all of the alloca instructions in the function, removing them // if they are only used by getelementptr instructions. @@ -1516,29 +1492,6 @@ bool SROA::performScalarRepl(Function &F) { if (AI->isArrayAllocation() || !AI->getAllocatedType()->isSized()) continue; - // Check to see if this allocation is only modified by a memcpy/memmove from - // a constant global whose alignment is equal to or exceeds that of the - // allocation. If this is the case, we can change all users to use - // the constant global instead. This is commonly produced by the CFE by - // constructs like "void foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A' - // is only subsequently read. - SmallVector<Instruction *, 4> ToDelete; - if (MemTransferInst *Copy = isOnlyCopiedFromConstantGlobal(AI, ToDelete)) { - if (AI->getAlignment() <= getPointeeAlignment(Copy->getSource(), *TD)) { - DEBUG(dbgs() << "Found alloca equal to global: " << *AI << '\n'); - DEBUG(dbgs() << " memcpy = " << *Copy << '\n'); - for (unsigned i = 0, e = ToDelete.size(); i != e; ++i) - ToDelete[i]->eraseFromParent(); - Constant *TheSrc = cast<Constant>(Copy->getSource()); - AI->replaceAllUsesWith(ConstantExpr::getBitCast(TheSrc, AI->getType())); - Copy->eraseFromParent(); // Don't mutate the global. - AI->eraseFromParent(); - ++NumGlobals; - Changed = true; - continue; - } - } - // Check to see if we can perform the core SROA transformation. We cannot // transform the allocation instruction if it is an array allocation // (allocations OF arrays are ok though), and an allocation of a scalar @@ -2584,7 +2537,7 @@ void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, /// HasPadding - Return true if the specified type has any structure or /// alignment padding in between the elements that would be split apart /// by SROA; return false otherwise. -static bool HasPadding(Type *Ty, const TargetData &TD) { +static bool HasPadding(Type *Ty, const DataLayout &TD) { if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { Ty = ATy->getElementType(); return TD.getTypeSizeInBits(Ty) != TD.getTypeAllocSizeInBits(Ty); @@ -2656,134 +2609,3 @@ bool SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) { return true; } - - - -/// PointsToConstantGlobal - Return true if V (possibly indirectly) points to -/// some part of a constant global variable. This intentionally only accepts -/// constant expressions because we don't can't rewrite arbitrary instructions. -static bool PointsToConstantGlobal(Value *V) { - if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) - return GV->isConstant(); - if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) - if (CE->getOpcode() == Instruction::BitCast || - CE->getOpcode() == Instruction::GetElementPtr) - return PointsToConstantGlobal(CE->getOperand(0)); - return false; -} - -/// isOnlyCopiedFromConstantGlobal - Recursively walk the uses of a (derived) -/// pointer to an alloca. Ignore any reads of the pointer, return false if we -/// see any stores or other unknown uses. If we see pointer arithmetic, keep -/// track of whether it moves the pointer (with isOffset) but otherwise traverse -/// the uses. If we see a memcpy/memmove that targets an unoffseted pointer to -/// the alloca, and if the source pointer is a pointer to a constant global, we -/// can optimize this. -static bool -isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy, - bool isOffset, - SmallVector<Instruction *, 4> &LifetimeMarkers) { - // We track lifetime intrinsics as we encounter them. If we decide to go - // ahead and replace the value with the global, this lets the caller quickly - // eliminate the markers. - - for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) { - User *U = cast<Instruction>(*UI); - - if (LoadInst *LI = dyn_cast<LoadInst>(U)) { - // Ignore non-volatile loads, they are always ok. - if (!LI->isSimple()) return false; - continue; - } - - if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) { - // If uses of the bitcast are ok, we are ok. - if (!isOnlyCopiedFromConstantGlobal(BCI, TheCopy, isOffset, - LifetimeMarkers)) - return false; - continue; - } - if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) { - // If the GEP has all zero indices, it doesn't offset the pointer. If it - // doesn't, it does. - if (!isOnlyCopiedFromConstantGlobal(GEP, TheCopy, - isOffset || !GEP->hasAllZeroIndices(), - LifetimeMarkers)) - return false; - continue; - } - - if (CallSite CS = U) { - // If this is the function being called then we treat it like a load and - // ignore it. - if (CS.isCallee(UI)) - continue; - - // If this is a readonly/readnone call site, then we know it is just a - // load (but one that potentially returns the value itself), so we can - // ignore it if we know that the value isn't captured. - unsigned ArgNo = CS.getArgumentNo(UI); - if (CS.onlyReadsMemory() && - (CS.getInstruction()->use_empty() || CS.doesNotCapture(ArgNo))) - continue; - - // If this is being passed as a byval argument, the caller is making a - // copy, so it is only a read of the alloca. - if (CS.isByValArgument(ArgNo)) - continue; - } - - // Lifetime intrinsics can be handled by the caller. - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) { - if (II->getIntrinsicID() == Intrinsic::lifetime_start || - II->getIntrinsicID() == Intrinsic::lifetime_end) { - assert(II->use_empty() && "Lifetime markers have no result to use!"); - LifetimeMarkers.push_back(II); - continue; - } - } - - // If this is isn't our memcpy/memmove, reject it as something we can't - // handle. - MemTransferInst *MI = dyn_cast<MemTransferInst>(U); - if (MI == 0) - return false; - - // If the transfer is using the alloca as a source of the transfer, then - // ignore it since it is a load (unless the transfer is volatile). - if (UI.getOperandNo() == 1) { - if (MI->isVolatile()) return false; - continue; - } - - // If we already have seen a copy, reject the second one. - if (TheCopy) return false; - - // If the pointer has been offset from the start of the alloca, we can't - // safely handle this. - if (isOffset) return false; - - // If the memintrinsic isn't using the alloca as the dest, reject it. - if (UI.getOperandNo() != 0) return false; - - // If the source of the memcpy/move is not a constant global, reject it. - if (!PointsToConstantGlobal(MI->getSource())) - return false; - - // Otherwise, the transform is safe. Remember the copy instruction. - TheCopy = MI; - } - return true; -} - -/// isOnlyCopiedFromConstantGlobal - Return true if the specified alloca is only -/// modified by a copy from a constant global. If we can prove this, we can -/// replace any uses of the alloca with uses of the global directly. -MemTransferInst * -SROA::isOnlyCopiedFromConstantGlobal(AllocaInst *AI, - SmallVector<Instruction*, 4> &ToDelete) { - MemTransferInst *TheCopy = 0; - if (::isOnlyCopiedFromConstantGlobal(AI, TheCopy, false, ToDelete)) - return TheCopy; - return 0; -} diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp index d13e4ab..9f24bb6 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -31,10 +31,11 @@ #include "llvm/Attributes.h" #include "llvm/Support/CFG.h" #include "llvm/Pass.h" -#include "llvm/Target/TargetData.h" +#include "llvm/DataLayout.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/TargetTransformInfo.h" using namespace llvm; STATISTIC(NumSimpl, "Number of blocks simplified"); @@ -59,9 +60,9 @@ FunctionPass *llvm::createCFGSimplificationPass() { return new CFGSimplifyPass(); } -/// ChangeToUnreachable - Insert an unreachable instruction before the specified +/// changeToUnreachable - Insert an unreachable instruction before the specified /// instruction, making it and the rest of the code in the block dead. -static void ChangeToUnreachable(Instruction *I, bool UseLLVMTrap) { +static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) { BasicBlock *BB = I->getParent(); // Loop over all of the successors, removing BB's entry from any PHI // nodes. @@ -87,8 +88,8 @@ static void ChangeToUnreachable(Instruction *I, bool UseLLVMTrap) { } } -/// ChangeToCall - Convert the specified invoke into a normal call. -static void ChangeToCall(InvokeInst *II) { +/// changeToCall - Convert the specified invoke into a normal call. +static void changeToCall(InvokeInst *II) { SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3); CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, "", II); NewCall->takeName(II); @@ -105,7 +106,7 @@ static void ChangeToCall(InvokeInst *II) { II->eraseFromParent(); } -static bool MarkAliveBlocks(BasicBlock *BB, +static bool markAliveBlocks(BasicBlock *BB, SmallPtrSet<BasicBlock*, 128> &Reachable) { SmallVector<BasicBlock*, 128> Worklist; @@ -129,7 +130,7 @@ static bool MarkAliveBlocks(BasicBlock *BB, ++BBI; if (!isa<UnreachableInst>(BBI)) { // Don't insert a call to llvm.trap right before the unreachable. - ChangeToUnreachable(BBI, false); + changeToUnreachable(BBI, false); Changed = true; } break; @@ -148,7 +149,7 @@ static bool MarkAliveBlocks(BasicBlock *BB, if (isa<UndefValue>(Ptr) || (isa<ConstantPointerNull>(Ptr) && SI->getPointerAddressSpace() == 0)) { - ChangeToUnreachable(SI, true); + changeToUnreachable(SI, true); Changed = true; break; } @@ -159,7 +160,7 @@ static bool MarkAliveBlocks(BasicBlock *BB, if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) { Value *Callee = II->getCalledValue(); if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) { - ChangeToUnreachable(II, true); + changeToUnreachable(II, true); Changed = true; } else if (II->doesNotThrow()) { if (II->use_empty() && II->onlyReadsMemory()) { @@ -168,7 +169,7 @@ static bool MarkAliveBlocks(BasicBlock *BB, II->getUnwindDest()->removePredecessor(II->getParent()); II->eraseFromParent(); } else - ChangeToCall(II); + changeToCall(II); Changed = true; } } @@ -180,12 +181,12 @@ static bool MarkAliveBlocks(BasicBlock *BB, return Changed; } -/// RemoveUnreachableBlocksFromFn - Remove blocks that are not reachable, even +/// removeUnreachableBlocksFromFn - Remove blocks that are not reachable, even /// if they are in a dead cycle. Return true if a change was made, false /// otherwise. -static bool RemoveUnreachableBlocksFromFn(Function &F) { +static bool removeUnreachableBlocksFromFn(Function &F) { SmallPtrSet<BasicBlock*, 128> Reachable; - bool Changed = MarkAliveBlocks(F.begin(), Reachable); + bool Changed = markAliveBlocks(F.begin(), Reachable); // If there are unreachable blocks in the CFG... if (Reachable.size() == F.size()) @@ -215,9 +216,9 @@ static bool RemoveUnreachableBlocksFromFn(Function &F) { return true; } -/// MergeEmptyReturnBlocks - If we have more than one empty (other than phi +/// mergeEmptyReturnBlocks - If we have more than one empty (other than phi /// node) return blocks, merge them together to promote recursive block merging. -static bool MergeEmptyReturnBlocks(Function &F) { +static bool mergeEmptyReturnBlocks(Function &F) { bool Changed = false; BasicBlock *RetBlock = 0; @@ -291,9 +292,10 @@ static bool MergeEmptyReturnBlocks(Function &F) { return Changed; } -/// IterativeSimplifyCFG - Call SimplifyCFG on all the blocks in the function, +/// iterativelySimplifyCFG - Call SimplifyCFG on all the blocks in the function, /// iterating until no more changes are made. -static bool IterativeSimplifyCFG(Function &F, const TargetData *TD) { +static bool iterativelySimplifyCFG(Function &F, const DataLayout *TD, + const TargetTransformInfo *TTI) { bool Changed = false; bool LocalChange = true; while (LocalChange) { @@ -302,7 +304,7 @@ static bool IterativeSimplifyCFG(Function &F, const TargetData *TD) { // Loop over all of the basic blocks and remove them if they are unneeded... // for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) { - if (SimplifyCFG(BBIt++, TD)) { + if (SimplifyCFG(BBIt++, TD, TTI)) { LocalChange = true; ++NumSimpl; } @@ -316,25 +318,27 @@ static bool IterativeSimplifyCFG(Function &F, const TargetData *TD) { // simplify the CFG. // bool CFGSimplifyPass::runOnFunction(Function &F) { - const TargetData *TD = getAnalysisIfAvailable<TargetData>(); - bool EverChanged = RemoveUnreachableBlocksFromFn(F); - EverChanged |= MergeEmptyReturnBlocks(F); - EverChanged |= IterativeSimplifyCFG(F, TD); + const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); + const TargetTransformInfo *TTI = + getAnalysisIfAvailable<TargetTransformInfo>(); + bool EverChanged = removeUnreachableBlocksFromFn(F); + EverChanged |= mergeEmptyReturnBlocks(F); + EverChanged |= iterativelySimplifyCFG(F, TD, TTI); // If neither pass changed anything, we're done. if (!EverChanged) return false; - // IterativeSimplifyCFG can (rarely) make some loops dead. If this happens, - // RemoveUnreachableBlocksFromFn is needed to nuke them, which means we should + // iterativelySimplifyCFG can (rarely) make some loops dead. If this happens, + // removeUnreachableBlocksFromFn is needed to nuke them, which means we should // iterate between the two optimizations. We structure the code like this to - // avoid reruning IterativeSimplifyCFG if the second pass of - // RemoveUnreachableBlocksFromFn doesn't do anything. - if (!RemoveUnreachableBlocksFromFn(F)) + // avoid reruning iterativelySimplifyCFG if the second pass of + // removeUnreachableBlocksFromFn doesn't do anything. + if (!removeUnreachableBlocksFromFn(F)) return true; do { - EverChanged = IterativeSimplifyCFG(F, TD); - EverChanged |= RemoveUnreachableBlocksFromFn(F); + EverChanged = iterativelySimplifyCFG(F, TD, TTI); + EverChanged |= removeUnreachableBlocksFromFn(F); } while (EverChanged); return true; diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyLibCalls.cpp index f110320..17d07cd 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SimplifyLibCalls.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyLibCalls.cpp @@ -28,9 +28,10 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringMap.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetData.h" +#include "llvm/DataLayout.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Config/config.h" // FIXME: Shouldn't depend on host! using namespace llvm; @@ -38,6 +39,10 @@ using namespace llvm; STATISTIC(NumSimplified, "Number of library calls simplified"); STATISTIC(NumAnnotated, "Number of attributes added to library functions"); +static cl::opt<bool> UnsafeFPShrink("enable-double-float-shrink", cl::Hidden, + cl::init(false), + cl::desc("Enable unsafe double to float " + "shrinking for math lib calls")); //===----------------------------------------------------------------------===// // Optimizer Base Class //===----------------------------------------------------------------------===// @@ -48,7 +53,7 @@ namespace { class LibCallOptimization { protected: Function *Caller; - const TargetData *TD; + const DataLayout *TD; const TargetLibraryInfo *TLI; LLVMContext* Context; public: @@ -63,7 +68,7 @@ public: virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) =0; - Value *OptimizeCall(CallInst *CI, const TargetData *TD, + Value *OptimizeCall(CallInst *CI, const DataLayout *TD, const TargetLibraryInfo *TLI, IRBuilder<> &B) { Caller = CI->getParent()->getParent(); this->TD = TD; @@ -85,22 +90,6 @@ public: // Helper Functions //===----------------------------------------------------------------------===// -/// IsOnlyUsedInZeroEqualityComparison - Return true if it only matters that the -/// value is equal or not-equal to zero. -static bool IsOnlyUsedInZeroEqualityComparison(Value *V) { - for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); - UI != E; ++UI) { - if (ICmpInst *IC = dyn_cast<ICmpInst>(*UI)) - if (IC->isEquality()) - if (Constant *C = dyn_cast<Constant>(IC->getOperand(1))) - if (C->isNullValue()) - continue; - // Unknown instruction. - return false; - } - return true; -} - static bool CallHasFloatingPointArgument(const CallInst *CI) { for (CallInst::const_op_iterator it = CI->op_begin(), e = CI->op_end(); it != e; ++it) { @@ -110,799 +99,62 @@ static bool CallHasFloatingPointArgument(const CallInst *CI) { return false; } -/// IsOnlyUsedInEqualityComparison - Return true if it is only used in equality -/// comparisons with With. -static bool IsOnlyUsedInEqualityComparison(Value *V, Value *With) { - for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); - UI != E; ++UI) { - if (ICmpInst *IC = dyn_cast<ICmpInst>(*UI)) - if (IC->isEquality() && IC->getOperand(1) == With) - continue; - // Unknown instruction. - return false; - } - return true; -} - +namespace { //===----------------------------------------------------------------------===// -// String and Memory LibCall Optimizations +// Math Library Optimizations //===----------------------------------------------------------------------===// //===---------------------------------------===// -// 'strcat' Optimizations -namespace { -struct StrCatOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Verify the "strcat" function prototype. - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 2 || - FT->getReturnType() != B.getInt8PtrTy() || - FT->getParamType(0) != FT->getReturnType() || - FT->getParamType(1) != FT->getReturnType()) - return 0; - - // Extract some information from the instruction - Value *Dst = CI->getArgOperand(0); - Value *Src = CI->getArgOperand(1); - - // See if we can get the length of the input string. - uint64_t Len = GetStringLength(Src); - if (Len == 0) return 0; - --Len; // Unbias length. - - // Handle the simple, do-nothing case: strcat(x, "") -> x - if (Len == 0) - return Dst; - - // These optimizations require TargetData. - if (!TD) return 0; - - return EmitStrLenMemCpy(Src, Dst, Len, B); - } - - Value *EmitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len, IRBuilder<> &B) { - // We need to find the end of the destination string. That's where the - // memory is to be moved to. We just generate a call to strlen. - Value *DstLen = EmitStrLen(Dst, B, TD, TLI); - if (!DstLen) - return 0; - - // Now that we have the destination's length, we must index into the - // destination's pointer to get the actual memcpy destination (end of - // the string .. we're concatenating). - Value *CpyDst = B.CreateGEP(Dst, DstLen, "endptr"); - - // We have enough information to now generate the memcpy call to do the - // concatenation for us. Make a memcpy to copy the nul byte with align = 1. - B.CreateMemCpy(CpyDst, Src, - ConstantInt::get(TD->getIntPtrType(*Context), Len + 1), 1); - return Dst; - } -}; - -//===---------------------------------------===// -// 'strncat' Optimizations - -struct StrNCatOpt : public StrCatOpt { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Verify the "strncat" function prototype. - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 3 || - FT->getReturnType() != B.getInt8PtrTy() || - FT->getParamType(0) != FT->getReturnType() || - FT->getParamType(1) != FT->getReturnType() || - !FT->getParamType(2)->isIntegerTy()) - return 0; - - // Extract some information from the instruction - Value *Dst = CI->getArgOperand(0); - Value *Src = CI->getArgOperand(1); - uint64_t Len; - - // We don't do anything if length is not constant - if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2))) - Len = LengthArg->getZExtValue(); - else - return 0; - - // See if we can get the length of the input string. - uint64_t SrcLen = GetStringLength(Src); - if (SrcLen == 0) return 0; - --SrcLen; // Unbias length. - - // Handle the simple, do-nothing cases: - // strncat(x, "", c) -> x - // strncat(x, c, 0) -> x - if (SrcLen == 0 || Len == 0) return Dst; - - // These optimizations require TargetData. - if (!TD) return 0; - - // We don't optimize this case - if (Len < SrcLen) return 0; - - // strncat(x, s, c) -> strcat(x, s) - // s is constant so the strcat can be optimized further - return EmitStrLenMemCpy(Src, Dst, SrcLen, B); - } -}; - -//===---------------------------------------===// -// 'strchr' Optimizations - -struct StrChrOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Verify the "strchr" function prototype. - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 2 || - FT->getReturnType() != B.getInt8PtrTy() || - FT->getParamType(0) != FT->getReturnType() || - !FT->getParamType(1)->isIntegerTy(32)) - return 0; - - Value *SrcStr = CI->getArgOperand(0); - - // If the second operand is non-constant, see if we can compute the length - // of the input string and turn this into memchr. - ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1)); - if (CharC == 0) { - // These optimizations require TargetData. - if (!TD) return 0; - - uint64_t Len = GetStringLength(SrcStr); - if (Len == 0 || !FT->getParamType(1)->isIntegerTy(32))// memchr needs i32. - return 0; - - return EmitMemChr(SrcStr, CI->getArgOperand(1), // include nul. - ConstantInt::get(TD->getIntPtrType(*Context), Len), - B, TD, TLI); - } - - // Otherwise, the character is a constant, see if the first argument is - // a string literal. If so, we can constant fold. - StringRef Str; - if (!getConstantStringInfo(SrcStr, Str)) - return 0; - - // Compute the offset, make sure to handle the case when we're searching for - // zero (a weird way to spell strlen). - size_t I = CharC->getSExtValue() == 0 ? - Str.size() : Str.find(CharC->getSExtValue()); - if (I == StringRef::npos) // Didn't find the char. strchr returns null. - return Constant::getNullValue(CI->getType()); - - // strchr(s+n,c) -> gep(s+n+i,c) - return B.CreateGEP(SrcStr, B.getInt64(I), "strchr"); - } -}; - -//===---------------------------------------===// -// 'strrchr' Optimizations - -struct StrRChrOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Verify the "strrchr" function prototype. - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 2 || - FT->getReturnType() != B.getInt8PtrTy() || - FT->getParamType(0) != FT->getReturnType() || - !FT->getParamType(1)->isIntegerTy(32)) - return 0; - - Value *SrcStr = CI->getArgOperand(0); - ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1)); - - // Cannot fold anything if we're not looking for a constant. - if (!CharC) - return 0; - - StringRef Str; - if (!getConstantStringInfo(SrcStr, Str)) { - // strrchr(s, 0) -> strchr(s, 0) - if (TD && CharC->isZero()) - return EmitStrChr(SrcStr, '\0', B, TD, TLI); - return 0; - } - - // Compute the offset. - size_t I = CharC->getSExtValue() == 0 ? - Str.size() : Str.rfind(CharC->getSExtValue()); - if (I == StringRef::npos) // Didn't find the char. Return null. - return Constant::getNullValue(CI->getType()); - - // strrchr(s+n,c) -> gep(s+n+i,c) - return B.CreateGEP(SrcStr, B.getInt64(I), "strrchr"); - } -}; - -//===---------------------------------------===// -// 'strcmp' Optimizations - -struct StrCmpOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Verify the "strcmp" function prototype. - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 2 || - !FT->getReturnType()->isIntegerTy(32) || - FT->getParamType(0) != FT->getParamType(1) || - FT->getParamType(0) != B.getInt8PtrTy()) - return 0; - - Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1); - if (Str1P == Str2P) // strcmp(x,x) -> 0 - return ConstantInt::get(CI->getType(), 0); - - StringRef Str1, Str2; - bool HasStr1 = getConstantStringInfo(Str1P, Str1); - bool HasStr2 = getConstantStringInfo(Str2P, Str2); - - // strcmp(x, y) -> cnst (if both x and y are constant strings) - if (HasStr1 && HasStr2) - return ConstantInt::get(CI->getType(), Str1.compare(Str2)); - - if (HasStr1 && Str1.empty()) // strcmp("", x) -> -*x - return B.CreateNeg(B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), - CI->getType())); - - if (HasStr2 && Str2.empty()) // strcmp(x,"") -> *x - return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType()); - - // strcmp(P, "x") -> memcmp(P, "x", 2) - uint64_t Len1 = GetStringLength(Str1P); - uint64_t Len2 = GetStringLength(Str2P); - if (Len1 && Len2) { - // These optimizations require TargetData. - if (!TD) return 0; - - return EmitMemCmp(Str1P, Str2P, - ConstantInt::get(TD->getIntPtrType(*Context), - std::min(Len1, Len2)), B, TD, TLI); - } - - return 0; - } -}; - -//===---------------------------------------===// -// 'strncmp' Optimizations - -struct StrNCmpOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Verify the "strncmp" function prototype. - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 3 || - !FT->getReturnType()->isIntegerTy(32) || - FT->getParamType(0) != FT->getParamType(1) || - FT->getParamType(0) != B.getInt8PtrTy() || - !FT->getParamType(2)->isIntegerTy()) - return 0; - - Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1); - if (Str1P == Str2P) // strncmp(x,x,n) -> 0 - return ConstantInt::get(CI->getType(), 0); - - // Get the length argument if it is constant. - uint64_t Length; - if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2))) - Length = LengthArg->getZExtValue(); - else - return 0; - - if (Length == 0) // strncmp(x,y,0) -> 0 - return ConstantInt::get(CI->getType(), 0); - - if (TD && Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1) - return EmitMemCmp(Str1P, Str2P, CI->getArgOperand(2), B, TD, TLI); - - StringRef Str1, Str2; - bool HasStr1 = getConstantStringInfo(Str1P, Str1); - bool HasStr2 = getConstantStringInfo(Str2P, Str2); - - // strncmp(x, y) -> cnst (if both x and y are constant strings) - if (HasStr1 && HasStr2) { - StringRef SubStr1 = Str1.substr(0, Length); - StringRef SubStr2 = Str2.substr(0, Length); - return ConstantInt::get(CI->getType(), SubStr1.compare(SubStr2)); - } - - if (HasStr1 && Str1.empty()) // strncmp("", x, n) -> -*x - return B.CreateNeg(B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), - CI->getType())); - - if (HasStr2 && Str2.empty()) // strncmp(x, "", n) -> *x - return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType()); - - return 0; - } -}; - - -//===---------------------------------------===// -// 'strcpy' Optimizations - -struct StrCpyOpt : public LibCallOptimization { - bool OptChkCall; // True if it's optimizing a __strcpy_chk libcall. - - StrCpyOpt(bool c) : OptChkCall(c) {} - - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Verify the "strcpy" function prototype. - unsigned NumParams = OptChkCall ? 3 : 2; - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != NumParams || - FT->getReturnType() != FT->getParamType(0) || - FT->getParamType(0) != FT->getParamType(1) || - FT->getParamType(0) != B.getInt8PtrTy()) - return 0; - - Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); - if (Dst == Src) // strcpy(x,x) -> x - return Src; - - // These optimizations require TargetData. - if (!TD) return 0; - - // See if we can get the length of the input string. - uint64_t Len = GetStringLength(Src); - if (Len == 0) return 0; - - // We have enough information to now generate the memcpy call to do the - // concatenation for us. Make a memcpy to copy the nul byte with align = 1. - if (!OptChkCall || - !EmitMemCpyChk(Dst, Src, - ConstantInt::get(TD->getIntPtrType(*Context), Len), - CI->getArgOperand(2), B, TD, TLI)) - B.CreateMemCpy(Dst, Src, - ConstantInt::get(TD->getIntPtrType(*Context), Len), 1); - return Dst; - } -}; - -//===---------------------------------------===// -// 'stpcpy' Optimizations - -struct StpCpyOpt: public LibCallOptimization { - bool OptChkCall; // True if it's optimizing a __stpcpy_chk libcall. - - StpCpyOpt(bool c) : OptChkCall(c) {} - - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // Verify the "stpcpy" function prototype. - unsigned NumParams = OptChkCall ? 3 : 2; - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != NumParams || - FT->getReturnType() != FT->getParamType(0) || - FT->getParamType(0) != FT->getParamType(1) || - FT->getParamType(0) != B.getInt8PtrTy()) - return 0; - - // These optimizations require TargetData. - if (!TD) return 0; - - Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); - if (Dst == Src) { // stpcpy(x,x) -> x+strlen(x) - Value *StrLen = EmitStrLen(Src, B, TD, TLI); - return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : 0; - } - - // See if we can get the length of the input string. - uint64_t Len = GetStringLength(Src); - if (Len == 0) return 0; - - Value *LenV = ConstantInt::get(TD->getIntPtrType(*Context), Len); - Value *DstEnd = B.CreateGEP(Dst, - ConstantInt::get(TD->getIntPtrType(*Context), - Len - 1)); - - // We have enough information to now generate the memcpy call to do the - // copy for us. Make a memcpy to copy the nul byte with align = 1. - if (!OptChkCall || !EmitMemCpyChk(Dst, Src, LenV, CI->getArgOperand(2), B, - TD, TLI)) - B.CreateMemCpy(Dst, Src, LenV, 1); - return DstEnd; - } -}; - -//===---------------------------------------===// -// 'strncpy' Optimizations - -struct StrNCpyOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || - FT->getParamType(0) != FT->getParamType(1) || - FT->getParamType(0) != B.getInt8PtrTy() || - !FT->getParamType(2)->isIntegerTy()) - return 0; - - Value *Dst = CI->getArgOperand(0); - Value *Src = CI->getArgOperand(1); - Value *LenOp = CI->getArgOperand(2); - - // See if we can get the length of the input string. - uint64_t SrcLen = GetStringLength(Src); - if (SrcLen == 0) return 0; - --SrcLen; - - if (SrcLen == 0) { - // strncpy(x, "", y) -> memset(x, '\0', y, 1) - B.CreateMemSet(Dst, B.getInt8('\0'), LenOp, 1); - return Dst; - } - - uint64_t Len; - if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(LenOp)) - Len = LengthArg->getZExtValue(); - else - return 0; - - if (Len == 0) return Dst; // strncpy(x, y, 0) -> x - - // These optimizations require TargetData. - if (!TD) return 0; - - // Let strncpy handle the zero padding - if (Len > SrcLen+1) return 0; - - // strncpy(x, s, c) -> memcpy(x, s, c, 1) [s and c are constant] - B.CreateMemCpy(Dst, Src, - ConstantInt::get(TD->getIntPtrType(*Context), Len), 1); - - return Dst; - } -}; - -//===---------------------------------------===// -// 'strlen' Optimizations - -struct StrLenOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 1 || - FT->getParamType(0) != B.getInt8PtrTy() || - !FT->getReturnType()->isIntegerTy()) - return 0; - - Value *Src = CI->getArgOperand(0); - - // Constant folding: strlen("xyz") -> 3 - if (uint64_t Len = GetStringLength(Src)) - return ConstantInt::get(CI->getType(), Len-1); - - // strlen(x) != 0 --> *x != 0 - // strlen(x) == 0 --> *x == 0 - if (IsOnlyUsedInZeroEqualityComparison(CI)) - return B.CreateZExt(B.CreateLoad(Src, "strlenfirst"), CI->getType()); - return 0; - } -}; - - -//===---------------------------------------===// -// 'strpbrk' Optimizations - -struct StrPBrkOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 2 || - FT->getParamType(0) != B.getInt8PtrTy() || - FT->getParamType(1) != FT->getParamType(0) || - FT->getReturnType() != FT->getParamType(0)) - return 0; - - StringRef S1, S2; - bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1); - bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2); - - // strpbrk(s, "") -> NULL - // strpbrk("", s) -> NULL - if ((HasS1 && S1.empty()) || (HasS2 && S2.empty())) - return Constant::getNullValue(CI->getType()); - - // Constant folding. - if (HasS1 && HasS2) { - size_t I = S1.find_first_of(S2); - if (I == std::string::npos) // No match. - return Constant::getNullValue(CI->getType()); - - return B.CreateGEP(CI->getArgOperand(0), B.getInt64(I), "strpbrk"); - } - - // strpbrk(s, "a") -> strchr(s, 'a') - if (TD && HasS2 && S2.size() == 1) - return EmitStrChr(CI->getArgOperand(0), S2[0], B, TD, TLI); - - return 0; - } -}; - -//===---------------------------------------===// -// 'strto*' Optimizations. This handles strtol, strtod, strtof, strtoul, etc. - -struct StrToOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - if ((FT->getNumParams() != 2 && FT->getNumParams() != 3) || - !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isPointerTy()) - return 0; - - Value *EndPtr = CI->getArgOperand(1); - if (isa<ConstantPointerNull>(EndPtr)) { - // With a null EndPtr, this function won't capture the main argument. - // It would be readonly too, except that it still may write to errno. - CI->addAttribute(1, Attribute::NoCapture); - } - - return 0; - } -}; - -//===---------------------------------------===// -// 'strspn' Optimizations - -struct StrSpnOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 2 || - FT->getParamType(0) != B.getInt8PtrTy() || - FT->getParamType(1) != FT->getParamType(0) || - !FT->getReturnType()->isIntegerTy()) - return 0; - - StringRef S1, S2; - bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1); - bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2); - - // strspn(s, "") -> 0 - // strspn("", s) -> 0 - if ((HasS1 && S1.empty()) || (HasS2 && S2.empty())) - return Constant::getNullValue(CI->getType()); - - // Constant folding. - if (HasS1 && HasS2) { - size_t Pos = S1.find_first_not_of(S2); - if (Pos == StringRef::npos) Pos = S1.size(); - return ConstantInt::get(CI->getType(), Pos); - } - - return 0; - } -}; - -//===---------------------------------------===// -// 'strcspn' Optimizations - -struct StrCSpnOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 2 || - FT->getParamType(0) != B.getInt8PtrTy() || - FT->getParamType(1) != FT->getParamType(0) || - !FT->getReturnType()->isIntegerTy()) - return 0; - - StringRef S1, S2; - bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1); - bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2); - - // strcspn("", s) -> 0 - if (HasS1 && S1.empty()) - return Constant::getNullValue(CI->getType()); - - // Constant folding. - if (HasS1 && HasS2) { - size_t Pos = S1.find_first_of(S2); - if (Pos == StringRef::npos) Pos = S1.size(); - return ConstantInt::get(CI->getType(), Pos); - } - - // strcspn(s, "") -> strlen(s) - if (TD && HasS2 && S2.empty()) - return EmitStrLen(CI->getArgOperand(0), B, TD, TLI); - - return 0; - } -}; - -//===---------------------------------------===// -// 'strstr' Optimizations +// Double -> Float Shrinking Optimizations for Unary Functions like 'floor' -struct StrStrOpt : public LibCallOptimization { +struct UnaryDoubleFPOpt : public LibCallOptimization { + bool CheckRetType; + UnaryDoubleFPOpt(bool CheckReturnType): CheckRetType(CheckReturnType) {} virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 2 || - !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isPointerTy() || - !FT->getReturnType()->isPointerTy()) + if (FT->getNumParams() != 1 || !FT->getReturnType()->isDoubleTy() || + !FT->getParamType(0)->isDoubleTy()) return 0; - // fold strstr(x, x) -> x. - if (CI->getArgOperand(0) == CI->getArgOperand(1)) - return B.CreateBitCast(CI->getArgOperand(0), CI->getType()); - - // fold strstr(a, b) == a -> strncmp(a, b, strlen(b)) == 0 - if (TD && IsOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) { - Value *StrLen = EmitStrLen(CI->getArgOperand(1), B, TD, TLI); - if (!StrLen) - return 0; - Value *StrNCmp = EmitStrNCmp(CI->getArgOperand(0), CI->getArgOperand(1), - StrLen, B, TD, TLI); - if (!StrNCmp) - return 0; - for (Value::use_iterator UI = CI->use_begin(), UE = CI->use_end(); - UI != UE; ) { - ICmpInst *Old = cast<ICmpInst>(*UI++); - Value *Cmp = B.CreateICmp(Old->getPredicate(), StrNCmp, - ConstantInt::getNullValue(StrNCmp->getType()), - "cmp"); - Old->replaceAllUsesWith(Cmp); - Old->eraseFromParent(); + if (CheckRetType) { + // Check if all the uses for function like 'sin' are converted to float. + for (Value::use_iterator UseI = CI->use_begin(); UseI != CI->use_end(); + ++UseI) { + FPTruncInst *Cast = dyn_cast<FPTruncInst>(*UseI); + if (Cast == 0 || !Cast->getType()->isFloatTy()) + return 0; } - return CI; - } - - // See if either input string is a constant string. - StringRef SearchStr, ToFindStr; - bool HasStr1 = getConstantStringInfo(CI->getArgOperand(0), SearchStr); - bool HasStr2 = getConstantStringInfo(CI->getArgOperand(1), ToFindStr); - - // fold strstr(x, "") -> x. - if (HasStr2 && ToFindStr.empty()) - return B.CreateBitCast(CI->getArgOperand(0), CI->getType()); - - // If both strings are known, constant fold it. - if (HasStr1 && HasStr2) { - std::string::size_type Offset = SearchStr.find(ToFindStr); - - if (Offset == StringRef::npos) // strstr("foo", "bar") -> null - return Constant::getNullValue(CI->getType()); - - // strstr("abcd", "bc") -> gep((char*)"abcd", 1) - Value *Result = CastToCStr(CI->getArgOperand(0), B); - Result = B.CreateConstInBoundsGEP1_64(Result, Offset, "strstr"); - return B.CreateBitCast(Result, CI->getType()); - } - - // fold strstr(x, "y") -> strchr(x, 'y'). - if (HasStr2 && ToFindStr.size() == 1) { - Value *StrChr= EmitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TD, TLI); - return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : 0; } - return 0; - } -}; - - -//===---------------------------------------===// -// 'memcmp' Optimizations - -struct MemCmpOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 3 || !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isPointerTy() || - !FT->getReturnType()->isIntegerTy(32)) - return 0; - Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1); - - if (LHS == RHS) // memcmp(s,s,x) -> 0 - return Constant::getNullValue(CI->getType()); - - // Make sure we have a constant length. - ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getArgOperand(2)); - if (!LenC) return 0; - uint64_t Len = LenC->getZExtValue(); - - if (Len == 0) // memcmp(s1,s2,0) -> 0 - return Constant::getNullValue(CI->getType()); - - // memcmp(S1,S2,1) -> *(unsigned char*)LHS - *(unsigned char*)RHS - if (Len == 1) { - Value *LHSV = B.CreateZExt(B.CreateLoad(CastToCStr(LHS, B), "lhsc"), - CI->getType(), "lhsv"); - Value *RHSV = B.CreateZExt(B.CreateLoad(CastToCStr(RHS, B), "rhsc"), - CI->getType(), "rhsv"); - return B.CreateSub(LHSV, RHSV, "chardiff"); - } - - // Constant folding: memcmp(x, y, l) -> cnst (all arguments are constant) - StringRef LHSStr, RHSStr; - if (getConstantStringInfo(LHS, LHSStr) && - getConstantStringInfo(RHS, RHSStr)) { - // Make sure we're not reading out-of-bounds memory. - if (Len > LHSStr.size() || Len > RHSStr.size()) - return 0; - uint64_t Ret = memcmp(LHSStr.data(), RHSStr.data(), Len); - return ConstantInt::get(CI->getType(), Ret); - } - - return 0; - } -}; - -//===---------------------------------------===// -// 'memcpy' Optimizations - -struct MemCpyOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // These optimizations require TargetData. - if (!TD) return 0; - - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || - !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isPointerTy() || - FT->getParamType(2) != TD->getIntPtrType(*Context)) - return 0; - - // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1) - B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), 1); - return CI->getArgOperand(0); - } -}; - -//===---------------------------------------===// -// 'memmove' Optimizations - -struct MemMoveOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // These optimizations require TargetData. - if (!TD) return 0; - - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || - !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isPointerTy() || - FT->getParamType(2) != TD->getIntPtrType(*Context)) - return 0; - - // memmove(x, y, n) -> llvm.memmove(x, y, n, 1) - B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), 1); - return CI->getArgOperand(0); - } -}; - -//===---------------------------------------===// -// 'memset' Optimizations - -struct MemSetOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // These optimizations require TargetData. - if (!TD) return 0; - - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || - !FT->getParamType(0)->isPointerTy() || - !FT->getParamType(1)->isIntegerTy() || - FT->getParamType(2) != TD->getIntPtrType(*Context)) + // If this is something like 'floor((double)floatval)', convert to floorf. + FPExtInst *Cast = dyn_cast<FPExtInst>(CI->getArgOperand(0)); + if (Cast == 0 || !Cast->getOperand(0)->getType()->isFloatTy()) return 0; - // memset(p, v, n) -> llvm.memset(p, v, n, 1) - Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false); - B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1); - return CI->getArgOperand(0); + // floor((double)floatval) -> (double)floorf(floatval) + Value *V = Cast->getOperand(0); + V = EmitUnaryFloatFnCall(V, Callee->getName(), B, Callee->getAttributes()); + return B.CreateFPExt(V, B.getDoubleTy()); } }; -//===----------------------------------------------------------------------===// -// Math Library Optimizations -//===----------------------------------------------------------------------===// - //===---------------------------------------===// // 'cos*' Optimizations - struct CosOpt : public LibCallOptimization { virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *Ret = NULL; + if (UnsafeFPShrink && Callee->getName() == "cos" && + TLI->has(LibFunc::cosf)) { + UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true); + Ret = UnsafeUnaryDoubleFP.CallOptimizer(Callee, CI, B); + } + FunctionType *FT = Callee->getFunctionType(); // Just make sure this has 1 argument of FP type, which matches the // result type. if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || !FT->getParamType(0)->isFloatingPointTy()) - return 0; + return Ret; // cos(-x) -> cos(x) Value *Op1 = CI->getArgOperand(0); @@ -910,7 +162,7 @@ struct CosOpt : public LibCallOptimization { BinaryOperator *BinExpr = cast<BinaryOperator>(Op1); return B.CreateCall(Callee, BinExpr->getOperand(1), "cos"); } - return 0; + return Ret; } }; @@ -919,13 +171,20 @@ struct CosOpt : public LibCallOptimization { struct PowOpt : public LibCallOptimization { virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *Ret = NULL; + if (UnsafeFPShrink && Callee->getName() == "pow" && + TLI->has(LibFunc::powf)) { + UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true); + Ret = UnsafeUnaryDoubleFP.CallOptimizer(Callee, CI, B); + } + FunctionType *FT = Callee->getFunctionType(); // Just make sure this has 2 arguments of the same FP type, which match the // result type. if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) || FT->getParamType(0) != FT->getParamType(1) || !FT->getParamType(0)->isFloatingPointTy()) - return 0; + return Ret; Value *Op1 = CI->getArgOperand(0), *Op2 = CI->getArgOperand(1); if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) { @@ -936,7 +195,7 @@ struct PowOpt : public LibCallOptimization { } ConstantFP *Op2C = dyn_cast<ConstantFP>(Op2); - if (Op2C == 0) return 0; + if (Op2C == 0) return Ret; if (Op2C->getValueAPF().isZero()) // pow(x, 0.0) -> 1.0 return ConstantFP::get(CI->getType(), 1.0); @@ -974,12 +233,19 @@ struct PowOpt : public LibCallOptimization { struct Exp2Opt : public LibCallOptimization { virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + Value *Ret = NULL; + if (UnsafeFPShrink && Callee->getName() == "exp2" && + TLI->has(LibFunc::exp2)) { + UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true); + Ret = UnsafeUnaryDoubleFP.CallOptimizer(Callee, CI, B); + } + FunctionType *FT = Callee->getFunctionType(); // Just make sure this has 1 argument of FP type, which matches the // result type. if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || !FT->getParamType(0)->isFloatingPointTy()) - return 0; + return Ret; Value *Op = CI->getArgOperand(0); // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x)) if sizeof(x) <= 32 @@ -1016,29 +282,7 @@ struct Exp2Opt : public LibCallOptimization { return CI; } - return 0; - } -}; - -//===---------------------------------------===// -// Double -> Float Shrinking Optimizations for Unary Functions like 'floor' - -struct UnaryDoubleFPOpt : public LibCallOptimization { - virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - FunctionType *FT = Callee->getFunctionType(); - if (FT->getNumParams() != 1 || !FT->getReturnType()->isDoubleTy() || - !FT->getParamType(0)->isDoubleTy()) - return 0; - - // If this is something like 'floor((double)floatval)', convert to floorf. - FPExtInst *Cast = dyn_cast<FPExtInst>(CI->getArgOperand(0)); - if (Cast == 0 || !Cast->getOperand(0)->getType()->isFloatTy()) - return 0; - - // floor((double)floatval) -> (double)floorf(floatval) - Value *V = Cast->getOperand(0); - V = EmitUnaryFloatFnCall(V, Callee->getName(), B, Callee->getAttributes()); - return B.CreateFPExt(V, B.getDoubleTy()); + return Ret; } }; @@ -1063,8 +307,8 @@ struct FFSOpt : public LibCallOptimization { // Constant fold. if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { - if (CI->getValue() == 0) // ffs(0) -> 0. - return Constant::getNullValue(CI->getType()); + if (CI->isZero()) // ffs(0) -> 0. + return B.getInt32(0); // ffs(c) -> cttz(c)+1 return B.getInt32(CI->getValue().countTrailingZeros() + 1); } @@ -1267,7 +511,7 @@ struct SPrintFOpt : public LibCallOptimization { if (FormatStr[i] == '%') return 0; // we found a format specifier, bail out. - // These optimizations require TargetData. + // These optimizations require DataLayout. if (!TD) return 0; // sprintf(str, fmt) -> llvm.memcpy(str, fmt, strlen(fmt)+1, 1) @@ -1297,7 +541,7 @@ struct SPrintFOpt : public LibCallOptimization { } if (FormatStr[1] == 's') { - // These optimizations require TargetData. + // These optimizations require DataLayout. if (!TD) return 0; // sprintf(dest, "%s", str) -> llvm.memcpy(dest, str, strlen(str)+1, 1) @@ -1385,7 +629,7 @@ struct FWriteOpt : public LibCallOptimization { struct FPutsOpt : public LibCallOptimization { virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - // These optimizations require TargetData. + // These optimizations require DataLayout. if (!TD) return 0; // Require two pointers. Also, we can't optimize if return value is used. @@ -1422,7 +666,7 @@ struct FPrintFOpt : public LibCallOptimization { if (FormatStr[i] == '%') // Could handle %% -> % if we cared. return 0; // We found a format specifier. - // These optimizations require TargetData. + // These optimizations require DataLayout. if (!TD) return 0; Value *NewCI = EmitFWrite(CI->getArgOperand(1), @@ -1524,17 +768,9 @@ namespace { TargetLibraryInfo *TLI; StringMap<LibCallOptimization*> Optimizations; - // String and Memory LibCall Optimizations - StrCatOpt StrCat; StrNCatOpt StrNCat; StrChrOpt StrChr; StrRChrOpt StrRChr; - StrCmpOpt StrCmp; StrNCmpOpt StrNCmp; - StrCpyOpt StrCpy; StrCpyOpt StrCpyChk; - StpCpyOpt StpCpy; StpCpyOpt StpCpyChk; - StrNCpyOpt StrNCpy; - StrLenOpt StrLen; StrPBrkOpt StrPBrk; - StrToOpt StrTo; StrSpnOpt StrSpn; StrCSpnOpt StrCSpn; StrStrOpt StrStr; - MemCmpOpt MemCmp; MemCpyOpt MemCpy; MemMoveOpt MemMove; MemSetOpt MemSet; // Math Library Optimizations - CosOpt Cos; PowOpt Pow; Exp2Opt Exp2; UnaryDoubleFPOpt UnaryDoubleFP; + CosOpt Cos; PowOpt Pow; Exp2Opt Exp2; + UnaryDoubleFPOpt UnaryDoubleFP, UnsafeUnaryDoubleFP; // Integer Optimizations FFSOpt FFS; AbsOpt Abs; IsDigitOpt IsDigit; IsAsciiOpt IsAscii; ToAsciiOpt ToAscii; @@ -1546,11 +782,13 @@ namespace { bool Modified; // This is only used by doInitialization. public: static char ID; // Pass identification - SimplifyLibCalls() : FunctionPass(ID), StrCpy(false), StrCpyChk(true), - StpCpy(false), StpCpyChk(true) { + SimplifyLibCalls() : FunctionPass(ID), UnaryDoubleFP(false), + UnsafeUnaryDoubleFP(true) { initializeSimplifyLibCallsPass(*PassRegistry::getPassRegistry()); } void AddOpt(LibFunc::Func F, LibCallOptimization* Opt); + void AddOpt(LibFunc::Func F1, LibFunc::Func F2, LibCallOptimization* Opt); + void InitOptimizations(); bool runOnFunction(Function &F); @@ -1586,40 +824,15 @@ void SimplifyLibCalls::AddOpt(LibFunc::Func F, LibCallOptimization* Opt) { Optimizations[TLI->getName(F)] = Opt; } +void SimplifyLibCalls::AddOpt(LibFunc::Func F1, LibFunc::Func F2, + LibCallOptimization* Opt) { + if (TLI->has(F1) && TLI->has(F2)) + Optimizations[TLI->getName(F1)] = Opt; +} + /// Optimizations - Populate the Optimizations map with all the optimizations /// we know. void SimplifyLibCalls::InitOptimizations() { - // String and Memory LibCall Optimizations - Optimizations["strcat"] = &StrCat; - Optimizations["strncat"] = &StrNCat; - Optimizations["strchr"] = &StrChr; - Optimizations["strrchr"] = &StrRChr; - Optimizations["strcmp"] = &StrCmp; - Optimizations["strncmp"] = &StrNCmp; - Optimizations["strcpy"] = &StrCpy; - Optimizations["strncpy"] = &StrNCpy; - Optimizations["stpcpy"] = &StpCpy; - Optimizations["strlen"] = &StrLen; - Optimizations["strpbrk"] = &StrPBrk; - Optimizations["strtol"] = &StrTo; - Optimizations["strtod"] = &StrTo; - Optimizations["strtof"] = &StrTo; - Optimizations["strtoul"] = &StrTo; - Optimizations["strtoll"] = &StrTo; - Optimizations["strtold"] = &StrTo; - Optimizations["strtoull"] = &StrTo; - Optimizations["strspn"] = &StrSpn; - Optimizations["strcspn"] = &StrCSpn; - Optimizations["strstr"] = &StrStr; - Optimizations["memcmp"] = &MemCmp; - AddOpt(LibFunc::memcpy, &MemCpy); - Optimizations["memmove"] = &MemMove; - AddOpt(LibFunc::memset, &MemSet); - - // _chk variants of String and Memory LibCall Optimizations. - Optimizations["__strcpy_chk"] = &StrCpyChk; - Optimizations["__stpcpy_chk"] = &StpCpyChk; - // Math Library Optimizations Optimizations["cosf"] = &Cos; Optimizations["cos"] = &Cos; @@ -1641,16 +854,37 @@ void SimplifyLibCalls::InitOptimizations() { Optimizations["llvm.exp2.f64"] = &Exp2; Optimizations["llvm.exp2.f32"] = &Exp2; - if (TLI->has(LibFunc::floor) && TLI->has(LibFunc::floorf)) - Optimizations["floor"] = &UnaryDoubleFP; - if (TLI->has(LibFunc::ceil) && TLI->has(LibFunc::ceilf)) - Optimizations["ceil"] = &UnaryDoubleFP; - if (TLI->has(LibFunc::round) && TLI->has(LibFunc::roundf)) - Optimizations["round"] = &UnaryDoubleFP; - if (TLI->has(LibFunc::rint) && TLI->has(LibFunc::rintf)) - Optimizations["rint"] = &UnaryDoubleFP; - if (TLI->has(LibFunc::nearbyint) && TLI->has(LibFunc::nearbyintf)) - Optimizations["nearbyint"] = &UnaryDoubleFP; + AddOpt(LibFunc::ceil, LibFunc::ceilf, &UnaryDoubleFP); + AddOpt(LibFunc::fabs, LibFunc::fabsf, &UnaryDoubleFP); + AddOpt(LibFunc::floor, LibFunc::floorf, &UnaryDoubleFP); + AddOpt(LibFunc::rint, LibFunc::rintf, &UnaryDoubleFP); + AddOpt(LibFunc::round, LibFunc::roundf, &UnaryDoubleFP); + AddOpt(LibFunc::nearbyint, LibFunc::nearbyintf, &UnaryDoubleFP); + AddOpt(LibFunc::trunc, LibFunc::truncf, &UnaryDoubleFP); + + if(UnsafeFPShrink) { + AddOpt(LibFunc::acos, LibFunc::acosf, &UnsafeUnaryDoubleFP); + AddOpt(LibFunc::acosh, LibFunc::acoshf, &UnsafeUnaryDoubleFP); + AddOpt(LibFunc::asin, LibFunc::asinf, &UnsafeUnaryDoubleFP); + AddOpt(LibFunc::asinh, LibFunc::asinhf, &UnsafeUnaryDoubleFP); + AddOpt(LibFunc::atan, LibFunc::atanf, &UnsafeUnaryDoubleFP); + AddOpt(LibFunc::atanh, LibFunc::atanhf, &UnsafeUnaryDoubleFP); + AddOpt(LibFunc::cbrt, LibFunc::cbrtf, &UnsafeUnaryDoubleFP); + AddOpt(LibFunc::cosh, LibFunc::coshf, &UnsafeUnaryDoubleFP); + AddOpt(LibFunc::exp, LibFunc::expf, &UnsafeUnaryDoubleFP); + AddOpt(LibFunc::exp10, LibFunc::exp10f, &UnsafeUnaryDoubleFP); + AddOpt(LibFunc::expm1, LibFunc::expm1f, &UnsafeUnaryDoubleFP); + AddOpt(LibFunc::log, LibFunc::logf, &UnsafeUnaryDoubleFP); + AddOpt(LibFunc::log10, LibFunc::log10f, &UnsafeUnaryDoubleFP); + AddOpt(LibFunc::log1p, LibFunc::log1pf, &UnsafeUnaryDoubleFP); + AddOpt(LibFunc::log2, LibFunc::log2f, &UnsafeUnaryDoubleFP); + AddOpt(LibFunc::logb, LibFunc::logbf, &UnsafeUnaryDoubleFP); + AddOpt(LibFunc::sin, LibFunc::sinf, &UnsafeUnaryDoubleFP); + AddOpt(LibFunc::sinh, LibFunc::sinhf, &UnsafeUnaryDoubleFP); + AddOpt(LibFunc::sqrt, LibFunc::sqrtf, &UnsafeUnaryDoubleFP); + AddOpt(LibFunc::tan, LibFunc::tanf, &UnsafeUnaryDoubleFP); + AddOpt(LibFunc::tanh, LibFunc::tanhf, &UnsafeUnaryDoubleFP); + } // Integer Optimizations Optimizations["ffs"] = &FFS; @@ -1681,7 +915,7 @@ bool SimplifyLibCalls::runOnFunction(Function &F) { if (Optimizations.empty()) InitOptimizations(); - const TargetData *TD = getAnalysisIfAvailable<TargetData>(); + const DataLayout *TD = getAnalysisIfAvailable<DataLayout>(); IRBuilder<> Builder(F.getContext()); |