diff options
author | rdivacky <rdivacky@FreeBSD.org> | 2010-05-04 16:11:02 +0000 |
---|---|---|
committer | rdivacky <rdivacky@FreeBSD.org> | 2010-05-04 16:11:02 +0000 |
commit | 750ce4d809c7e2a298a389a512a17652ff5be3f2 (patch) | |
tree | 70fbd90da02177c8e6ef82adba9fa8ace285a5e3 /lib/Transforms/Scalar | |
parent | 5f970ec96e421f64db6b1c6509a902ea73d98cc7 (diff) | |
download | FreeBSD-src-750ce4d809c7e2a298a389a512a17652ff5be3f2.zip FreeBSD-src-750ce4d809c7e2a298a389a512a17652ff5be3f2.tar.gz |
Update LLVM to r103004.
Diffstat (limited to 'lib/Transforms/Scalar')
-rw-r--r-- | lib/Transforms/Scalar/CMakeLists.txt | 1 | ||||
-rw-r--r-- | lib/Transforms/Scalar/GVN.cpp | 8 | ||||
-rw-r--r-- | lib/Transforms/Scalar/IndVarSimplify.cpp | 187 | ||||
-rw-r--r-- | lib/Transforms/Scalar/JumpThreading.cpp | 4 | ||||
-rw-r--r-- | lib/Transforms/Scalar/LICM.cpp | 8 | ||||
-rw-r--r-- | lib/Transforms/Scalar/LoopIndexSplit.cpp | 19 | ||||
-rw-r--r-- | lib/Transforms/Scalar/LoopStrengthReduce.cpp | 349 | ||||
-rw-r--r-- | lib/Transforms/Scalar/LoopUnswitch.cpp | 72 | ||||
-rw-r--r-- | lib/Transforms/Scalar/MemCpyOptimizer.cpp | 2 | ||||
-rw-r--r-- | lib/Transforms/Scalar/Reg2Mem.cpp | 7 | ||||
-rw-r--r-- | lib/Transforms/Scalar/SCCP.cpp | 15 | ||||
-rw-r--r-- | lib/Transforms/Scalar/SCCVN.cpp | 716 | ||||
-rw-r--r-- | lib/Transforms/Scalar/ScalarReplAggregates.cpp | 1183 | ||||
-rw-r--r-- | lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp | 3 | ||||
-rw-r--r-- | lib/Transforms/Scalar/TailRecursionElimination.cpp | 30 |
15 files changed, 1136 insertions, 1468 deletions
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index 683c1c2..5778864 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -21,7 +21,6 @@ add_llvm_library(LLVMScalarOpts Reassociate.cpp Reg2Mem.cpp SCCP.cpp - SCCVN.cpp Scalar.cpp ScalarReplAggregates.cpp SimplifyCFGPass.cpp diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index 642d59d..321def7 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -1217,7 +1217,7 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, return ConstantFoldLoadFromConstPtr(Src, &TD); } - +namespace { struct AvailableValueInBlock { /// BB - The basic block in question. @@ -1291,6 +1291,8 @@ struct AvailableValueInBlock { } }; +} + /// ConstructSSAForLoadSet - Given a set of loads specified by ValuesPerBlock, /// construct SSA form, allowing us to eliminate LI. This returns the value /// that should be used at LI's definition site. @@ -1333,8 +1335,8 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, return V; } -static bool isLifetimeStart(Instruction *Inst) { - if (IntrinsicInst* II = dyn_cast<IntrinsicInst>(Inst)) +static bool isLifetimeStart(const Instruction *Inst) { + if (const IntrinsicInst* II = dyn_cast<IntrinsicInst>(Inst)) return II->getIntrinsicID() == Intrinsic::lifetime_start; return false; } diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp index 6605666..36bea67 100644 --- a/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -97,6 +97,8 @@ namespace { private: + void EliminateIVComparisons(); + void EliminateIVRemainders(); void RewriteNonIntegerIVs(Loop *L); ICmpInst *LinearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount, @@ -133,6 +135,24 @@ ICmpInst *IndVarSimplify::LinearFunctionTestReplace(Loop *L, BasicBlock *ExitingBlock, BranchInst *BI, SCEVExpander &Rewriter) { + // Special case: If the backedge-taken count is a UDiv, it's very likely a + // UDiv that ScalarEvolution produced in order to compute a precise + // expression, rather than a UDiv from the user's code. If we can't find a + // UDiv in the code with some simple searching, assume the former and forego + // rewriting the loop. + if (isa<SCEVUDivExpr>(BackedgeTakenCount)) { + ICmpInst *OrigCond = dyn_cast<ICmpInst>(BI->getCondition()); + if (!OrigCond) return 0; + const SCEV *R = SE->getSCEV(OrigCond->getOperand(1)); + R = SE->getMinusSCEV(R, SE->getConstant(R->getType(), 1)); + if (R != BackedgeTakenCount) { + const SCEV *L = SE->getSCEV(OrigCond->getOperand(0)); + L = SE->getMinusSCEV(L, SE->getConstant(L->getType(), 1)); + if (L != BackedgeTakenCount) + return 0; + } + } + // If the exiting block is not the same as the backedge block, we must compare // against the preincremented value, otherwise we prefer to compare against // the post-incremented value. @@ -142,12 +162,12 @@ ICmpInst *IndVarSimplify::LinearFunctionTestReplace(Loop *L, // Add one to the "backedge-taken" count to get the trip count. // If this addition may overflow, we have to be more pessimistic and // cast the induction variable before doing the add. - const SCEV *Zero = SE->getIntegerSCEV(0, BackedgeTakenCount->getType()); + const SCEV *Zero = SE->getConstant(BackedgeTakenCount->getType(), 0); const SCEV *N = SE->getAddExpr(BackedgeTakenCount, - SE->getIntegerSCEV(1, BackedgeTakenCount->getType())); + SE->getConstant(BackedgeTakenCount->getType(), 1)); if ((isa<SCEVConstant>(N) && !N->isZero()) || - SE->isLoopGuardedByCond(L, ICmpInst::ICMP_NE, N, Zero)) { + SE->isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, N, Zero)) { // No overflow. Cast the sum. RHS = SE->getTruncateOrZeroExtend(N, IndVar->getType()); } else { @@ -155,7 +175,7 @@ ICmpInst *IndVarSimplify::LinearFunctionTestReplace(Loop *L, RHS = SE->getTruncateOrZeroExtend(BackedgeTakenCount, IndVar->getType()); RHS = SE->getAddExpr(RHS, - SE->getIntegerSCEV(1, IndVar->getType())); + SE->getConstant(IndVar->getType(), 1)); } // The BackedgeTaken expression contains the number of times that the @@ -336,6 +356,116 @@ void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) { SE->forgetLoop(L); } +void IndVarSimplify::EliminateIVComparisons() { + SmallVector<WeakVH, 16> DeadInsts; + + // Look for ICmp users. + for (IVUsers::iterator I = IU->begin(), E = IU->end(); I != E; ++I) { + IVStrideUse &UI = *I; + ICmpInst *ICmp = dyn_cast<ICmpInst>(UI.getUser()); + if (!ICmp) continue; + + bool Swapped = UI.getOperandValToReplace() == ICmp->getOperand(1); + ICmpInst::Predicate Pred = ICmp->getPredicate(); + if (Swapped) Pred = ICmpInst::getSwappedPredicate(Pred); + + // Get the SCEVs for the ICmp operands. + const SCEV *S = IU->getReplacementExpr(UI); + const SCEV *X = SE->getSCEV(ICmp->getOperand(!Swapped)); + + // Simplify unnecessary loops away. + const Loop *ICmpLoop = LI->getLoopFor(ICmp->getParent()); + S = SE->getSCEVAtScope(S, ICmpLoop); + X = SE->getSCEVAtScope(X, ICmpLoop); + + // If the condition is always true or always false, replace it with + // a constant value. + if (SE->isKnownPredicate(Pred, S, X)) + ICmp->replaceAllUsesWith(ConstantInt::getTrue(ICmp->getContext())); + else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X)) + ICmp->replaceAllUsesWith(ConstantInt::getFalse(ICmp->getContext())); + else + continue; + + DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n'); + DeadInsts.push_back(ICmp); + } + + // Now that we're done iterating through lists, clean up any instructions + // which are now dead. + while (!DeadInsts.empty()) + if (Instruction *Inst = + dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val())) + RecursivelyDeleteTriviallyDeadInstructions(Inst); +} + +void IndVarSimplify::EliminateIVRemainders() { + SmallVector<WeakVH, 16> DeadInsts; + + // Look for SRem and URem users. + for (IVUsers::iterator I = IU->begin(), E = IU->end(); I != E; ++I) { + IVStrideUse &UI = *I; + BinaryOperator *Rem = dyn_cast<BinaryOperator>(UI.getUser()); + if (!Rem) continue; + + bool isSigned = Rem->getOpcode() == Instruction::SRem; + if (!isSigned && Rem->getOpcode() != Instruction::URem) + continue; + + // We're only interested in the case where we know something about + // the numerator. + if (UI.getOperandValToReplace() != Rem->getOperand(0)) + continue; + + // Get the SCEVs for the ICmp operands. + const SCEV *S = SE->getSCEV(Rem->getOperand(0)); + const SCEV *X = SE->getSCEV(Rem->getOperand(1)); + + // Simplify unnecessary loops away. + const Loop *ICmpLoop = LI->getLoopFor(Rem->getParent()); + S = SE->getSCEVAtScope(S, ICmpLoop); + X = SE->getSCEVAtScope(X, ICmpLoop); + + // i % n --> i if i is in [0,n). + if ((!isSigned || SE->isKnownNonNegative(S)) && + SE->isKnownPredicate(isSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, + S, X)) + Rem->replaceAllUsesWith(Rem->getOperand(0)); + else { + // (i+1) % n --> (i+1)==n?0:(i+1) if i is in [0,n). + const SCEV *LessOne = + SE->getMinusSCEV(S, SE->getConstant(S->getType(), 1)); + if ((!isSigned || SE->isKnownNonNegative(LessOne)) && + SE->isKnownPredicate(isSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, + LessOne, X)) { + ICmpInst *ICmp = new ICmpInst(Rem, ICmpInst::ICMP_EQ, + Rem->getOperand(0), Rem->getOperand(1), + "tmp"); + SelectInst *Sel = + SelectInst::Create(ICmp, + ConstantInt::get(Rem->getType(), 0), + Rem->getOperand(0), "tmp", Rem); + Rem->replaceAllUsesWith(Sel); + } else + continue; + } + + // Inform IVUsers about the new users. + if (Instruction *I = dyn_cast<Instruction>(Rem->getOperand(0))) + IU->AddUsersIfInteresting(I); + + DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n'); + DeadInsts.push_back(Rem); + } + + // Now that we're done iterating through lists, clean up any instructions + // which are now dead. + while (!DeadInsts.empty()) + if (Instruction *Inst = + dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val())) + RecursivelyDeleteTriviallyDeadInstructions(Inst); +} + bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { IU = &getAnalysis<IVUsers>(); LI = &getAnalysis<LoopInfo>(); @@ -362,6 +492,12 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { if (!isa<SCEVCouldNotCompute>(BackedgeTakenCount)) RewriteLoopExitValues(L, Rewriter); + // Simplify ICmp IV users. + EliminateIVComparisons(); + + // Simplify SRem and URem IV users. + EliminateIVRemainders(); + // Compute the type of the largest recurrence expression, and decide whether // a canonical induction variable should be inserted. const Type *LargestType = 0; @@ -454,6 +590,46 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { return Changed; } +// FIXME: It is an extremely bad idea to indvar substitute anything more +// complex than affine induction variables. Doing so will put expensive +// polynomial evaluations inside of the loop, and the str reduction pass +// currently can only reduce affine polynomials. For now just disable +// indvar subst on anything more complex than an affine addrec, unless +// it can be expanded to a trivial value. +static bool isSafe(const SCEV *S, const Loop *L) { + // Loop-invariant values are safe. + if (S->isLoopInvariant(L)) return true; + + // Affine addrecs are safe. Non-affine are not, because LSR doesn't know how + // to transform them into efficient code. + if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) + return AR->isAffine(); + + // An add is safe it all its operands are safe. + if (const SCEVCommutativeExpr *Commutative = dyn_cast<SCEVCommutativeExpr>(S)) { + for (SCEVCommutativeExpr::op_iterator I = Commutative->op_begin(), + E = Commutative->op_end(); I != E; ++I) + if (!isSafe(*I, L)) return false; + return true; + } + + // A cast is safe if its operand is. + if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S)) + return isSafe(C->getOperand(), L); + + // A udiv is safe if its operands are. + if (const SCEVUDivExpr *UD = dyn_cast<SCEVUDivExpr>(S)) + return isSafe(UD->getLHS(), L) && + isSafe(UD->getRHS(), L); + + // SCEVUnknown is always safe. + if (isa<SCEVUnknown>(S)) + return true; + + // Nothing else is safe. + return false; +} + void IndVarSimplify::RewriteIVExpressions(Loop *L, SCEVExpander &Rewriter) { SmallVector<WeakVH, 16> DeadInsts; @@ -465,7 +641,6 @@ void IndVarSimplify::RewriteIVExpressions(Loop *L, SCEVExpander &Rewriter) { // the need for the code evaluation methods to insert induction variables // of different sizes. for (IVUsers::iterator UI = IU->begin(), E = IU->end(); UI != E; ++UI) { - const SCEV *Stride = UI->getStride(); Value *Op = UI->getOperandValToReplace(); const Type *UseTy = Op->getType(); Instruction *User = UI->getUser(); @@ -486,7 +661,7 @@ void IndVarSimplify::RewriteIVExpressions(Loop *L, SCEVExpander &Rewriter) { // currently can only reduce affine polynomials. For now just disable // indvar subst on anything more complex than an affine addrec, unless // it can be expanded to a trivial value. - if (!AR->isLoopInvariant(L) && !Stride->isLoopInvariant(L)) + if (!isSafe(AR, L)) continue; // Determine the insertion point for this user. By default, insert diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index a6489ec..df05b71 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -670,8 +670,10 @@ bool JumpThreading::ProcessBranchOnDuplicateCond(BasicBlock *PredBB, Value *OldCond = DestBI->getCondition(); DestBI->setCondition(ConstantInt::get(Type::getInt1Ty(BB->getContext()), BranchDir)); - ConstantFoldTerminator(BB); + // Delete dead instructions before we fold the branch. Folding the branch + // can eliminate edges from the CFG which can end up deleting OldCond. RecursivelyDeleteTriviallyDeadInstructions(OldCond); + ConstantFoldTerminator(BB); return true; } diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp index d7ace342..7347395 100644 --- a/lib/Transforms/Scalar/LICM.cpp +++ b/lib/Transforms/Scalar/LICM.cpp @@ -683,16 +683,18 @@ void LICM::PromoteValuesInLoop() { // to LI as we are loading or storing. Since we know that the value is // stored in this loop, this will always succeed. for (Value::use_iterator UI = Ptr->use_begin(), E = Ptr->use_end(); - UI != E; ++UI) - if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) { + UI != E; ++UI) { + User *U = *UI; + if (LoadInst *LI = dyn_cast<LoadInst>(U)) { LoadValue = LI; break; - } else if (StoreInst *SI = dyn_cast<StoreInst>(*UI)) { + } else if (StoreInst *SI = dyn_cast<StoreInst>(U)) { if (SI->getOperand(1) == Ptr) { LoadValue = SI->getOperand(0); break; } } + } assert(LoadValue && "No store through the pointer found!"); PointerValueNumbers.push_back(LoadValue); // Remember this for later. } diff --git a/lib/Transforms/Scalar/LoopIndexSplit.cpp b/lib/Transforms/Scalar/LoopIndexSplit.cpp index 16d3f2f..101ff5b 100644 --- a/lib/Transforms/Scalar/LoopIndexSplit.cpp +++ b/lib/Transforms/Scalar/LoopIndexSplit.cpp @@ -948,6 +948,25 @@ bool LoopIndexSplit::splitLoop() { if (!IVBasedValues.count(SplitCondition->getOperand(!SVOpNum))) return false; + // Check for side effects. + for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); + I != E; ++I) { + BasicBlock *BB = *I; + + assert(DT->dominates(Header, BB)); + if (DT->properlyDominates(SplitCondition->getParent(), BB)) + continue; + + for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); + BI != BE; ++BI) { + Instruction *Inst = BI; + + if (!Inst->isSafeToSpeculativelyExecute() && !isa<PHINode>(Inst) + && !isa<BranchInst>(Inst) && !isa<DbgInfoIntrinsic>(Inst)) + return false; + } + } + // Normalize loop conditions so that it is easier to calculate new loop // bounds. if (IVisGT(*ExitCondition) || IVisGE(*ExitCondition)) { diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 625a75d..cf3d16f 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -221,7 +221,7 @@ static void DoInitialMatch(const SCEV *S, Loop *L, if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) if (!AR->getStart()->isZero()) { DoInitialMatch(AR->getStart(), L, Good, Bad, SE, DT); - DoInitialMatch(SE.getAddRecExpr(SE.getIntegerSCEV(0, AR->getType()), + DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0), AR->getStepRecurrence(SE), AR->getLoop()), L, Good, Bad, SE, DT); @@ -262,11 +262,15 @@ void Formula::InitialMatch(const SCEV *S, Loop *L, SmallVector<const SCEV *, 4> Bad; DoInitialMatch(S, L, Good, Bad, SE, DT); if (!Good.empty()) { - BaseRegs.push_back(SE.getAddExpr(Good)); + const SCEV *Sum = SE.getAddExpr(Good); + if (!Sum->isZero()) + BaseRegs.push_back(Sum); AM.HasBaseReg = true; } if (!Bad.empty()) { - BaseRegs.push_back(SE.getAddExpr(Bad)); + const SCEV *Sum = SE.getAddExpr(Bad); + if (!Sum->isZero()) + BaseRegs.push_back(Sum); AM.HasBaseReg = true; } } @@ -375,7 +379,7 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, bool IgnoreSignificantBits = false) { // Handle the trivial case, which works for any SCEV type. if (LHS == RHS) - return SE.getIntegerSCEV(1, LHS->getType()); + return SE.getConstant(LHS->getType(), 1); // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do some // folding. @@ -450,7 +454,7 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) { if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) { if (C->getValue()->getValue().getMinSignedBits() <= 64) { - S = SE.getIntegerSCEV(0, C->getType()); + S = SE.getConstant(C->getType(), 0); return C->getValue()->getSExtValue(); } } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { @@ -473,7 +477,7 @@ static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) { static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) { if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) { if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) { - S = SE.getIntegerSCEV(0, GV->getType()); + S = SE.getConstant(GV->getType(), 0); return GV; } } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { @@ -781,10 +785,10 @@ struct LSRFixup { /// will be replaced. Value *OperandValToReplace; - /// PostIncLoop - If this user is to use the post-incremented value of an + /// PostIncLoops - If this user is to use the post-incremented value of an /// induction variable, this variable is non-null and holds the loop /// associated with the induction variable. - const Loop *PostIncLoop; + PostIncLoopSet PostIncLoops; /// LUIdx - The index of the LSRUse describing the expression which /// this fixup needs, minus an offset (below). @@ -795,6 +799,8 @@ struct LSRFixup { /// offsets, for example in an unrolled loop. int64_t Offset; + bool isUseFullyOutsideLoop(const Loop *L) const; + LSRFixup(); void print(raw_ostream &OS) const; @@ -804,9 +810,24 @@ struct LSRFixup { } LSRFixup::LSRFixup() - : UserInst(0), OperandValToReplace(0), PostIncLoop(0), + : UserInst(0), OperandValToReplace(0), LUIdx(~size_t(0)), Offset(0) {} +/// isUseFullyOutsideLoop - Test whether this fixup always uses its +/// value outside of the given loop. +bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const { + // PHI nodes use their value in their incoming blocks. + if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) { + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) + if (PN->getIncomingValue(i) == OperandValToReplace && + L->contains(PN->getIncomingBlock(i))) + return false; + return true; + } + + return !L->contains(UserInst); +} + void LSRFixup::print(raw_ostream &OS) const { OS << "UserInst="; // Store is common and interesting enough to be worth special-casing. @@ -821,9 +842,10 @@ void LSRFixup::print(raw_ostream &OS) const { OS << ", OperandValToReplace="; WriteAsOperand(OS, OperandValToReplace, /*PrintType=*/false); - if (PostIncLoop) { + for (PostIncLoopSet::const_iterator I = PostIncLoops.begin(), + E = PostIncLoops.end(); I != E; ++I) { OS << ", PostIncLoop="; - WriteAsOperand(OS, PostIncLoop->getHeader(), /*PrintType=*/false); + WriteAsOperand(OS, (*I)->getHeader(), /*PrintType=*/false); } if (LUIdx != ~size_t(0)) @@ -1135,6 +1157,7 @@ class LSRInstance { IVUsers &IU; ScalarEvolution &SE; DominatorTree &DT; + LoopInfo &LI; const TargetLowering *const TLI; Loop *const L; bool Changed; @@ -1214,6 +1237,13 @@ public: DenseSet<const SCEV *> &VisitedRegs) const; void Solve(SmallVectorImpl<const Formula *> &Solution) const; + BasicBlock::iterator + HoistInsertPosition(BasicBlock::iterator IP, + const SmallVectorImpl<Instruction *> &Inputs) const; + BasicBlock::iterator AdjustInsertPositionForExpand(BasicBlock::iterator IP, + const LSRFixup &LF, + const LSRUse &LU) const; + Value *Expand(const LSRFixup &LF, const Formula &F, BasicBlock::iterator IP, @@ -1427,16 +1457,30 @@ ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) { const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) return Cond; - const SCEV *One = SE.getIntegerSCEV(1, BackedgeTakenCount->getType()); + const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1); // Add one to the backedge-taken count to get the trip count. const SCEV *IterationCount = SE.getAddExpr(BackedgeTakenCount, One); - - // Check for a max calculation that matches the pattern. - if (!isa<SCEVSMaxExpr>(IterationCount) && !isa<SCEVUMaxExpr>(IterationCount)) + if (IterationCount != SE.getSCEV(Sel)) return Cond; + + // Check for a max calculation that matches the pattern. There's no check + // for ICMP_ULE here because the comparison would be with zero, which + // isn't interesting. + CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE; + const SCEVNAryExpr *Max = 0; + if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) { + Pred = ICmpInst::ICMP_SLE; + Max = S; + } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) { + Pred = ICmpInst::ICMP_SLT; + Max = S; + } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) { + Pred = ICmpInst::ICMP_ULT; + Max = U; + } else { + // No match; bail. return Cond; - const SCEVNAryExpr *Max = cast<SCEVNAryExpr>(IterationCount); - if (Max != SE.getSCEV(Sel)) return Cond; + } // To handle a max with more than two operands, this optimization would // require additional checking and setup. @@ -1445,7 +1489,13 @@ ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) { const SCEV *MaxLHS = Max->getOperand(0); const SCEV *MaxRHS = Max->getOperand(1); - if (!MaxLHS || MaxLHS != One) return Cond; + + // ScalarEvolution canonicalizes constants to the left. For < and >, look + // for a comparison with 1. For <= and >=, a comparison with zero. + if (!MaxLHS || + (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One))) + return Cond; + // Check the relevant induction variable for conformance to // the pattern. const SCEV *IV = SE.getSCEV(Cond->getOperand(0)); @@ -1461,16 +1511,29 @@ ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) { // Check the right operand of the select, and remember it, as it will // be used in the new comparison instruction. Value *NewRHS = 0; - if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS) + if (ICmpInst::isTrueWhenEqual(Pred)) { + // Look for n+1, and grab n. + if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1))) + if (isa<ConstantInt>(BO->getOperand(1)) && + cast<ConstantInt>(BO->getOperand(1))->isOne() && + SE.getSCEV(BO->getOperand(0)) == MaxRHS) + NewRHS = BO->getOperand(0); + if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2))) + if (isa<ConstantInt>(BO->getOperand(1)) && + cast<ConstantInt>(BO->getOperand(1))->isOne() && + SE.getSCEV(BO->getOperand(0)) == MaxRHS) + NewRHS = BO->getOperand(0); + if (!NewRHS) + return Cond; + } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS) NewRHS = Sel->getOperand(1); else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS) NewRHS = Sel->getOperand(2); - if (!NewRHS) return Cond; + else + llvm_unreachable("Max doesn't match expected pattern!"); // Determine the new comparison opcode. It may be signed or unsigned, // and the original comparison may be either equality or inequality. - CmpInst::Predicate Pred = - isa<SCEVSMaxExpr>(Max) ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT; if (Cond->getPredicate() == CmpInst::ICMP_EQ) Pred = CmpInst::getInversePredicate(Pred); @@ -1545,8 +1608,9 @@ LSRInstance::OptimizeLoopTermCond() { !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) { // Conservatively assume there may be reuse if the quotient of their // strides could be a legal scale. - const SCEV *A = CondUse->getStride(); - const SCEV *B = UI->getStride(); + const SCEV *A = IU.getStride(*CondUse, L); + const SCEV *B = IU.getStride(*UI, L); + if (!A || !B) continue; if (SE.getTypeSizeInBits(A->getType()) != SE.getTypeSizeInBits(B->getType())) { if (SE.getTypeSizeInBits(A->getType()) > @@ -1598,8 +1662,7 @@ LSRInstance::OptimizeLoopTermCond() { ExitingBlock->getInstList().insert(TermBr, Cond); // Clone the IVUse, as the old use still exists! - CondUse = &IU.AddUser(CondUse->getStride(), CondUse->getOffset(), - Cond, CondUse->getOperandValToReplace()); + CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace()); TermBr->replaceUsesOfWith(OldCond, Cond); } } @@ -1607,9 +1670,7 @@ LSRInstance::OptimizeLoopTermCond() { // If we get to here, we know that we can transform the setcc instruction to // use the post-incremented version of the IV, allowing us to coalesce the // live ranges for the IV correctly. - CondUse->setOffset(SE.getMinusSCEV(CondUse->getOffset(), - CondUse->getStride())); - CondUse->setIsUseOfPostIncrementedValue(true); + CondUse->transformToPostInc(L); Changed = true; PostIncs.insert(Cond); @@ -1717,19 +1778,24 @@ void LSRInstance::CollectInterestingTypesAndFactors() { SmallSetVector<const SCEV *, 4> Strides; // Collect interesting types and strides. + SmallVector<const SCEV *, 4> Worklist; for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) { - const SCEV *Stride = UI->getStride(); + const SCEV *Expr = IU.getExpr(*UI); // Collect interesting types. - Types.insert(SE.getEffectiveSCEVType(Stride->getType())); - - // Add the stride for this loop. - Strides.insert(Stride); - - // Add strides for other mentioned loops. - for (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(UI->getOffset()); - AR; AR = dyn_cast<SCEVAddRecExpr>(AR->getStart())) - Strides.insert(AR->getStepRecurrence(SE)); + Types.insert(SE.getEffectiveSCEVType(Expr->getType())); + + // Add strides for mentioned loops. + Worklist.push_back(Expr); + do { + const SCEV *S = Worklist.pop_back_val(); + if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { + Strides.insert(AR->getStepRecurrence(SE)); + Worklist.push_back(AR->getStart()); + } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { + Worklist.insert(Worklist.end(), Add->op_begin(), Add->op_end()); + } + } while (!Worklist.empty()); } // Compute interesting factors from the set of interesting strides. @@ -1776,8 +1842,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { LSRFixup &LF = getNewFixup(); LF.UserInst = UI->getUser(); LF.OperandValToReplace = UI->getOperandValToReplace(); - if (UI->isUseOfPostIncrementedValue()) - LF.PostIncLoop = L; + LF.PostIncLoops = UI->getPostIncLoops(); LSRUse::KindType Kind = LSRUse::Basic; const Type *AccessTy = 0; @@ -1786,7 +1851,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { AccessTy = getAccessType(LF.UserInst); } - const SCEV *S = IU.getCanonicalExpr(*UI); + const SCEV *S = IU.getExpr(*UI); // Equality (== and !=) ICmps are special. We can rewrite (i == N) as // (N - i == 0), and this allows (N - i) to be the expression that we work @@ -1824,7 +1889,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { LF.LUIdx = P.first; LF.Offset = P.second; LSRUse &LU = Uses[LF.LUIdx]; - LU.AllFixupsOutsideLoop &= !L->contains(LF.UserInst); + LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); // If this is the first use of this LSRUse, give it a formula. if (LU.Formulae.empty()) { @@ -1918,9 +1983,17 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { continue; // Ignore uses which are part of other SCEV expressions, to avoid // analyzing them multiple times. - if (SE.isSCEVable(UserInst->getType()) && - !isa<SCEVUnknown>(SE.getSCEV(const_cast<Instruction *>(UserInst)))) - continue; + if (SE.isSCEVable(UserInst->getType())) { + const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst)); + // If the user is a no-op, look through to its uses. + if (!isa<SCEVUnknown>(UserS)) + continue; + if (UserS == U) { + Worklist.push_back( + SE.getUnknown(const_cast<Instruction *>(UserInst))); + continue; + } + } // Ignore icmp instructions which are already being analyzed. if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) { unsigned OtherIdx = !UI.getOperandNo(); @@ -1936,7 +2009,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { LF.LUIdx = P.first; LF.Offset = P.second; LSRUse &LU = Uses[LF.LUIdx]; - LU.AllFixupsOutsideLoop &= L->contains(LF.UserInst); + LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); InsertSupplementalFormula(U, LU, LF.LUIdx); CountRegisters(LU.Formulae.back(), Uses.size() - 1); break; @@ -1959,7 +2032,7 @@ static void CollectSubexprs(const SCEV *S, const SCEVConstant *C, } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { // Split a non-zero base out of an addrec. if (!AR->getStart()->isZero()) { - CollectSubexprs(SE.getAddRecExpr(SE.getIntegerSCEV(0, AR->getType()), + CollectSubexprs(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0), AR->getStepRecurrence(SE), AR->getLoop()), C, Ops, SE); CollectSubexprs(AR->getStart(), C, Ops, SE); @@ -2020,8 +2093,11 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, LU.Kind, LU.AccessTy, TLI, SE)) continue; + const SCEV *InnerSum = SE.getAddExpr(InnerAddOps); + if (InnerSum->isZero()) + continue; Formula F = Base; - F.BaseRegs[i] = SE.getAddExpr(InnerAddOps); + F.BaseRegs[i] = InnerSum; F.BaseRegs.push_back(*J); if (InsertFormula(LU, LUIdx, F)) // If that formula hadn't been seen before, recurse to find more like @@ -2102,7 +2178,7 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs - *I; if (isLegalUse(F.AM, LU.MinOffset - *I, LU.MaxOffset - *I, LU.Kind, LU.AccessTy, TLI)) { - F.BaseRegs[i] = SE.getAddExpr(G, SE.getIntegerSCEV(*I, G->getType())); + F.BaseRegs[i] = SE.getAddExpr(G, SE.getConstant(G->getType(), *I)); (void)InsertFormula(LU, LUIdx, F); } @@ -2165,7 +2241,7 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, // Compensate for the use having MinOffset built into it. F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Offset - LU.MinOffset; - const SCEV *FactorS = SE.getIntegerSCEV(Factor, IntTy); + const SCEV *FactorS = SE.getConstant(IntTy, Factor); // Check that multiplying with each base register doesn't overflow. for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) { @@ -2227,7 +2303,7 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) { - const SCEV *FactorS = SE.getIntegerSCEV(Factor, IntTy); + const SCEV *FactorS = SE.getConstant(IntTy, Factor); if (FactorS->isZero()) continue; // Divide out the factor, ignoring high bits, since we'll be @@ -2426,7 +2502,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { if (C->getValue()->getValue().isNegative() != (NewF.AM.BaseOffs < 0) && (C->getValue()->getValue().abs() * APInt(BitWidth, F.AM.Scale)) - .ule(APInt(BitWidth, NewF.AM.BaseOffs).abs())) + .ule(abs64(NewF.AM.BaseOffs))) continue; // OK, looks good. @@ -2454,7 +2530,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { if (C->getValue()->getValue().isNegative() != (NewF.AM.BaseOffs < 0) && C->getValue()->getValue().abs() - .ule(APInt(BitWidth, NewF.AM.BaseOffs).abs())) + .ule(abs64(NewF.AM.BaseOffs))) goto skip_formula; // Ok, looks good. @@ -2776,37 +2852,33 @@ static BasicBlock *getImmediateDominator(BasicBlock *BB, DominatorTree &DT) { return Node->getBlock(); } -Value *LSRInstance::Expand(const LSRFixup &LF, - const Formula &F, - BasicBlock::iterator IP, - SCEVExpander &Rewriter, - SmallVectorImpl<WeakVH> &DeadInsts) const { - const LSRUse &LU = Uses[LF.LUIdx]; - - // Then, collect some instructions which we will remain dominated by when - // expanding the replacement. These must be dominated by any operands that - // will be required in the expansion. - SmallVector<Instruction *, 4> Inputs; - if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace)) - Inputs.push_back(I); - if (LU.Kind == LSRUse::ICmpZero) - if (Instruction *I = - dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1))) - Inputs.push_back(I); - if (LF.PostIncLoop) { - if (!L->contains(LF.UserInst)) - Inputs.push_back(L->getLoopLatch()->getTerminator()); - else - Inputs.push_back(IVIncInsertPos); - } - - // Then, climb up the immediate dominator tree as far as we can go while - // still being dominated by the input positions. +/// HoistInsertPosition - Helper for AdjustInsertPositionForExpand. Climb up +/// the dominator tree far as we can go while still being dominated by the +/// input positions. This helps canonicalize the insert position, which +/// encourages sharing. +BasicBlock::iterator +LSRInstance::HoistInsertPosition(BasicBlock::iterator IP, + const SmallVectorImpl<Instruction *> &Inputs) + const { for (;;) { + const Loop *IPLoop = LI.getLoopFor(IP->getParent()); + unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0; + + BasicBlock *IDom; + for (BasicBlock *Rung = IP->getParent(); ; Rung = IDom) { + IDom = getImmediateDominator(Rung, DT); + if (!IDom) return IP; + + // Don't climb into a loop though. + const Loop *IDomLoop = LI.getLoopFor(IDom); + unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0; + if (IDomDepth <= IPLoopDepth && + (IDomDepth != IPLoopDepth || IDomLoop == IPLoop)) + break; + } + bool AllDominate = true; Instruction *BetterPos = 0; - BasicBlock *IDom = getImmediateDominator(IP->getParent(), DT); - if (!IDom) break; Instruction *Tentative = IDom->getTerminator(); for (SmallVectorImpl<Instruction *>::const_iterator I = Inputs.begin(), E = Inputs.end(); I != E; ++I) { @@ -2815,6 +2887,8 @@ Value *LSRInstance::Expand(const LSRFixup &LF, AllDominate = false; break; } + // Attempt to find an insert position in the middle of the block, + // instead of at the end, so that it can be used for other expansions. if (IDom == Inst->getParent() && (!BetterPos || DT.dominates(BetterPos, Inst))) BetterPos = next(BasicBlock::iterator(Inst)); @@ -2826,12 +2900,77 @@ Value *LSRInstance::Expand(const LSRFixup &LF, else IP = Tentative; } + + return IP; +} + +/// AdjustInsertPositionForExpand - Determine an input position which will be +/// dominated by the operands and which will dominate the result. +BasicBlock::iterator +LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator IP, + const LSRFixup &LF, + const LSRUse &LU) const { + // Collect some instructions which must be dominated by the + // expanding replacement. These must be dominated by any operands that + // will be required in the expansion. + SmallVector<Instruction *, 4> Inputs; + if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace)) + Inputs.push_back(I); + if (LU.Kind == LSRUse::ICmpZero) + if (Instruction *I = + dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1))) + Inputs.push_back(I); + if (LF.PostIncLoops.count(L)) { + if (LF.isUseFullyOutsideLoop(L)) + Inputs.push_back(L->getLoopLatch()->getTerminator()); + else + Inputs.push_back(IVIncInsertPos); + } + // The expansion must also be dominated by the increment positions of any + // loops it for which it is using post-inc mode. + for (PostIncLoopSet::const_iterator I = LF.PostIncLoops.begin(), + E = LF.PostIncLoops.end(); I != E; ++I) { + const Loop *PIL = *I; + if (PIL == L) continue; + + // Be dominated by the loop exit. + SmallVector<BasicBlock *, 4> ExitingBlocks; + PIL->getExitingBlocks(ExitingBlocks); + if (!ExitingBlocks.empty()) { + BasicBlock *BB = ExitingBlocks[0]; + for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i) + BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]); + Inputs.push_back(BB->getTerminator()); + } + } + + // Then, climb up the immediate dominator tree as far as we can go while + // still being dominated by the input positions. + IP = HoistInsertPosition(IP, Inputs); + + // Don't insert instructions before PHI nodes. while (isa<PHINode>(IP)) ++IP; + + // Ignore debug intrinsics. while (isa<DbgInfoIntrinsic>(IP)) ++IP; + return IP; +} + +Value *LSRInstance::Expand(const LSRFixup &LF, + const Formula &F, + BasicBlock::iterator IP, + SCEVExpander &Rewriter, + SmallVectorImpl<WeakVH> &DeadInsts) const { + const LSRUse &LU = Uses[LF.LUIdx]; + + // Determine an input position which will be dominated by the operands and + // which will dominate the result. + IP = AdjustInsertPositionForExpand(IP, LF, LU); + // Inform the Rewriter if we have a post-increment use, so that it can // perform an advantageous expansion. - Rewriter.setPostInc(LF.PostIncLoop); + Rewriter.setPostInc(LF.PostIncLoops); // This is the type that the user actually needs. const Type *OpTy = LF.OperandValToReplace->getType(); @@ -2855,24 +2994,11 @@ Value *LSRInstance::Expand(const LSRFixup &LF, const SCEV *Reg = *I; assert(!Reg->isZero() && "Zero allocated in a base register!"); - // If we're expanding for a post-inc user for the add-rec's loop, make the - // post-inc adjustment. - const SCEV *Start = Reg; - while (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Start)) { - if (AR->getLoop() == LF.PostIncLoop) { - Reg = SE.getAddExpr(Reg, AR->getStepRecurrence(SE)); - // If the user is inside the loop, insert the code after the increment - // so that it is dominated by its operand. If the original insert point - // was already dominated by the increment, keep it, because there may - // be loop-variant operands that need to be respected also. - if (L->contains(LF.UserInst) && !DT.dominates(IVIncInsertPos, IP)) { - IP = IVIncInsertPos; - while (isa<DbgInfoIntrinsic>(IP)) ++IP; - } - break; - } - Start = AR->getStart(); - } + // If we're expanding for a post-inc user, make the post-inc adjustment. + PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops); + Reg = TransformForPostIncUse(Denormalize, Reg, + LF.UserInst, LF.OperandValToReplace, + Loops, SE, DT); Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, 0, IP))); } @@ -2889,11 +3015,11 @@ Value *LSRInstance::Expand(const LSRFixup &LF, if (F.AM.Scale != 0) { const SCEV *ScaledS = F.ScaledReg; - // If we're expanding for a post-inc user for the add-rec's loop, make the - // post-inc adjustment. - if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(ScaledS)) - if (AR->getLoop() == LF.PostIncLoop) - ScaledS = SE.getAddExpr(ScaledS, AR->getStepRecurrence(SE)); + // If we're expanding for a post-inc user, make the post-inc adjustment. + PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops); + ScaledS = TransformForPostIncUse(Denormalize, ScaledS, + LF.UserInst, LF.OperandValToReplace, + Loops, SE, DT); if (LU.Kind == LSRUse::ICmpZero) { // An interesting way of "folding" with an icmp is to use a negated @@ -2907,8 +3033,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, // which is expected to be matched as part of the address. ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP)); ScaledS = SE.getMulExpr(ScaledS, - SE.getIntegerSCEV(F.AM.Scale, - ScaledS->getType())); + SE.getConstant(ScaledS->getType(), F.AM.Scale)); Ops.push_back(ScaledS); // Flush the operand list to suppress SCEVExpander hoisting. @@ -2949,12 +3074,12 @@ Value *LSRInstance::Expand(const LSRFixup &LF, // Emit instructions summing all the operands. const SCEV *FullS = Ops.empty() ? - SE.getIntegerSCEV(0, IntTy) : + SE.getConstant(IntTy, 0) : SE.getAddExpr(Ops); Value *FullV = Rewriter.expandCodeFor(FullS, Ty, IP); // We're done expanding now, so reset the rewriter. - Rewriter.setPostInc(0); + Rewriter.clearPostInc(); // An ICmpZero Formula represents an ICmp which we're handling as a // comparison against zero. Now that we've expanded an expression for that @@ -3118,6 +3243,7 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P) : IU(P->getAnalysis<IVUsers>()), SE(P->getAnalysis<ScalarEvolution>()), DT(P->getAnalysis<DominatorTree>()), + LI(P->getAnalysis<LoopInfo>()), TLI(tli), L(l), Changed(false), IVIncInsertPos(0) { // If LoopSimplify form is not available, stay out of trouble. @@ -3274,9 +3400,10 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { // We split critical edges, so we change the CFG. However, we do update // many analyses if they are around. AU.addPreservedID(LoopSimplifyID); - AU.addPreserved<LoopInfo>(); AU.addPreserved("domfrontier"); + AU.addRequired<LoopInfo>(); + AU.addPreserved<LoopInfo>(); AU.addRequiredID(LoopSimplifyID); AU.addRequired<DominatorTree>(); AU.addPreserved<DominatorTree>(); diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp index 3918738..ae7bf40 100644 --- a/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -34,6 +34,7 @@ #include "llvm/Instructions.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InlineCost.h" +#include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/Dominators.h" @@ -677,15 +678,22 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, LoopProcessWorklist.push_back(NewLoop); redoLoop = true; + // Keep a WeakVH holding onto LIC. If the first call to RewriteLoopBody + // deletes the instruction (for example by simplifying a PHI that feeds into + // the condition that we're unswitching on), we don't rewrite the second + // iteration. + WeakVH LICHandle(LIC); + // Now we rewrite the original code to know that the condition is true and the // new code to know that the condition is false. RewriteLoopBodyWithConditionConstant(L, LIC, Val, false); - - // It's possible that simplifying one loop could cause the other to be - // deleted. If so, don't simplify it. - if (!LoopProcessWorklist.empty() && LoopProcessWorklist.back() == NewLoop) - RewriteLoopBodyWithConditionConstant(NewLoop, LIC, Val, true); + // It's possible that simplifying one loop could cause the other to be + // changed to another value or a constant. If its a constant, don't simplify + // it. + if (!LoopProcessWorklist.empty() && LoopProcessWorklist.back() == NewLoop && + LICHandle && !isa<Constant>(LICHandle)) + RewriteLoopBodyWithConditionConstant(NewLoop, LICHandle, Val, true); } /// RemoveFromWorklist - Remove all instances of I from the worklist vector @@ -981,45 +989,16 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) { continue; } + // See if instruction simplification can hack this up. This is common for + // things like "select false, X, Y" after unswitching made the condition be + // 'false'. + if (Value *V = SimplifyInstruction(I)) { + ReplaceUsesOfWith(I, V, Worklist, L, LPM); + continue; + } + // Special case hacks that appear commonly in unswitched code. - switch (I->getOpcode()) { - case Instruction::Select: - if (ConstantInt *CB = dyn_cast<ConstantInt>(I->getOperand(0))) { - ReplaceUsesOfWith(I, I->getOperand(!CB->getZExtValue()+1), Worklist, L, - LPM); - continue; - } - break; - case Instruction::And: - if (isa<ConstantInt>(I->getOperand(0)) && - // constant -> RHS - I->getOperand(0)->getType()->isIntegerTy(1)) - cast<BinaryOperator>(I)->swapOperands(); - if (ConstantInt *CB = dyn_cast<ConstantInt>(I->getOperand(1))) - if (CB->getType()->isIntegerTy(1)) { - if (CB->isOne()) // X & 1 -> X - ReplaceUsesOfWith(I, I->getOperand(0), Worklist, L, LPM); - else // X & 0 -> 0 - ReplaceUsesOfWith(I, I->getOperand(1), Worklist, L, LPM); - continue; - } - break; - case Instruction::Or: - if (isa<ConstantInt>(I->getOperand(0)) && - // constant -> RHS - I->getOperand(0)->getType()->isIntegerTy(1)) - cast<BinaryOperator>(I)->swapOperands(); - if (ConstantInt *CB = dyn_cast<ConstantInt>(I->getOperand(1))) - if (CB->getType()->isIntegerTy(1)) { - if (CB->isOne()) // X | 1 -> 1 - ReplaceUsesOfWith(I, I->getOperand(1), Worklist, L, LPM); - else // X | 0 -> X - ReplaceUsesOfWith(I, I->getOperand(0), Worklist, L, LPM); - continue; - } - break; - case Instruction::Br: { - BranchInst *BI = cast<BranchInst>(I); + if (BranchInst *BI = dyn_cast<BranchInst>(I)) { if (BI->isUnconditional()) { // If BI's parent is the only pred of the successor, fold the two blocks // together. @@ -1052,13 +1031,13 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) { LPM->deleteSimpleAnalysisValue(Succ, L); Succ->eraseFromParent(); ++NumSimplify; - break; + continue; } if (ConstantInt *CB = dyn_cast<ConstantInt>(BI->getCondition())){ // Conditional branch. Turn it into an unconditional branch, then // remove dead blocks. - break; // FIXME: Enable. + continue; // FIXME: Enable. DEBUG(dbgs() << "Folded branch: " << *BI); BasicBlock *DeadSucc = BI->getSuccessor(CB->getZExtValue()); @@ -1072,8 +1051,7 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) { RemoveBlockIfDead(DeadSucc, Worklist, L); } - break; - } + continue; } } } diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 3b305ae..3611b8e 100644 --- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -744,7 +744,7 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) { const Type *ArgTys[3] = { M->getRawDest()->getType(), M->getRawSource()->getType(), M->getLength()->getType() }; - M->setOperand(0,Intrinsic::getDeclaration(Mod, Intrinsic::memcpy, ArgTys, 3)); + M->setCalledFunction(Intrinsic::getDeclaration(Mod, Intrinsic::memcpy, ArgTys, 3)); // MemDep may have over conservative information about this instruction, just // conservatively flush it from the cache. diff --git a/lib/Transforms/Scalar/Reg2Mem.cpp b/lib/Transforms/Scalar/Reg2Mem.cpp index 7a6eec3..13222ac 100644 --- a/lib/Transforms/Scalar/Reg2Mem.cpp +++ b/lib/Transforms/Scalar/Reg2Mem.cpp @@ -46,10 +46,11 @@ namespace { bool valueEscapes(const Instruction *Inst) const { const BasicBlock *BB = Inst->getParent(); for (Value::const_use_iterator UI = Inst->use_begin(),E = Inst->use_end(); - UI != E; ++UI) - if (cast<Instruction>(*UI)->getParent() != BB || - isa<PHINode>(*UI)) + UI != E; ++UI) { + const Instruction *I = cast<Instruction>(*UI); + if (I->getParent() != BB || isa<PHINode>(I)) return true; + } return false; } diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp index 4f09bee..907ece8 100644 --- a/lib/Transforms/Scalar/SCCP.cpp +++ b/lib/Transforms/Scalar/SCCP.cpp @@ -317,7 +317,10 @@ private: void markConstant(LatticeVal &IV, Value *V, Constant *C) { if (!IV.markConstant(C)) return; DEBUG(dbgs() << "markConstant: " << *C << ": " << *V << '\n'); - InstWorkList.push_back(V); + if (IV.isOverdefined()) + OverdefinedInstWorkList.push_back(V); + else + InstWorkList.push_back(V); } void markConstant(Value *V, Constant *C) { @@ -327,9 +330,13 @@ private: void markForcedConstant(Value *V, Constant *C) { assert(!V->getType()->isStructTy() && "Should use other method"); - ValueState[V].markForcedConstant(C); + LatticeVal &IV = ValueState[V]; + IV.markForcedConstant(C); DEBUG(dbgs() << "markForcedConstant: " << *C << ": " << *V << '\n'); - InstWorkList.push_back(V); + if (IV.isOverdefined()) + OverdefinedInstWorkList.push_back(V); + else + InstWorkList.push_back(V); } @@ -1445,6 +1452,8 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // After a zero extend, we know the top part is zero. SExt doesn't have // to be handled here, because we don't know whether the top part is 1's // or 0's. + case Instruction::SIToFP: // some FP values are not possible, just use 0. + case Instruction::UIToFP: // some FP values are not possible, just use 0. markForcedConstant(I, Constant::getNullValue(ITy)); return true; case Instruction::Mul: diff --git a/lib/Transforms/Scalar/SCCVN.cpp b/lib/Transforms/Scalar/SCCVN.cpp deleted file mode 100644 index 9685a29..0000000 --- a/lib/Transforms/Scalar/SCCVN.cpp +++ /dev/null @@ -1,716 +0,0 @@ -//===- SCCVN.cpp - Eliminate redundant values -----------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This pass performs global value numbering to eliminate fully redundant -// instructions. This is based on the paper "SCC-based Value Numbering" -// by Cooper, et al. -// -//===----------------------------------------------------------------------===// - -#define DEBUG_TYPE "sccvn" -#include "llvm/Transforms/Scalar.h" -#include "llvm/BasicBlock.h" -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Function.h" -#include "llvm/Operator.h" -#include "llvm/Value.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/PostOrderIterator.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/SparseBitVector.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/Dominators.h" -#include "llvm/Support/CFG.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Transforms/Utils/SSAUpdater.h" -using namespace llvm; - -STATISTIC(NumSCCVNInstr, "Number of instructions deleted by SCCVN"); -STATISTIC(NumSCCVNPhi, "Number of phis deleted by SCCVN"); - -//===----------------------------------------------------------------------===// -// ValueTable Class -//===----------------------------------------------------------------------===// - -/// This class holds the mapping between values and value numbers. It is used -/// as an efficient mechanism to determine the expression-wise equivalence of -/// two values. -namespace { - struct Expression { - enum ExpressionOpcode { ADD, FADD, SUB, FSUB, MUL, FMUL, - UDIV, SDIV, FDIV, UREM, SREM, - FREM, SHL, LSHR, ASHR, AND, OR, XOR, ICMPEQ, - ICMPNE, ICMPUGT, ICMPUGE, ICMPULT, ICMPULE, - ICMPSGT, ICMPSGE, ICMPSLT, ICMPSLE, FCMPOEQ, - FCMPOGT, FCMPOGE, FCMPOLT, FCMPOLE, FCMPONE, - FCMPORD, FCMPUNO, FCMPUEQ, FCMPUGT, FCMPUGE, - FCMPULT, FCMPULE, FCMPUNE, EXTRACT, INSERT, - SHUFFLE, SELECT, TRUNC, ZEXT, SEXT, FPTOUI, - FPTOSI, UITOFP, SITOFP, FPTRUNC, FPEXT, - PTRTOINT, INTTOPTR, BITCAST, GEP, CALL, CONSTANT, - INSERTVALUE, EXTRACTVALUE, EMPTY, TOMBSTONE }; - - ExpressionOpcode opcode; - const Type* type; - SmallVector<uint32_t, 4> varargs; - - Expression() { } - Expression(ExpressionOpcode o) : opcode(o) { } - - bool operator==(const Expression &other) const { - if (opcode != other.opcode) - return false; - else if (opcode == EMPTY || opcode == TOMBSTONE) - return true; - else if (type != other.type) - return false; - else { - if (varargs.size() != other.varargs.size()) - return false; - - for (size_t i = 0; i < varargs.size(); ++i) - if (varargs[i] != other.varargs[i]) - return false; - - return true; - } - } - - bool operator!=(const Expression &other) const { - return !(*this == other); - } - }; - - class ValueTable { - private: - DenseMap<Value*, uint32_t> valueNumbering; - DenseMap<Expression, uint32_t> expressionNumbering; - DenseMap<Value*, uint32_t> constantsNumbering; - - uint32_t nextValueNumber; - - Expression::ExpressionOpcode getOpcode(BinaryOperator* BO); - Expression::ExpressionOpcode getOpcode(CmpInst* C); - Expression::ExpressionOpcode getOpcode(CastInst* C); - Expression create_expression(BinaryOperator* BO); - Expression create_expression(CmpInst* C); - Expression create_expression(ShuffleVectorInst* V); - Expression create_expression(ExtractElementInst* C); - Expression create_expression(InsertElementInst* V); - Expression create_expression(SelectInst* V); - Expression create_expression(CastInst* C); - Expression create_expression(GetElementPtrInst* G); - Expression create_expression(CallInst* C); - Expression create_expression(Constant* C); - Expression create_expression(ExtractValueInst* C); - Expression create_expression(InsertValueInst* C); - public: - ValueTable() : nextValueNumber(1) { } - uint32_t computeNumber(Value *V); - uint32_t lookup(Value *V); - void add(Value *V, uint32_t num); - void clear(); - void clearExpressions(); - void erase(Value *v); - unsigned size(); - void verifyRemoved(const Value *) const; - }; -} - -namespace llvm { -template <> struct DenseMapInfo<Expression> { - static inline Expression getEmptyKey() { - return Expression(Expression::EMPTY); - } - - static inline Expression getTombstoneKey() { - return Expression(Expression::TOMBSTONE); - } - - static unsigned getHashValue(const Expression e) { - unsigned hash = e.opcode; - - hash = ((unsigned)((uintptr_t)e.type >> 4) ^ - (unsigned)((uintptr_t)e.type >> 9)); - - for (SmallVector<uint32_t, 4>::const_iterator I = e.varargs.begin(), - E = e.varargs.end(); I != E; ++I) - hash = *I + hash * 37; - - return hash; - } - static bool isEqual(const Expression &LHS, const Expression &RHS) { - return LHS == RHS; - } -}; -template <> -struct isPodLike<Expression> { static const bool value = true; }; - -} - -//===----------------------------------------------------------------------===// -// ValueTable Internal Functions -//===----------------------------------------------------------------------===// -Expression::ExpressionOpcode ValueTable::getOpcode(BinaryOperator* BO) { - switch(BO->getOpcode()) { - default: // THIS SHOULD NEVER HAPPEN - llvm_unreachable("Binary operator with unknown opcode?"); - case Instruction::Add: return Expression::ADD; - case Instruction::FAdd: return Expression::FADD; - case Instruction::Sub: return Expression::SUB; - case Instruction::FSub: return Expression::FSUB; - case Instruction::Mul: return Expression::MUL; - case Instruction::FMul: return Expression::FMUL; - case Instruction::UDiv: return Expression::UDIV; - case Instruction::SDiv: return Expression::SDIV; - case Instruction::FDiv: return Expression::FDIV; - case Instruction::URem: return Expression::UREM; - case Instruction::SRem: return Expression::SREM; - case Instruction::FRem: return Expression::FREM; - case Instruction::Shl: return Expression::SHL; - case Instruction::LShr: return Expression::LSHR; - case Instruction::AShr: return Expression::ASHR; - case Instruction::And: return Expression::AND; - case Instruction::Or: return Expression::OR; - case Instruction::Xor: return Expression::XOR; - } -} - -Expression::ExpressionOpcode ValueTable::getOpcode(CmpInst* C) { - if (isa<ICmpInst>(C)) { - switch (C->getPredicate()) { - default: // THIS SHOULD NEVER HAPPEN - llvm_unreachable("Comparison with unknown predicate?"); - case ICmpInst::ICMP_EQ: return Expression::ICMPEQ; - case ICmpInst::ICMP_NE: return Expression::ICMPNE; - case ICmpInst::ICMP_UGT: return Expression::ICMPUGT; - case ICmpInst::ICMP_UGE: return Expression::ICMPUGE; - case ICmpInst::ICMP_ULT: return Expression::ICMPULT; - case ICmpInst::ICMP_ULE: return Expression::ICMPULE; - case ICmpInst::ICMP_SGT: return Expression::ICMPSGT; - case ICmpInst::ICMP_SGE: return Expression::ICMPSGE; - case ICmpInst::ICMP_SLT: return Expression::ICMPSLT; - case ICmpInst::ICMP_SLE: return Expression::ICMPSLE; - } - } else { - switch (C->getPredicate()) { - default: // THIS SHOULD NEVER HAPPEN - llvm_unreachable("Comparison with unknown predicate?"); - case FCmpInst::FCMP_OEQ: return Expression::FCMPOEQ; - case FCmpInst::FCMP_OGT: return Expression::FCMPOGT; - case FCmpInst::FCMP_OGE: return Expression::FCMPOGE; - case FCmpInst::FCMP_OLT: return Expression::FCMPOLT; - case FCmpInst::FCMP_OLE: return Expression::FCMPOLE; - case FCmpInst::FCMP_ONE: return Expression::FCMPONE; - case FCmpInst::FCMP_ORD: return Expression::FCMPORD; - case FCmpInst::FCMP_UNO: return Expression::FCMPUNO; - case FCmpInst::FCMP_UEQ: return Expression::FCMPUEQ; - case FCmpInst::FCMP_UGT: return Expression::FCMPUGT; - case FCmpInst::FCMP_UGE: return Expression::FCMPUGE; - case FCmpInst::FCMP_ULT: return Expression::FCMPULT; - case FCmpInst::FCMP_ULE: return Expression::FCMPULE; - case FCmpInst::FCMP_UNE: return Expression::FCMPUNE; - } - } -} - -Expression::ExpressionOpcode ValueTable::getOpcode(CastInst* C) { - switch(C->getOpcode()) { - default: // THIS SHOULD NEVER HAPPEN - llvm_unreachable("Cast operator with unknown opcode?"); - case Instruction::Trunc: return Expression::TRUNC; - case Instruction::ZExt: return Expression::ZEXT; - case Instruction::SExt: return Expression::SEXT; - case Instruction::FPToUI: return Expression::FPTOUI; - case Instruction::FPToSI: return Expression::FPTOSI; - case Instruction::UIToFP: return Expression::UITOFP; - case Instruction::SIToFP: return Expression::SITOFP; - case Instruction::FPTrunc: return Expression::FPTRUNC; - case Instruction::FPExt: return Expression::FPEXT; - case Instruction::PtrToInt: return Expression::PTRTOINT; - case Instruction::IntToPtr: return Expression::INTTOPTR; - case Instruction::BitCast: return Expression::BITCAST; - } -} - -Expression ValueTable::create_expression(CallInst* C) { - Expression e; - - e.type = C->getType(); - e.opcode = Expression::CALL; - - e.varargs.push_back(lookup(C->getCalledFunction())); - for (CallInst::op_iterator I = C->op_begin()+1, E = C->op_end(); - I != E; ++I) - e.varargs.push_back(lookup(*I)); - - return e; -} - -Expression ValueTable::create_expression(BinaryOperator* BO) { - Expression e; - e.varargs.push_back(lookup(BO->getOperand(0))); - e.varargs.push_back(lookup(BO->getOperand(1))); - e.type = BO->getType(); - e.opcode = getOpcode(BO); - - return e; -} - -Expression ValueTable::create_expression(CmpInst* C) { - Expression e; - - e.varargs.push_back(lookup(C->getOperand(0))); - e.varargs.push_back(lookup(C->getOperand(1))); - e.type = C->getType(); - e.opcode = getOpcode(C); - - return e; -} - -Expression ValueTable::create_expression(CastInst* C) { - Expression e; - - e.varargs.push_back(lookup(C->getOperand(0))); - e.type = C->getType(); - e.opcode = getOpcode(C); - - return e; -} - -Expression ValueTable::create_expression(ShuffleVectorInst* S) { - Expression e; - - e.varargs.push_back(lookup(S->getOperand(0))); - e.varargs.push_back(lookup(S->getOperand(1))); - e.varargs.push_back(lookup(S->getOperand(2))); - e.type = S->getType(); - e.opcode = Expression::SHUFFLE; - - return e; -} - -Expression ValueTable::create_expression(ExtractElementInst* E) { - Expression e; - - e.varargs.push_back(lookup(E->getOperand(0))); - e.varargs.push_back(lookup(E->getOperand(1))); - e.type = E->getType(); - e.opcode = Expression::EXTRACT; - - return e; -} - -Expression ValueTable::create_expression(InsertElementInst* I) { - Expression e; - - e.varargs.push_back(lookup(I->getOperand(0))); - e.varargs.push_back(lookup(I->getOperand(1))); - e.varargs.push_back(lookup(I->getOperand(2))); - e.type = I->getType(); - e.opcode = Expression::INSERT; - - return e; -} - -Expression ValueTable::create_expression(SelectInst* I) { - Expression e; - - e.varargs.push_back(lookup(I->getCondition())); - e.varargs.push_back(lookup(I->getTrueValue())); - e.varargs.push_back(lookup(I->getFalseValue())); - e.type = I->getType(); - e.opcode = Expression::SELECT; - - return e; -} - -Expression ValueTable::create_expression(GetElementPtrInst* G) { - Expression e; - - e.varargs.push_back(lookup(G->getPointerOperand())); - e.type = G->getType(); - e.opcode = Expression::GEP; - - for (GetElementPtrInst::op_iterator I = G->idx_begin(), E = G->idx_end(); - I != E; ++I) - e.varargs.push_back(lookup(*I)); - - return e; -} - -Expression ValueTable::create_expression(ExtractValueInst* E) { - Expression e; - - e.varargs.push_back(lookup(E->getAggregateOperand())); - for (ExtractValueInst::idx_iterator II = E->idx_begin(), IE = E->idx_end(); - II != IE; ++II) - e.varargs.push_back(*II); - e.type = E->getType(); - e.opcode = Expression::EXTRACTVALUE; - - return e; -} - -Expression ValueTable::create_expression(InsertValueInst* E) { - Expression e; - - e.varargs.push_back(lookup(E->getAggregateOperand())); - e.varargs.push_back(lookup(E->getInsertedValueOperand())); - for (InsertValueInst::idx_iterator II = E->idx_begin(), IE = E->idx_end(); - II != IE; ++II) - e.varargs.push_back(*II); - e.type = E->getType(); - e.opcode = Expression::INSERTVALUE; - - return e; -} - -//===----------------------------------------------------------------------===// -// ValueTable External Functions -//===----------------------------------------------------------------------===// - -/// add - Insert a value into the table with a specified value number. -void ValueTable::add(Value *V, uint32_t num) { - valueNumbering[V] = num; -} - -/// computeNumber - Returns the value number for the specified value, assigning -/// it a new number if it did not have one before. -uint32_t ValueTable::computeNumber(Value *V) { - if (uint32_t v = valueNumbering[V]) - return v; - else if (uint32_t v= constantsNumbering[V]) - return v; - - if (!isa<Instruction>(V)) { - constantsNumbering[V] = nextValueNumber; - return nextValueNumber++; - } - - Instruction* I = cast<Instruction>(V); - Expression exp; - switch (I->getOpcode()) { - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or : - case Instruction::Xor: - exp = create_expression(cast<BinaryOperator>(I)); - break; - case Instruction::ICmp: - case Instruction::FCmp: - exp = create_expression(cast<CmpInst>(I)); - break; - case Instruction::Trunc: - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::UIToFP: - case Instruction::SIToFP: - case Instruction::FPTrunc: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::BitCast: - exp = create_expression(cast<CastInst>(I)); - break; - case Instruction::Select: - exp = create_expression(cast<SelectInst>(I)); - break; - case Instruction::ExtractElement: - exp = create_expression(cast<ExtractElementInst>(I)); - break; - case Instruction::InsertElement: - exp = create_expression(cast<InsertElementInst>(I)); - break; - case Instruction::ShuffleVector: - exp = create_expression(cast<ShuffleVectorInst>(I)); - break; - case Instruction::ExtractValue: - exp = create_expression(cast<ExtractValueInst>(I)); - break; - case Instruction::InsertValue: - exp = create_expression(cast<InsertValueInst>(I)); - break; - case Instruction::GetElementPtr: - exp = create_expression(cast<GetElementPtrInst>(I)); - break; - default: - valueNumbering[V] = nextValueNumber; - return nextValueNumber++; - } - - uint32_t& e = expressionNumbering[exp]; - if (!e) e = nextValueNumber++; - valueNumbering[V] = e; - - return e; -} - -/// lookup - Returns the value number of the specified value. Returns 0 if -/// the value has not yet been numbered. -uint32_t ValueTable::lookup(Value *V) { - if (!isa<Instruction>(V)) { - if (!constantsNumbering.count(V)) - constantsNumbering[V] = nextValueNumber++; - return constantsNumbering[V]; - } - - return valueNumbering[V]; -} - -/// clear - Remove all entries from the ValueTable -void ValueTable::clear() { - valueNumbering.clear(); - expressionNumbering.clear(); - constantsNumbering.clear(); - nextValueNumber = 1; -} - -void ValueTable::clearExpressions() { - expressionNumbering.clear(); - constantsNumbering.clear(); - nextValueNumber = 1; -} - -/// erase - Remove a value from the value numbering -void ValueTable::erase(Value *V) { - valueNumbering.erase(V); -} - -/// verifyRemoved - Verify that the value is removed from all internal data -/// structures. -void ValueTable::verifyRemoved(const Value *V) const { - for (DenseMap<Value*, uint32_t>::const_iterator - I = valueNumbering.begin(), E = valueNumbering.end(); I != E; ++I) { - assert(I->first != V && "Inst still occurs in value numbering map!"); - } -} - -//===----------------------------------------------------------------------===// -// SCCVN Pass -//===----------------------------------------------------------------------===// - -namespace { - - struct ValueNumberScope { - ValueNumberScope* parent; - DenseMap<uint32_t, Value*> table; - SparseBitVector<128> availIn; - SparseBitVector<128> availOut; - - ValueNumberScope(ValueNumberScope* p) : parent(p) { } - }; - - class SCCVN : public FunctionPass { - bool runOnFunction(Function &F); - public: - static char ID; // Pass identification, replacement for typeid - SCCVN() : FunctionPass(&ID) { } - - private: - ValueTable VT; - DenseMap<BasicBlock*, ValueNumberScope*> BBMap; - - // This transformation requires dominator postdominator info - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<DominatorTree>(); - - AU.addPreserved<DominatorTree>(); - AU.setPreservesCFG(); - } - }; - - char SCCVN::ID = 0; -} - -// createSCCVNPass - The public interface to this file... -FunctionPass *llvm::createSCCVNPass() { return new SCCVN(); } - -static RegisterPass<SCCVN> X("sccvn", - "SCC Value Numbering"); - -static Value *lookupNumber(ValueNumberScope *Locals, uint32_t num) { - while (Locals) { - DenseMap<uint32_t, Value*>::iterator I = Locals->table.find(num); - if (I != Locals->table.end()) - return I->second; - Locals = Locals->parent; - } - - return 0; -} - -bool SCCVN::runOnFunction(Function& F) { - // Implement the RPO version of the SCCVN algorithm. Conceptually, - // we optimisitically assume that all instructions with the same opcode have - // the same VN. Then we deepen our comparison by one level, to all - // instructions whose operands have the same opcodes get the same VN. We - // iterate this process until the partitioning stops changing, at which - // point we have computed a full numbering. - ReversePostOrderTraversal<Function*> RPOT(&F); - bool done = false; - while (!done) { - done = true; - VT.clearExpressions(); - for (ReversePostOrderTraversal<Function*>::rpo_iterator I = RPOT.begin(), - E = RPOT.end(); I != E; ++I) { - BasicBlock* BB = *I; - for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); - BI != BE; ++BI) { - uint32_t origVN = VT.lookup(BI); - uint32_t newVN = VT.computeNumber(BI); - if (origVN != newVN) - done = false; - } - } - } - - // Now, do a dominator walk, eliminating simple, dominated redundancies as we - // go. Also, build the ValueNumberScope structure that will be used for - // computing full availability. - DominatorTree& DT = getAnalysis<DominatorTree>(); - bool changed = false; - for (df_iterator<DomTreeNode*> DI = df_begin(DT.getRootNode()), - DE = df_end(DT.getRootNode()); DI != DE; ++DI) { - BasicBlock* BB = DI->getBlock(); - if (DI->getIDom()) - BBMap[BB] = new ValueNumberScope(BBMap[DI->getIDom()->getBlock()]); - else - BBMap[BB] = new ValueNumberScope(0); - - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) { - uint32_t num = VT.lookup(I); - Value* repl = lookupNumber(BBMap[BB], num); - - if (repl) { - if (isa<PHINode>(I)) - ++NumSCCVNPhi; - else - ++NumSCCVNInstr; - I->replaceAllUsesWith(repl); - Instruction* OldInst = I; - ++I; - BBMap[BB]->table[num] = repl; - OldInst->eraseFromParent(); - changed = true; - } else { - BBMap[BB]->table[num] = I; - BBMap[BB]->availOut.set(num); - - ++I; - } - } - } - - // Perform a forward data-flow to compute availability at all points on - // the CFG. - do { - changed = false; - for (ReversePostOrderTraversal<Function*>::rpo_iterator I = RPOT.begin(), - E = RPOT.end(); I != E; ++I) { - BasicBlock* BB = *I; - ValueNumberScope *VNS = BBMap[BB]; - - SparseBitVector<128> preds; - bool first = true; - for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); - PI != PE; ++PI) { - if (first) { - preds = BBMap[*PI]->availOut; - first = false; - } else { - preds &= BBMap[*PI]->availOut; - } - } - - changed |= (VNS->availIn |= preds); - changed |= (VNS->availOut |= preds); - } - } while (changed); - - // Use full availability information to perform non-dominated replacements. - SSAUpdater SSU; - for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) { - if (!BBMap.count(FI)) continue; - for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); - BI != BE; ) { - uint32_t num = VT.lookup(BI); - if (!BBMap[FI]->availIn.test(num)) { - ++BI; - continue; - } - - SSU.Initialize(BI); - - SmallPtrSet<BasicBlock*, 8> visited; - SmallVector<BasicBlock*, 8> stack; - visited.insert(FI); - for (pred_iterator PI = pred_begin(FI), PE = pred_end(FI); - PI != PE; ++PI) - if (!visited.count(*PI)) - stack.push_back(*PI); - - while (!stack.empty()) { - BasicBlock* CurrBB = stack.pop_back_val(); - visited.insert(CurrBB); - - ValueNumberScope* S = BBMap[CurrBB]; - if (S->table.count(num)) { - SSU.AddAvailableValue(CurrBB, S->table[num]); - } else { - for (pred_iterator PI = pred_begin(CurrBB), PE = pred_end(CurrBB); - PI != PE; ++PI) - if (!visited.count(*PI)) - stack.push_back(*PI); - } - } - - Value* repl = SSU.GetValueInMiddleOfBlock(FI); - BI->replaceAllUsesWith(repl); - Instruction* CurInst = BI; - ++BI; - BBMap[FI]->table[num] = repl; - if (isa<PHINode>(CurInst)) - ++NumSCCVNPhi; - else - ++NumSCCVNInstr; - - CurInst->eraseFromParent(); - } - } - - VT.clear(); - for (DenseMap<BasicBlock*, ValueNumberScope*>::iterator - I = BBMap.begin(), E = BBMap.end(); I != E; ++I) - delete I->second; - BBMap.clear(); - - return changed; -} diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp index 6211beb..5ca9ce3 100644 --- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp +++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -130,14 +130,7 @@ namespace { void RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, SmallVector<AllocaInst*, 32> &NewElts); - bool CanConvertToScalar(Value *V, bool &IsNotTrivial, const Type *&VecTy, - bool &SawVec, uint64_t Offset, unsigned AllocaSize); - void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset); - Value *ConvertScalar_ExtractValue(Value *NV, const Type *ToType, - uint64_t Offset, IRBuilder<> &Builder); - Value *ConvertScalar_InsertValue(Value *StoredVal, Value *ExistingVal, - uint64_t Offset, IRBuilder<> &Builder); - static Instruction *isOnlyCopiedFromConstantGlobal(AllocaInst *AI); + static MemTransferInst *isOnlyCopiedFromConstantGlobal(AllocaInst *AI); }; } @@ -150,6 +143,596 @@ FunctionPass *llvm::createScalarReplAggregatesPass(signed int Threshold) { } +//===----------------------------------------------------------------------===// +// Convert To Scalar Optimization. +//===----------------------------------------------------------------------===// + +namespace { +/// ConvertToScalarInfo - This class implements the "Convert To Scalar" +/// optimization, which scans the uses of an alloca and determines if it can +/// rewrite it in terms of a single new alloca that can be mem2reg'd. +class ConvertToScalarInfo { + /// AllocaSize - The size of the alloca being considered. + unsigned AllocaSize; + const TargetData &TD; + + /// IsNotTrivial - This is set to true if there is some access to the object + /// which means that mem2reg can't promote it. + bool IsNotTrivial; + + /// VectorTy - This tracks the type that we should promote the vector to if + /// it is possible to turn it into a vector. This starts out null, and if it + /// isn't possible to turn into a vector type, it gets set to VoidTy. + const Type *VectorTy; + + /// HadAVector - True if there is at least one vector access to the alloca. + /// We don't want to turn random arrays into vectors and use vector element + /// insert/extract, but if there are element accesses to something that is + /// also declared as a vector, we do want to promote to a vector. + bool HadAVector; + +public: + explicit ConvertToScalarInfo(unsigned Size, const TargetData &td) + : AllocaSize(Size), TD(td) { + IsNotTrivial = false; + VectorTy = 0; + HadAVector = false; + } + + AllocaInst *TryConvert(AllocaInst *AI); + +private: + bool CanConvertToScalar(Value *V, uint64_t Offset); + void MergeInType(const Type *In, uint64_t Offset); + void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset); + + Value *ConvertScalar_ExtractValue(Value *NV, const Type *ToType, + uint64_t Offset, IRBuilder<> &Builder); + Value *ConvertScalar_InsertValue(Value *StoredVal, Value *ExistingVal, + uint64_t Offset, IRBuilder<> &Builder); +}; +} // end anonymous namespace. + +/// TryConvert - Analyze the specified alloca, and if it is safe to do so, +/// rewrite it to be a new alloca which is mem2reg'able. This returns the new +/// alloca if possible or null if not. +AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) { + // If we can't convert this scalar, or if mem2reg can trivially do it, bail + // out. + if (!CanConvertToScalar(AI, 0) || !IsNotTrivial) + return 0; + + // If we were able to find a vector type that can handle this with + // insert/extract elements, and if there was at least one use that had + // a vector type, promote this to a vector. We don't want to promote + // random stuff that doesn't use vectors (e.g. <9 x double>) because then + // we just get a lot of insert/extracts. If at least one vector is + // involved, then we probably really do have a union of vector/array. + const Type *NewTy; + if (VectorTy && VectorTy->isVectorTy() && HadAVector) { + DEBUG(dbgs() << "CONVERT TO VECTOR: " << *AI << "\n TYPE = " + << *VectorTy << '\n'); + NewTy = VectorTy; // Use the vector type. + } else { + DEBUG(dbgs() << "CONVERT TO SCALAR INTEGER: " << *AI << "\n"); + // Create and insert the integer alloca. + NewTy = IntegerType::get(AI->getContext(), AllocaSize*8); + } + AllocaInst *NewAI = new AllocaInst(NewTy, 0, "", AI->getParent()->begin()); + ConvertUsesToScalar(AI, NewAI, 0); + return NewAI; +} + +/// MergeInType - Add the 'In' type to the accumulated vector type (VectorTy) +/// so far at the offset specified by Offset (which is specified in bytes). +/// +/// There are two cases we handle here: +/// 1) A union of vector types of the same size and potentially its elements. +/// Here we turn element accesses into insert/extract element operations. +/// This promotes a <4 x float> with a store of float to the third element +/// into a <4 x float> that uses insert element. +/// 2) A fully general blob of memory, which we turn into some (potentially +/// large) integer type with extract and insert operations where the loads +/// and stores would mutate the memory. We mark this by setting VectorTy +/// to VoidTy. +void ConvertToScalarInfo::MergeInType(const Type *In, uint64_t Offset) { + // If we already decided to turn this into a blob of integer memory, there is + // nothing to be done. + if (VectorTy && VectorTy->isVoidTy()) + return; + + // If this could be contributing to a vector, analyze it. + + // If the In type is a vector that is the same size as the alloca, see if it + // matches the existing VecTy. + if (const VectorType *VInTy = dyn_cast<VectorType>(In)) { + // Remember if we saw a vector type. + HadAVector = true; + + if (VInTy->getBitWidth()/8 == AllocaSize && Offset == 0) { + // If we're storing/loading a vector of the right size, allow it as a + // vector. If this the first vector we see, remember the type so that + // we know the element size. If this is a subsequent access, ignore it + // even if it is a differing type but the same size. Worst case we can + // bitcast the resultant vectors. + if (VectorTy == 0) + VectorTy = VInTy; + return; + } + } else if (In->isFloatTy() || In->isDoubleTy() || + (In->isIntegerTy() && In->getPrimitiveSizeInBits() >= 8 && + isPowerOf2_32(In->getPrimitiveSizeInBits()))) { + // If we're accessing something that could be an element of a vector, see + // if the implied vector agrees with what we already have and if Offset is + // compatible with it. + unsigned EltSize = In->getPrimitiveSizeInBits()/8; + if (Offset % EltSize == 0 && AllocaSize % EltSize == 0 && + (VectorTy == 0 || + cast<VectorType>(VectorTy)->getElementType() + ->getPrimitiveSizeInBits()/8 == EltSize)) { + if (VectorTy == 0) + VectorTy = VectorType::get(In, AllocaSize/EltSize); + return; + } + } + + // Otherwise, we have a case that we can't handle with an optimized vector + // form. We can still turn this into a large integer. + VectorTy = Type::getVoidTy(In->getContext()); +} + +/// CanConvertToScalar - V is a pointer. If we can convert the pointee and all +/// its accesses to a single vector type, return true and set VecTy to +/// the new type. If we could convert the alloca into a single promotable +/// integer, return true but set VecTy to VoidTy. Further, if the use is not a +/// completely trivial use that mem2reg could promote, set IsNotTrivial. Offset +/// is the current offset from the base of the alloca being analyzed. +/// +/// If we see at least one access to the value that is as a vector type, set the +/// SawVec flag. +bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) { + for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) { + Instruction *User = cast<Instruction>(*UI); + + if (LoadInst *LI = dyn_cast<LoadInst>(User)) { + // Don't break volatile loads. + if (LI->isVolatile()) + return false; + MergeInType(LI->getType(), Offset); + continue; + } + + if (StoreInst *SI = dyn_cast<StoreInst>(User)) { + // Storing the pointer, not into the value? + if (SI->getOperand(0) == V || SI->isVolatile()) return false; + MergeInType(SI->getOperand(0)->getType(), Offset); + continue; + } + + if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) { + IsNotTrivial = true; // Can't be mem2reg'd. + if (!CanConvertToScalar(BCI, Offset)) + return false; + continue; + } + + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) { + // If this is a GEP with a variable indices, we can't handle it. + if (!GEP->hasAllConstantIndices()) + return false; + + // Compute the offset that this GEP adds to the pointer. + SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end()); + uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(), + &Indices[0], Indices.size()); + // See if all uses can be converted. + if (!CanConvertToScalar(GEP, Offset+GEPOffset)) + return false; + IsNotTrivial = true; // Can't be mem2reg'd. + continue; + } + + // If this is a constant sized memset of a constant value (e.g. 0) we can + // handle it. + if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) { + // Store of constant value and constant size. + if (!isa<ConstantInt>(MSI->getValue()) || + !isa<ConstantInt>(MSI->getLength())) + return false; + IsNotTrivial = true; // Can't be mem2reg'd. + continue; + } + + // If this is a memcpy or memmove into or out of the whole allocation, we + // can handle it like a load or store of the scalar type. + if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) { + ConstantInt *Len = dyn_cast<ConstantInt>(MTI->getLength()); + if (Len == 0 || Len->getZExtValue() != AllocaSize || Offset != 0) + return false; + + IsNotTrivial = true; // Can't be mem2reg'd. + continue; + } + + // Otherwise, we cannot handle this! + return false; + } + + return true; +} + +/// ConvertUsesToScalar - Convert all of the users of Ptr to use the new alloca +/// directly. This happens when we are converting an "integer union" to a +/// single integer scalar, or when we are converting a "vector union" to a +/// vector with insert/extractelement instructions. +/// +/// Offset is an offset from the original alloca, in bits that need to be +/// shifted to the right. By the end of this, there should be no uses of Ptr. +void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, + uint64_t Offset) { + while (!Ptr->use_empty()) { + Instruction *User = cast<Instruction>(Ptr->use_back()); + + if (BitCastInst *CI = dyn_cast<BitCastInst>(User)) { + ConvertUsesToScalar(CI, NewAI, Offset); + CI->eraseFromParent(); + continue; + } + + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) { + // Compute the offset that this GEP adds to the pointer. + SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end()); + uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(), + &Indices[0], Indices.size()); + ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8); + GEP->eraseFromParent(); + continue; + } + + IRBuilder<> Builder(User->getParent(), User); + + if (LoadInst *LI = dyn_cast<LoadInst>(User)) { + // The load is a bit extract from NewAI shifted right by Offset bits. + Value *LoadedVal = Builder.CreateLoad(NewAI, "tmp"); + Value *NewLoadVal + = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset, Builder); + LI->replaceAllUsesWith(NewLoadVal); + LI->eraseFromParent(); + continue; + } + + if (StoreInst *SI = dyn_cast<StoreInst>(User)) { + assert(SI->getOperand(0) != Ptr && "Consistency error!"); + Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in"); + Value *New = ConvertScalar_InsertValue(SI->getOperand(0), Old, Offset, + Builder); + Builder.CreateStore(New, NewAI); + SI->eraseFromParent(); + + // If the load we just inserted is now dead, then the inserted store + // overwrote the entire thing. + if (Old->use_empty()) + Old->eraseFromParent(); + continue; + } + + // If this is a constant sized memset of a constant value (e.g. 0) we can + // transform it into a store of the expanded constant value. + if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) { + assert(MSI->getRawDest() == Ptr && "Consistency error!"); + unsigned NumBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue(); + if (NumBytes != 0) { + unsigned Val = cast<ConstantInt>(MSI->getValue())->getZExtValue(); + + // Compute the value replicated the right number of times. + APInt APVal(NumBytes*8, Val); + + // Splat the value if non-zero. + if (Val) + for (unsigned i = 1; i != NumBytes; ++i) + APVal |= APVal << 8; + + Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in"); + Value *New = ConvertScalar_InsertValue( + ConstantInt::get(User->getContext(), APVal), + Old, Offset, Builder); + Builder.CreateStore(New, NewAI); + + // If the load we just inserted is now dead, then the memset overwrote + // the entire thing. + if (Old->use_empty()) + Old->eraseFromParent(); + } + MSI->eraseFromParent(); + continue; + } + + // If this is a memcpy or memmove into or out of the whole allocation, we + // can handle it like a load or store of the scalar type. + if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) { + assert(Offset == 0 && "must be store to start of alloca"); + + // If the source and destination are both to the same alloca, then this is + // a noop copy-to-self, just delete it. Otherwise, emit a load and store + // as appropriate. + AllocaInst *OrigAI = cast<AllocaInst>(Ptr->getUnderlyingObject(0)); + + if (MTI->getSource()->getUnderlyingObject(0) != OrigAI) { + // Dest must be OrigAI, change this to be a load from the original + // pointer (bitcasted), then a store to our new alloca. + assert(MTI->getRawDest() == Ptr && "Neither use is of pointer?"); + Value *SrcPtr = MTI->getSource(); + SrcPtr = Builder.CreateBitCast(SrcPtr, NewAI->getType()); + + LoadInst *SrcVal = Builder.CreateLoad(SrcPtr, "srcval"); + SrcVal->setAlignment(MTI->getAlignment()); + Builder.CreateStore(SrcVal, NewAI); + } else if (MTI->getDest()->getUnderlyingObject(0) != OrigAI) { + // Src must be OrigAI, change this to be a load from NewAI then a store + // through the original dest pointer (bitcasted). + assert(MTI->getRawSource() == Ptr && "Neither use is of pointer?"); + LoadInst *SrcVal = Builder.CreateLoad(NewAI, "srcval"); + + Value *DstPtr = Builder.CreateBitCast(MTI->getDest(), NewAI->getType()); + StoreInst *NewStore = Builder.CreateStore(SrcVal, DstPtr); + NewStore->setAlignment(MTI->getAlignment()); + } else { + // Noop transfer. Src == Dst + } + + MTI->eraseFromParent(); + continue; + } + + llvm_unreachable("Unsupported operation!"); + } +} + +/// ConvertScalar_ExtractValue - Extract a value of type ToType from an integer +/// or vector value FromVal, extracting the bits from the offset specified by +/// Offset. This returns the value, which is of type ToType. +/// +/// This happens when we are converting an "integer union" to a single +/// integer scalar, or when we are converting a "vector union" to a vector with +/// insert/extractelement instructions. +/// +/// Offset is an offset from the original alloca, in bits that need to be +/// shifted to the right. +Value *ConvertToScalarInfo:: +ConvertScalar_ExtractValue(Value *FromVal, const Type *ToType, + uint64_t Offset, IRBuilder<> &Builder) { + // If the load is of the whole new alloca, no conversion is needed. + if (FromVal->getType() == ToType && Offset == 0) + return FromVal; + + // If the result alloca is a vector type, this is either an element + // access or a bitcast to another vector type of the same size. + if (const VectorType *VTy = dyn_cast<VectorType>(FromVal->getType())) { + if (ToType->isVectorTy()) + return Builder.CreateBitCast(FromVal, ToType, "tmp"); + + // Otherwise it must be an element access. + unsigned Elt = 0; + if (Offset) { + unsigned EltSize = TD.getTypeAllocSizeInBits(VTy->getElementType()); + Elt = Offset/EltSize; + assert(EltSize*Elt == Offset && "Invalid modulus in validity checking"); + } + // Return the element extracted out of it. + Value *V = Builder.CreateExtractElement(FromVal, ConstantInt::get( + Type::getInt32Ty(FromVal->getContext()), Elt), "tmp"); + if (V->getType() != ToType) + V = Builder.CreateBitCast(V, ToType, "tmp"); + return V; + } + + // If ToType is a first class aggregate, extract out each of the pieces and + // use insertvalue's to form the FCA. + if (const StructType *ST = dyn_cast<StructType>(ToType)) { + const StructLayout &Layout = *TD.getStructLayout(ST); + Value *Res = UndefValue::get(ST); + for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { + Value *Elt = ConvertScalar_ExtractValue(FromVal, ST->getElementType(i), + Offset+Layout.getElementOffsetInBits(i), + Builder); + Res = Builder.CreateInsertValue(Res, Elt, i, "tmp"); + } + return Res; + } + + if (const ArrayType *AT = dyn_cast<ArrayType>(ToType)) { + uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType()); + Value *Res = UndefValue::get(AT); + for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { + Value *Elt = ConvertScalar_ExtractValue(FromVal, AT->getElementType(), + Offset+i*EltSize, Builder); + Res = Builder.CreateInsertValue(Res, Elt, i, "tmp"); + } + return Res; + } + + // Otherwise, this must be a union that was converted to an integer value. + const IntegerType *NTy = cast<IntegerType>(FromVal->getType()); + + // If this is a big-endian system and the load is narrower than the + // full alloca type, we need to do a shift to get the right bits. + int ShAmt = 0; + if (TD.isBigEndian()) { + // On big-endian machines, the lowest bit is stored at the bit offset + // from the pointer given by getTypeStoreSizeInBits. This matters for + // integers with a bitwidth that is not a multiple of 8. + ShAmt = TD.getTypeStoreSizeInBits(NTy) - + TD.getTypeStoreSizeInBits(ToType) - Offset; + } else { + ShAmt = Offset; + } + + // Note: we support negative bitwidths (with shl) which are not defined. + // We do this to support (f.e.) loads off the end of a structure where + // only some bits are used. + if (ShAmt > 0 && (unsigned)ShAmt < NTy->getBitWidth()) + FromVal = Builder.CreateLShr(FromVal, + ConstantInt::get(FromVal->getType(), + ShAmt), "tmp"); + else if (ShAmt < 0 && (unsigned)-ShAmt < NTy->getBitWidth()) + FromVal = Builder.CreateShl(FromVal, + ConstantInt::get(FromVal->getType(), + -ShAmt), "tmp"); + + // Finally, unconditionally truncate the integer to the right width. + unsigned LIBitWidth = TD.getTypeSizeInBits(ToType); + if (LIBitWidth < NTy->getBitWidth()) + FromVal = + Builder.CreateTrunc(FromVal, IntegerType::get(FromVal->getContext(), + LIBitWidth), "tmp"); + else if (LIBitWidth > NTy->getBitWidth()) + FromVal = + Builder.CreateZExt(FromVal, IntegerType::get(FromVal->getContext(), + LIBitWidth), "tmp"); + + // If the result is an integer, this is a trunc or bitcast. + if (ToType->isIntegerTy()) { + // Should be done. + } else if (ToType->isFloatingPointTy() || ToType->isVectorTy()) { + // Just do a bitcast, we know the sizes match up. + FromVal = Builder.CreateBitCast(FromVal, ToType, "tmp"); + } else { + // Otherwise must be a pointer. + FromVal = Builder.CreateIntToPtr(FromVal, ToType, "tmp"); + } + assert(FromVal->getType() == ToType && "Didn't convert right?"); + return FromVal; +} + +/// ConvertScalar_InsertValue - Insert the value "SV" into the existing integer +/// or vector value "Old" at the offset specified by Offset. +/// +/// This happens when we are converting an "integer union" to a +/// single integer scalar, or when we are converting a "vector union" to a +/// vector with insert/extractelement instructions. +/// +/// Offset is an offset from the original alloca, in bits that need to be +/// shifted to the right. +Value *ConvertToScalarInfo:: +ConvertScalar_InsertValue(Value *SV, Value *Old, + uint64_t Offset, IRBuilder<> &Builder) { + // Convert the stored type to the actual type, shift it left to insert + // then 'or' into place. + const Type *AllocaType = Old->getType(); + LLVMContext &Context = Old->getContext(); + + if (const VectorType *VTy = dyn_cast<VectorType>(AllocaType)) { + uint64_t VecSize = TD.getTypeAllocSizeInBits(VTy); + uint64_t ValSize = TD.getTypeAllocSizeInBits(SV->getType()); + + // Changing the whole vector with memset or with an access of a different + // vector type? + if (ValSize == VecSize) + return Builder.CreateBitCast(SV, AllocaType, "tmp"); + + uint64_t EltSize = TD.getTypeAllocSizeInBits(VTy->getElementType()); + + // Must be an element insertion. + unsigned Elt = Offset/EltSize; + + if (SV->getType() != VTy->getElementType()) + SV = Builder.CreateBitCast(SV, VTy->getElementType(), "tmp"); + + SV = Builder.CreateInsertElement(Old, SV, + ConstantInt::get(Type::getInt32Ty(SV->getContext()), Elt), + "tmp"); + return SV; + } + + // If SV is a first-class aggregate value, insert each value recursively. + if (const StructType *ST = dyn_cast<StructType>(SV->getType())) { + const StructLayout &Layout = *TD.getStructLayout(ST); + for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { + Value *Elt = Builder.CreateExtractValue(SV, i, "tmp"); + Old = ConvertScalar_InsertValue(Elt, Old, + Offset+Layout.getElementOffsetInBits(i), + Builder); + } + return Old; + } + + if (const ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) { + uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType()); + for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { + Value *Elt = Builder.CreateExtractValue(SV, i, "tmp"); + Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, Builder); + } + return Old; + } + + // If SV is a float, convert it to the appropriate integer type. + // If it is a pointer, do the same. + unsigned SrcWidth = TD.getTypeSizeInBits(SV->getType()); + unsigned DestWidth = TD.getTypeSizeInBits(AllocaType); + unsigned SrcStoreWidth = TD.getTypeStoreSizeInBits(SV->getType()); + unsigned DestStoreWidth = TD.getTypeStoreSizeInBits(AllocaType); + if (SV->getType()->isFloatingPointTy() || SV->getType()->isVectorTy()) + SV = Builder.CreateBitCast(SV, + IntegerType::get(SV->getContext(),SrcWidth), "tmp"); + else if (SV->getType()->isPointerTy()) + SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getContext()), "tmp"); + + // Zero extend or truncate the value if needed. + if (SV->getType() != AllocaType) { + if (SV->getType()->getPrimitiveSizeInBits() < + AllocaType->getPrimitiveSizeInBits()) + SV = Builder.CreateZExt(SV, AllocaType, "tmp"); + else { + // Truncation may be needed if storing more than the alloca can hold + // (undefined behavior). + SV = Builder.CreateTrunc(SV, AllocaType, "tmp"); + SrcWidth = DestWidth; + SrcStoreWidth = DestStoreWidth; + } + } + + // If this is a big-endian system and the store is narrower than the + // full alloca type, we need to do a shift to get the right bits. + int ShAmt = 0; + if (TD.isBigEndian()) { + // On big-endian machines, the lowest bit is stored at the bit offset + // from the pointer given by getTypeStoreSizeInBits. This matters for + // integers with a bitwidth that is not a multiple of 8. + ShAmt = DestStoreWidth - SrcStoreWidth - Offset; + } else { + ShAmt = Offset; + } + + // Note: we support negative bitwidths (with shr) which are not defined. + // We do this to support (f.e.) stores off the end of a structure where + // only some bits in the structure are set. + APInt Mask(APInt::getLowBitsSet(DestWidth, SrcWidth)); + if (ShAmt > 0 && (unsigned)ShAmt < DestWidth) { + SV = Builder.CreateShl(SV, ConstantInt::get(SV->getType(), + ShAmt), "tmp"); + Mask <<= ShAmt; + } else if (ShAmt < 0 && (unsigned)-ShAmt < DestWidth) { + SV = Builder.CreateLShr(SV, ConstantInt::get(SV->getType(), + -ShAmt), "tmp"); + Mask = Mask.lshr(-ShAmt); + } + + // Mask out the bits we are about to insert from the old value, and or + // in the new bits. + if (SrcWidth != DestWidth) { + assert(DestWidth > SrcWidth); + Old = Builder.CreateAnd(Old, ConstantInt::get(Context, ~Mask), "mask"); + SV = Builder.CreateOr(Old, SV, "ins"); + } + return SV; +} + + +//===----------------------------------------------------------------------===// +// SRoA Driver +//===----------------------------------------------------------------------===// + + bool SROA::runOnFunction(Function &F) { TD = getAnalysisIfAvailable<TargetData>(); @@ -202,6 +785,7 @@ bool SROA::performPromotion(Function &F) { return Changed; } + /// ShouldAttemptScalarRepl - Decide if an alloca is a good candidate for /// SROA. It must be a struct or array type with a small number of elements. static bool ShouldAttemptScalarRepl(AllocaInst *AI) { @@ -216,6 +800,7 @@ static bool ShouldAttemptScalarRepl(AllocaInst *AI) { return false; } + // performScalarRepl - This algorithm is a simple worklist driven algorithm, // which runs on all of the malloc/alloca instructions in the function, removing // them if they are only used by getelementptr instructions. @@ -223,7 +808,7 @@ static bool ShouldAttemptScalarRepl(AllocaInst *AI) { bool SROA::performScalarRepl(Function &F) { std::vector<AllocaInst*> WorkList; - // Scan the entry basic block, adding any alloca's and mallocs to the worklist + // Scan the entry basic block, adding allocas to the worklist. BasicBlock &BB = F.getEntryBlock(); for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I) if (AllocaInst *A = dyn_cast<AllocaInst>(I)) @@ -239,6 +824,7 @@ bool SROA::performScalarRepl(Function &F) { // with unused elements. if (AI->use_empty()) { AI->eraseFromParent(); + Changed = true; continue; } @@ -251,10 +837,10 @@ bool SROA::performScalarRepl(Function &F) { // the constant global instead. This is commonly produced by the CFE by // constructs like "void foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A' // is only subsequently read. - if (Instruction *TheCopy = isOnlyCopiedFromConstantGlobal(AI)) { + if (MemTransferInst *TheCopy = isOnlyCopiedFromConstantGlobal(AI)) { DEBUG(dbgs() << "Found alloca equal to global: " << *AI << '\n'); DEBUG(dbgs() << " memcpy = " << *TheCopy << '\n'); - Constant *TheSrc = cast<Constant>(TheCopy->getOperand(2)); + Constant *TheSrc = cast<Constant>(TheCopy->getSource()); AI->replaceAllUsesWith(ConstantExpr::getBitCast(TheSrc, AI->getType())); TheCopy->eraseFromParent(); // Don't mutate the global. AI->eraseFromParent(); @@ -271,7 +857,10 @@ bool SROA::performScalarRepl(Function &F) { // Do not promote [0 x %struct]. if (AllocaSize == 0) continue; - + + // Do not promote any struct whose size is too big. + if (AllocaSize > SRThreshold) continue; + // If the alloca looks like a good candidate for scalar replacement, and if // all its users can be transformed, then split up the aggregate into its // separate elements. @@ -281,48 +870,20 @@ bool SROA::performScalarRepl(Function &F) { continue; } - // Do not promote any struct whose size is too big. - if (AllocaSize > SRThreshold) continue; - // If we can turn this aggregate value (potentially with casts) into a // simple scalar value that can be mem2reg'd into a register value. // IsNotTrivial tracks whether this is something that mem2reg could have // promoted itself. If so, we don't want to transform it needlessly. Note // that we can't just check based on the type: the alloca may be of an i32 // but that has pointer arithmetic to set byte 3 of it or something. - bool IsNotTrivial = false; - const Type *VectorTy = 0; - bool HadAVector = false; - if (CanConvertToScalar(AI, IsNotTrivial, VectorTy, HadAVector, - 0, unsigned(AllocaSize)) && IsNotTrivial) { - AllocaInst *NewAI; - // If we were able to find a vector type that can handle this with - // insert/extract elements, and if there was at least one use that had - // a vector type, promote this to a vector. We don't want to promote - // random stuff that doesn't use vectors (e.g. <9 x double>) because then - // we just get a lot of insert/extracts. If at least one vector is - // involved, then we probably really do have a union of vector/array. - if (VectorTy && VectorTy->isVectorTy() && HadAVector) { - DEBUG(dbgs() << "CONVERT TO VECTOR: " << *AI << "\n TYPE = " - << *VectorTy << '\n'); - - // Create and insert the vector alloca. - NewAI = new AllocaInst(VectorTy, 0, "", AI->getParent()->begin()); - ConvertUsesToScalar(AI, NewAI, 0); - } else { - DEBUG(dbgs() << "CONVERT TO SCALAR INTEGER: " << *AI << "\n"); - - // Create and insert the integer alloca. - const Type *NewTy = IntegerType::get(AI->getContext(), AllocaSize*8); - NewAI = new AllocaInst(NewTy, 0, "", AI->getParent()->begin()); - ConvertUsesToScalar(AI, NewAI, 0); - } + if (AllocaInst *NewAI = + ConvertToScalarInfo((unsigned)AllocaSize, *TD).TryConvert(AI)) { NewAI->takeName(AI); AI->eraseFromParent(); ++NumConverted; Changed = true; continue; - } + } // Otherwise, couldn't process this alloca. } @@ -698,7 +1259,6 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, // that doesn't have anything to do with the alloca that we are promoting. For // memset, this Value* stays null. Value *OtherPtr = 0; - LLVMContext &Context = MI->getContext(); unsigned MemAlignment = MI->getAlignment(); if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) { // memmove/memcopy if (Inst == MTI->getRawDest()) @@ -756,7 +1316,7 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, } // Process each element of the aggregate. - Value *TheFn = MI->getOperand(0); + Value *TheFn = MI->getCalledValue(); const Type *BytePtrTy = MI->getRawDest()->getType(); bool SROADest = MI->getRawDest() == Inst; @@ -775,12 +1335,11 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, MI); uint64_t EltOffset; const PointerType *OtherPtrTy = cast<PointerType>(OtherPtr->getType()); - if (const StructType *ST = - dyn_cast<StructType>(OtherPtrTy->getElementType())) { + const Type *OtherTy = OtherPtrTy->getElementType(); + if (const StructType *ST = dyn_cast<StructType>(OtherTy)) { EltOffset = TD->getStructLayout(ST)->getElementOffset(i); } else { - const Type *EltTy = - cast<SequentialType>(OtherPtr->getType())->getElementType(); + const Type *EltTy = cast<SequentialType>(OtherTy)->getElementType(); EltOffset = TD->getTypeAllocSize(EltTy)*i; } @@ -832,7 +1391,7 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, } // Convert the integer value to the appropriate type. - StoreVal = ConstantInt::get(Context, TotalVal); + StoreVal = ConstantInt::get(CI->getContext(), TotalVal); if (ValTy->isPointerTy()) StoreVal = ConstantExpr::getIntToPtr(StoreVal, ValTy); else if (ValTy->isFloatingPointTy()) @@ -1174,509 +1733,6 @@ bool SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) { return true; } -/// MergeInType - Add the 'In' type to the accumulated type (Accum) so far at -/// the offset specified by Offset (which is specified in bytes). -/// -/// There are two cases we handle here: -/// 1) A union of vector types of the same size and potentially its elements. -/// Here we turn element accesses into insert/extract element operations. -/// This promotes a <4 x float> with a store of float to the third element -/// into a <4 x float> that uses insert element. -/// 2) A fully general blob of memory, which we turn into some (potentially -/// large) integer type with extract and insert operations where the loads -/// and stores would mutate the memory. -static void MergeInType(const Type *In, uint64_t Offset, const Type *&VecTy, - unsigned AllocaSize, const TargetData &TD, - LLVMContext &Context) { - // If this could be contributing to a vector, analyze it. - if (VecTy != Type::getVoidTy(Context)) { // either null or a vector type. - - // If the In type is a vector that is the same size as the alloca, see if it - // matches the existing VecTy. - if (const VectorType *VInTy = dyn_cast<VectorType>(In)) { - if (VInTy->getBitWidth()/8 == AllocaSize && Offset == 0) { - // If we're storing/loading a vector of the right size, allow it as a - // vector. If this the first vector we see, remember the type so that - // we know the element size. - if (VecTy == 0) - VecTy = VInTy; - return; - } - } else if (In->isFloatTy() || In->isDoubleTy() || - (In->isIntegerTy() && In->getPrimitiveSizeInBits() >= 8 && - isPowerOf2_32(In->getPrimitiveSizeInBits()))) { - // If we're accessing something that could be an element of a vector, see - // if the implied vector agrees with what we already have and if Offset is - // compatible with it. - unsigned EltSize = In->getPrimitiveSizeInBits()/8; - if (Offset % EltSize == 0 && - AllocaSize % EltSize == 0 && - (VecTy == 0 || - cast<VectorType>(VecTy)->getElementType() - ->getPrimitiveSizeInBits()/8 == EltSize)) { - if (VecTy == 0) - VecTy = VectorType::get(In, AllocaSize/EltSize); - return; - } - } - } - - // Otherwise, we have a case that we can't handle with an optimized vector - // form. We can still turn this into a large integer. - VecTy = Type::getVoidTy(Context); -} - -/// CanConvertToScalar - V is a pointer. If we can convert the pointee and all -/// its accesses to a single vector type, return true and set VecTy to -/// the new type. If we could convert the alloca into a single promotable -/// integer, return true but set VecTy to VoidTy. Further, if the use is not a -/// completely trivial use that mem2reg could promote, set IsNotTrivial. Offset -/// is the current offset from the base of the alloca being analyzed. -/// -/// If we see at least one access to the value that is as a vector type, set the -/// SawVec flag. -bool SROA::CanConvertToScalar(Value *V, bool &IsNotTrivial, const Type *&VecTy, - bool &SawVec, uint64_t Offset, - unsigned AllocaSize) { - for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) { - Instruction *User = cast<Instruction>(*UI); - - if (LoadInst *LI = dyn_cast<LoadInst>(User)) { - // Don't break volatile loads. - if (LI->isVolatile()) - return false; - MergeInType(LI->getType(), Offset, VecTy, - AllocaSize, *TD, V->getContext()); - SawVec |= LI->getType()->isVectorTy(); - continue; - } - - if (StoreInst *SI = dyn_cast<StoreInst>(User)) { - // Storing the pointer, not into the value? - if (SI->getOperand(0) == V || SI->isVolatile()) return 0; - MergeInType(SI->getOperand(0)->getType(), Offset, - VecTy, AllocaSize, *TD, V->getContext()); - SawVec |= SI->getOperand(0)->getType()->isVectorTy(); - continue; - } - - if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) { - if (!CanConvertToScalar(BCI, IsNotTrivial, VecTy, SawVec, Offset, - AllocaSize)) - return false; - IsNotTrivial = true; - continue; - } - - if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) { - // If this is a GEP with a variable indices, we can't handle it. - if (!GEP->hasAllConstantIndices()) - return false; - - // Compute the offset that this GEP adds to the pointer. - SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end()); - uint64_t GEPOffset = TD->getIndexedOffset(GEP->getPointerOperandType(), - &Indices[0], Indices.size()); - // See if all uses can be converted. - if (!CanConvertToScalar(GEP, IsNotTrivial, VecTy, SawVec,Offset+GEPOffset, - AllocaSize)) - return false; - IsNotTrivial = true; - continue; - } - - // If this is a constant sized memset of a constant value (e.g. 0) we can - // handle it. - if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) { - // Store of constant value and constant size. - if (isa<ConstantInt>(MSI->getValue()) && - isa<ConstantInt>(MSI->getLength())) { - IsNotTrivial = true; - continue; - } - } - - // If this is a memcpy or memmove into or out of the whole allocation, we - // can handle it like a load or store of the scalar type. - if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) { - if (ConstantInt *Len = dyn_cast<ConstantInt>(MTI->getLength())) - if (Len->getZExtValue() == AllocaSize && Offset == 0) { - IsNotTrivial = true; - continue; - } - } - - // Otherwise, we cannot handle this! - return false; - } - - return true; -} - -/// ConvertUsesToScalar - Convert all of the users of Ptr to use the new alloca -/// directly. This happens when we are converting an "integer union" to a -/// single integer scalar, or when we are converting a "vector union" to a -/// vector with insert/extractelement instructions. -/// -/// Offset is an offset from the original alloca, in bits that need to be -/// shifted to the right. By the end of this, there should be no uses of Ptr. -void SROA::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset) { - while (!Ptr->use_empty()) { - Instruction *User = cast<Instruction>(Ptr->use_back()); - - if (BitCastInst *CI = dyn_cast<BitCastInst>(User)) { - ConvertUsesToScalar(CI, NewAI, Offset); - CI->eraseFromParent(); - continue; - } - - if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) { - // Compute the offset that this GEP adds to the pointer. - SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end()); - uint64_t GEPOffset = TD->getIndexedOffset(GEP->getPointerOperandType(), - &Indices[0], Indices.size()); - ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8); - GEP->eraseFromParent(); - continue; - } - - IRBuilder<> Builder(User->getParent(), User); - - if (LoadInst *LI = dyn_cast<LoadInst>(User)) { - // The load is a bit extract from NewAI shifted right by Offset bits. - Value *LoadedVal = Builder.CreateLoad(NewAI, "tmp"); - Value *NewLoadVal - = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset, Builder); - LI->replaceAllUsesWith(NewLoadVal); - LI->eraseFromParent(); - continue; - } - - if (StoreInst *SI = dyn_cast<StoreInst>(User)) { - assert(SI->getOperand(0) != Ptr && "Consistency error!"); - Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in"); - Value *New = ConvertScalar_InsertValue(SI->getOperand(0), Old, Offset, - Builder); - Builder.CreateStore(New, NewAI); - SI->eraseFromParent(); - - // If the load we just inserted is now dead, then the inserted store - // overwrote the entire thing. - if (Old->use_empty()) - Old->eraseFromParent(); - continue; - } - - // If this is a constant sized memset of a constant value (e.g. 0) we can - // transform it into a store of the expanded constant value. - if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) { - assert(MSI->getRawDest() == Ptr && "Consistency error!"); - unsigned NumBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue(); - if (NumBytes != 0) { - unsigned Val = cast<ConstantInt>(MSI->getValue())->getZExtValue(); - - // Compute the value replicated the right number of times. - APInt APVal(NumBytes*8, Val); - - // Splat the value if non-zero. - if (Val) - for (unsigned i = 1; i != NumBytes; ++i) - APVal |= APVal << 8; - - Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in"); - Value *New = ConvertScalar_InsertValue( - ConstantInt::get(User->getContext(), APVal), - Old, Offset, Builder); - Builder.CreateStore(New, NewAI); - - // If the load we just inserted is now dead, then the memset overwrote - // the entire thing. - if (Old->use_empty()) - Old->eraseFromParent(); - } - MSI->eraseFromParent(); - continue; - } - - // If this is a memcpy or memmove into or out of the whole allocation, we - // can handle it like a load or store of the scalar type. - if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) { - assert(Offset == 0 && "must be store to start of alloca"); - - // If the source and destination are both to the same alloca, then this is - // a noop copy-to-self, just delete it. Otherwise, emit a load and store - // as appropriate. - AllocaInst *OrigAI = cast<AllocaInst>(Ptr->getUnderlyingObject(0)); - - if (MTI->getSource()->getUnderlyingObject(0) != OrigAI) { - // Dest must be OrigAI, change this to be a load from the original - // pointer (bitcasted), then a store to our new alloca. - assert(MTI->getRawDest() == Ptr && "Neither use is of pointer?"); - Value *SrcPtr = MTI->getSource(); - SrcPtr = Builder.CreateBitCast(SrcPtr, NewAI->getType()); - - LoadInst *SrcVal = Builder.CreateLoad(SrcPtr, "srcval"); - SrcVal->setAlignment(MTI->getAlignment()); - Builder.CreateStore(SrcVal, NewAI); - } else if (MTI->getDest()->getUnderlyingObject(0) != OrigAI) { - // Src must be OrigAI, change this to be a load from NewAI then a store - // through the original dest pointer (bitcasted). - assert(MTI->getRawSource() == Ptr && "Neither use is of pointer?"); - LoadInst *SrcVal = Builder.CreateLoad(NewAI, "srcval"); - - Value *DstPtr = Builder.CreateBitCast(MTI->getDest(), NewAI->getType()); - StoreInst *NewStore = Builder.CreateStore(SrcVal, DstPtr); - NewStore->setAlignment(MTI->getAlignment()); - } else { - // Noop transfer. Src == Dst - } - - MTI->eraseFromParent(); - continue; - } - - llvm_unreachable("Unsupported operation!"); - } -} - -/// ConvertScalar_ExtractValue - Extract a value of type ToType from an integer -/// or vector value FromVal, extracting the bits from the offset specified by -/// Offset. This returns the value, which is of type ToType. -/// -/// This happens when we are converting an "integer union" to a single -/// integer scalar, or when we are converting a "vector union" to a vector with -/// insert/extractelement instructions. -/// -/// Offset is an offset from the original alloca, in bits that need to be -/// shifted to the right. -Value *SROA::ConvertScalar_ExtractValue(Value *FromVal, const Type *ToType, - uint64_t Offset, IRBuilder<> &Builder) { - // If the load is of the whole new alloca, no conversion is needed. - if (FromVal->getType() == ToType && Offset == 0) - return FromVal; - - // If the result alloca is a vector type, this is either an element - // access or a bitcast to another vector type of the same size. - if (const VectorType *VTy = dyn_cast<VectorType>(FromVal->getType())) { - if (ToType->isVectorTy()) - return Builder.CreateBitCast(FromVal, ToType, "tmp"); - - // Otherwise it must be an element access. - unsigned Elt = 0; - if (Offset) { - unsigned EltSize = TD->getTypeAllocSizeInBits(VTy->getElementType()); - Elt = Offset/EltSize; - assert(EltSize*Elt == Offset && "Invalid modulus in validity checking"); - } - // Return the element extracted out of it. - Value *V = Builder.CreateExtractElement(FromVal, ConstantInt::get( - Type::getInt32Ty(FromVal->getContext()), Elt), "tmp"); - if (V->getType() != ToType) - V = Builder.CreateBitCast(V, ToType, "tmp"); - return V; - } - - // If ToType is a first class aggregate, extract out each of the pieces and - // use insertvalue's to form the FCA. - if (const StructType *ST = dyn_cast<StructType>(ToType)) { - const StructLayout &Layout = *TD->getStructLayout(ST); - Value *Res = UndefValue::get(ST); - for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { - Value *Elt = ConvertScalar_ExtractValue(FromVal, ST->getElementType(i), - Offset+Layout.getElementOffsetInBits(i), - Builder); - Res = Builder.CreateInsertValue(Res, Elt, i, "tmp"); - } - return Res; - } - - if (const ArrayType *AT = dyn_cast<ArrayType>(ToType)) { - uint64_t EltSize = TD->getTypeAllocSizeInBits(AT->getElementType()); - Value *Res = UndefValue::get(AT); - for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { - Value *Elt = ConvertScalar_ExtractValue(FromVal, AT->getElementType(), - Offset+i*EltSize, Builder); - Res = Builder.CreateInsertValue(Res, Elt, i, "tmp"); - } - return Res; - } - - // Otherwise, this must be a union that was converted to an integer value. - const IntegerType *NTy = cast<IntegerType>(FromVal->getType()); - - // If this is a big-endian system and the load is narrower than the - // full alloca type, we need to do a shift to get the right bits. - int ShAmt = 0; - if (TD->isBigEndian()) { - // On big-endian machines, the lowest bit is stored at the bit offset - // from the pointer given by getTypeStoreSizeInBits. This matters for - // integers with a bitwidth that is not a multiple of 8. - ShAmt = TD->getTypeStoreSizeInBits(NTy) - - TD->getTypeStoreSizeInBits(ToType) - Offset; - } else { - ShAmt = Offset; - } - - // Note: we support negative bitwidths (with shl) which are not defined. - // We do this to support (f.e.) loads off the end of a structure where - // only some bits are used. - if (ShAmt > 0 && (unsigned)ShAmt < NTy->getBitWidth()) - FromVal = Builder.CreateLShr(FromVal, - ConstantInt::get(FromVal->getType(), - ShAmt), "tmp"); - else if (ShAmt < 0 && (unsigned)-ShAmt < NTy->getBitWidth()) - FromVal = Builder.CreateShl(FromVal, - ConstantInt::get(FromVal->getType(), - -ShAmt), "tmp"); - - // Finally, unconditionally truncate the integer to the right width. - unsigned LIBitWidth = TD->getTypeSizeInBits(ToType); - if (LIBitWidth < NTy->getBitWidth()) - FromVal = - Builder.CreateTrunc(FromVal, IntegerType::get(FromVal->getContext(), - LIBitWidth), "tmp"); - else if (LIBitWidth > NTy->getBitWidth()) - FromVal = - Builder.CreateZExt(FromVal, IntegerType::get(FromVal->getContext(), - LIBitWidth), "tmp"); - - // If the result is an integer, this is a trunc or bitcast. - if (ToType->isIntegerTy()) { - // Should be done. - } else if (ToType->isFloatingPointTy() || ToType->isVectorTy()) { - // Just do a bitcast, we know the sizes match up. - FromVal = Builder.CreateBitCast(FromVal, ToType, "tmp"); - } else { - // Otherwise must be a pointer. - FromVal = Builder.CreateIntToPtr(FromVal, ToType, "tmp"); - } - assert(FromVal->getType() == ToType && "Didn't convert right?"); - return FromVal; -} - -/// ConvertScalar_InsertValue - Insert the value "SV" into the existing integer -/// or vector value "Old" at the offset specified by Offset. -/// -/// This happens when we are converting an "integer union" to a -/// single integer scalar, or when we are converting a "vector union" to a -/// vector with insert/extractelement instructions. -/// -/// Offset is an offset from the original alloca, in bits that need to be -/// shifted to the right. -Value *SROA::ConvertScalar_InsertValue(Value *SV, Value *Old, - uint64_t Offset, IRBuilder<> &Builder) { - - // Convert the stored type to the actual type, shift it left to insert - // then 'or' into place. - const Type *AllocaType = Old->getType(); - LLVMContext &Context = Old->getContext(); - - if (const VectorType *VTy = dyn_cast<VectorType>(AllocaType)) { - uint64_t VecSize = TD->getTypeAllocSizeInBits(VTy); - uint64_t ValSize = TD->getTypeAllocSizeInBits(SV->getType()); - - // Changing the whole vector with memset or with an access of a different - // vector type? - if (ValSize == VecSize) - return Builder.CreateBitCast(SV, AllocaType, "tmp"); - - uint64_t EltSize = TD->getTypeAllocSizeInBits(VTy->getElementType()); - - // Must be an element insertion. - unsigned Elt = Offset/EltSize; - - if (SV->getType() != VTy->getElementType()) - SV = Builder.CreateBitCast(SV, VTy->getElementType(), "tmp"); - - SV = Builder.CreateInsertElement(Old, SV, - ConstantInt::get(Type::getInt32Ty(SV->getContext()), Elt), - "tmp"); - return SV; - } - - // If SV is a first-class aggregate value, insert each value recursively. - if (const StructType *ST = dyn_cast<StructType>(SV->getType())) { - const StructLayout &Layout = *TD->getStructLayout(ST); - for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { - Value *Elt = Builder.CreateExtractValue(SV, i, "tmp"); - Old = ConvertScalar_InsertValue(Elt, Old, - Offset+Layout.getElementOffsetInBits(i), - Builder); - } - return Old; - } - - if (const ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) { - uint64_t EltSize = TD->getTypeAllocSizeInBits(AT->getElementType()); - for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { - Value *Elt = Builder.CreateExtractValue(SV, i, "tmp"); - Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, Builder); - } - return Old; - } - - // If SV is a float, convert it to the appropriate integer type. - // If it is a pointer, do the same. - unsigned SrcWidth = TD->getTypeSizeInBits(SV->getType()); - unsigned DestWidth = TD->getTypeSizeInBits(AllocaType); - unsigned SrcStoreWidth = TD->getTypeStoreSizeInBits(SV->getType()); - unsigned DestStoreWidth = TD->getTypeStoreSizeInBits(AllocaType); - if (SV->getType()->isFloatingPointTy() || SV->getType()->isVectorTy()) - SV = Builder.CreateBitCast(SV, - IntegerType::get(SV->getContext(),SrcWidth), "tmp"); - else if (SV->getType()->isPointerTy()) - SV = Builder.CreatePtrToInt(SV, TD->getIntPtrType(SV->getContext()), "tmp"); - - // Zero extend or truncate the value if needed. - if (SV->getType() != AllocaType) { - if (SV->getType()->getPrimitiveSizeInBits() < - AllocaType->getPrimitiveSizeInBits()) - SV = Builder.CreateZExt(SV, AllocaType, "tmp"); - else { - // Truncation may be needed if storing more than the alloca can hold - // (undefined behavior). - SV = Builder.CreateTrunc(SV, AllocaType, "tmp"); - SrcWidth = DestWidth; - SrcStoreWidth = DestStoreWidth; - } - } - - // If this is a big-endian system and the store is narrower than the - // full alloca type, we need to do a shift to get the right bits. - int ShAmt = 0; - if (TD->isBigEndian()) { - // On big-endian machines, the lowest bit is stored at the bit offset - // from the pointer given by getTypeStoreSizeInBits. This matters for - // integers with a bitwidth that is not a multiple of 8. - ShAmt = DestStoreWidth - SrcStoreWidth - Offset; - } else { - ShAmt = Offset; - } - - // Note: we support negative bitwidths (with shr) which are not defined. - // We do this to support (f.e.) stores off the end of a structure where - // only some bits in the structure are set. - APInt Mask(APInt::getLowBitsSet(DestWidth, SrcWidth)); - if (ShAmt > 0 && (unsigned)ShAmt < DestWidth) { - SV = Builder.CreateShl(SV, ConstantInt::get(SV->getType(), - ShAmt), "tmp"); - Mask <<= ShAmt; - } else if (ShAmt < 0 && (unsigned)-ShAmt < DestWidth) { - SV = Builder.CreateLShr(SV, ConstantInt::get(SV->getType(), - -ShAmt), "tmp"); - Mask = Mask.lshr(-ShAmt); - } - - // Mask out the bits we are about to insert from the old value, and or - // in the new bits. - if (SrcWidth != DestWidth) { - assert(DestWidth > SrcWidth); - Old = Builder.CreateAnd(Old, ConstantInt::get(Context, ~Mask), "mask"); - SV = Builder.CreateOr(Old, SV, "ins"); - } - return SV; -} - /// PointsToConstantGlobal - Return true if V (possibly indirectly) points to @@ -1699,21 +1755,23 @@ static bool PointsToConstantGlobal(Value *V) { /// the uses. If we see a memcpy/memmove that targets an unoffseted pointer to /// the alloca, and if the source pointer is a pointer to a constant global, we /// can optimize this. -static bool isOnlyCopiedFromConstantGlobal(Value *V, Instruction *&TheCopy, +static bool isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy, bool isOffset) { for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) { - if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) + User *U = cast<Instruction>(*UI); + + if (LoadInst *LI = dyn_cast<LoadInst>(U)) // Ignore non-volatile loads, they are always ok. if (!LI->isVolatile()) continue; - if (BitCastInst *BCI = dyn_cast<BitCastInst>(*UI)) { + if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) { // If uses of the bitcast are ok, we are ok. if (!isOnlyCopiedFromConstantGlobal(BCI, TheCopy, isOffset)) return false; continue; } - if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(*UI)) { + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) { // If the GEP has all zero indices, it doesn't offset the pointer. If it // doesn't, it does. if (!isOnlyCopiedFromConstantGlobal(GEP, TheCopy, @@ -1724,7 +1782,8 @@ static bool isOnlyCopiedFromConstantGlobal(Value *V, Instruction *&TheCopy, // If this is isn't our memcpy/memmove, reject it as something we can't // handle. - if (!isa<MemTransferInst>(*UI)) + MemTransferInst *MI = dyn_cast<MemTransferInst>(U); + if (MI == 0) return false; // If we already have seen a copy, reject the second one. @@ -1737,10 +1796,8 @@ static bool isOnlyCopiedFromConstantGlobal(Value *V, Instruction *&TheCopy, // If the memintrinsic isn't using the alloca as the dest, reject it. if (UI.getOperandNo() != 1) return false; - MemIntrinsic *MI = cast<MemIntrinsic>(*UI); - // If the source of the memcpy/move is not a constant global, reject it. - if (!PointsToConstantGlobal(MI->getOperand(2))) + if (!PointsToConstantGlobal(MI->getSource())) return false; // Otherwise, the transform is safe. Remember the copy instruction. @@ -1752,8 +1809,8 @@ static bool isOnlyCopiedFromConstantGlobal(Value *V, Instruction *&TheCopy, /// isOnlyCopiedFromConstantGlobal - Return true if the specified alloca is only /// modified by a copy from a constant global. If we can prove this, we can /// replace any uses of the alloca with uses of the global directly. -Instruction *SROA::isOnlyCopiedFromConstantGlobal(AllocaInst *AI) { - Instruction *TheCopy = 0; +MemTransferInst *SROA::isOnlyCopiedFromConstantGlobal(AllocaInst *AI) { + MemTransferInst *TheCopy = 0; if (::isOnlyCopiedFromConstantGlobal(AI, TheCopy, false)) return TheCopy; return 0; diff --git a/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp b/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp index 4464961..c3408e7 100644 --- a/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp +++ b/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp @@ -93,7 +93,8 @@ InlineHalfPowrs(const std::vector<Instruction *> &HalfPowrs, // Inline the call, taking care of what code ends up where. NewBlock = SplitBlock(NextInst->getParent(), NextInst, this); - bool B = InlineFunction(Call, 0, TD); + InlineFunctionInfo IFI(0, TD); + bool B = InlineFunction(Call, IFI); assert(B && "half_powr didn't inline?"); B=B; BasicBlock *NewBody = NewBlock->getSinglePredecessor(); diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp index 162d902..5ad5de2 100644 --- a/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -59,6 +59,8 @@ #include "llvm/Instructions.h" #include "llvm/Pass.h" #include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/InlineCost.h" +#include "llvm/Support/CallSite.h" #include "llvm/Support/CFG.h" #include "llvm/ADT/Statistic.h" using namespace llvm; @@ -328,15 +330,6 @@ bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry, if (&BB->front() == Ret) // Make sure there is something before the ret... return false; - // If the return is in the entry block, then making this transformation would - // turn infinite recursion into an infinite loop. This transformation is ok - // in theory, but breaks some code like: - // double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call - // disable this xform in this case, because the code generator will lower the - // call to fabs into inline code. - if (BB == &F->getEntryBlock()) - return false; - // Scan backwards from the return, checking to see if there is a tail call in // this block. If so, set CI to it. CallInst *CI; @@ -356,6 +349,25 @@ bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry, if (CI->isTailCall() && CannotTailCallElimCallsMarkedTail) return false; + // As a special case, detect code like this: + // double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call + // and disable this xform in this case, because the code generator will + // lower the call to fabs into inline code. + if (BB == &F->getEntryBlock() && + &BB->front() == CI && &*++BB->begin() == Ret && + callIsSmall(F)) { + // A single-block function with just a call and a return. Check that + // the arguments match. + CallSite::arg_iterator I = CallSite(CI).arg_begin(), + E = CallSite(CI).arg_end(); + Function::arg_iterator FI = F->arg_begin(), + FE = F->arg_end(); + for (; I != E && FI != FE; ++I, ++FI) + if (*I != &*FI) break; + if (I == E && FI == FE) + return false; + } + // If we are introducing accumulator recursion to eliminate associative // operations after the call instruction, this variable contains the initial // value for the accumulator. If this value is set, we actually perform |