diff options
Diffstat (limited to 'contrib/llvm/lib/Transforms/Scalar')
22 files changed, 2131 insertions, 1946 deletions
diff --git a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp index a5adb5e..ba214d1 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp @@ -57,6 +57,7 @@ bool ADCE::runOnFunction(Function& F) { for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) if (isa<TerminatorInst>(I.getInstructionIterator()) || isa<DbgInfoIntrinsic>(I.getInstructionIterator()) || + isa<LandingPadInst>(I.getInstructionIterator()) || I->mayHaveSideEffects()) { alive.insert(I.getInstructionIterator()); worklist.push_back(I.getInstructionIterator()); @@ -65,7 +66,6 @@ bool ADCE::runOnFunction(Function& F) { // Propagate liveness backwards to operands. while (!worklist.empty()) { Instruction* curr = worklist.pop_back_val(); - for (Instruction::op_iterator OI = curr->op_begin(), OE = curr->op_end(); OI != OE; ++OI) if (Instruction* Inst = dyn_cast<Instruction>(OI)) diff --git a/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp b/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp index 0af14ed..f8f18b2 100644 --- a/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp @@ -58,6 +58,7 @@ STATISTIC(NumMemoryInsts, "Number of memory instructions whose address " STATISTIC(NumExtsMoved, "Number of [s|z]ext instructions combined with loads"); STATISTIC(NumExtUses, "Number of uses of [s|z]ext instructions optimized"); STATISTIC(NumRetsDup, "Number of return instructions duplicated"); +STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved"); static cl::opt<bool> DisableBranchOpts( "disable-cgp-branch-opts", cl::Hidden, cl::init(false), @@ -104,12 +105,13 @@ namespace { void EliminateMostlyEmptyBlock(BasicBlock *BB); bool OptimizeBlock(BasicBlock &BB); bool OptimizeInst(Instruction *I); - bool OptimizeMemoryInst(Instruction *I, Value *Addr, const Type *AccessTy); + bool OptimizeMemoryInst(Instruction *I, Value *Addr, Type *AccessTy); bool OptimizeInlineAsmInst(CallInst *CS); bool OptimizeCallInst(CallInst *CI); bool MoveExtToFormExtLoad(Instruction *I); bool OptimizeExtUses(Instruction *I); bool DupRetToEnableTailCallOpts(ReturnInst *RI); + bool PlaceDbgValues(Function &F); }; } @@ -132,6 +134,11 @@ bool CodeGenPrepare::runOnFunction(Function &F) { // unconditional branch. EverMadeChange |= EliminateMostlyEmptyBlocks(F); + // llvm.dbg.value is far away from the value then iSel may not be able + // handle it properly. iSel will drop llvm.dbg.value if it can not + // find a node corresponding to the value. + EverMadeChange |= PlaceDbgValues(F); + bool MadeChange = true; while (MadeChange) { MadeChange = false; @@ -410,8 +417,7 @@ static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI){ CastInst *&InsertedCast = InsertedCasts[UserBB]; if (!InsertedCast) { - BasicBlock::iterator InsertPt = UserBB->getFirstNonPHI(); - + BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); InsertedCast = CastInst::Create(CI->getOpcode(), CI->getOperand(0), CI->getType(), "", InsertPt); @@ -467,8 +473,7 @@ static bool OptimizeCmpExpression(CmpInst *CI) { CmpInst *&InsertedCmp = InsertedCmps[UserBB]; if (!InsertedCmp) { - BasicBlock::iterator InsertPt = UserBB->getFirstNonPHI(); - + BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); InsertedCmp = CmpInst::Create(CI->getOpcode(), CI->getPredicate(), CI->getOperand(0), @@ -528,7 +533,7 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI); if (II && II->getIntrinsicID() == Intrinsic::objectsize) { bool Min = (cast<ConstantInt>(II->getArgOperand(1))->getZExtValue() == 1); - const Type *ReturnTy = CI->getType(); + Type *ReturnTy = CI->getType(); Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL); // Substituting this can cause recursive simplifications, which can @@ -551,22 +556,6 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { // From here on out we're working with named functions. if (CI->getCalledFunction() == 0) return false; - // llvm.dbg.value is far away from the value then iSel may not be able - // handle it properly. iSel will drop llvm.dbg.value if it can not - // find a node corresponding to the value. - if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(CI)) - if (Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue())) - if (!VI->isTerminator() && - (DVI->getParent() != VI->getParent() || DT->dominates(DVI, VI))) { - DEBUG(dbgs() << "Moving Debug Value before :\n" << *DVI << ' ' << *VI); - DVI->removeFromParent(); - if (isa<PHINode>(VI)) - DVI->insertBefore(VI->getParent()->getFirstNonPHI()); - else - DVI->insertAfter(VI); - return true; - } - // We'll need TargetData from here on out. const TargetData *TD = TLI ? TLI->getTargetData() : 0; if (!TD) return false; @@ -724,7 +713,7 @@ static bool IsNonLocalValue(Value *V, BasicBlock *BB) { /// This method is used to optimize both load/store and inline asms with memory /// operands. bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, - const Type *AccessTy) { + Type *AccessTy) { Value *Repl = Addr; // Try to collapse single-value PHI nodes. This is necessary to undo @@ -746,13 +735,11 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, worklist.pop_back(); // Break use-def graph loops. - if (Visited.count(V)) { + if (!Visited.insert(V)) { Consensus = 0; break; } - Visited.insert(V); - // For a PHI node, push all of its incoming values. if (PHINode *P = dyn_cast<PHINode>(V)) { for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i) @@ -763,7 +750,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, // For non-PHIs, determine the addressing mode being computed. SmallVector<Instruction*, 16> NewAddrModeInsts; ExtAddrMode NewAddrMode = - AddressingModeMatcher::Match(V, AccessTy,MemoryInst, + AddressingModeMatcher::Match(V, AccessTy, MemoryInst, NewAddrModeInsts, *TLI); // This check is broken into two cases with very similar code to avoid using @@ -822,7 +809,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, // Insert this computation right after this user. Since our caller is // scanning from the top of the BB to the bottom, reuse of the expr are // guaranteed to happen later. - BasicBlock::iterator InsertPt = MemoryInst; + IRBuilder<> Builder(MemoryInst); // Now that we determined the addressing expression we want to use and know // that we have to sink it into this block. Check to see if we have already @@ -833,11 +820,11 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode << " for " << *MemoryInst); if (SunkAddr->getType() != Addr->getType()) - SunkAddr = new BitCastInst(SunkAddr, Addr->getType(), "tmp", InsertPt); + SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType()); } else { DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for " << *MemoryInst); - const Type *IntPtrTy = + Type *IntPtrTy = TLI->getTargetData()->getIntPtrType(AccessTy->getContext()); Value *Result = 0; @@ -850,10 +837,9 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, if (AddrMode.BaseReg) { Value *V = AddrMode.BaseReg; if (V->getType()->isPointerTy()) - V = new PtrToIntInst(V, IntPtrTy, "sunkaddr", InsertPt); + V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr"); if (V->getType() != IntPtrTy) - V = CastInst::CreateIntegerCast(V, IntPtrTy, /*isSigned=*/true, - "sunkaddr", InsertPt); + V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr"); Result = V; } @@ -863,29 +849,27 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, if (V->getType() == IntPtrTy) { // done. } else if (V->getType()->isPointerTy()) { - V = new PtrToIntInst(V, IntPtrTy, "sunkaddr", InsertPt); + V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr"); } else if (cast<IntegerType>(IntPtrTy)->getBitWidth() < cast<IntegerType>(V->getType())->getBitWidth()) { - V = new TruncInst(V, IntPtrTy, "sunkaddr", InsertPt); + V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr"); } else { - V = new SExtInst(V, IntPtrTy, "sunkaddr", InsertPt); + V = Builder.CreateSExt(V, IntPtrTy, "sunkaddr"); } if (AddrMode.Scale != 1) - V = BinaryOperator::CreateMul(V, ConstantInt::get(IntPtrTy, - AddrMode.Scale), - "sunkaddr", InsertPt); + V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale), + "sunkaddr"); if (Result) - Result = BinaryOperator::CreateAdd(Result, V, "sunkaddr", InsertPt); + Result = Builder.CreateAdd(Result, V, "sunkaddr"); else Result = V; } // Add in the BaseGV if present. if (AddrMode.BaseGV) { - Value *V = new PtrToIntInst(AddrMode.BaseGV, IntPtrTy, "sunkaddr", - InsertPt); + Value *V = Builder.CreatePtrToInt(AddrMode.BaseGV, IntPtrTy, "sunkaddr"); if (Result) - Result = BinaryOperator::CreateAdd(Result, V, "sunkaddr", InsertPt); + Result = Builder.CreateAdd(Result, V, "sunkaddr"); else Result = V; } @@ -894,7 +878,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, if (AddrMode.BaseOffs) { Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs); if (Result) - Result = BinaryOperator::CreateAdd(Result, V, "sunkaddr", InsertPt); + Result = Builder.CreateAdd(Result, V, "sunkaddr"); else Result = V; } @@ -902,7 +886,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, if (Result == 0) SunkAddr = Constant::getNullValue(Addr->getType()); else - SunkAddr = new IntToPtrInst(Result, Addr->getType(), "sunkaddr",InsertPt); + SunkAddr = Builder.CreateIntToPtr(Result, Addr->getType(), "sunkaddr"); } MemoryInst->replaceUsesOfWith(Repl, SunkAddr); @@ -1059,8 +1043,7 @@ bool CodeGenPrepare::OptimizeExtUses(Instruction *I) { Instruction *&InsertedTrunc = InsertedTruncs[UserBB]; if (!InsertedTrunc) { - BasicBlock::iterator InsertPt = UserBB->getFirstNonPHI(); - + BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt(); InsertedTrunc = new TruncInst(I, Src->getType(), "", InsertPt); } @@ -1159,3 +1142,34 @@ bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) { return MadeChange; } + +// llvm.dbg.value is far away from the value then iSel may not be able +// handle it properly. iSel will drop llvm.dbg.value if it can not +// find a node corresponding to the value. +bool CodeGenPrepare::PlaceDbgValues(Function &F) { + bool MadeChange = false; + for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { + Instruction *PrevNonDbgInst = NULL; + for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) { + Instruction *Insn = BI; ++BI; + DbgValueInst *DVI = dyn_cast<DbgValueInst>(Insn); + if (!DVI) { + PrevNonDbgInst = Insn; + continue; + } + + Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue()); + if (VI && VI != PrevNonDbgInst && !VI->isTerminator()) { + DEBUG(dbgs() << "Moving Debug Value before :\n" << *DVI << ' ' << *VI); + DVI->removeFromParent(); + if (isa<PHINode>(VI)) + DVI->insertBefore(VI->getParent()->getFirstInsertionPt()); + else + DVI->insertAfter(VI); + MadeChange = true; + ++NumDbgValueMoved; + } + } + } + return MadeChange; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index cb9b5be..a593d0f 100644 --- a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -52,18 +52,18 @@ namespace { AA = &getAnalysis<AliasAnalysis>(); MD = &getAnalysis<MemoryDependenceAnalysis>(); DominatorTree &DT = getAnalysis<DominatorTree>(); - + bool Changed = false; for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) // Only check non-dead blocks. Dead blocks may have strange pointer // cycles that will confuse alias analysis. if (DT.isReachableFromEntry(I)) Changed |= runOnBasicBlock(*I); - + AA = 0; MD = 0; return Changed; } - + bool runOnBasicBlock(BasicBlock &BB); bool HandleFree(CallInst *F); bool handleEndBlock(BasicBlock &BB); @@ -105,34 +105,34 @@ static void DeleteDeadInstruction(Instruction *I, MemoryDependenceAnalysis &MD, SmallPtrSet<Value*, 16> *ValueSet = 0) { SmallVector<Instruction*, 32> NowDeadInsts; - + NowDeadInsts.push_back(I); --NumFastOther; - + // Before we touch this instruction, remove it from memdep! do { Instruction *DeadInst = NowDeadInsts.pop_back_val(); ++NumFastOther; - + // This instruction is dead, zap it, in stages. Start by removing it from // MemDep, which needs to know the operands and needs it to be in the // function. MD.removeInstruction(DeadInst); - + for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) { Value *Op = DeadInst->getOperand(op); DeadInst->setOperand(op, 0); - + // If this operand just became dead, add it to the NowDeadInsts list. if (!Op->use_empty()) continue; - + if (Instruction *OpI = dyn_cast<Instruction>(Op)) if (isInstructionTriviallyDead(OpI)) NowDeadInsts.push_back(OpI); } - + DeadInst->eraseFromParent(); - + if (ValueSet) ValueSet->erase(DeadInst); } while (!NowDeadInsts.empty()); } @@ -159,11 +159,13 @@ static bool hasMemoryWrite(Instruction *I) { } /// getLocForWrite - Return a Location stored to by the specified instruction. +/// If isRemovable returns true, this function and getLocForRead completely +/// describe the memory operations for this instruction. static AliasAnalysis::Location getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) return AA.getLocation(SI); - + if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Inst)) { // memcpy/memmove/memset. AliasAnalysis::Location Loc = AA.getLocationForDest(MI); @@ -174,10 +176,10 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { return AliasAnalysis::Location(); return Loc; } - + IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst); if (II == 0) return AliasAnalysis::Location(); - + switch (II->getIntrinsicID()) { default: return AliasAnalysis::Location(); // Unhandled intrinsic. case Intrinsic::init_trampoline: @@ -185,7 +187,7 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { // that we should use the size of the pointee type. This isn't valid for // init.trampoline, which writes more than an i8. if (AA.getTargetData() == 0) return AliasAnalysis::Location(); - + // FIXME: We don't know the size of the trampoline, so we can't really // handle it here. return AliasAnalysis::Location(II->getArgOperand(0)); @@ -198,10 +200,10 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { /// getLocForRead - Return the location read by the specified "hasMemoryWrite" /// instruction if any. -static AliasAnalysis::Location +static AliasAnalysis::Location getLocForRead(Instruction *Inst, AliasAnalysis &AA) { assert(hasMemoryWrite(Inst) && "Unknown instruction case"); - + // The only instructions that both read and write are the mem transfer // instructions (memcpy/memmove). if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(Inst)) @@ -213,10 +215,10 @@ getLocForRead(Instruction *Inst, AliasAnalysis &AA) { /// isRemovable - If the value of this instruction and the memory it writes to /// is unused, may we delete this instruction? static bool isRemovable(Instruction *I) { - // Don't remove volatile stores. + // Don't remove volatile/atomic stores. if (StoreInst *SI = dyn_cast<StoreInst>(I)) - return !SI->isVolatile(); - + return SI->isUnordered(); + IntrinsicInst *II = cast<IntrinsicInst>(I); switch (II->getIntrinsicID()) { default: assert(0 && "doesn't pass 'hasMemoryWrite' predicate"); @@ -227,7 +229,7 @@ static bool isRemovable(Instruction *I) { case Intrinsic::init_trampoline: // Always safe to remove init_trampoline. return true; - + case Intrinsic::memset: case Intrinsic::memmove: case Intrinsic::memcpy: @@ -255,16 +257,16 @@ static uint64_t getPointerSize(Value *V, AliasAnalysis &AA) { const TargetData *TD = AA.getTargetData(); if (TD == 0) return AliasAnalysis::UnknownSize; - + if (AllocaInst *A = dyn_cast<AllocaInst>(V)) { // Get size information for the alloca if (ConstantInt *C = dyn_cast<ConstantInt>(A->getArraySize())) return C->getZExtValue() * TD->getTypeAllocSize(A->getAllocatedType()); return AliasAnalysis::UnknownSize; } - + assert(isa<Argument>(V) && "Expected AllocaInst or Argument!"); - const PointerType *PT = cast<PointerType>(V->getType()); + PointerType *PT = cast<PointerType>(V->getType()); return TD->getTypeAllocSize(PT->getElementType()); } @@ -287,7 +289,7 @@ static bool isCompleteOverwrite(const AliasAnalysis::Location &Later, AliasAnalysis &AA) { const Value *P1 = Earlier.Ptr->stripPointerCasts(); const Value *P2 = Later.Ptr->stripPointerCasts(); - + // If the start pointers are the same, we just have to compare sizes to see if // the later store was larger than the earlier store. if (P1 == P2) { @@ -302,33 +304,33 @@ static bool isCompleteOverwrite(const AliasAnalysis::Location &Later, return Later.Ptr->getType() == Earlier.Ptr->getType(); return false; } - + // Make sure that the Later size is >= the Earlier size. if (Later.Size < Earlier.Size) return false; return true; } - + // Otherwise, we have to have size information, and the later store has to be // larger than the earlier one. if (Later.Size == AliasAnalysis::UnknownSize || Earlier.Size == AliasAnalysis::UnknownSize || Later.Size <= Earlier.Size || AA.getTargetData() == 0) return false; - + // Check to see if the later store is to the entire object (either a global, // an alloca, or a byval argument). If so, then it clearly overwrites any // other store to the same object. const TargetData &TD = *AA.getTargetData(); - + const Value *UO1 = GetUnderlyingObject(P1, &TD), *UO2 = GetUnderlyingObject(P2, &TD); - + // If we can't resolve the same pointers to the same object, then we can't // analyze them at all. if (UO1 != UO2) return false; - + // If the "Later" store is to a recognizable object, get its size. if (isObjectPointerWithTrustworthySize(UO2)) { uint64_t ObjectSize = @@ -336,26 +338,26 @@ static bool isCompleteOverwrite(const AliasAnalysis::Location &Later, if (ObjectSize == Later.Size) return true; } - + // Okay, we have stores to two completely different pointers. Try to // decompose the pointer into a "base + constant_offset" form. If the base // pointers are equal, then we can reason about the two stores. int64_t EarlierOff = 0, LaterOff = 0; const Value *BP1 = GetPointerBaseWithConstantOffset(P1, EarlierOff, TD); const Value *BP2 = GetPointerBaseWithConstantOffset(P2, LaterOff, TD); - + // If the base pointers still differ, we have two completely different stores. if (BP1 != BP2) return false; // The later store completely overlaps the earlier store if: - // + // // 1. Both start at the same offset and the later one's size is greater than // or equal to the earlier one's, or // // |--earlier--| // |-- later --| - // + // // 2. The earlier store has an offset greater than the later offset, but which // still lies completely within the later store. // @@ -373,7 +375,7 @@ static bool isCompleteOverwrite(const AliasAnalysis::Location &Later, /// isPossibleSelfRead - If 'Inst' might be a self read (i.e. a noop copy of a /// memory region into an identical pointer) then it doesn't actually make its -/// input dead in the traditional sense. Consider this case: +/// input dead in the traditional sense. Consider this case: /// /// memcpy(A <- B) /// memcpy(A <- A) @@ -391,10 +393,10 @@ static bool isPossibleSelfRead(Instruction *Inst, // location read. AliasAnalysis::Location InstReadLoc = getLocForRead(Inst, AA); if (InstReadLoc.Ptr == 0) return false; // Not a reading instruction. - + // If the read and written loc obviously don't alias, it isn't a read. if (AA.isNoAlias(InstReadLoc, InstStoreLoc)) return false; - + // Okay, 'Inst' may copy over itself. However, we can still remove a the // DepWrite instruction if we can prove that it reads from the same location // as Inst. This handles useful cases like: @@ -404,10 +406,10 @@ static bool isPossibleSelfRead(Instruction *Inst, // aliases, so removing the first memcpy is safe (assuming it writes <= # // bytes as the second one. AliasAnalysis::Location DepReadLoc = getLocForRead(DepWrite, AA); - + if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr)) return false; - + // If DepWrite doesn't read memory or if we can't prove it is a must alias, // then it can't be considered dead. return true; @@ -420,43 +422,43 @@ static bool isPossibleSelfRead(Instruction *Inst, bool DSE::runOnBasicBlock(BasicBlock &BB) { bool MadeChange = false; - + // Do a top-down walk on the BB. for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) { Instruction *Inst = BBI++; - + // Handle 'free' calls specially. if (CallInst *F = isFreeCall(Inst)) { MadeChange |= HandleFree(F); continue; } - + // If we find something that writes memory, get its memory dependence. if (!hasMemoryWrite(Inst)) continue; MemDepResult InstDep = MD->getDependency(Inst); - + // Ignore any store where we can't find a local dependence. // FIXME: cross-block DSE would be fun. :) - if (InstDep.isNonLocal() || InstDep.isUnknown()) + if (!InstDep.isDef() && !InstDep.isClobber()) continue; - + // If we're storing the same value back to a pointer that we just // loaded from, then the store can be removed. if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { if (LoadInst *DepLoad = dyn_cast<LoadInst>(InstDep.getInst())) { if (SI->getPointerOperand() == DepLoad->getPointerOperand() && - SI->getOperand(0) == DepLoad && !SI->isVolatile()) { + SI->getOperand(0) == DepLoad && isRemovable(SI)) { DEBUG(dbgs() << "DSE: Remove Store Of Load from same pointer:\n " << "LOAD: " << *DepLoad << "\n STORE: " << *SI << '\n'); - + // DeleteDeadInstruction can delete the current instruction. Save BBI // in case we need it. WeakVH NextInst(BBI); - + DeleteDeadInstruction(SI, *MD); - + if (NextInst == 0) // Next instruction deleted. BBI = BB.begin(); else if (BBI != BB.begin()) // Revisit this instruction if possible. @@ -467,15 +469,15 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { } } } - + // Figure out what location is being stored to. AliasAnalysis::Location Loc = getLocForWrite(Inst, *AA); // If we didn't get a useful location, fail. if (Loc.Ptr == 0) continue; - - while (!InstDep.isNonLocal() && !InstDep.isUnknown()) { + + while (InstDep.isDef() || InstDep.isClobber()) { // Get the memory clobbered by the instruction we depend on. MemDep will // skip any instructions that 'Loc' clearly doesn't interact with. If we // end up depending on a may- or must-aliased load, then we can't optimize @@ -496,12 +498,12 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { !isPossibleSelfRead(Inst, Loc, DepWrite, *AA)) { DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *DepWrite << "\n KILLER: " << *Inst << '\n'); - + // Delete the store and now-dead instructions that feed it. DeleteDeadInstruction(DepWrite, *MD); ++NumFastStores; MadeChange = true; - + // DeleteDeadInstruction can delete the current instruction in loop // cases, reset BBI. BBI = Inst; @@ -509,7 +511,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { --BBI; break; } - + // If this is a may-aliased store that is clobbering the store value, we // can keep searching past it for another must-aliased pointer that stores // to the same location. For example, in: @@ -519,20 +521,20 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { // we can remove the first store to P even though we don't know if P and Q // alias. if (DepWrite == &BB.front()) break; - + // Can't look past this instruction if it might read 'Loc'. if (AA->getModRefInfo(DepWrite, Loc) & AliasAnalysis::Ref) break; - + InstDep = MD->getPointerDependencyFrom(Loc, false, DepWrite, &BB); } } - + // If this block ends in a return, unwind, or unreachable, all allocas are // dead at its end, which means stores to them are also dead. if (BB.getTerminator()->getNumSuccessors() == 0) MadeChange |= handleEndBlock(BB); - + return MadeChange; } @@ -543,18 +545,18 @@ bool DSE::HandleFree(CallInst *F) { MemDepResult Dep = MD->getDependency(F); - while (!Dep.isNonLocal() && !Dep.isUnknown()) { + while (Dep.isDef() || Dep.isClobber()) { Instruction *Dependency = Dep.getInst(); if (!hasMemoryWrite(Dependency) || !isRemovable(Dependency)) return MadeChange; - + Value *DepPointer = GetUnderlyingObject(getStoredPointerOperand(Dependency)); // Check for aliasing. if (!AA->isMustAlias(F->getArgOperand(0), DepPointer)) return MadeChange; - + // DCE instructions only used to calculate that store DeleteDeadInstruction(Dependency, *MD); ++NumFastStores; @@ -567,7 +569,7 @@ bool DSE::HandleFree(CallInst *F) { // free(s); Dep = MD->getDependency(F); }; - + return MadeChange; } @@ -579,28 +581,28 @@ bool DSE::HandleFree(CallInst *F) { /// ret void bool DSE::handleEndBlock(BasicBlock &BB) { bool MadeChange = false; - + // Keep track of all of the stack objects that are dead at the end of the // function. SmallPtrSet<Value*, 16> DeadStackObjects; - + // Find all of the alloca'd pointers in the entry block. BasicBlock *Entry = BB.getParent()->begin(); for (BasicBlock::iterator I = Entry->begin(), E = Entry->end(); I != E; ++I) if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) DeadStackObjects.insert(AI); - + // Treat byval arguments the same, stores to them are dead at the end of the // function. for (Function::arg_iterator AI = BB.getParent()->arg_begin(), AE = BB.getParent()->arg_end(); AI != AE; ++AI) if (AI->hasByValAttr()) DeadStackObjects.insert(AI); - + // Scan the basic block backwards for (BasicBlock::iterator BBI = BB.end(); BBI != BB.begin(); ){ --BBI; - + // If we find a store, check to see if it points into a dead stack value. if (hasMemoryWrite(BBI) && isRemovable(BBI)) { // See through pointer-to-pointer bitcasts @@ -609,10 +611,10 @@ bool DSE::handleEndBlock(BasicBlock &BB) { // Stores to stack values are valid candidates for removal. if (DeadStackObjects.count(Pointer)) { Instruction *Dead = BBI++; - + DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n DEAD: " << *Dead << "\n Object: " << *Pointer << '\n'); - + // DCE instructions only used to calculate that store. DeleteDeadInstruction(Dead, *MD, &DeadStackObjects); ++NumFastStores; @@ -620,7 +622,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { continue; } } - + // Remove any dead non-memory-mutating instructions. if (isInstructionTriviallyDead(BBI)) { Instruction *Inst = BBI++; @@ -629,55 +631,61 @@ bool DSE::handleEndBlock(BasicBlock &BB) { MadeChange = true; continue; } - + if (AllocaInst *A = dyn_cast<AllocaInst>(BBI)) { DeadStackObjects.erase(A); continue; } - + if (CallSite CS = cast<Value>(BBI)) { // If this call does not access memory, it can't be loading any of our // pointers. if (AA->doesNotAccessMemory(CS)) continue; - + // If the call might load from any of our allocas, then any store above // the call is live. SmallVector<Value*, 8> LiveAllocas; for (SmallPtrSet<Value*, 16>::iterator I = DeadStackObjects.begin(), E = DeadStackObjects.end(); I != E; ++I) { // See if the call site touches it. - AliasAnalysis::ModRefResult A = + AliasAnalysis::ModRefResult A = AA->getModRefInfo(CS, *I, getPointerSize(*I, *AA)); - + if (A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref) LiveAllocas.push_back(*I); } - + for (SmallVector<Value*, 8>::iterator I = LiveAllocas.begin(), E = LiveAllocas.end(); I != E; ++I) DeadStackObjects.erase(*I); - + // If all of the allocas were clobbered by the call then we're not going // to find anything else to process. if (DeadStackObjects.empty()) return MadeChange; - + continue; } - + AliasAnalysis::Location LoadedLoc; - + // If we encounter a use of the pointer, it is no longer considered dead if (LoadInst *L = dyn_cast<LoadInst>(BBI)) { + if (!L->isUnordered()) // Be conservative with atomic/volatile load + break; LoadedLoc = AA->getLocation(L); } else if (VAArgInst *V = dyn_cast<VAArgInst>(BBI)) { LoadedLoc = AA->getLocation(V); } else if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(BBI)) { LoadedLoc = AA->getLocationForSource(MTI); - } else { - // Not a loading instruction. + } else if (!BBI->mayReadFromMemory()) { + // Instruction doesn't read memory. Note that stores that weren't removed + // above will hit this case. continue; + } else { + // Unknown inst; assume it clobbers everything. + break; } // Remove any allocas from the DeadPointer set that are loaded, as this @@ -689,7 +697,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { if (DeadStackObjects.empty()) break; } - + return MadeChange; } @@ -703,14 +711,14 @@ void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc, // A constant can't be in the dead pointer set. if (isa<Constant>(UnderlyingPointer)) return; - + // If the kill pointer can be easily reduced to an alloca, don't bother doing // extraneous AA queries. if (isa<AllocaInst>(UnderlyingPointer) || isa<Argument>(UnderlyingPointer)) { DeadStackObjects.erase(const_cast<Value*>(UnderlyingPointer)); return; } - + SmallVector<Value*, 16> NowLive; for (SmallPtrSet<Value*, 16>::iterator I = DeadStackObjects.begin(), E = DeadStackObjects.end(); I != E; ++I) { diff --git a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp index 3d3f17b..c0223d2 100644 --- a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -92,7 +92,7 @@ unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) { // Hash in all of the operands as pointers. unsigned Res = 0; for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i) - Res ^= getHash(Inst->getOperand(i)) << i; + Res ^= getHash(Inst->getOperand(i)) << (i & 0xF); if (CastInst *CI = dyn_cast<CastInst>(Inst)) Res ^= getHash(CI->getType()); @@ -185,7 +185,7 @@ unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) { for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i) { assert(!Inst->getOperand(i)->getType()->isMetadataTy() && "Cannot value number calls with metadata operands"); - Res ^= getHash(Inst->getOperand(i)) << i; + Res ^= getHash(Inst->getOperand(i)) << (i & 0xF); } // Mix in the opcode. @@ -357,7 +357,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // If this is a non-volatile load, process it. if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { // Ignore volatile loads. - if (LI->isVolatile()) { + if (!LI->isSimple()) { LastStore = 0; continue; } @@ -437,7 +437,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { std::pair<Value*, unsigned>(SI->getValueOperand(), CurrentGeneration)); // Remember that this was the last store we saw for DSE. - if (!SI->isVolatile()) + if (SI->isSimple()) LastStore = SI; } } diff --git a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp index 87b7317..cbfdbcd 100644 --- a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp @@ -41,12 +41,16 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/IRBuilder.h" +#include "llvm/Support/PatternMatch.h" using namespace llvm; +using namespace PatternMatch; STATISTIC(NumGVNInstr, "Number of instructions deleted"); STATISTIC(NumGVNLoad, "Number of loads deleted"); STATISTIC(NumGVNPRE, "Number of instructions PRE'd"); STATISTIC(NumGVNBlocks, "Number of blocks merged"); +STATISTIC(NumGVNSimpl, "Number of instructions simplified"); +STATISTIC(NumGVNEqProp, "Number of equalities propagated"); STATISTIC(NumPRELoad, "Number of loads PRE'd"); static cl::opt<bool> EnablePRE("enable-pre", @@ -63,7 +67,7 @@ static cl::opt<bool> EnableLoadPRE("enable-load-pre", cl::init(true)); namespace { struct Expression { uint32_t opcode; - const Type *type; + Type *type; SmallVector<uint32_t, 4> varargs; Expression(uint32_t o = ~2U) : opcode(o) { } @@ -548,6 +552,9 @@ namespace { void cleanupGlobalSets(); void verifyRemoved(const Instruction *I) const; bool splitCriticalEdges(); + unsigned replaceAllDominatedUsesWith(Value *From, Value *To, + BasicBlock *Root); + bool propagateEquality(Value *LHS, Value *RHS, BasicBlock *Root); }; char GVN::ID = 0; @@ -655,7 +662,7 @@ SpeculationFailure: /// CanCoerceMustAliasedValueToLoad - Return true if /// CoerceAvailableValueToLoadType will succeed. static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal, - const Type *LoadTy, + Type *LoadTy, const TargetData &TD) { // If the loaded or stored value is an first class array or struct, don't try // to transform them. We need to be able to bitcast to integer. @@ -680,17 +687,17 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal, /// /// If we can't do it, return null. static Value *CoerceAvailableValueToLoadType(Value *StoredVal, - const Type *LoadedTy, + Type *LoadedTy, Instruction *InsertPt, const TargetData &TD) { if (!CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, TD)) return 0; // If this is already the right type, just return it. - const Type *StoredValTy = StoredVal->getType(); + Type *StoredValTy = StoredVal->getType(); - uint64_t StoreSize = TD.getTypeStoreSizeInBits(StoredValTy); - uint64_t LoadSize = TD.getTypeStoreSizeInBits(LoadedTy); + uint64_t StoreSize = TD.getTypeSizeInBits(StoredValTy); + uint64_t LoadSize = TD.getTypeSizeInBits(LoadedTy); // If the store and reload are the same size, we can always reuse it. if (StoreSize == LoadSize) { @@ -704,7 +711,7 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, StoredVal = new PtrToIntInst(StoredVal, StoredValTy, "", InsertPt); } - const Type *TypeToCastTo = LoadedTy; + Type *TypeToCastTo = LoadedTy; if (TypeToCastTo->isPointerTy()) TypeToCastTo = TD.getIntPtrType(StoredValTy->getContext()); @@ -743,7 +750,7 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, } // Truncate the integer to the right size now. - const Type *NewIntTy = IntegerType::get(StoredValTy->getContext(), LoadSize); + Type *NewIntTy = IntegerType::get(StoredValTy->getContext(), LoadSize); StoredVal = new TruncInst(StoredVal, NewIntTy, "trunc", InsertPt); if (LoadedTy == NewIntTy) @@ -765,7 +772,7 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, /// Check this case to see if there is anything more we can do before we give /// up. This returns -1 if we have to give up, or a byte number in the stored /// value of the piece that feeds the load. -static int AnalyzeLoadFromClobberingWrite(const Type *LoadTy, Value *LoadPtr, +static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr, Value *WritePtr, uint64_t WriteSizeInBits, const TargetData &TD) { @@ -839,7 +846,7 @@ static int AnalyzeLoadFromClobberingWrite(const Type *LoadTy, Value *LoadPtr, /// AnalyzeLoadFromClobberingStore - This function is called when we have a /// memdep query of a load that ends up being a clobbering store. -static int AnalyzeLoadFromClobberingStore(const Type *LoadTy, Value *LoadPtr, +static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr, StoreInst *DepSI, const TargetData &TD) { // Cannot handle reading from store of first-class aggregate yet. @@ -856,7 +863,7 @@ static int AnalyzeLoadFromClobberingStore(const Type *LoadTy, Value *LoadPtr, /// AnalyzeLoadFromClobberingLoad - This function is called when we have a /// memdep query of a load that ends up being clobbered by another load. See if /// the other load can feed into the second load. -static int AnalyzeLoadFromClobberingLoad(const Type *LoadTy, Value *LoadPtr, +static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, LoadInst *DepLI, const TargetData &TD){ // Cannot handle reading from store of first-class aggregate yet. if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy()) @@ -883,7 +890,7 @@ static int AnalyzeLoadFromClobberingLoad(const Type *LoadTy, Value *LoadPtr, -static int AnalyzeLoadFromClobberingMemInst(const Type *LoadTy, Value *LoadPtr, +static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, MemIntrinsic *MI, const TargetData &TD) { // If the mem operation is a non-constant size, we can't handle it. @@ -920,7 +927,7 @@ static int AnalyzeLoadFromClobberingMemInst(const Type *LoadTy, Value *LoadPtr, llvm::Type::getInt8PtrTy(Src->getContext())); Constant *OffsetCst = ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); - Src = ConstantExpr::getGetElementPtr(Src, &OffsetCst, 1); + Src = ConstantExpr::getGetElementPtr(Src, OffsetCst); Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy)); if (ConstantFoldLoadFromConstPtr(Src, &TD)) return Offset; @@ -934,7 +941,7 @@ static int AnalyzeLoadFromClobberingMemInst(const Type *LoadTy, Value *LoadPtr, /// mustalias. Check this case to see if there is anything more we can do /// before we give up. static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, - const Type *LoadTy, + Type *LoadTy, Instruction *InsertPt, const TargetData &TD){ LLVMContext &Ctx = SrcVal->getType()->getContext(); @@ -946,10 +953,9 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, // Compute which bits of the stored value are being used by the load. Convert // to an integer type to start with. if (SrcVal->getType()->isPointerTy()) - SrcVal = Builder.CreatePtrToInt(SrcVal, TD.getIntPtrType(Ctx), "tmp"); + SrcVal = Builder.CreatePtrToInt(SrcVal, TD.getIntPtrType(Ctx)); if (!SrcVal->getType()->isIntegerTy()) - SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8), - "tmp"); + SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8)); // Shift the bits to the least significant depending on endianness. unsigned ShiftAmt; @@ -959,11 +965,10 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, ShiftAmt = (StoreSize-LoadSize-Offset)*8; if (ShiftAmt) - SrcVal = Builder.CreateLShr(SrcVal, ShiftAmt, "tmp"); + SrcVal = Builder.CreateLShr(SrcVal, ShiftAmt); if (LoadSize != StoreSize) - SrcVal = Builder.CreateTrunc(SrcVal, IntegerType::get(Ctx, LoadSize*8), - "tmp"); + SrcVal = Builder.CreateTrunc(SrcVal, IntegerType::get(Ctx, LoadSize*8)); return CoerceAvailableValueToLoadType(SrcVal, LoadTy, InsertPt, TD); } @@ -974,7 +979,7 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, /// because the pointers don't mustalias. Check this case to see if there is /// anything more we can do before we give up. static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, - const Type *LoadTy, Instruction *InsertPt, + Type *LoadTy, Instruction *InsertPt, GVN &gvn) { const TargetData &TD = *gvn.getTargetData(); // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to @@ -982,8 +987,8 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, unsigned SrcValSize = TD.getTypeStoreSize(SrcVal->getType()); unsigned LoadSize = TD.getTypeStoreSize(LoadTy); if (Offset+LoadSize > SrcValSize) { - assert(!SrcVal->isVolatile() && "Cannot widen volatile load!"); - assert(isa<IntegerType>(SrcVal->getType())&&"Can't widen non-integer load"); + assert(SrcVal->isSimple() && "Cannot widen volatile/atomic load!"); + assert(SrcVal->getType()->isIntegerTy() && "Can't widen non-integer load"); // If we have a load/load clobber an DepLI can be widened to cover this // load, then we should widen it to the next power of 2 size big enough! unsigned NewLoadSize = Offset+LoadSize; @@ -996,7 +1001,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, // memdep queries will find the new load. We can't easily remove the old // load completely because it is already in the value numbering table. IRBuilder<> Builder(SrcVal->getParent(), ++BasicBlock::iterator(SrcVal)); - const Type *DestPTy = + Type *DestPTy = IntegerType::get(LoadTy->getContext(), NewLoadSize*8); DestPTy = PointerType::get(DestPTy, cast<PointerType>(PtrVal->getType())->getAddressSpace()); @@ -1034,7 +1039,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, /// GetMemInstValueForLoad - This function is called when we have a /// memdep query of a load that ends up being a clobbering mem intrinsic. static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, - const Type *LoadTy, Instruction *InsertPt, + Type *LoadTy, Instruction *InsertPt, const TargetData &TD){ LLVMContext &Ctx = LoadTy->getContext(); uint64_t LoadSize = TD.getTypeSizeInBits(LoadTy)/8; @@ -1081,7 +1086,7 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, llvm::Type::getInt8PtrTy(Src->getContext())); Constant *OffsetCst = ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); - Src = ConstantExpr::getGetElementPtr(Src, &OffsetCst, 1); + Src = ConstantExpr::getGetElementPtr(Src, OffsetCst); Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy)); return ConstantFoldLoadFromConstPtr(Src, &TD); } @@ -1154,7 +1159,7 @@ struct AvailableValueInBlock { /// MaterializeAdjustedValue - Emit code into this block to adjust the value /// defined here to the specified type. This handles various coercion cases. - Value *MaterializeAdjustedValue(const Type *LoadTy, GVN &gvn) const { + Value *MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) const { Value *Res; if (isSimpleValue()) { Res = getSimpleValue(); @@ -1213,7 +1218,7 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, SSAUpdater SSAUpdate(&NewPHIs); SSAUpdate.Initialize(LI->getType(), LI->getName()); - const Type *LoadTy = LI->getType(); + Type *LoadTy = LI->getType(); for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) { const AvailableValueInBlock &AV = ValuesPerBlock[i]; @@ -1274,7 +1279,9 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { // If we had a phi translation failure, we'll have a single entry which is a // clobber in the current block. Reject this early. - if (Deps.size() == 1 && Deps[0].getResult().isUnknown()) { + if (Deps.size() == 1 + && !Deps[0].getResult().isDef() && !Deps[0].getResult().isClobber()) + { DEBUG( dbgs() << "GVN: non-local load "; WriteAsOperand(dbgs(), LI); @@ -1294,7 +1301,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { BasicBlock *DepBB = Deps[i].getBB(); MemDepResult DepInfo = Deps[i].getResult(); - if (DepInfo.isUnknown()) { + if (!DepInfo.isDef() && !DepInfo.isClobber()) { UnavailableBlocks.push_back(DepBB); continue; } @@ -1359,7 +1366,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { continue; } - assert(DepInfo.isDef() && "Expecting def here"); + // DepInfo.isDef() here Instruction *DepInst = DepInfo.getInst(); @@ -1446,8 +1453,8 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i) Blockers.insert(UnavailableBlocks[i]); - // Lets find first basic block with more than one predecessor. Walk backwards - // through predecessors if needed. + // Let's find the first basic block with more than one predecessor. Walk + // backwards through predecessors if needed. BasicBlock *LoadBB = LI->getParent(); BasicBlock *TmpBB = LoadBB; @@ -1519,10 +1526,19 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { << Pred->getName() << "': " << *LI << '\n'); return false; } + + if (LoadBB->isLandingPad()) { + DEBUG(dbgs() + << "COULD NOT PRE LOAD BECAUSE OF LANDING PAD CRITICAL EDGE '" + << Pred->getName() << "': " << *LI << '\n'); + return false; + } + unsigned SuccNum = GetSuccessorNumber(Pred, LoadBB); NeedToSplit.push_back(std::make_pair(Pred->getTerminator(), SuccNum)); } } + if (!NeedToSplit.empty()) { toSplit.append(NeedToSplit.begin(), NeedToSplit.end()); return false; @@ -1660,7 +1676,7 @@ bool GVN::processLoad(LoadInst *L) { if (!MD) return false; - if (L->isVolatile()) + if (!L->isSimple()) return false; if (L->use_empty()) { @@ -1747,7 +1763,11 @@ bool GVN::processLoad(LoadInst *L) { return false; } - if (Dep.isUnknown()) { + // If it is defined in another block, try harder. + if (Dep.isNonLocal()) + return processNonLocalLoad(L); + + if (!Dep.isDef()) { DEBUG( // fast print dep, using operator<< on instruction is too slow. dbgs() << "GVN: load "; @@ -1757,12 +1777,6 @@ bool GVN::processLoad(LoadInst *L) { return false; } - // If it is defined in another block, try harder. - if (Dep.isNonLocal()) - return processNonLocalLoad(L); - - assert(Dep.isDef() && "Expecting def here"); - Instruction *DepInst = Dep.getInst(); if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInst)) { Value *StoredVal = DepSI->getValueOperand(); @@ -1874,6 +1888,133 @@ Value *GVN::findLeader(BasicBlock *BB, uint32_t num) { return Val; } +/// replaceAllDominatedUsesWith - Replace all uses of 'From' with 'To' if the +/// use is dominated by the given basic block. Returns the number of uses that +/// were replaced. +unsigned GVN::replaceAllDominatedUsesWith(Value *From, Value *To, + BasicBlock *Root) { + unsigned Count = 0; + for (Value::use_iterator UI = From->use_begin(), UE = From->use_end(); + UI != UE; ) { + Instruction *User = cast<Instruction>(*UI); + unsigned OpNum = UI.getOperandNo(); + ++UI; + + if (DT->dominates(Root, User->getParent())) { + User->setOperand(OpNum, To); + ++Count; + } + } + return Count; +} + +/// propagateEquality - The given values are known to be equal in every block +/// dominated by 'Root'. Exploit this, for example by replacing 'LHS' with +/// 'RHS' everywhere in the scope. Returns whether a change was made. +bool GVN::propagateEquality(Value *LHS, Value *RHS, BasicBlock *Root) { + if (LHS == RHS) return false; + assert(LHS->getType() == RHS->getType() && "Equal but types differ!"); + + // Don't try to propagate equalities between constants. + if (isa<Constant>(LHS) && isa<Constant>(RHS)) + return false; + + // Make sure that any constants are on the right-hand side. In general the + // best results are obtained by placing the longest lived value on the RHS. + if (isa<Constant>(LHS)) + std::swap(LHS, RHS); + + // If neither term is constant then bail out. This is not for correctness, + // it's just that the non-constant case is much less useful: it occurs just + // as often as the constant case but handling it hardly ever results in an + // improvement. + if (!isa<Constant>(RHS)) + return false; + + // If value numbering later deduces that an instruction in the scope is equal + // to 'LHS' then ensure it will be turned into 'RHS'. + addToLeaderTable(VN.lookup_or_add(LHS), RHS, Root); + + // Replace all occurrences of 'LHS' with 'RHS' everywhere in the scope. + unsigned NumReplacements = replaceAllDominatedUsesWith(LHS, RHS, Root); + bool Changed = NumReplacements > 0; + NumGVNEqProp += NumReplacements; + + // Now try to deduce additional equalities from this one. For example, if the + // known equality was "(A != B)" == "false" then it follows that A and B are + // equal in the scope. Only boolean equalities with an explicit true or false + // RHS are currently supported. + if (!RHS->getType()->isIntegerTy(1)) + // Not a boolean equality - bail out. + return Changed; + ConstantInt *CI = dyn_cast<ConstantInt>(RHS); + if (!CI) + // RHS neither 'true' nor 'false' - bail out. + return Changed; + // Whether RHS equals 'true'. Otherwise it equals 'false'. + bool isKnownTrue = CI->isAllOnesValue(); + bool isKnownFalse = !isKnownTrue; + + // If "A && B" is known true then both A and B are known true. If "A || B" + // is known false then both A and B are known false. + Value *A, *B; + if ((isKnownTrue && match(LHS, m_And(m_Value(A), m_Value(B)))) || + (isKnownFalse && match(LHS, m_Or(m_Value(A), m_Value(B))))) { + Changed |= propagateEquality(A, RHS, Root); + Changed |= propagateEquality(B, RHS, Root); + return Changed; + } + + // If we are propagating an equality like "(A == B)" == "true" then also + // propagate the equality A == B. + if (ICmpInst *Cmp = dyn_cast<ICmpInst>(LHS)) { + // Only equality comparisons are supported. + if ((isKnownTrue && Cmp->getPredicate() == CmpInst::ICMP_EQ) || + (isKnownFalse && Cmp->getPredicate() == CmpInst::ICMP_NE)) { + Value *Op0 = Cmp->getOperand(0), *Op1 = Cmp->getOperand(1); + Changed |= propagateEquality(Op0, Op1, Root); + } + return Changed; + } + + return Changed; +} + +/// isOnlyReachableViaThisEdge - There is an edge from 'Src' to 'Dst'. Return +/// true if every path from the entry block to 'Dst' passes via this edge. In +/// particular 'Dst' must not be reachable via another edge from 'Src'. +static bool isOnlyReachableViaThisEdge(BasicBlock *Src, BasicBlock *Dst, + DominatorTree *DT) { + // First off, there must not be more than one edge from Src to Dst, there + // should be exactly one. So keep track of the number of times Src occurs + // as a predecessor of Dst and fail if it's more than once. Secondly, any + // other predecessors of Dst should be dominated by Dst (see logic below). + bool SawEdgeFromSrc = false; + for (pred_iterator PI = pred_begin(Dst), PE = pred_end(Dst); PI != PE; ++PI) { + BasicBlock *Pred = *PI; + if (Pred == Src) { + // An edge from Src to Dst. + if (SawEdgeFromSrc) + // There are multiple edges from Src to Dst - fail. + return false; + SawEdgeFromSrc = true; + continue; + } + // If the predecessor is not dominated by Dst, then it must be possible to + // reach it either without passing through Src (and thus not via the edge) + // or by passing through Src but taking a different edge out of Src. Either + // way it is possible to reach Dst without passing via the edge, so fail. + if (!DT->dominates(Dst, *PI)) + return false; + } + assert(SawEdgeFromSrc && "No edge between these basic blocks!"); + + // Every path from the entry block to Dst must at some point pass to Dst from + // a predecessor that is not dominated by Dst. This predecessor can only be + // Src, since all others are dominated by Dst. As there is only one edge from + // Src to Dst, the path passes by this edge. + return true; +} /// processInstruction - When calculating availability, handle an instruction /// by inserting it into the appropriate sets @@ -1891,6 +2032,7 @@ bool GVN::processInstruction(Instruction *I) { if (MD && V->getType()->isPointerTy()) MD->invalidateCachedPointerInfo(V); markInstructionForDeletion(I); + ++NumGVNSimpl; return true; } @@ -1903,30 +2045,45 @@ bool GVN::processInstruction(Instruction *I) { return false; } - // For conditions branches, we can perform simple conditional propagation on + // For conditional branches, we can perform simple conditional propagation on // the condition value itself. if (BranchInst *BI = dyn_cast<BranchInst>(I)) { if (!BI->isConditional() || isa<Constant>(BI->getCondition())) return false; - + Value *BranchCond = BI->getCondition(); - uint32_t CondVN = VN.lookup_or_add(BranchCond); - + BasicBlock *TrueSucc = BI->getSuccessor(0); BasicBlock *FalseSucc = BI->getSuccessor(1); - - if (TrueSucc->getSinglePredecessor()) - addToLeaderTable(CondVN, - ConstantInt::getTrue(TrueSucc->getContext()), - TrueSucc); - if (FalseSucc->getSinglePredecessor()) - addToLeaderTable(CondVN, - ConstantInt::getFalse(TrueSucc->getContext()), - FalseSucc); - - return false; + BasicBlock *Parent = BI->getParent(); + bool Changed = false; + + if (isOnlyReachableViaThisEdge(Parent, TrueSucc, DT)) + Changed |= propagateEquality(BranchCond, + ConstantInt::getTrue(TrueSucc->getContext()), + TrueSucc); + + if (isOnlyReachableViaThisEdge(Parent, FalseSucc, DT)) + Changed |= propagateEquality(BranchCond, + ConstantInt::getFalse(FalseSucc->getContext()), + FalseSucc); + + return Changed; } - + + // For switches, propagate the case values into the case destinations. + if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) { + Value *SwitchCond = SI->getCondition(); + BasicBlock *Parent = SI->getParent(); + bool Changed = false; + for (unsigned i = 1, e = SI->getNumCases(); i != e; ++i) { + BasicBlock *Dst = SI->getSuccessor(i); + if (isOnlyReachableViaThisEdge(Parent, Dst, DT)) + Changed |= propagateEquality(SwitchCond, SI->getCaseValue(i), Dst); + } + return Changed; + } + // Instructions with void type don't return a value, so there's // no point in trying to find redudancies in them. if (I->getType()->isVoidTy()) return false; @@ -2071,6 +2228,9 @@ bool GVN::performPRE(Function &F) { // Nothing to PRE in the entry block. if (CurrentBlock == &F.getEntryBlock()) continue; + // Don't perform PRE on a landing pad. + if (CurrentBlock->isLandingPad()) continue; + for (BasicBlock::iterator BI = CurrentBlock->begin(), BE = CurrentBlock->end(); BI != BE; ) { Instruction *CurInst = BI++; diff --git a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index dee3d38..75fa011 100644 --- a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -11,17 +11,6 @@ // computations derived from them) into simpler forms suitable for subsequent // analysis and transformation. // -// This transformation makes the following changes to each loop with an -// identifiable induction variable: -// 1. All loops are transformed to have a SINGLE canonical induction variable -// which starts at zero and steps by one. -// 2. The canonical induction variable is guaranteed to be the first PHI node -// in the loop header block. -// 3. The canonical induction variable is guaranteed to be in a wide enough -// type so that IV expressions need not be (directly) zero-extended or -// sign-extended. -// 4. Any pointer arithmetic recurrences are raised to use array subscripts. -// // If the trip count of a loop is computable, this pass also makes the following // changes: // 1. The exit condition for the loop is canonicalized to compare the @@ -33,9 +22,6 @@ // purpose of the loop is to compute the exit value of some derived // expression, this transformation will make the loop dead. // -// This transformation should be followed by strength reduction after all of the -// desired loop transformations have been performed. -// //===----------------------------------------------------------------------===// #define DEBUG_TYPE "indvars" @@ -57,11 +43,11 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/SimplifyIndVar.h" #include "llvm/Target/TargetData.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/STLExtras.h" using namespace llvm; STATISTIC(NumRemoved , "Number of aux indvars removed"); @@ -69,15 +55,21 @@ STATISTIC(NumWidened , "Number of indvars widened"); STATISTIC(NumInserted , "Number of canonical indvars added"); STATISTIC(NumReplaced , "Number of exit values replaced"); STATISTIC(NumLFTR , "Number of loop exit tests replaced"); -STATISTIC(NumElimIdentity, "Number of IV identities eliminated"); STATISTIC(NumElimExt , "Number of IV sign/zero extends eliminated"); -STATISTIC(NumElimRem , "Number of IV remainder operations eliminated"); -STATISTIC(NumElimCmp , "Number of IV comparisons eliminated"); STATISTIC(NumElimIV , "Number of congruent IVs eliminated"); -static cl::opt<bool> DisableIVRewrite( - "disable-iv-rewrite", cl::Hidden, - cl::desc("Disable canonical induction variable rewriting")); +namespace llvm { + cl::opt<bool> EnableIVRewrite( + "enable-iv-rewrite", cl::Hidden, + cl::desc("Enable canonical induction variable rewriting")); + + // Trip count verification can be enabled by default under NDEBUG if we + // implement a strong expression equivalence checker in SCEV. Until then, we + // use the verify-indvars flag, which may assert in some cases. + cl::opt<bool> VerifyIndvars( + "verify-indvars", cl::Hidden, + cl::desc("Verify the ScalarEvolution result after running indvars")); +} namespace { class IndVarSimplify : public LoopPass { @@ -105,12 +97,12 @@ namespace { AU.addRequired<ScalarEvolution>(); AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); - if (!DisableIVRewrite) + if (EnableIVRewrite) AU.addRequired<IVUsers>(); AU.addPreserved<ScalarEvolution>(); AU.addPreservedID(LoopSimplifyID); AU.addPreservedID(LCSSAID); - if (!DisableIVRewrite) + if (EnableIVRewrite) AU.addPreserved<IVUsers>(); AU.setPreservesCFG(); } @@ -125,24 +117,14 @@ namespace { void HandleFloatingPointIV(Loop *L, PHINode *PH); void RewriteNonIntegerIVs(Loop *L); - void RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter); - - void SimplifyIVUsers(SCEVExpander &Rewriter); - void SimplifyIVUsersNoRewrite(Loop *L, SCEVExpander &Rewriter); + void SimplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LPPassManager &LPM); - bool EliminateIVUser(Instruction *UseInst, Instruction *IVOperand); - void EliminateIVComparison(ICmpInst *ICmp, Value *IVOperand); - void EliminateIVRemainder(BinaryOperator *Rem, - Value *IVOperand, - bool IsSigned); - - void SimplifyCongruentIVs(Loop *L); + void RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter); void RewriteIVExpressions(Loop *L, SCEVExpander &Rewriter); - ICmpInst *LinearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount, - PHINode *IndVar, - SCEVExpander &Rewriter); + Value *LinearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount, + PHINode *IndVar, SCEVExpander &Rewriter); void SinkUnusedInvariants(Loop *L); }; @@ -211,6 +193,36 @@ bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) { return true; } +/// Determine the insertion point for this user. By default, insert immediately +/// before the user. SCEVExpander or LICM will hoist loop invariants out of the +/// loop. For PHI nodes, there may be multiple uses, so compute the nearest +/// common dominator for the incoming blocks. +static Instruction *getInsertPointForUses(Instruction *User, Value *Def, + DominatorTree *DT) { + PHINode *PHI = dyn_cast<PHINode>(User); + if (!PHI) + return User; + + Instruction *InsertPt = 0; + for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i) { + if (PHI->getIncomingValue(i) != Def) + continue; + + BasicBlock *InsertBB = PHI->getIncomingBlock(i); + if (!InsertPt) { + InsertPt = InsertBB->getTerminator(); + continue; + } + InsertBB = DT->findNearestCommonDominator(InsertPt->getParent(), InsertBB); + InsertPt = InsertBB->getTerminator(); + } + assert(InsertPt && "Missing phi operand"); + assert((!isa<Instruction>(Def) || + DT->dominates(cast<Instruction>(Def), InsertPt)) && + "def does not dominate all uses"); + return InsertPt; +} + //===----------------------------------------------------------------------===// // RewriteNonIntegerIVs and helpers. Prefer integer IVs. //===----------------------------------------------------------------------===// @@ -337,14 +349,14 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) { // Positive and negative strides have different safety conditions. if (IncValue > 0) { // If we have a positive stride, we require the init to be less than the - // exit value and an equality or less than comparison. - if (InitValue >= ExitValue || - NewPred == CmpInst::ICMP_SGT || NewPred == CmpInst::ICMP_SGE) + // exit value. + if (InitValue >= ExitValue) return; uint32_t Range = uint32_t(ExitValue-InitValue); - if (NewPred == CmpInst::ICMP_SLE) { - // Normalize SLE -> SLT, check for infinite loop. + // Check for infinite loop, either: + // while (i <= Exit) or until (i > Exit) + if (NewPred == CmpInst::ICMP_SLE || NewPred == CmpInst::ICMP_SGT) { if (++Range == 0) return; // Range overflows. } @@ -364,14 +376,14 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) { } else { // If we have a negative stride, we require the init to be greater than the - // exit value and an equality or greater than comparison. - if (InitValue >= ExitValue || - NewPred == CmpInst::ICMP_SLT || NewPred == CmpInst::ICMP_SLE) + // exit value. + if (InitValue <= ExitValue) return; uint32_t Range = uint32_t(InitValue-ExitValue); - if (NewPred == CmpInst::ICMP_SGE) { - // Normalize SGE -> SGT, check for infinite loop. + // Check for infinite loop, either: + // while (i >= Exit) or until (i < Exit) + if (NewPred == CmpInst::ICMP_SGE || NewPred == CmpInst::ICMP_SLT) { if (++Range == 0) return; // Range overflows. } @@ -390,7 +402,7 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) { return; } - const IntegerType *Int32Ty = Type::getInt32Ty(PN->getContext()); + IntegerType *Int32Ty = Type::getInt32Ty(PN->getContext()); // Insert new integer induction variable. PHINode *NewPHI = PHINode::Create(Int32Ty, 2, PN->getName()+".int", PN); @@ -429,7 +441,7 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) { // platforms. if (WeakPH) { Value *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv", - PN->getParent()->getFirstNonPHI()); + PN->getParent()->getFirstInsertionPt()); PN->replaceAllUsesWith(Conv); RecursivelyDeleteTriviallyDeadInstructions(PN); } @@ -437,6 +449,8 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) { // Add a new IVUsers entry for the newly-created integer PHI. if (IU) IU->AddUsersIfInteresting(NewPHI); + + Changed = true; } void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) { @@ -582,45 +596,15 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { //===----------------------------------------------------------------------===// // Rewrite IV users based on a canonical IV. -// To be replaced by -disable-iv-rewrite. +// Only for use with -enable-iv-rewrite. //===----------------------------------------------------------------------===// -/// SimplifyIVUsers - Iteratively perform simplification on IVUsers within this -/// loop. IVUsers is treated as a worklist. Each successive simplification may -/// push more users which may themselves be candidates for simplification. -/// -/// This is the old approach to IV simplification to be replaced by -/// SimplifyIVUsersNoRewrite. -/// -void IndVarSimplify::SimplifyIVUsers(SCEVExpander &Rewriter) { - // Each round of simplification involves a round of eliminating operations - // followed by a round of widening IVs. A single IVUsers worklist is used - // across all rounds. The inner loop advances the user. If widening exposes - // more uses, then another pass through the outer loop is triggered. - for (IVUsers::iterator I = IU->begin(); I != IU->end(); ++I) { - Instruction *UseInst = I->getUser(); - Value *IVOperand = I->getOperandValToReplace(); - - if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) { - EliminateIVComparison(ICmp, IVOperand); - continue; - } - if (BinaryOperator *Rem = dyn_cast<BinaryOperator>(UseInst)) { - bool IsSigned = Rem->getOpcode() == Instruction::SRem; - if (IsSigned || Rem->getOpcode() == Instruction::URem) { - EliminateIVRemainder(Rem, IVOperand, IsSigned); - continue; - } - } - } -} - -// FIXME: It is an extremely bad idea to indvar substitute anything more -// complex than affine induction variables. Doing so will put expensive -// polynomial evaluations inside of the loop, and the str reduction pass -// currently can only reduce affine polynomials. For now just disable -// indvar subst on anything more complex than an affine addrec, unless -// it can be expanded to a trivial value. +/// FIXME: It is an extremely bad idea to indvar substitute anything more +/// complex than affine induction variables. Doing so will put expensive +/// polynomial evaluations inside of the loop, and the str reduction pass +/// currently can only reduce affine polynomials. For now just disable +/// indvar subst on anything more complex than an affine addrec, unless +/// it can be expanded to a trivial value. static bool isSafe(const SCEV *S, const Loop *L, ScalarEvolution *SE) { // Loop-invariant values are safe. if (SE->isLoopInvariant(S, L)) return true; @@ -631,7 +615,8 @@ static bool isSafe(const SCEV *S, const Loop *L, ScalarEvolution *SE) { return AR->isAffine(); // An add is safe it all its operands are safe. - if (const SCEVCommutativeExpr *Commutative = dyn_cast<SCEVCommutativeExpr>(S)) { + if (const SCEVCommutativeExpr *Commutative + = dyn_cast<SCEVCommutativeExpr>(S)) { for (SCEVCommutativeExpr::op_iterator I = Commutative->op_begin(), E = Commutative->op_end(); I != E; ++I) if (!isSafe(*I, L, SE)) return false; @@ -665,7 +650,7 @@ void IndVarSimplify::RewriteIVExpressions(Loop *L, SCEVExpander &Rewriter) { // of different sizes. for (IVUsers::iterator UI = IU->begin(), E = IU->end(); UI != E; ++UI) { Value *Op = UI->getOperandValToReplace(); - const Type *UseTy = Op->getType(); + Type *UseTy = Op->getType(); Instruction *User = UI->getUser(); // Compute the final addrec to expand into code. @@ -692,18 +677,7 @@ void IndVarSimplify::RewriteIVExpressions(Loop *L, SCEVExpander &Rewriter) { // hoist loop invariants out of the loop. For PHI nodes, there may be // multiple uses, so compute the nearest common dominator for the // incoming blocks. - Instruction *InsertPt = User; - if (PHINode *PHI = dyn_cast<PHINode>(InsertPt)) - for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i) - if (PHI->getIncomingValue(i) == Op) { - if (InsertPt == User) - InsertPt = PHI->getIncomingBlock(i)->getTerminator(); - else - InsertPt = - DT->findNearestCommonDominator(InsertPt->getParent(), - PHI->getIncomingBlock(i)) - ->getTerminator(); - } + Instruction *InsertPt = getInsertPointForUses(User, Op, DT); // Now expand it into actual Instructions and patch it into place. Value *NewVal = Rewriter.expandCodeFor(AR, UseTy, InsertPt); @@ -747,19 +721,38 @@ namespace { // extend operations. This information is recorded by CollectExtend and // provides the input to WidenIV. struct WideIVInfo { - const Type *WidestNativeType; // Widest integer type created [sz]ext - bool IsSigned; // Was an sext user seen before a zext? + PHINode *NarrowIV; + Type *WidestNativeType; // Widest integer type created [sz]ext + bool IsSigned; // Was an sext user seen before a zext? - WideIVInfo() : WidestNativeType(0), IsSigned(false) {} + WideIVInfo() : NarrowIV(0), WidestNativeType(0), IsSigned(false) {} + }; + + class WideIVVisitor : public IVVisitor { + ScalarEvolution *SE; + const TargetData *TD; + + public: + WideIVInfo WI; + + WideIVVisitor(PHINode *NarrowIV, ScalarEvolution *SCEV, + const TargetData *TData) : + SE(SCEV), TD(TData) { WI.NarrowIV = NarrowIV; } + + // Implement the interface used by simplifyUsersOfIV. + virtual void visitCast(CastInst *Cast); }; } -/// CollectExtend - Update information about the induction variable that is +/// visitCast - Update information about the induction variable that is /// extended by this sign or zero extend operation. This is used to determine /// the final width of the IV before actually widening it. -static void CollectExtend(CastInst *Cast, bool IsSigned, WideIVInfo &WI, - ScalarEvolution *SE, const TargetData *TD) { - const Type *Ty = Cast->getType(); +void WideIVVisitor::visitCast(CastInst *Cast) { + bool IsSigned = Cast->getOpcode() == Instruction::SExt; + if (!IsSigned && Cast->getOpcode() != Instruction::ZExt) + return; + + Type *Ty = Cast->getType(); uint64_t Width = SE->getTypeSizeInBits(Ty); if (TD && !TD->isLegalInteger(Width)) return; @@ -779,6 +772,21 @@ static void CollectExtend(CastInst *Cast, bool IsSigned, WideIVInfo &WI, } namespace { + +/// NarrowIVDefUse - Record a link in the Narrow IV def-use chain along with the +/// WideIV that computes the same value as the Narrow IV def. This avoids +/// caching Use* pointers. +struct NarrowIVDefUse { + Instruction *NarrowDef; + Instruction *NarrowUse; + Instruction *WideDef; + + NarrowIVDefUse(): NarrowDef(0), NarrowUse(0), WideDef(0) {} + + NarrowIVDefUse(Instruction *ND, Instruction *NU, Instruction *WD): + NarrowDef(ND), NarrowUse(NU), WideDef(WD) {} +}; + /// WidenIV - The goal of this transform is to remove sign and zero extends /// without creating any new induction variables. To do this, it creates a new /// phi of the wider type and redirects all users, either removing extends or @@ -787,7 +795,7 @@ namespace { class WidenIV { // Parameters PHINode *OrigPhi; - const Type *WideType; + Type *WideType; bool IsSigned; // Context @@ -803,13 +811,13 @@ class WidenIV { SmallVectorImpl<WeakVH> &DeadInsts; SmallPtrSet<Instruction*,16> Widened; - SmallVector<std::pair<Use *, Instruction *>, 8> NarrowIVUsers; + SmallVector<NarrowIVDefUse, 8> NarrowIVUsers; public: - WidenIV(PHINode *PN, const WideIVInfo &WI, LoopInfo *LInfo, + WidenIV(const WideIVInfo &WI, LoopInfo *LInfo, ScalarEvolution *SEv, DominatorTree *DTree, SmallVectorImpl<WeakVH> &DI) : - OrigPhi(PN), + OrigPhi(WI.NarrowIV), WideType(WI.WidestNativeType), IsSigned(WI.IsSigned), LI(LInfo), @@ -826,21 +834,42 @@ public: PHINode *CreateWideIV(SCEVExpander &Rewriter); protected: - Instruction *CloneIVUser(Instruction *NarrowUse, - Instruction *NarrowDef, - Instruction *WideDef); + Value *getExtend(Value *NarrowOper, Type *WideType, bool IsSigned, + Instruction *Use); + + Instruction *CloneIVUser(NarrowIVDefUse DU); const SCEVAddRecExpr *GetWideRecurrence(Instruction *NarrowUse); - Instruction *WidenIVUse(Use &NarrowDefUse, Instruction *NarrowDef, - Instruction *WideDef); + const SCEVAddRecExpr* GetExtendedOperandRecurrence(NarrowIVDefUse DU); + + Instruction *WidenIVUse(NarrowIVDefUse DU); void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef); }; } // anonymous namespace -static Value *getExtend( Value *NarrowOper, const Type *WideType, - bool IsSigned, IRBuilder<> &Builder) { +/// isLoopInvariant - Perform a quick domtree based check for loop invariance +/// assuming that V is used within the loop. LoopInfo::isLoopInvariant() seems +/// gratuitous for this purpose. +static bool isLoopInvariant(Value *V, const Loop *L, const DominatorTree *DT) { + Instruction *Inst = dyn_cast<Instruction>(V); + if (!Inst) + return true; + + return DT->properlyDominates(Inst->getParent(), L->getHeader()); +} + +Value *WidenIV::getExtend(Value *NarrowOper, Type *WideType, bool IsSigned, + Instruction *Use) { + // Set the debug location and conservative insertion point. + IRBuilder<> Builder(Use); + // Hoist the insertion point into loop preheaders as far as possible. + for (const Loop *L = LI->getLoopFor(Use->getParent()); + L && L->getLoopPreheader() && isLoopInvariant(NarrowOper, L, DT); + L = L->getParentLoop()) + Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator()); + return IsSigned ? Builder.CreateSExt(NarrowOper, WideType) : Builder.CreateZExt(NarrowOper, WideType); } @@ -848,10 +877,8 @@ static Value *getExtend( Value *NarrowOper, const Type *WideType, /// CloneIVUser - Instantiate a wide operation to replace a narrow /// operation. This only needs to handle operations that can evaluation to /// SCEVAddRec. It can safely return 0 for any operation we decide not to clone. -Instruction *WidenIV::CloneIVUser(Instruction *NarrowUse, - Instruction *NarrowDef, - Instruction *WideDef) { - unsigned Opcode = NarrowUse->getOpcode(); +Instruction *WidenIV::CloneIVUser(NarrowIVDefUse DU) { + unsigned Opcode = DU.NarrowUse->getOpcode(); switch (Opcode) { default: return 0; @@ -865,24 +892,23 @@ Instruction *WidenIV::CloneIVUser(Instruction *NarrowUse, case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: - DEBUG(dbgs() << "Cloning IVUser: " << *NarrowUse << "\n"); - - IRBuilder<> Builder(NarrowUse); + DEBUG(dbgs() << "Cloning IVUser: " << *DU.NarrowUse << "\n"); // Replace NarrowDef operands with WideDef. Otherwise, we don't know // anything about the narrow operand yet so must insert a [sz]ext. It is // probably loop invariant and will be folded or hoisted. If it actually // comes from a widened IV, it should be removed during a future call to // WidenIVUse. - Value *LHS = (NarrowUse->getOperand(0) == NarrowDef) ? WideDef : - getExtend(NarrowUse->getOperand(0), WideType, IsSigned, Builder); - Value *RHS = (NarrowUse->getOperand(1) == NarrowDef) ? WideDef : - getExtend(NarrowUse->getOperand(1), WideType, IsSigned, Builder); + Value *LHS = (DU.NarrowUse->getOperand(0) == DU.NarrowDef) ? DU.WideDef : + getExtend(DU.NarrowUse->getOperand(0), WideType, IsSigned, DU.NarrowUse); + Value *RHS = (DU.NarrowUse->getOperand(1) == DU.NarrowDef) ? DU.WideDef : + getExtend(DU.NarrowUse->getOperand(1), WideType, IsSigned, DU.NarrowUse); - BinaryOperator *NarrowBO = cast<BinaryOperator>(NarrowUse); + BinaryOperator *NarrowBO = cast<BinaryOperator>(DU.NarrowUse); BinaryOperator *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS, NarrowBO->getName()); + IRBuilder<> Builder(DU.NarrowUse); Builder.Insert(WideBO); if (const OverflowingBinaryOperator *OBO = dyn_cast<OverflowingBinaryOperator>(NarrowBO)) { @@ -894,45 +920,46 @@ Instruction *WidenIV::CloneIVUser(Instruction *NarrowUse, llvm_unreachable(0); } -/// HoistStep - Attempt to hoist an IV increment above a potential use. -/// -/// To successfully hoist, two criteria must be met: -/// - IncV operands dominate InsertPos and -/// - InsertPos dominates IncV -/// -/// Meeting the second condition means that we don't need to check all of IncV's -/// existing uses (it's moving up in the domtree). -/// -/// This does not yet recursively hoist the operands, although that would -/// not be difficult. -static bool HoistStep(Instruction *IncV, Instruction *InsertPos, - const DominatorTree *DT) -{ - if (DT->dominates(IncV, InsertPos)) - return true; +/// No-wrap operations can transfer sign extension of their result to their +/// operands. Generate the SCEV value for the widened operation without +/// actually modifying the IR yet. If the expression after extending the +/// operands is an AddRec for this loop, return it. +const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) { + // Handle the common case of add<nsw/nuw> + if (DU.NarrowUse->getOpcode() != Instruction::Add) + return 0; - if (!DT->dominates(InsertPos->getParent(), IncV->getParent())) - return false; + // One operand (NarrowDef) has already been extended to WideDef. Now determine + // if extending the other will lead to a recurrence. + unsigned ExtendOperIdx = DU.NarrowUse->getOperand(0) == DU.NarrowDef ? 1 : 0; + assert(DU.NarrowUse->getOperand(1-ExtendOperIdx) == DU.NarrowDef && "bad DU"); + + const SCEV *ExtendOperExpr = 0; + const OverflowingBinaryOperator *OBO = + cast<OverflowingBinaryOperator>(DU.NarrowUse); + if (IsSigned && OBO->hasNoSignedWrap()) + ExtendOperExpr = SE->getSignExtendExpr( + SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType); + else if(!IsSigned && OBO->hasNoUnsignedWrap()) + ExtendOperExpr = SE->getZeroExtendExpr( + SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType); + else + return 0; - if (IncV->mayHaveSideEffects()) - return false; + const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>( + SE->getAddExpr(SE->getSCEV(DU.WideDef), ExtendOperExpr, + IsSigned ? SCEV::FlagNSW : SCEV::FlagNUW)); - // Attempt to hoist IncV - for (User::op_iterator OI = IncV->op_begin(), OE = IncV->op_end(); - OI != OE; ++OI) { - Instruction *OInst = dyn_cast<Instruction>(OI); - if (OInst && !DT->dominates(OInst, InsertPos)) - return false; - } - IncV->moveBefore(InsertPos); - return true; + if (!AddRec || AddRec->getLoop() != L) + return 0; + return AddRec; } -// GetWideRecurrence - Is this instruction potentially interesting from IVUsers' -// perspective after widening it's type? In other words, can the extend be -// safely hoisted out of the loop with SCEV reducing the value to a recurrence -// on the same loop. If so, return the sign or zero extended -// recurrence. Otherwise return NULL. +/// GetWideRecurrence - Is this instruction potentially interesting from +/// IVUsers' perspective after widening it's type? In other words, can the +/// extend be safely hoisted out of the loop with SCEV reducing the value to a +/// recurrence on the same loop. If so, return the sign or zero extended +/// recurrence. Otherwise return NULL. const SCEVAddRecExpr *WidenIV::GetWideRecurrence(Instruction *NarrowUse) { if (!SE->isSCEVable(NarrowUse->getType())) return 0; @@ -951,47 +978,45 @@ const SCEVAddRecExpr *WidenIV::GetWideRecurrence(Instruction *NarrowUse) { const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(WideExpr); if (!AddRec || AddRec->getLoop() != L) return 0; - return AddRec; } /// WidenIVUse - Determine whether an individual user of the narrow IV can be /// widened. If so, return the wide clone of the user. -Instruction *WidenIV::WidenIVUse(Use &NarrowDefUse, Instruction *NarrowDef, - Instruction *WideDef) { - Instruction *NarrowUse = cast<Instruction>(NarrowDefUse.getUser()); +Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU) { // Stop traversing the def-use chain at inner-loop phis or post-loop phis. - if (isa<PHINode>(NarrowUse) && LI->getLoopFor(NarrowUse->getParent()) != L) + if (isa<PHINode>(DU.NarrowUse) && + LI->getLoopFor(DU.NarrowUse->getParent()) != L) return 0; // Our raison d'etre! Eliminate sign and zero extension. - if (IsSigned ? isa<SExtInst>(NarrowUse) : isa<ZExtInst>(NarrowUse)) { - Value *NewDef = WideDef; - if (NarrowUse->getType() != WideType) { - unsigned CastWidth = SE->getTypeSizeInBits(NarrowUse->getType()); + if (IsSigned ? isa<SExtInst>(DU.NarrowUse) : isa<ZExtInst>(DU.NarrowUse)) { + Value *NewDef = DU.WideDef; + if (DU.NarrowUse->getType() != WideType) { + unsigned CastWidth = SE->getTypeSizeInBits(DU.NarrowUse->getType()); unsigned IVWidth = SE->getTypeSizeInBits(WideType); if (CastWidth < IVWidth) { // The cast isn't as wide as the IV, so insert a Trunc. - IRBuilder<> Builder(NarrowDefUse); - NewDef = Builder.CreateTrunc(WideDef, NarrowUse->getType()); + IRBuilder<> Builder(DU.NarrowUse); + NewDef = Builder.CreateTrunc(DU.WideDef, DU.NarrowUse->getType()); } else { // A wider extend was hidden behind a narrower one. This may induce // another round of IV widening in which the intermediate IV becomes // dead. It should be very rare. DEBUG(dbgs() << "INDVARS: New IV " << *WidePhi - << " not wide enough to subsume " << *NarrowUse << "\n"); - NarrowUse->replaceUsesOfWith(NarrowDef, WideDef); - NewDef = NarrowUse; + << " not wide enough to subsume " << *DU.NarrowUse << "\n"); + DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef); + NewDef = DU.NarrowUse; } } - if (NewDef != NarrowUse) { - DEBUG(dbgs() << "INDVARS: eliminating " << *NarrowUse - << " replaced by " << *WideDef << "\n"); + if (NewDef != DU.NarrowUse) { + DEBUG(dbgs() << "INDVARS: eliminating " << *DU.NarrowUse + << " replaced by " << *DU.WideDef << "\n"); ++NumElimExt; - NarrowUse->replaceAllUsesWith(NewDef); - DeadInsts.push_back(NarrowUse); + DU.NarrowUse->replaceAllUsesWith(NewDef); + DeadInsts.push_back(DU.NarrowUse); } // Now that the extend is gone, we want to expose it's uses for potential // further simplification. We don't need to directly inform SimplifyIVUsers @@ -1004,29 +1029,32 @@ Instruction *WidenIV::WidenIVUse(Use &NarrowDefUse, Instruction *NarrowDef, } // Does this user itself evaluate to a recurrence after widening? - const SCEVAddRecExpr *WideAddRec = GetWideRecurrence(NarrowUse); + const SCEVAddRecExpr *WideAddRec = GetWideRecurrence(DU.NarrowUse); + if (!WideAddRec) { + WideAddRec = GetExtendedOperandRecurrence(DU); + } if (!WideAddRec) { // This user does not evaluate to a recurence after widening, so don't // follow it. Instead insert a Trunc to kill off the original use, // eventually isolating the original narrow IV so it can be removed. - IRBuilder<> Builder(NarrowDefUse); - Value *Trunc = Builder.CreateTrunc(WideDef, NarrowDef->getType()); - NarrowUse->replaceUsesOfWith(NarrowDef, Trunc); + IRBuilder<> Builder(getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT)); + Value *Trunc = Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType()); + DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, Trunc); return 0; } - // We assume that block terminators are not SCEVable. We wouldn't want to + // Assume block terminators cannot evaluate to a recurrence. We can't to // insert a Trunc after a terminator if there happens to be a critical edge. - assert(NarrowUse != NarrowUse->getParent()->getTerminator() && + assert(DU.NarrowUse != DU.NarrowUse->getParent()->getTerminator() && "SCEV is not expected to evaluate a block terminator"); // Reuse the IV increment that SCEVExpander created as long as it dominates // NarrowUse. Instruction *WideUse = 0; - if (WideAddRec == WideIncExpr && HoistStep(WideInc, NarrowUse, DT)) { + if (WideAddRec == WideIncExpr + && SCEVExpander::hoistStep(WideInc, DU.NarrowUse, DT)) WideUse = WideInc; - } else { - WideUse = CloneIVUser(NarrowUse, NarrowDef, WideDef); + WideUse = CloneIVUser(DU); if (!WideUse) return 0; } @@ -1051,13 +1079,13 @@ Instruction *WidenIV::WidenIVUse(Use &NarrowDefUse, Instruction *NarrowDef, void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) { for (Value::use_iterator UI = NarrowDef->use_begin(), UE = NarrowDef->use_end(); UI != UE; ++UI) { - Use &U = UI.getUse(); + Instruction *NarrowUse = cast<Instruction>(*UI); // Handle data flow merges and bizarre phi cycles. - if (!Widened.insert(cast<Instruction>(U.getUser()))) + if (!Widened.insert(NarrowUse)) continue; - NarrowIVUsers.push_back(std::make_pair(&UI.getUse(), WideDef)); + NarrowIVUsers.push_back(NarrowIVDefUse(NarrowDef, NarrowUse, WideDef)); } } @@ -1124,23 +1152,19 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) { pushNarrowIVUsers(OrigPhi, WidePhi); while (!NarrowIVUsers.empty()) { - Use *UsePtr; - Instruction *WideDef; - tie(UsePtr, WideDef) = NarrowIVUsers.pop_back_val(); - Use &NarrowDefUse = *UsePtr; + NarrowIVDefUse DU = NarrowIVUsers.pop_back_val(); // Process a def-use edge. This may replace the use, so don't hold a // use_iterator across it. - Instruction *NarrowDef = cast<Instruction>(NarrowDefUse.get()); - Instruction *WideUse = WidenIVUse(NarrowDefUse, NarrowDef, WideDef); + Instruction *WideUse = WidenIVUse(DU); // Follow all def-use edges from the previous narrow use. if (WideUse) - pushNarrowIVUsers(cast<Instruction>(NarrowDefUse.getUser()), WideUse); + pushNarrowIVUsers(DU.NarrowUse, WideUse); // WidenIVUse may have removed the def-use edge. - if (NarrowDef->use_empty()) - DeadInsts.push_back(NarrowDef); + if (DU.NarrowDef->use_empty()) + DeadInsts.push_back(DU.NarrowDef); } return WidePhi; } @@ -1149,187 +1173,17 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) { // Simplification of IV users based on SCEV evaluation. //===----------------------------------------------------------------------===// -void IndVarSimplify::EliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) { - unsigned IVOperIdx = 0; - ICmpInst::Predicate Pred = ICmp->getPredicate(); - if (IVOperand != ICmp->getOperand(0)) { - // Swapped - assert(IVOperand == ICmp->getOperand(1) && "Can't find IVOperand"); - IVOperIdx = 1; - Pred = ICmpInst::getSwappedPredicate(Pred); - } - - // Get the SCEVs for the ICmp operands. - const SCEV *S = SE->getSCEV(ICmp->getOperand(IVOperIdx)); - const SCEV *X = SE->getSCEV(ICmp->getOperand(1 - IVOperIdx)); - - // Simplify unnecessary loops away. - const Loop *ICmpLoop = LI->getLoopFor(ICmp->getParent()); - S = SE->getSCEVAtScope(S, ICmpLoop); - X = SE->getSCEVAtScope(X, ICmpLoop); - - // If the condition is always true or always false, replace it with - // a constant value. - if (SE->isKnownPredicate(Pred, S, X)) - ICmp->replaceAllUsesWith(ConstantInt::getTrue(ICmp->getContext())); - else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X)) - ICmp->replaceAllUsesWith(ConstantInt::getFalse(ICmp->getContext())); - else - return; - - DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n'); - ++NumElimCmp; - Changed = true; - DeadInsts.push_back(ICmp); -} - -void IndVarSimplify::EliminateIVRemainder(BinaryOperator *Rem, - Value *IVOperand, - bool IsSigned) { - // We're only interested in the case where we know something about - // the numerator. - if (IVOperand != Rem->getOperand(0)) - return; - - // Get the SCEVs for the ICmp operands. - const SCEV *S = SE->getSCEV(Rem->getOperand(0)); - const SCEV *X = SE->getSCEV(Rem->getOperand(1)); - - // Simplify unnecessary loops away. - const Loop *ICmpLoop = LI->getLoopFor(Rem->getParent()); - S = SE->getSCEVAtScope(S, ICmpLoop); - X = SE->getSCEVAtScope(X, ICmpLoop); - - // i % n --> i if i is in [0,n). - if ((!IsSigned || SE->isKnownNonNegative(S)) && - SE->isKnownPredicate(IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, - S, X)) - Rem->replaceAllUsesWith(Rem->getOperand(0)); - else { - // (i+1) % n --> (i+1)==n?0:(i+1) if i is in [0,n). - const SCEV *LessOne = - SE->getMinusSCEV(S, SE->getConstant(S->getType(), 1)); - if (IsSigned && !SE->isKnownNonNegative(LessOne)) - return; - - if (!SE->isKnownPredicate(IsSigned ? - ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, - LessOne, X)) - return; - - ICmpInst *ICmp = new ICmpInst(Rem, ICmpInst::ICMP_EQ, - Rem->getOperand(0), Rem->getOperand(1), - "tmp"); - SelectInst *Sel = - SelectInst::Create(ICmp, - ConstantInt::get(Rem->getType(), 0), - Rem->getOperand(0), "tmp", Rem); - Rem->replaceAllUsesWith(Sel); - } - - // Inform IVUsers about the new users. - if (IU) { - if (Instruction *I = dyn_cast<Instruction>(Rem->getOperand(0))) - IU->AddUsersIfInteresting(I); - } - DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n'); - ++NumElimRem; - Changed = true; - DeadInsts.push_back(Rem); -} - -/// EliminateIVUser - Eliminate an operation that consumes a simple IV and has -/// no observable side-effect given the range of IV values. -bool IndVarSimplify::EliminateIVUser(Instruction *UseInst, - Instruction *IVOperand) { - if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) { - EliminateIVComparison(ICmp, IVOperand); - return true; - } - if (BinaryOperator *Rem = dyn_cast<BinaryOperator>(UseInst)) { - bool IsSigned = Rem->getOpcode() == Instruction::SRem; - if (IsSigned || Rem->getOpcode() == Instruction::URem) { - EliminateIVRemainder(Rem, IVOperand, IsSigned); - return true; - } - } - - // Eliminate any operation that SCEV can prove is an identity function. - if (!SE->isSCEVable(UseInst->getType()) || - (UseInst->getType() != IVOperand->getType()) || - (SE->getSCEV(UseInst) != SE->getSCEV(IVOperand))) - return false; - - DEBUG(dbgs() << "INDVARS: Eliminated identity: " << *UseInst << '\n'); - - UseInst->replaceAllUsesWith(IVOperand); - ++NumElimIdentity; - Changed = true; - DeadInsts.push_back(UseInst); - return true; -} - -/// pushIVUsers - Add all uses of Def to the current IV's worklist. -/// -static void pushIVUsers( - Instruction *Def, - SmallPtrSet<Instruction*,16> &Simplified, - SmallVectorImpl< std::pair<Instruction*,Instruction*> > &SimpleIVUsers) { - - for (Value::use_iterator UI = Def->use_begin(), E = Def->use_end(); - UI != E; ++UI) { - Instruction *User = cast<Instruction>(*UI); - - // Avoid infinite or exponential worklist processing. - // Also ensure unique worklist users. - // If Def is a LoopPhi, it may not be in the Simplified set, so check for - // self edges first. - if (User != Def && Simplified.insert(User)) - SimpleIVUsers.push_back(std::make_pair(User, Def)); - } -} - -/// isSimpleIVUser - Return true if this instruction generates a simple SCEV -/// expression in terms of that IV. -/// -/// This is similar to IVUsers' isInsteresting() but processes each instruction -/// non-recursively when the operand is already known to be a simpleIVUser. -/// -static bool isSimpleIVUser(Instruction *I, const Loop *L, ScalarEvolution *SE) { - if (!SE->isSCEVable(I->getType())) - return false; - - // Get the symbolic expression for this instruction. - const SCEV *S = SE->getSCEV(I); - - // We assume that terminators are not SCEVable. - assert((!S || I != I->getParent()->getTerminator()) && - "can't fold terminators"); - - // Only consider affine recurrences. - const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S); - if (AR && AR->getLoop() == L) - return true; - - return false; -} -/// SimplifyIVUsersNoRewrite - Iteratively perform simplification on a worklist -/// of IV users. Each successive simplification may push more users which may +/// SimplifyAndExtend - Iteratively perform simplification on a worklist of IV +/// users. Each successive simplification may push more users which may /// themselves be candidates for simplification. /// -/// The "NoRewrite" algorithm does not require IVUsers analysis. Instead, it -/// simplifies instructions in-place during analysis. Rather than rewriting -/// induction variables bottom-up from their users, it transforms a chain of -/// IVUsers top-down, updating the IR only when it encouters a clear -/// optimization opportunitiy. A SCEVExpander "Rewriter" instance is still -/// needed, but only used to generate a new IV (phi) of wider type for sign/zero -/// extend elimination. +/// Sign/Zero extend elimination is interleaved with IV simplification. /// -/// Once DisableIVRewrite is default, LSR will be the only client of IVUsers. -/// -void IndVarSimplify::SimplifyIVUsersNoRewrite(Loop *L, SCEVExpander &Rewriter) { - std::map<PHINode *, WideIVInfo> WideIVMap; +void IndVarSimplify::SimplifyAndExtend(Loop *L, + SCEVExpander &Rewriter, + LPPassManager &LPM) { + SmallVector<WideIVInfo, 8> WideIVs; SmallVector<PHINode*, 8> LoopPhis; for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { @@ -1345,108 +1199,81 @@ void IndVarSimplify::SimplifyIVUsersNoRewrite(Loop *L, SCEVExpander &Rewriter) { // extension. The first time SCEV attempts to normalize sign/zero extension, // the result becomes final. So for the most predictable results, we delay // evaluation of sign/zero extend evaluation until needed, and avoid running - // other SCEV based analysis prior to SimplifyIVUsersNoRewrite. + // other SCEV based analysis prior to SimplifyAndExtend. do { PHINode *CurrIV = LoopPhis.pop_back_val(); // Information about sign/zero extensions of CurrIV. - WideIVInfo WI; - - // Instructions processed by SimplifyIVUsers for CurrIV. - SmallPtrSet<Instruction*,16> Simplified; - - // Use-def pairs if IV users waiting to be processed for CurrIV. - SmallVector<std::pair<Instruction*, Instruction*>, 8> SimpleIVUsers; - - // Push users of the current LoopPhi. In rare cases, pushIVUsers may be - // called multiple times for the same LoopPhi. This is the proper thing to - // do for loop header phis that use each other. - pushIVUsers(CurrIV, Simplified, SimpleIVUsers); + WideIVVisitor WIV(CurrIV, SE, TD); - while (!SimpleIVUsers.empty()) { - Instruction *UseInst, *Operand; - tie(UseInst, Operand) = SimpleIVUsers.pop_back_val(); - // Bypass back edges to avoid extra work. - if (UseInst == CurrIV) continue; + Changed |= simplifyUsersOfIV(CurrIV, SE, &LPM, DeadInsts, &WIV); - if (EliminateIVUser(UseInst, Operand)) { - pushIVUsers(Operand, Simplified, SimpleIVUsers); - continue; - } - if (CastInst *Cast = dyn_cast<CastInst>(UseInst)) { - bool IsSigned = Cast->getOpcode() == Instruction::SExt; - if (IsSigned || Cast->getOpcode() == Instruction::ZExt) { - CollectExtend(Cast, IsSigned, WI, SE, TD); - } - continue; - } - if (isSimpleIVUser(UseInst, L, SE)) { - pushIVUsers(UseInst, Simplified, SimpleIVUsers); - } - } - if (WI.WidestNativeType) { - WideIVMap[CurrIV] = WI; + if (WIV.WI.WidestNativeType) { + WideIVs.push_back(WIV.WI); } } while(!LoopPhis.empty()); - for (std::map<PHINode *, WideIVInfo>::const_iterator I = WideIVMap.begin(), - E = WideIVMap.end(); I != E; ++I) { - WidenIV Widener(I->first, I->second, LI, SE, DT, DeadInsts); + for (; !WideIVs.empty(); WideIVs.pop_back()) { + WidenIV Widener(WideIVs.back(), LI, SE, DT, DeadInsts); if (PHINode *WidePhi = Widener.CreateWideIV(Rewriter)) { Changed = true; LoopPhis.push_back(WidePhi); } } - WideIVMap.clear(); } } -/// SimplifyCongruentIVs - Check for congruent phis in this loop header and -/// populate ExprToIVMap for use later. -/// -void IndVarSimplify::SimplifyCongruentIVs(Loop *L) { - DenseMap<const SCEV *, PHINode *> ExprToIVMap; - for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { - PHINode *Phi = cast<PHINode>(I); - if (!SE->isSCEVable(Phi->getType())) - continue; +//===----------------------------------------------------------------------===// +// LinearFunctionTestReplace and its kin. Rewrite the loop exit condition. +//===----------------------------------------------------------------------===// - const SCEV *S = SE->getSCEV(Phi); - DenseMap<const SCEV *, PHINode *>::const_iterator Pos; - bool Inserted; - tie(Pos, Inserted) = ExprToIVMap.insert(std::make_pair(S, Phi)); - if (Inserted) - continue; - PHINode *OrigPhi = Pos->second; - // Replacing the congruent phi is sufficient because acyclic redundancy - // elimination, CSE/GVN, should handle the rest. However, once SCEV proves - // that a phi is congruent, it's almost certain to be the head of an IV - // user cycle that is isomorphic with the original phi. So it's worth - // eagerly cleaning up the common case of a single IV increment. - if (BasicBlock *LatchBlock = L->getLoopLatch()) { - Instruction *OrigInc = - cast<Instruction>(OrigPhi->getIncomingValueForBlock(LatchBlock)); - Instruction *IsomorphicInc = - cast<Instruction>(Phi->getIncomingValueForBlock(LatchBlock)); - if (OrigInc != IsomorphicInc && - SE->getSCEV(OrigInc) == SE->getSCEV(IsomorphicInc) && - HoistStep(OrigInc, IsomorphicInc, DT)) { - DEBUG(dbgs() << "INDVARS: Eliminated congruent iv.inc: " - << *IsomorphicInc << '\n'); - IsomorphicInc->replaceAllUsesWith(OrigInc); - DeadInsts.push_back(IsomorphicInc); - } +/// Check for expressions that ScalarEvolution generates to compute +/// BackedgeTakenInfo. If these expressions have not been reduced, then +/// expanding them may incur additional cost (albeit in the loop preheader). +static bool isHighCostExpansion(const SCEV *S, BranchInst *BI, + ScalarEvolution *SE) { + // If the backedge-taken count is a UDiv, it's very likely a UDiv that + // ScalarEvolution's HowFarToZero or HowManyLessThans produced to compute a + // precise expression, rather than a UDiv from the user's code. If we can't + // find a UDiv in the code with some simple searching, assume the former and + // forego rewriting the loop. + if (isa<SCEVUDivExpr>(S)) { + ICmpInst *OrigCond = dyn_cast<ICmpInst>(BI->getCondition()); + if (!OrigCond) return true; + const SCEV *R = SE->getSCEV(OrigCond->getOperand(1)); + R = SE->getMinusSCEV(R, SE->getConstant(R->getType(), 1)); + if (R != S) { + const SCEV *L = SE->getSCEV(OrigCond->getOperand(0)); + L = SE->getMinusSCEV(L, SE->getConstant(L->getType(), 1)); + if (L != S) + return true; } - DEBUG(dbgs() << "INDVARS: Eliminated congruent iv: " << *Phi << '\n'); - ++NumElimIV; - Phi->replaceAllUsesWith(OrigPhi); - DeadInsts.push_back(Phi); } -} -//===----------------------------------------------------------------------===// -// LinearFunctionTestReplace and its kin. Rewrite the loop exit condition. -//===----------------------------------------------------------------------===// + if (EnableIVRewrite) + return false; + + // Recurse past add expressions, which commonly occur in the + // BackedgeTakenCount. They may already exist in program code, and if not, + // they are not too expensive rematerialize. + if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { + for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); + I != E; ++I) { + if (isHighCostExpansion(*I, BI, SE)) + return true; + } + return false; + } + + // HowManyLessThans uses a Max expression whenever the loop is not guarded by + // the exit condition. + if (isa<SCEVSMaxExpr>(S) || isa<SCEVUMaxExpr>(S)) + return true; + + // If we haven't recognized an expensive SCEV patter, assume its an expression + // produced by program code. + return false; +} /// canExpandBackedgeTakenCount - Return true if this loop's backedge taken /// count expression can be safely and cheaply expanded into an instruction @@ -1465,31 +1292,17 @@ static bool canExpandBackedgeTakenCount(Loop *L, ScalarEvolution *SE) { if (!BI) return false; - // Special case: If the backedge-taken count is a UDiv, it's very likely a - // UDiv that ScalarEvolution produced in order to compute a precise - // expression, rather than a UDiv from the user's code. If we can't find a - // UDiv in the code with some simple searching, assume the former and forego - // rewriting the loop. - if (isa<SCEVUDivExpr>(BackedgeTakenCount)) { - ICmpInst *OrigCond = dyn_cast<ICmpInst>(BI->getCondition()); - if (!OrigCond) return false; - const SCEV *R = SE->getSCEV(OrigCond->getOperand(1)); - R = SE->getMinusSCEV(R, SE->getConstant(R->getType(), 1)); - if (R != BackedgeTakenCount) { - const SCEV *L = SE->getSCEV(OrigCond->getOperand(0)); - L = SE->getMinusSCEV(L, SE->getConstant(L->getType(), 1)); - if (L != BackedgeTakenCount) - return false; - } - } + if (isHighCostExpansion(BackedgeTakenCount, BI, SE)) + return false; + return true; } /// getBackedgeIVType - Get the widest type used by the loop test after peeking /// through Truncs. /// -/// TODO: Unnecessary if LFTR does not force a canonical IV. -static const Type *getBackedgeIVType(Loop *L) { +/// TODO: Unnecessary when ForceLFTR is removed. +static Type *getBackedgeIVType(Loop *L) { if (!L->getExitingBlock()) return 0; @@ -1502,7 +1315,7 @@ static const Type *getBackedgeIVType(Loop *L) { if (!Cond) return 0; - const Type *Ty = 0; + Type *Ty = 0; for(User::op_iterator OI = Cond->op_begin(), OE = Cond->op_end(); OI != OE; ++OI) { assert((!Ty || Ty == (*OI)->getType()) && "bad icmp operand types"); @@ -1515,12 +1328,187 @@ static const Type *getBackedgeIVType(Loop *L) { return Ty; } +/// getLoopPhiForCounter - Return the loop header phi IFF IncV adds a loop +/// invariant value to the phi. +static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) { + Instruction *IncI = dyn_cast<Instruction>(IncV); + if (!IncI) + return 0; + + switch (IncI->getOpcode()) { + case Instruction::Add: + case Instruction::Sub: + break; + case Instruction::GetElementPtr: + // An IV counter must preserve its type. + if (IncI->getNumOperands() == 2) + break; + default: + return 0; + } + + PHINode *Phi = dyn_cast<PHINode>(IncI->getOperand(0)); + if (Phi && Phi->getParent() == L->getHeader()) { + if (isLoopInvariant(IncI->getOperand(1), L, DT)) + return Phi; + return 0; + } + if (IncI->getOpcode() == Instruction::GetElementPtr) + return 0; + + // Allow add/sub to be commuted. + Phi = dyn_cast<PHINode>(IncI->getOperand(1)); + if (Phi && Phi->getParent() == L->getHeader()) { + if (isLoopInvariant(IncI->getOperand(0), L, DT)) + return Phi; + } + return 0; +} + +/// needsLFTR - LinearFunctionTestReplace policy. Return true unless we can show +/// that the current exit test is already sufficiently canonical. +static bool needsLFTR(Loop *L, DominatorTree *DT) { + assert(L->getExitingBlock() && "expected loop exit"); + + BasicBlock *LatchBlock = L->getLoopLatch(); + // Don't bother with LFTR if the loop is not properly simplified. + if (!LatchBlock) + return false; + + BranchInst *BI = dyn_cast<BranchInst>(L->getExitingBlock()->getTerminator()); + assert(BI && "expected exit branch"); + + // Do LFTR to simplify the exit condition to an ICMP. + ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition()); + if (!Cond) + return true; + + // Do LFTR to simplify the exit ICMP to EQ/NE + ICmpInst::Predicate Pred = Cond->getPredicate(); + if (Pred != ICmpInst::ICMP_NE && Pred != ICmpInst::ICMP_EQ) + return true; + + // Look for a loop invariant RHS + Value *LHS = Cond->getOperand(0); + Value *RHS = Cond->getOperand(1); + if (!isLoopInvariant(RHS, L, DT)) { + if (!isLoopInvariant(LHS, L, DT)) + return true; + std::swap(LHS, RHS); + } + // Look for a simple IV counter LHS + PHINode *Phi = dyn_cast<PHINode>(LHS); + if (!Phi) + Phi = getLoopPhiForCounter(LHS, L, DT); + + if (!Phi) + return true; + + // Do LFTR if the exit condition's IV is *not* a simple counter. + Value *IncV = Phi->getIncomingValueForBlock(L->getLoopLatch()); + return Phi != getLoopPhiForCounter(IncV, L, DT); +} + +/// AlmostDeadIV - Return true if this IV has any uses other than the (soon to +/// be rewritten) loop exit test. +static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) { + int LatchIdx = Phi->getBasicBlockIndex(LatchBlock); + Value *IncV = Phi->getIncomingValue(LatchIdx); + + for (Value::use_iterator UI = Phi->use_begin(), UE = Phi->use_end(); + UI != UE; ++UI) { + if (*UI != Cond && *UI != IncV) return false; + } + + for (Value::use_iterator UI = IncV->use_begin(), UE = IncV->use_end(); + UI != UE; ++UI) { + if (*UI != Cond && *UI != Phi) return false; + } + return true; +} + +/// FindLoopCounter - Find an affine IV in canonical form. +/// +/// FIXME: Accept -1 stride and set IVLimit = IVInit - BECount +/// +/// FIXME: Accept non-unit stride as long as SCEV can reduce BECount * Stride. +/// This is difficult in general for SCEV because of potential overflow. But we +/// could at least handle constant BECounts. +static PHINode * +FindLoopCounter(Loop *L, const SCEV *BECount, + ScalarEvolution *SE, DominatorTree *DT, const TargetData *TD) { + // I'm not sure how BECount could be a pointer type, but we definitely don't + // want to LFTR that. + if (BECount->getType()->isPointerTy()) + return 0; + + uint64_t BCWidth = SE->getTypeSizeInBits(BECount->getType()); + + Value *Cond = + cast<BranchInst>(L->getExitingBlock()->getTerminator())->getCondition(); + + // Loop over all of the PHI nodes, looking for a simple counter. + PHINode *BestPhi = 0; + const SCEV *BestInit = 0; + BasicBlock *LatchBlock = L->getLoopLatch(); + assert(LatchBlock && "needsLFTR should guarantee a loop latch"); + + for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { + PHINode *Phi = cast<PHINode>(I); + if (!SE->isSCEVable(Phi->getType())) + continue; + + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Phi)); + if (!AR || AR->getLoop() != L || !AR->isAffine()) + continue; + + // AR may be a pointer type, while BECount is an integer type. + // AR may be wider than BECount. With eq/ne tests overflow is immaterial. + // AR may not be a narrower type, or we may never exit. + uint64_t PhiWidth = SE->getTypeSizeInBits(AR->getType()); + if (PhiWidth < BCWidth || (TD && !TD->isLegalInteger(PhiWidth))) + continue; + + const SCEV *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)); + if (!Step || !Step->isOne()) + continue; + + int LatchIdx = Phi->getBasicBlockIndex(LatchBlock); + Value *IncV = Phi->getIncomingValue(LatchIdx); + if (getLoopPhiForCounter(IncV, L, DT) != Phi) + continue; + + const SCEV *Init = AR->getStart(); + + if (BestPhi && !AlmostDeadIV(BestPhi, LatchBlock, Cond)) { + // Don't force a live loop counter if another IV can be used. + if (AlmostDeadIV(Phi, LatchBlock, Cond)) + continue; + + // Prefer to count-from-zero. This is a more "canonical" counter form. It + // also prefers integer to pointer IVs. + if (BestInit->isZero() != Init->isZero()) { + if (BestInit->isZero()) + continue; + } + // If two IVs both count from zero or both count from nonzero then the + // narrower is likely a dead phi that has been widened. Use the wider phi + // to allow the other to be eliminated. + if (PhiWidth <= SE->getTypeSizeInBits(BestPhi->getType())) + continue; + } + BestPhi = Phi; + BestInit = Init; + } + return BestPhi; +} + /// LinearFunctionTestReplace - This method rewrites the exit condition of the /// loop to be a canonical != comparison against the incremented loop induction /// variable. This pass is able to rewrite the exit tests of any loop where the /// SCEV analysis can determine a loop-invariant trip count of the loop, which /// is actually a much broader range than just linear tests. -ICmpInst *IndVarSimplify:: +Value *IndVarSimplify:: LinearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount, PHINode *IndVar, @@ -1528,62 +1516,117 @@ LinearFunctionTestReplace(Loop *L, assert(canExpandBackedgeTakenCount(L, SE) && "precondition"); BranchInst *BI = cast<BranchInst>(L->getExitingBlock()->getTerminator()); + // LFTR can ignore IV overflow and truncate to the width of + // BECount. This avoids materializing the add(zext(add)) expression. + Type *CntTy = !EnableIVRewrite ? + BackedgeTakenCount->getType() : IndVar->getType(); + + const SCEV *IVLimit = BackedgeTakenCount; + // If the exiting block is not the same as the backedge block, we must compare // against the preincremented value, otherwise we prefer to compare against // the post-incremented value. Value *CmpIndVar; - const SCEV *RHS = BackedgeTakenCount; if (L->getExitingBlock() == L->getLoopLatch()) { // Add one to the "backedge-taken" count to get the trip count. // If this addition may overflow, we have to be more pessimistic and // cast the induction variable before doing the add. - const SCEV *Zero = SE->getConstant(BackedgeTakenCount->getType(), 0); const SCEV *N = - SE->getAddExpr(BackedgeTakenCount, - SE->getConstant(BackedgeTakenCount->getType(), 1)); - if ((isa<SCEVConstant>(N) && !N->isZero()) || - SE->isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, N, Zero)) { - // No overflow. Cast the sum. - RHS = SE->getTruncateOrZeroExtend(N, IndVar->getType()); - } else { - // Potential overflow. Cast before doing the add. - RHS = SE->getTruncateOrZeroExtend(BackedgeTakenCount, - IndVar->getType()); - RHS = SE->getAddExpr(RHS, - SE->getConstant(IndVar->getType(), 1)); + SE->getAddExpr(IVLimit, SE->getConstant(IVLimit->getType(), 1)); + if (CntTy == IVLimit->getType()) + IVLimit = N; + else { + const SCEV *Zero = SE->getConstant(IVLimit->getType(), 0); + if ((isa<SCEVConstant>(N) && !N->isZero()) || + SE->isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, N, Zero)) { + // No overflow. Cast the sum. + IVLimit = SE->getTruncateOrZeroExtend(N, CntTy); + } else { + // Potential overflow. Cast before doing the add. + IVLimit = SE->getTruncateOrZeroExtend(IVLimit, CntTy); + IVLimit = SE->getAddExpr(IVLimit, SE->getConstant(CntTy, 1)); + } } - // The BackedgeTaken expression contains the number of times that the // backedge branches to the loop header. This is one less than the // number of times the loop executes, so use the incremented indvar. CmpIndVar = IndVar->getIncomingValueForBlock(L->getExitingBlock()); } else { // We have to use the preincremented value... - RHS = SE->getTruncateOrZeroExtend(BackedgeTakenCount, - IndVar->getType()); + IVLimit = SE->getTruncateOrZeroExtend(IVLimit, CntTy); CmpIndVar = IndVar; } + // For unit stride, IVLimit = Start + BECount with 2's complement overflow. + // So for, non-zero start compute the IVLimit here. + bool isPtrIV = false; + Type *CmpTy = CntTy; + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IndVar)); + assert(AR && AR->getLoop() == L && AR->isAffine() && "bad loop counter"); + if (!AR->getStart()->isZero()) { + assert(AR->getStepRecurrence(*SE)->isOne() && "only handles unit stride"); + const SCEV *IVInit = AR->getStart(); + + // For pointer types, sign extend BECount in order to materialize a GEP. + // Note that for without EnableIVRewrite, we never run SCEVExpander on a + // pointer type, because we must preserve the existing GEPs. Instead we + // directly generate a GEP later. + if (IVInit->getType()->isPointerTy()) { + isPtrIV = true; + CmpTy = SE->getEffectiveSCEVType(IVInit->getType()); + IVLimit = SE->getTruncateOrSignExtend(IVLimit, CmpTy); + } + // For integer types, truncate the IV before computing IVInit + BECount. + else { + if (SE->getTypeSizeInBits(IVInit->getType()) + > SE->getTypeSizeInBits(CmpTy)) + IVInit = SE->getTruncateExpr(IVInit, CmpTy); + + IVLimit = SE->getAddExpr(IVInit, IVLimit); + } + } // Expand the code for the iteration count. - assert(SE->isLoopInvariant(RHS, L) && + IRBuilder<> Builder(BI); + + assert(SE->isLoopInvariant(IVLimit, L) && "Computed iteration count is not loop invariant!"); - Value *ExitCnt = Rewriter.expandCodeFor(RHS, IndVar->getType(), BI); + Value *ExitCnt = Rewriter.expandCodeFor(IVLimit, CmpTy, BI); + + // Create a gep for IVInit + IVLimit from on an existing pointer base. + assert(isPtrIV == IndVar->getType()->isPointerTy() && + "IndVar type must match IVInit type"); + if (isPtrIV) { + Value *IVStart = IndVar->getIncomingValueForBlock(L->getLoopPreheader()); + assert(AR->getStart() == SE->getSCEV(IVStart) && "bad loop counter"); + assert(SE->getSizeOfExpr( + cast<PointerType>(IVStart->getType())->getElementType())->isOne() + && "unit stride pointer IV must be i8*"); + + Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator()); + ExitCnt = Builder.CreateGEP(IVStart, ExitCnt, "lftr.limit"); + Builder.SetInsertPoint(BI); + } // Insert a new icmp_ne or icmp_eq instruction before the branch. - ICmpInst::Predicate Opcode; + ICmpInst::Predicate P; if (L->contains(BI->getSuccessor(0))) - Opcode = ICmpInst::ICMP_NE; + P = ICmpInst::ICMP_NE; else - Opcode = ICmpInst::ICMP_EQ; + P = ICmpInst::ICMP_EQ; DEBUG(dbgs() << "INDVARS: Rewriting loop exit condition to:\n" << " LHS:" << *CmpIndVar << '\n' << " op:\t" - << (Opcode == ICmpInst::ICMP_NE ? "!=" : "==") << "\n" - << " RHS:\t" << *RHS << "\n"); + << (P == ICmpInst::ICMP_NE ? "!=" : "==") << "\n" + << " RHS:\t" << *ExitCnt << "\n" + << " Expr:\t" << *IVLimit << "\n"); + + if (SE->getTypeSizeInBits(CmpIndVar->getType()) + > SE->getTypeSizeInBits(CmpTy)) { + CmpIndVar = Builder.CreateTrunc(CmpIndVar, CmpTy, "lftr.wideiv"); + } - ICmpInst *Cond = new ICmpInst(BI, Opcode, CmpIndVar, ExitCnt, "exitcond"); - Cond->setDebugLoc(BI->getDebugLoc()); + Value *Cond = Builder.CreateICmp(P, CmpIndVar, ExitCnt, "exitcond"); Value *OrigCond = BI->getCondition(); // It's tempting to use replaceAllUsesWith here to fully replace the old // comparison, but that's not immediately safe, since users of the old @@ -1612,7 +1655,7 @@ void IndVarSimplify::SinkUnusedInvariants(Loop *L) { BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) return; - Instruction *InsertPt = ExitBlock->getFirstNonPHI(); + Instruction *InsertPt = ExitBlock->getFirstInsertionPt(); BasicBlock::iterator I = Preheader->getTerminator(); while (I != Preheader->begin()) { --I; @@ -1633,6 +1676,10 @@ void IndVarSimplify::SinkUnusedInvariants(Loop *L) { if (isa<DbgInfoIntrinsic>(I)) continue; + // Skip landingpad instructions. + if (isa<LandingPadInst>(I)) + continue; + // Don't sink static AllocaInsts out of the entry block, which would // turn them into dynamic allocas! if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) @@ -1699,7 +1746,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { if (!L->isLoopSimplifyForm()) return false; - if (!DisableIVRewrite) + if (EnableIVRewrite) IU = &getAnalysis<IVUsers>(); LI = &getAnalysis<LoopInfo>(); SE = &getAnalysis<ScalarEvolution>(); @@ -1717,6 +1764,9 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // Create a rewriter object which we'll use to transform the code with. SCEVExpander Rewriter(*SE, "indvars"); +#ifndef NDEBUG + Rewriter.setDebugType(DEBUG_TYPE); +#endif // Eliminate redundant IV users. // @@ -1724,9 +1774,9 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // attempt to avoid evaluating SCEVs for sign/zero extend operations until // other expressions involving loop IVs have been evaluated. This helps SCEV // set no-wrap flags before normalizing sign/zero extension. - if (DisableIVRewrite) { + if (!EnableIVRewrite) { Rewriter.disableCanonicalMode(); - SimplifyIVUsersNoRewrite(L, Rewriter); + SimplifyAndExtend(L, Rewriter, LPM); } // Check to see if this loop has a computable loop-invariant execution count. @@ -1739,25 +1789,25 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { RewriteLoopExitValues(L, Rewriter); // Eliminate redundant IV users. - if (!DisableIVRewrite) - SimplifyIVUsers(Rewriter); + if (EnableIVRewrite) + Changed |= simplifyIVUsers(IU, SE, &LPM, DeadInsts); // Eliminate redundant IV cycles. - if (DisableIVRewrite) - SimplifyCongruentIVs(L); + if (!EnableIVRewrite) + NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts); // Compute the type of the largest recurrence expression, and decide whether // a canonical induction variable should be inserted. - const Type *LargestType = 0; + Type *LargestType = 0; bool NeedCannIV = false; bool ExpandBECount = canExpandBackedgeTakenCount(L, SE); - if (ExpandBECount) { + if (EnableIVRewrite && ExpandBECount) { // If we have a known trip count and a single exit block, we'll be // rewriting the loop exit test condition below, which requires a // canonical induction variable. NeedCannIV = true; - const Type *Ty = BackedgeTakenCount->getType(); - if (DisableIVRewrite) { + Type *Ty = BackedgeTakenCount->getType(); + if (!EnableIVRewrite) { // In this mode, SimplifyIVUsers may have already widened the IV used by // the backedge test and inserted a Trunc on the compare's operand. Get // the wider type to avoid creating a redundant narrow IV only used by the @@ -1769,10 +1819,10 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { SE->getTypeSizeInBits(LargestType)) LargestType = SE->getEffectiveSCEVType(Ty); } - if (!DisableIVRewrite) { + if (EnableIVRewrite) { for (IVUsers::const_iterator I = IU->begin(), E = IU->end(); I != E; ++I) { NeedCannIV = true; - const Type *Ty = + Type *Ty = SE->getEffectiveSCEVType(I->getOperandValToReplace()->getType()); if (!LargestType || SE->getTypeSizeInBits(Ty) > @@ -1811,18 +1861,16 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // the end of the pass. while (!OldCannIVs.empty()) { PHINode *OldCannIV = OldCannIVs.pop_back_val(); - OldCannIV->insertBefore(L->getHeader()->getFirstNonPHI()); + OldCannIV->insertBefore(L->getHeader()->getFirstInsertionPt()); } } - + else if (!EnableIVRewrite && ExpandBECount && needsLFTR(L, DT)) { + IndVar = FindLoopCounter(L, BackedgeTakenCount, SE, DT, TD); + } // If we have a trip count expression, rewrite the loop's exit condition // using it. We can currently only handle loops with a single exit. - ICmpInst *NewICmp = 0; - if (ExpandBECount) { - assert(canExpandBackedgeTakenCount(L, SE) && - "canonical IV disrupted BackedgeTaken expansion"); - assert(NeedCannIV && - "LinearFunctionTestReplace requires a canonical induction variable"); + Value *NewICmp = 0; + if (ExpandBECount && IndVar) { // Check preconditions for proper SCEVExpander operation. SCEV does not // express SCEVExpander's dependencies, such as LoopSimplify. Instead any // pass that uses the SCEVExpander must do it. This does not work well for @@ -1837,7 +1885,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { LinearFunctionTestReplace(L, BackedgeTakenCount, IndVar, Rewriter); } // Rewrite IV-derived expressions. - if (!DisableIVRewrite) + if (EnableIVRewrite) RewriteIVExpressions(L, Rewriter); // Clear the rewriter cache, because values that are in the rewriter's cache @@ -1860,12 +1908,34 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // For completeness, inform IVUsers of the IV use in the newly-created // loop exit test instruction. - if (NewICmp && IU) - IU->AddUsersIfInteresting(cast<Instruction>(NewICmp->getOperand(0))); - + if (IU && NewICmp) { + ICmpInst *NewICmpInst = dyn_cast<ICmpInst>(NewICmp); + if (NewICmpInst) + IU->AddUsersIfInteresting(cast<Instruction>(NewICmpInst->getOperand(0))); + } // Clean up dead instructions. Changed |= DeleteDeadPHIs(L->getHeader()); // Check a post-condition. - assert(L->isLCSSAForm(*DT) && "Indvars did not leave the loop in lcssa form!"); + assert(L->isLCSSAForm(*DT) && + "Indvars did not leave the loop in lcssa form!"); + + // Verify that LFTR, and any other change have not interfered with SCEV's + // ability to compute trip count. +#ifndef NDEBUG + if (!EnableIVRewrite && VerifyIndvars && + !isa<SCEVCouldNotCompute>(BackedgeTakenCount)) { + SE->forgetLoop(L); + const SCEV *NewBECount = SE->getBackedgeTakenCount(L); + if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) < + SE->getTypeSizeInBits(NewBECount->getType())) + NewBECount = SE->getTruncateOrNoop(NewBECount, + BackedgeTakenCount->getType()); + else + BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, + NewBECount->getType()); + assert(BackedgeTakenCount == NewBECount && "indvars must preserve SCEV"); + } +#endif + return Changed; } diff --git a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp index b500d5b..f410af3 100644 --- a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -811,8 +811,8 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { /// important optimization that encourages jump threading, and needs to be run /// interlaced with other jump threading tasks. bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { - // Don't hack volatile loads. - if (LI->isVolatile()) return false; + // Don't hack volatile/atomic loads. + if (!LI->isSimple()) return false; // If the load is defined in a block with exactly one predecessor, it can't be // partially redundant. diff --git a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp index 66add6c..b79bb13 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp @@ -151,6 +151,11 @@ namespace { /// bool isSafeToExecuteUnconditionally(Instruction &I); + /// isGuaranteedToExecute - Check that the instruction is guaranteed to + /// execute. + /// + bool isGuaranteedToExecute(Instruction &I); + /// pointerInvalidatedByLoop - Return true if the body of this loop may /// store into the memory location pointed to by V. /// @@ -357,8 +362,8 @@ void LICM::HoistRegion(DomTreeNode *N) { bool LICM::canSinkOrHoistInst(Instruction &I) { // Loads have extra constraints we have to verify before we can hoist them. if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { - if (LI->isVolatile()) - return false; // Don't hoist volatile loads! + if (!LI->isUnordered()) + return false; // Don't hoist volatile/atomic loads! // Loads from constant memory are always safe to move, even if they end up // in the same alias set as something that ends up being modified. @@ -461,7 +466,7 @@ void LICM::sink(Instruction &I) { } else { // Move the instruction to the start of the exit block, after any PHI // nodes in it. - I.moveBefore(ExitBlocks[0]->getFirstNonPHI()); + I.moveBefore(ExitBlocks[0]->getFirstInsertionPt()); // This instruction is no longer in the AST for the current loop, because // we just sunk it out of the loop. If we just sunk it into an outer @@ -504,7 +509,7 @@ void LICM::sink(Instruction &I) { continue; // Insert the code after the last PHI node. - BasicBlock::iterator InsertPt = ExitBlock->getFirstNonPHI(); + BasicBlock::iterator InsertPt = ExitBlock->getFirstInsertionPt(); // If this is the first exit block processed, just move the original // instruction, otherwise clone the original instruction and insert @@ -577,6 +582,10 @@ bool LICM::isSafeToExecuteUnconditionally(Instruction &Inst) { if (Inst.isSafeToSpeculativelyExecute()) return true; + return isGuaranteedToExecute(Inst); +} + +bool LICM::isGuaranteedToExecute(Instruction &Inst) { // Otherwise we have to check to make sure that the instruction dominates all // of the exit blocks. If it doesn't, then there is a path out of the loop // which does not execute this instruction, so we can't hoist it. @@ -635,7 +644,7 @@ namespace { for (unsigned i = 0, e = LoopExitBlocks.size(); i != e; ++i) { BasicBlock *ExitBlock = LoopExitBlocks[i]; Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock); - Instruction *InsertPos = ExitBlock->getFirstNonPHI(); + Instruction *InsertPos = ExitBlock->getFirstInsertionPt(); StoreInst *NewSI = new StoreInst(LiveInValue, SomePtr, InsertPos); NewSI->setAlignment(Alignment); NewSI->setDebugLoc(DL); @@ -713,34 +722,41 @@ void LICM::PromoteAliasSet(AliasSet &AS) { // If there is an non-load/store instruction in the loop, we can't promote // it. - unsigned InstAlignment; if (LoadInst *load = dyn_cast<LoadInst>(Use)) { - assert(!cast<LoadInst>(Use)->isVolatile() && "AST broken"); - InstAlignment = load->getAlignment(); + assert(!load->isVolatile() && "AST broken"); + if (!load->isSimple()) + return; } else if (StoreInst *store = dyn_cast<StoreInst>(Use)) { // Stores *of* the pointer are not interesting, only stores *to* the // pointer. if (Use->getOperand(1) != ASIV) continue; - InstAlignment = store->getAlignment(); - assert(!cast<StoreInst>(Use)->isVolatile() && "AST broken"); + assert(!store->isVolatile() && "AST broken"); + if (!store->isSimple()) + return; + + // Note that we only check GuaranteedToExecute inside the store case + // so that we do not introduce stores where they did not exist before + // (which would break the LLVM concurrency model). + + // If the alignment of this instruction allows us to specify a more + // restrictive (and performant) alignment and if we are sure this + // instruction will be executed, update the alignment. + // Larger is better, with the exception of 0 being the best alignment. + unsigned InstAlignment = store->getAlignment(); + if ((InstAlignment > Alignment || InstAlignment == 0) + && (Alignment != 0)) + if (isGuaranteedToExecute(*Use)) { + GuaranteedToExecute = true; + Alignment = InstAlignment; + } + + if (!GuaranteedToExecute) + GuaranteedToExecute = isGuaranteedToExecute(*Use); + } else return; // Not a load or store. - // If the alignment of this instruction allows us to specify a more - // restrictive (and performant) alignment and if we are sure this - // instruction will be executed, update the alignment. - // Larger is better, with the exception of 0 being the best alignment. - if ((InstAlignment > Alignment || InstAlignment == 0) - && (Alignment != 0)) - if (isSafeToExecuteUnconditionally(*Use)) { - GuaranteedToExecute = true; - Alignment = InstAlignment; - } - - if (!GuaranteedToExecute) - GuaranteedToExecute = isSafeToExecuteUnconditionally(*Use); - LoopUses.push_back(Use); } } diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index a0e41d9..ad15cbb 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -267,7 +267,7 @@ bool LoopIdiomRecognize::runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, /// processLoopStore - See if this store can be promoted to a memset or memcpy. bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) { - if (SI->isVolatile()) return false; + if (!SI->isSimple()) return false; Value *StoredVal = SI->getValueOperand(); Value *StorePtr = SI->getPointerOperand(); @@ -314,7 +314,7 @@ bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) { const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getOperand(0))); if (LoadEv && LoadEv->getLoop() == CurLoop && LoadEv->isAffine() && - StoreEv->getOperand(1) == LoadEv->getOperand(1) && !LI->isVolatile()) + StoreEv->getOperand(1) == LoadEv->getOperand(1) && LI->isSimple()) if (processLoopStoreOfLoopLoad(SI, StoreSize, StoreEv, LoadEv, BECount)) return true; } @@ -463,7 +463,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, SplatValue = 0; } else { // Otherwise, this isn't an idiom we can transform. For example, we can't - // do anything with a 3-byte store, for example. + // do anything with a 3-byte store. return false; } @@ -498,7 +498,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, // The # stored bytes is (BECount+1)*Size. Expand the trip count out to // pointer size if it isn't already. - const Type *IntPtr = TD->getIntPtrType(DestPtr->getContext()); + Type *IntPtr = TD->getIntPtrType(DestPtr->getContext()); BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr); const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1), @@ -604,7 +604,7 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, // The # stored bytes is (BECount+1)*Size. Expand the trip count out to // pointer size if it isn't already. - const Type *IntPtr = TD->getIntPtrType(SI->getContext()); + Type *IntPtr = TD->getIntPtrType(SI->getContext()); BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr); const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1), diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 509d026..3e122c2 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -70,12 +70,27 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/DenseSet.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetLowering.h" #include <algorithm> using namespace llvm; +namespace llvm { +cl::opt<bool> EnableNested( + "enable-lsr-nested", cl::Hidden, cl::desc("Enable LSR on nested loops")); + +cl::opt<bool> EnableRetry( + "enable-lsr-retry", cl::Hidden, cl::desc("Enable LSR retry")); + +// Temporary flag to cleanup congruent phis after LSR phi expansion. +// It's currently disabled until we can determine whether it's truly useful or +// not. The flag should be removed after the v3.0 release. +cl::opt<bool> EnablePhiElim( + "enable-lsr-phielim", cl::Hidden, cl::desc("Enable LSR phi elimination")); +} + namespace { /// RegSortData - This class holds data which is used to order reuse candidates. @@ -219,7 +234,7 @@ struct Formula { void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE); unsigned getNumRegs() const; - const Type *getType() const; + Type *getType() const; void DeleteBaseReg(const SCEV *&S); @@ -319,7 +334,7 @@ unsigned Formula::getNumRegs() const { /// getType - Return the type of this formula, if it has one, or null /// otherwise. This type is meaningless except for the bit size. -const Type *Formula::getType() const { +Type *Formula::getType() const { return !BaseRegs.empty() ? BaseRegs.front()->getType() : ScaledReg ? ScaledReg->getType() : AM.BaseGV ? AM.BaseGV->getType() : @@ -397,7 +412,7 @@ void Formula::dump() const { /// isAddRecSExtable - Return true if the given addrec can be sign-extended /// without changing its value. static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) { - const Type *WideTy = + Type *WideTy = IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1); return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy)); } @@ -405,7 +420,7 @@ static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) { /// isAddSExtable - Return true if the given add can be sign-extended /// without changing its value. static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) { - const Type *WideTy = + Type *WideTy = IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1); return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy)); } @@ -413,7 +428,7 @@ static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) { /// isMulSExtable - Return true if the given mul can be sign-extended /// without changing its value. static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) { - const Type *WideTy = + Type *WideTy = IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(M->getType()) * M->getNumOperands()); return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy)); @@ -594,8 +609,8 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) { } /// getAccessType - Return the type of the memory being accessed. -static const Type *getAccessType(const Instruction *Inst) { - const Type *AccessTy = Inst->getType(); +static Type *getAccessType(const Instruction *Inst) { + Type *AccessTy = Inst->getType(); if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) AccessTy = SI->getOperand(0)->getType(); else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { @@ -614,7 +629,7 @@ static const Type *getAccessType(const Instruction *Inst) { // All pointers have the same requirements, so canonicalize them to an // arbitrary pointer type to minimize variation. - if (const PointerType *PTy = dyn_cast<PointerType>(AccessTy)) + if (PointerType *PTy = dyn_cast<PointerType>(AccessTy)) AccessTy = PointerType::get(IntegerType::get(PTy->getContext(), 1), PTy->getAddressSpace()); @@ -670,6 +685,21 @@ public: void Loose(); +#ifndef NDEBUG + // Once any of the metrics loses, they must all remain losers. + bool isValid() { + return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds + | ImmCost | SetupCost) != ~0u) + || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds + & ImmCost & SetupCost) == ~0u); + } +#endif + + bool isLoser() { + assert(isValid() && "invalid cost"); + return NumRegs == ~0u; + } + void RateFormula(const Formula &F, SmallPtrSet<const SCEV *, 16> &Regs, const DenseSet<const SCEV *> &VisitedRegs, @@ -702,34 +732,48 @@ void Cost::RateRegister(const SCEV *Reg, if (AR->getLoop() == L) AddRecCost += 1; /// TODO: This should be a function of the stride. - // If this is an addrec for a loop that's already been visited by LSR, - // don't second-guess its addrec phi nodes. LSR isn't currently smart - // enough to reason about more than one loop at a time. Consider these - // registers free and leave them alone. - else if (L->contains(AR->getLoop()) || + // If this is an addrec for another loop, don't second-guess its addrec phi + // nodes. LSR isn't currently smart enough to reason about more than one + // loop at a time. LSR has either already run on inner loops, will not run + // on other loops, and cannot be expected to change sibling loops. If the + // AddRec exists, consider it's register free and leave it alone. Otherwise, + // do not consider this formula at all. + // FIXME: why do we need to generate such fomulae? + else if (!EnableNested || L->contains(AR->getLoop()) || (!AR->getLoop()->contains(L) && DT.dominates(L->getHeader(), AR->getLoop()->getHeader()))) { for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin(); - PHINode *PN = dyn_cast<PHINode>(I); ++I) + PHINode *PN = dyn_cast<PHINode>(I); ++I) { if (SE.isSCEVable(PN->getType()) && (SE.getEffectiveSCEVType(PN->getType()) == SE.getEffectiveSCEVType(AR->getType())) && SE.getSCEV(PN) == AR) return; - + } + if (!EnableNested) { + Loose(); + return; + } // If this isn't one of the addrecs that the loop already has, it // would require a costly new phi and add. TODO: This isn't // precisely modeled right now. ++NumBaseAdds; - if (!Regs.count(AR->getStart())) + if (!Regs.count(AR->getStart())) { RateRegister(AR->getStart(), Regs, L, SE, DT); + if (isLoser()) + return; + } } // Add the step value register, if it needs one. // TODO: The non-affine case isn't precisely modeled here. - if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) - if (!Regs.count(AR->getStart())) + if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) { + if (!Regs.count(AR->getOperand(1))) { RateRegister(AR->getOperand(1), Regs, L, SE, DT); + if (isLoser()) + return; + } + } } ++NumRegs; @@ -769,6 +813,8 @@ void Cost::RateFormula(const Formula &F, return; } RatePrimaryRegister(ScaledReg, Regs, L, SE, DT); + if (isLoser()) + return; } for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) { @@ -778,6 +824,8 @@ void Cost::RateFormula(const Formula &F, return; } RatePrimaryRegister(BaseReg, Regs, L, SE, DT); + if (isLoser()) + return; } // Determine how many (unfolded) adds we'll need inside the loop. @@ -795,6 +843,7 @@ void Cost::RateFormula(const Formula &F, else if (Offset != 0) ImmCost += APInt(64, Offset, true).getMinSignedBits(); } + assert(isValid() && "invalid cost"); } /// Loose - Set this cost to a losing value. @@ -980,7 +1029,7 @@ public: }; KindType Kind; - const Type *AccessTy; + Type *AccessTy; SmallVector<int64_t, 8> Offsets; int64_t MinOffset; @@ -995,7 +1044,7 @@ public: /// this LSRUse. FindUseWithSimilarFormula can't consider uses with different /// max fixup widths to be equivalent, because the narrower one may be relying /// on the implicit truncation to truncate away bogus bits. - const Type *WidestFixupType; + Type *WidestFixupType; /// Formulae - A list of ways to build a value that can satisfy this user. /// After the list is populated, one of these is selected heuristically and @@ -1005,7 +1054,7 @@ public: /// Regs - The set of register candidates used by all formulae in this LSRUse. SmallPtrSet<const SCEV *, 4> Regs; - LSRUse(KindType K, const Type *T) : Kind(K), AccessTy(T), + LSRUse(KindType K, Type *T) : Kind(K), AccessTy(T), MinOffset(INT64_MAX), MaxOffset(INT64_MIN), AllFixupsOutsideLoop(true), @@ -1127,7 +1176,7 @@ void LSRUse::dump() const { /// be completely folded into the user instruction at isel time. This includes /// address-mode folding and special icmp tricks. static bool isLegalUse(const TargetLowering::AddrMode &AM, - LSRUse::KindType Kind, const Type *AccessTy, + LSRUse::KindType Kind, Type *AccessTy, const TargetLowering *TLI) { switch (Kind) { case LSRUse::Address: @@ -1156,7 +1205,7 @@ static bool isLegalUse(const TargetLowering::AddrMode &AM, // If we have low-level target information, ask the target if it can fold an // integer immediate on an icmp. if (AM.BaseOffs != 0) { - if (TLI) return TLI->isLegalICmpImmediate(-AM.BaseOffs); + if (TLI) return TLI->isLegalICmpImmediate(-(uint64_t)AM.BaseOffs); return false; } @@ -1176,7 +1225,7 @@ static bool isLegalUse(const TargetLowering::AddrMode &AM, static bool isLegalUse(TargetLowering::AddrMode AM, int64_t MinOffset, int64_t MaxOffset, - LSRUse::KindType Kind, const Type *AccessTy, + LSRUse::KindType Kind, Type *AccessTy, const TargetLowering *TLI) { // Check for overflow. if (((int64_t)((uint64_t)AM.BaseOffs + MinOffset) > AM.BaseOffs) != @@ -1198,7 +1247,7 @@ static bool isLegalUse(TargetLowering::AddrMode AM, static bool isAlwaysFoldable(int64_t BaseOffs, GlobalValue *BaseGV, bool HasBaseReg, - LSRUse::KindType Kind, const Type *AccessTy, + LSRUse::KindType Kind, Type *AccessTy, const TargetLowering *TLI) { // Fast-path: zero is always foldable. if (BaseOffs == 0 && !BaseGV) return true; @@ -1224,7 +1273,7 @@ static bool isAlwaysFoldable(int64_t BaseOffs, static bool isAlwaysFoldable(const SCEV *S, int64_t MinOffset, int64_t MaxOffset, bool HasBaseReg, - LSRUse::KindType Kind, const Type *AccessTy, + LSRUse::KindType Kind, Type *AccessTy, const TargetLowering *TLI, ScalarEvolution &SE) { // Fast-path: zero is always foldable. @@ -1299,7 +1348,7 @@ class LSRInstance { SmallSetVector<int64_t, 8> Factors; /// Types - Interesting use types, to facilitate truncation reuse. - SmallSetVector<const Type *, 4> Types; + SmallSetVector<Type *, 4> Types; /// Fixups - The list of operands which are to be replaced. SmallVector<LSRFixup, 16> Fixups; @@ -1330,11 +1379,11 @@ class LSRInstance { UseMapTy UseMap; bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, - LSRUse::KindType Kind, const Type *AccessTy); + LSRUse::KindType Kind, Type *AccessTy); std::pair<size_t, int64_t> getUse(const SCEV *&Expr, LSRUse::KindType Kind, - const Type *AccessTy); + Type *AccessTy); void DeleteUse(LSRUse &LU, size_t LUIdx); @@ -1426,7 +1475,8 @@ void LSRInstance::OptimizeShadowIV() { IVUsers::const_iterator CandidateUI = UI; ++UI; Instruction *ShadowUse = CandidateUI->getUser(); - const Type *DestTy = NULL; + Type *DestTy = NULL; + bool IsSigned = false; /* If shadow use is a int->float cast then insert a second IV to eliminate this cast. @@ -1440,10 +1490,14 @@ void LSRInstance::OptimizeShadowIV() { for (unsigned i = 0; i < n; ++i, ++d) foo(d); */ - if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) + if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) { + IsSigned = false; DestTy = UCast->getDestTy(); - else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) + } + else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) { + IsSigned = true; DestTy = SCast->getDestTy(); + } if (!DestTy) continue; if (TLI) { @@ -1457,7 +1511,7 @@ void LSRInstance::OptimizeShadowIV() { if (!PH) continue; if (PH->getNumIncomingValues() != 2) continue; - const Type *SrcTy = PH->getType(); + Type *SrcTy = PH->getType(); int Mantissa = DestTy->getFPMantissaWidth(); if (Mantissa == -1) continue; if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa) @@ -1474,7 +1528,9 @@ void LSRInstance::OptimizeShadowIV() { ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry)); if (!Init) continue; - Constant *NewInit = ConstantFP::get(DestTy, Init->getZExtValue()); + Constant *NewInit = ConstantFP::get(DestTy, IsSigned ? + (double)Init->getSExtValue() : + (double)Init->getZExtValue()); BinaryOperator *Incr = dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch)); @@ -1776,7 +1832,7 @@ LSRInstance::OptimizeLoopTermCond() { if (!TLI) goto decline_post_inc; // Check for possible scaled-address reuse. - const Type *AccessTy = getAccessType(UI->getUser()); + Type *AccessTy = getAccessType(UI->getUser()); TargetLowering::AddrMode AM; AM.Scale = C->getSExtValue(); if (TLI->isLegalAddressingMode(AM, AccessTy)) @@ -1840,10 +1896,10 @@ LSRInstance::OptimizeLoopTermCond() { /// return true. bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, - LSRUse::KindType Kind, const Type *AccessTy) { + LSRUse::KindType Kind, Type *AccessTy) { int64_t NewMinOffset = LU.MinOffset; int64_t NewMaxOffset = LU.MaxOffset; - const Type *NewAccessTy = AccessTy; + Type *NewAccessTy = AccessTy; // Check for a mismatched kind. It's tempting to collapse mismatched kinds to // something conservative, however this can pessimize in the case that one of @@ -1882,7 +1938,7 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, /// Either reuse an existing use or create a new one, as needed. std::pair<size_t, int64_t> LSRInstance::getUse(const SCEV *&Expr, - LSRUse::KindType Kind, const Type *AccessTy) { + LSRUse::KindType Kind, Type *AccessTy) { const SCEV *Copy = Expr; int64_t Offset = ExtractImmediate(Expr, SE); @@ -2044,7 +2100,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { LF.PostIncLoops = UI->getPostIncLoops(); LSRUse::KindType Kind = LSRUse::Basic; - const Type *AccessTy = 0; + Type *AccessTy = 0; if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) { Kind = LSRUse::Address; AccessTy = getAccessType(LF.UserInst); @@ -2464,7 +2520,7 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, if (LU.Kind != LSRUse::ICmpZero) return; // Determine the integer type for the base formula. - const Type *IntTy = Base.getType(); + Type *IntTy = Base.getType(); if (!IntTy) return; if (SE.getTypeSizeInBits(IntTy) > 64) return; @@ -2538,7 +2594,7 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, /// scaled-offset address modes, for example. void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { // Determine the integer type for the base formula. - const Type *IntTy = Base.getType(); + Type *IntTy = Base.getType(); if (!IntTy) return; // If this Formula already has a scaled register, we can't add another one. @@ -2598,13 +2654,13 @@ void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) { if (Base.AM.BaseGV) return; // Determine the integer type for the base formula. - const Type *DstTy = Base.getType(); + Type *DstTy = Base.getType(); if (!DstTy) return; DstTy = SE.getEffectiveSCEVType(DstTy); - for (SmallSetVector<const Type *, 4>::const_iterator + for (SmallSetVector<Type *, 4>::const_iterator I = Types.begin(), E = Types.end(); I != E; ++I) { - const Type *SrcTy = *I; + Type *SrcTy = *I; if (SrcTy != DstTy && TLI->isTruncateFree(SrcTy, DstTy)) { Formula F = Base; @@ -2741,7 +2797,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { int64_t Imm = WI.Imm; const SCEV *OrigReg = WI.OrigReg; - const Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType()); + Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType()); const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm)); unsigned BitWidth = SE.getTypeSizeInBits(IntTy); @@ -3275,6 +3331,9 @@ retry: skip:; } + if (!EnableRetry && !AnySatisfiedReqRegs) + return; + // If none of the formulae had all of the required registers, relax the // constraint so that we don't exclude all formulae. if (!AnySatisfiedReqRegs) { @@ -3298,6 +3357,10 @@ void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const { // SolveRecurse does all the work. SolveRecurse(Solution, SolutionCost, Workspace, CurCost, CurRegs, VisitedRegs); + if (Solution.empty()) { + DEBUG(dbgs() << "\nNo Satisfactory Solution\n"); + return; + } // Ok, we've now made all our decisions. DEBUG(dbgs() << "\n" @@ -3416,6 +3479,9 @@ LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator IP, // Don't insert instructions before PHI nodes. while (isa<PHINode>(IP)) ++IP; + // Ignore landingpad instructions. + while (isa<LandingPadInst>(IP)) ++IP; + // Ignore debug intrinsics. while (isa<DbgInfoIntrinsic>(IP)) ++IP; @@ -3440,9 +3506,9 @@ Value *LSRInstance::Expand(const LSRFixup &LF, Rewriter.setPostInc(LF.PostIncLoops); // This is the type that the user actually needs. - const Type *OpTy = LF.OperandValToReplace->getType(); + Type *OpTy = LF.OperandValToReplace->getType(); // This will be the type that we'll initially expand to. - const Type *Ty = F.getType(); + Type *Ty = F.getType(); if (!Ty) // No type known; just expand directly to the ultimate type. Ty = OpTy; @@ -3450,7 +3516,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, // Expand directly to the ultimate type if it's the right size. Ty = OpTy; // This is the type to do integer arithmetic in. - const Type *IntTy = SE.getEffectiveSCEVType(Ty); + Type *IntTy = SE.getEffectiveSCEVType(Ty); // Build up a list of operands to add together to form the full base. SmallVector<const SCEV *, 8> Ops; @@ -3527,7 +3593,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, // The other interesting way of "folding" with an ICmpZero is to use a // negated immediate. if (!ICmpScaledV) - ICmpScaledV = ConstantInt::get(IntTy, -Offset); + ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset); else { Ops.push_back(SE.getUnknown(ICmpScaledV)); ICmpScaledV = ConstantInt::get(IntTy, Offset); @@ -3611,10 +3677,20 @@ void LSRInstance::RewriteForPHI(PHINode *PN, // users. if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 && !isa<IndirectBrInst>(BB->getTerminator())) { - Loop *PNLoop = LI.getLoopFor(PN->getParent()); - if (!PNLoop || PN->getParent() != PNLoop->getHeader()) { + BasicBlock *Parent = PN->getParent(); + Loop *PNLoop = LI.getLoopFor(Parent); + if (!PNLoop || Parent != PNLoop->getHeader()) { // Split the critical edge. - BasicBlock *NewBB = SplitCriticalEdge(BB, PN->getParent(), P); + BasicBlock *NewBB = 0; + if (!Parent->isLandingPad()) { + NewBB = SplitCriticalEdge(BB, Parent, P, + /*MergeIdenticalEdges=*/true, + /*DontDeleteUselessPhis=*/true); + } else { + SmallVector<BasicBlock*, 2> NewBBs; + SplitLandingPadPredecessors(Parent, BB, "", "", P, NewBBs); + NewBB = NewBBs[0]; + } // If PN is outside of the loop and BB is in the loop, we want to // move the block to be immediately before the PHI block, not @@ -3637,7 +3713,7 @@ void LSRInstance::RewriteForPHI(PHINode *PN, Value *FullV = Expand(LF, F, BB->getTerminator(), Rewriter, DeadInsts); // If this is reuse-by-noop-cast, insert the noop cast. - const Type *OpTy = LF.OperandValToReplace->getType(); + Type *OpTy = LF.OperandValToReplace->getType(); if (FullV->getType() != OpTy) FullV = CastInst::Create(CastInst::getCastOpcode(FullV, false, @@ -3667,7 +3743,7 @@ void LSRInstance::Rewrite(const LSRFixup &LF, Value *FullV = Expand(LF, F, LF.UserInst, Rewriter, DeadInsts); // If this is reuse-by-noop-cast, insert the noop cast. - const Type *OpTy = LF.OperandValToReplace->getType(); + Type *OpTy = LF.OperandValToReplace->getType(); if (FullV->getType() != OpTy) { Instruction *Cast = CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false), @@ -3700,6 +3776,7 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, SCEVExpander Rewriter(SE, "lsr"); Rewriter.disableCanonicalMode(); + Rewriter.enableLSRMode(); Rewriter.setIVIncInsertPos(L, IVIncInsertPos); // Expand the new value definitions and update the users. @@ -3740,6 +3817,23 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P) OptimizeShadowIV(); OptimizeLoopTermCond(); + // If loop preparation eliminates all interesting IV users, bail. + if (IU.empty()) return; + + // Skip nested loops until we can model them better with formulae. + if (!EnableNested && !L->empty()) { + + if (EnablePhiElim) { + // Remove any extra phis created by processing inner loops. + SmallVector<WeakVH, 16> DeadInsts; + SCEVExpander Rewriter(SE, "lsr"); + Changed |= Rewriter.replaceCongruentIVs(L, &DT, DeadInsts); + Changed |= DeleteTriviallyDeadInstructions(DeadInsts); + } + DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n"); + return; + } + // Start collecting data and preparing for the solver. CollectInterestingTypesAndFactors(); CollectFixupsAndInitialFormulae(); @@ -3763,6 +3857,9 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P) Types.clear(); RegUses.clear(); + if (Solution.empty()) + return; + #ifndef NDEBUG // Formulae should be legal. for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(), @@ -3778,6 +3875,14 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P) // Now that we've decided what we want, make it so. ImplementSolution(Solution, P); + + if (EnablePhiElim) { + // Remove any extra phis created by processing inner loops. + SmallVector<WeakVH, 16> DeadInsts; + SCEVExpander Rewriter(SE, "lsr"); + Changed |= Rewriter.replaceCongruentIVs(L, &DT, DeadInsts); + Changed |= DeleteTriviallyDeadInstructions(DeadInsts); + } } void LSRInstance::print_factors_and_types(raw_ostream &OS) const { @@ -3793,7 +3898,7 @@ void LSRInstance::print_factors_and_types(raw_ostream &OS) const { OS << '*' << *I; } - for (SmallSetVector<const Type *, 4>::const_iterator + for (SmallSetVector<Type *, 4>::const_iterator I = Types.begin(), E = Types.end(); I != E; ++I) { if (!First) OS << ", "; First = false; diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index fef6bc3..91395b2 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -22,6 +22,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/UnrollLoop.h" +#include "llvm/Target/TargetData.h" #include <climits> using namespace llvm; @@ -39,6 +40,11 @@ UnrollAllowPartial("unroll-allow-partial", cl::init(false), cl::Hidden, cl::desc("Allows loops to be partially unrolled until " "-unroll-threshold loop size is reached.")); +// Temporary flag to be removed in 3.0 +static cl::opt<bool> +NoSCEVUnroll("disable-unroll-scev", cl::init(false), cl::Hidden, + cl::desc("Use ScalarEvolution to analyze loop trip counts for unrolling")); + namespace { class LoopUnroll : public LoopPass { public: @@ -49,7 +55,7 @@ namespace { CurrentAllowPartial = (P == -1) ? UnrollAllowPartial : (bool)P; UserThreshold = (T != -1) || (UnrollThreshold.getNumOccurrences() > 0); - + initializeLoopUnrollPass(*PassRegistry::getPassRegistry()); } @@ -57,11 +63,11 @@ namespace { /// that the loop unroll should be performed regardless of how much /// code expansion would result. static const unsigned NoThreshold = UINT_MAX; - + // Threshold to use when optsize is specified (and there is no // explicit -unroll-threshold). static const unsigned OptSizeUnrollThreshold = 50; - + unsigned CurrentCount; unsigned CurrentThreshold; bool CurrentAllowPartial; @@ -79,6 +85,7 @@ namespace { AU.addPreservedID(LoopSimplifyID); AU.addRequiredID(LCSSAID); AU.addPreservedID(LCSSAID); + AU.addRequired<ScalarEvolution>(); AU.addPreserved<ScalarEvolution>(); // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info. // If loop unroll does not preserve dom info then LCSSA pass on next @@ -101,45 +108,62 @@ Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial) { } /// ApproximateLoopSize - Approximate the size of the loop. -static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls) { +static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, + const TargetData *TD) { CodeMetrics Metrics; for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E; ++I) - Metrics.analyzeBasicBlock(*I); + Metrics.analyzeBasicBlock(*I, TD); NumCalls = Metrics.NumInlineCandidates; - + unsigned LoopSize = Metrics.NumInsts; - + // Don't allow an estimate of size zero. This would allows unrolling of loops // with huge iteration counts, which is a compile time problem even if it's // not a problem for code quality. if (LoopSize == 0) LoopSize = 1; - + return LoopSize; } bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { LoopInfo *LI = &getAnalysis<LoopInfo>(); + ScalarEvolution *SE = &getAnalysis<ScalarEvolution>(); BasicBlock *Header = L->getHeader(); DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName() << "] Loop %" << Header->getName() << "\n"); (void)Header; - + // Determine the current unrolling threshold. While this is normally set // from UnrollThreshold, it is overridden to a smaller value if the current // function is marked as optimize-for-size, and the unroll threshold was // not user specified. unsigned Threshold = CurrentThreshold; - if (!UserThreshold && + if (!UserThreshold && Header->getParent()->hasFnAttr(Attribute::OptimizeForSize)) Threshold = OptSizeUnrollThreshold; - // Find trip count - unsigned TripCount = L->getSmallConstantTripCount(); - unsigned Count = CurrentCount; - + // Find trip count and trip multiple if count is not available + unsigned TripCount = 0; + unsigned TripMultiple = 1; + if (!NoSCEVUnroll) { + // Find "latch trip count". UnrollLoop assumes that control cannot exit + // via the loop latch on any iteration prior to TripCount. The loop may exit + // early via an earlier branch. + BasicBlock *LatchBlock = L->getLoopLatch(); + if (LatchBlock) { + TripCount = SE->getSmallConstantTripCount(L, LatchBlock); + TripMultiple = SE->getSmallConstantTripMultiple(L, LatchBlock); + } + } + else { + TripCount = L->getSmallConstantTripCount(); + if (TripCount == 0) + TripMultiple = L->getSmallConstantTripMultiple(); + } // Automatically select an unroll count. + unsigned Count = CurrentCount; if (Count == 0) { // Conservative heuristic: if we know the trip count, see if we can // completely unroll (subject to the threshold, checked below); otherwise @@ -152,8 +176,9 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { // Enforce the threshold. if (Threshold != NoThreshold) { + const TargetData *TD = getAnalysisIfAvailable<TargetData>(); unsigned NumInlineCandidates; - unsigned LoopSize = ApproximateLoopSize(L, NumInlineCandidates); + unsigned LoopSize = ApproximateLoopSize(L, NumInlineCandidates, TD); DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n"); if (NumInlineCandidates != 0) { DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n"); @@ -182,12 +207,8 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { } // Unroll the loop. - Function *F = L->getHeader()->getParent(); - if (!UnrollLoop(L, Count, LI, &LPM)) + if (!UnrollLoop(L, Count, TripCount, TripMultiple, LI, &LPM)) return false; - // FIXME: Reconstruct dom info, because it is not preserved properly. - if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>()) - DT->runOnFunction(*F); return true; } diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp index 840c4b6..458949c 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -492,7 +492,7 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val, Value *BranchVal = LIC; if (!isa<ConstantInt>(Val) || Val->getType() != Type::getInt1Ty(LIC->getContext())) - BranchVal = new ICmpInst(InsertPt, ICmpInst::ICMP_EQ, LIC, Val, "tmp"); + BranchVal = new ICmpInst(InsertPt, ICmpInst::ICMP_EQ, LIC, Val); else if (Val != ConstantInt::getTrue(Val->getContext())) // We want to enter the new loop when the condition is true. std::swap(TrueDest, FalseDest); @@ -561,10 +561,17 @@ void LoopUnswitch::SplitExitEdges(Loop *L, BasicBlock *ExitBlock = ExitBlocks[i]; SmallVector<BasicBlock *, 4> Preds(pred_begin(ExitBlock), pred_end(ExitBlock)); + // Although SplitBlockPredecessors doesn't preserve loop-simplify in // general, if we call it on all predecessors of all exits then it does. - SplitBlockPredecessors(ExitBlock, Preds.data(), Preds.size(), - ".us-lcssa", this); + if (!ExitBlock->isLandingPad()) { + SplitBlockPredecessors(ExitBlock, Preds.data(), Preds.size(), + ".us-lcssa", this); + } else { + SmallVector<BasicBlock*, 2> NewBBs; + SplitLandingPadPredecessors(ExitBlock, Preds, ".us-lcssa", ".us-lcssa", + this, NewBBs); + } } } @@ -632,7 +639,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, // as well. ParentLoop->addBasicBlockToLoop(NewBlocks[0], LI->getBase()); } - + for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) { BasicBlock *NewExit = cast<BasicBlock>(VMap[ExitBlocks[i]]); // The new exit block should be in the same loop as the old one. @@ -653,6 +660,19 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, if (It != VMap.end()) V = It->second; PN->addIncoming(V, NewExit); } + + if (LandingPadInst *LPad = NewExit->getLandingPadInst()) { + PN = PHINode::Create(LPad->getType(), 0, "", + ExitSucc->getFirstInsertionPt()); + + for (pred_iterator I = pred_begin(ExitSucc), E = pred_end(ExitSucc); + I != E; ++I) { + BasicBlock *BB = *I; + LandingPadInst *LPI = BB->getLandingPadInst(); + LPI->replaceAllUsesWith(PN); + PN->addIncoming(LPI, BB); + } + } } // Rewrite the code to refer to itself. diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp index 9087b46..689bbe9 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp @@ -20,98 +20,88 @@ #include "llvm/Support/IRBuilder.h" using namespace llvm; -static bool LowerAtomicIntrinsic(IntrinsicInst *II) { - IRBuilder<> Builder(II->getParent(), II); - unsigned IID = II->getIntrinsicID(); - switch (IID) { - case Intrinsic::memory_barrier: - break; +static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) { + IRBuilder<> Builder(CXI->getParent(), CXI); + Value *Ptr = CXI->getPointerOperand(); + Value *Cmp = CXI->getCompareOperand(); + Value *Val = CXI->getNewValOperand(); + + LoadInst *Orig = Builder.CreateLoad(Ptr); + Value *Equal = Builder.CreateICmpEQ(Orig, Cmp); + Value *Res = Builder.CreateSelect(Equal, Val, Orig); + Builder.CreateStore(Res, Ptr); + + CXI->replaceAllUsesWith(Orig); + CXI->eraseFromParent(); + return true; +} - case Intrinsic::atomic_load_add: - case Intrinsic::atomic_load_sub: - case Intrinsic::atomic_load_and: - case Intrinsic::atomic_load_nand: - case Intrinsic::atomic_load_or: - case Intrinsic::atomic_load_xor: - case Intrinsic::atomic_load_max: - case Intrinsic::atomic_load_min: - case Intrinsic::atomic_load_umax: - case Intrinsic::atomic_load_umin: { - Value *Ptr = II->getArgOperand(0), *Delta = II->getArgOperand(1); +static bool LowerAtomicRMWInst(AtomicRMWInst *RMWI) { + IRBuilder<> Builder(RMWI->getParent(), RMWI); + Value *Ptr = RMWI->getPointerOperand(); + Value *Val = RMWI->getValOperand(); - LoadInst *Orig = Builder.CreateLoad(Ptr); - Value *Res = NULL; - switch (IID) { - default: assert(0 && "Unrecognized atomic modify operation"); - case Intrinsic::atomic_load_add: - Res = Builder.CreateAdd(Orig, Delta); - break; - case Intrinsic::atomic_load_sub: - Res = Builder.CreateSub(Orig, Delta); - break; - case Intrinsic::atomic_load_and: - Res = Builder.CreateAnd(Orig, Delta); - break; - case Intrinsic::atomic_load_nand: - Res = Builder.CreateNot(Builder.CreateAnd(Orig, Delta)); - break; - case Intrinsic::atomic_load_or: - Res = Builder.CreateOr(Orig, Delta); - break; - case Intrinsic::atomic_load_xor: - Res = Builder.CreateXor(Orig, Delta); - break; - case Intrinsic::atomic_load_max: - Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Delta), - Delta, Orig); - break; - case Intrinsic::atomic_load_min: - Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Delta), - Orig, Delta); - break; - case Intrinsic::atomic_load_umax: - Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Delta), - Delta, Orig); - break; - case Intrinsic::atomic_load_umin: - Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Delta), - Orig, Delta); - break; - } - Builder.CreateStore(Res, Ptr); + LoadInst *Orig = Builder.CreateLoad(Ptr); + Value *Res = NULL; - II->replaceAllUsesWith(Orig); + switch (RMWI->getOperation()) { + default: llvm_unreachable("Unexpected RMW operation"); + case AtomicRMWInst::Xchg: + Res = Val; break; - } - - case Intrinsic::atomic_swap: { - Value *Ptr = II->getArgOperand(0), *Val = II->getArgOperand(1); - LoadInst *Orig = Builder.CreateLoad(Ptr); - Builder.CreateStore(Val, Ptr); - II->replaceAllUsesWith(Orig); + case AtomicRMWInst::Add: + Res = Builder.CreateAdd(Orig, Val); break; - } - - case Intrinsic::atomic_cmp_swap: { - Value *Ptr = II->getArgOperand(0), *Cmp = II->getArgOperand(1); - Value *Val = II->getArgOperand(2); - - LoadInst *Orig = Builder.CreateLoad(Ptr); - Value *Equal = Builder.CreateICmpEQ(Orig, Cmp); - Value *Res = Builder.CreateSelect(Equal, Val, Orig); - Builder.CreateStore(Res, Ptr); - II->replaceAllUsesWith(Orig); + case AtomicRMWInst::Sub: + Res = Builder.CreateSub(Orig, Val); + break; + case AtomicRMWInst::And: + Res = Builder.CreateAnd(Orig, Val); + break; + case AtomicRMWInst::Nand: + Res = Builder.CreateNot(Builder.CreateAnd(Orig, Val)); + break; + case AtomicRMWInst::Or: + Res = Builder.CreateOr(Orig, Val); + break; + case AtomicRMWInst::Xor: + Res = Builder.CreateXor(Orig, Val); + break; + case AtomicRMWInst::Max: + Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val), + Val, Orig); + break; + case AtomicRMWInst::Min: + Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val), + Orig, Val); + break; + case AtomicRMWInst::UMax: + Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val), + Val, Orig); + break; + case AtomicRMWInst::UMin: + Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val), + Orig, Val); break; } + Builder.CreateStore(Res, Ptr); + RMWI->replaceAllUsesWith(Orig); + RMWI->eraseFromParent(); + return true; +} - default: - return false; - } +static bool LowerFenceInst(FenceInst *FI) { + FI->eraseFromParent(); + return true; +} - assert(II->use_empty() && - "Lowering should have eliminated any uses of the intrinsic call!"); - II->eraseFromParent(); +static bool LowerLoadInst(LoadInst *LI) { + LI->setAtomic(NotAtomic); + return true; +} +static bool LowerStoreInst(StoreInst *SI) { + SI->setAtomic(NotAtomic); return true; } @@ -123,9 +113,22 @@ namespace { } bool runOnBasicBlock(BasicBlock &BB) { bool Changed = false; - for (BasicBlock::iterator DI = BB.begin(), DE = BB.end(); DI != DE; ) - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(DI++)) - Changed |= LowerAtomicIntrinsic(II); + for (BasicBlock::iterator DI = BB.begin(), DE = BB.end(); DI != DE; ) { + Instruction *Inst = DI++; + if (FenceInst *FI = dyn_cast<FenceInst>(Inst)) + Changed |= LowerFenceInst(FI); + else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(Inst)) + Changed |= LowerAtomicCmpXchgInst(CXI); + else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(Inst)) + Changed |= LowerAtomicRMWInst(RMWI); + else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { + if (LI->isAtomic()) + LowerLoadInst(LI); + } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + if (SI->isAtomic()) + LowerStoreInst(SI); + } + } return Changed; } }; diff --git a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 7ed3db6..eeb8931 100644 --- a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -54,7 +54,7 @@ static int64_t GetOffsetFromIndex(const GetElementPtrInst *GEP, unsigned Idx, if (OpC->isZero()) continue; // No offset. // Handle struct indices, which add their field offset to the pointer. - if (const StructType *STy = dyn_cast<StructType>(*GTI)) { + if (StructType *STy = dyn_cast<StructType>(*GTI)) { Offset += TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue()); continue; } @@ -384,7 +384,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) { // If this is a store, see if we can merge it in. - if (NextStore->isVolatile()) break; + if (!NextStore->isSimple()) break; // Check to see if this stored value is of the same byte-splattable value. if (ByteVal != isBytewiseValue(NextStore->getOperand(0))) @@ -448,7 +448,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, // Determine alignment unsigned Alignment = Range.Alignment; if (Alignment == 0) { - const Type *EltType = + Type *EltType = cast<PointerType>(StartPtr->getType())->getElementType(); Alignment = TD->getABITypeAlignment(EltType); } @@ -479,7 +479,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { - if (SI->isVolatile()) return false; + if (!SI->isSimple()) return false; if (TD == 0) return false; @@ -487,7 +487,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { // happen to be using a load-store pair to implement it, rather than // a memcpy. if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) { - if (!LI->isVolatile() && LI->hasOneUse() && + if (LI->isSimple() && LI->hasOneUse() && LI->getParent() == SI->getParent()) { MemDepResult ldep = MD->getDependency(LI); CallInst *C = 0; @@ -616,7 +616,7 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, if (!A->hasStructRetAttr()) return false; - const Type *StructTy = cast<PointerType>(A->getType())->getElementType(); + Type *StructTy = cast<PointerType>(A->getType())->getElementType(); uint64_t destSize = TD->getTypeAllocSize(StructTy); if (destSize < srcSize) @@ -860,7 +860,7 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { // Find out what feeds this byval argument. Value *ByValArg = CS.getArgument(ArgNo); - const Type *ByValTy =cast<PointerType>(ByValArg->getType())->getElementType(); + Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType(); uint64_t ByValSize = TD->getTypeAllocSize(ByValTy); MemDepResult DepInfo = MD->getPointerDependencyFrom(AliasAnalysis::Location(ByValArg, ByValSize), diff --git a/contrib/llvm/lib/Transforms/Scalar/ObjCARC.cpp b/contrib/llvm/lib/Transforms/Scalar/ObjCARC.cpp index ee132d3..da74e9c 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ObjCARC.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ObjCARC.cpp @@ -180,7 +180,7 @@ static bool IsPotentialUse(const Value *Op) { Arg->hasStructRetAttr()) return false; // Only consider values with pointer types, and not function pointers. - const PointerType *Ty = dyn_cast<PointerType>(Op->getType()); + PointerType *Ty = dyn_cast<PointerType>(Op->getType()); if (!Ty || isa<FunctionType>(Ty->getElementType())) return false; // Conservatively assume anything else is a potential use. @@ -213,8 +213,8 @@ static InstructionClass GetFunctionClass(const Function *F) { const Argument *A0 = AI++; if (AI == AE) // Argument is a pointer. - if (const PointerType *PTy = dyn_cast<PointerType>(A0->getType())) { - const Type *ETy = PTy->getElementType(); + if (PointerType *PTy = dyn_cast<PointerType>(A0->getType())) { + Type *ETy = PTy->getElementType(); // Argument is i8*. if (ETy->isIntegerTy(8)) return StringSwitch<InstructionClass>(F->getName()) @@ -234,7 +234,7 @@ static InstructionClass GetFunctionClass(const Function *F) { .Default(IC_CallOrUser); // Argument is i8** - if (const PointerType *Pte = dyn_cast<PointerType>(ETy)) + if (PointerType *Pte = dyn_cast<PointerType>(ETy)) if (Pte->getElementType()->isIntegerTy(8)) return StringSwitch<InstructionClass>(F->getName()) .Case("objc_loadWeakRetained", IC_LoadWeakRetained) @@ -246,11 +246,11 @@ static InstructionClass GetFunctionClass(const Function *F) { // Two arguments, first is i8**. const Argument *A1 = AI++; if (AI == AE) - if (const PointerType *PTy = dyn_cast<PointerType>(A0->getType())) - if (const PointerType *Pte = dyn_cast<PointerType>(PTy->getElementType())) + if (PointerType *PTy = dyn_cast<PointerType>(A0->getType())) + if (PointerType *Pte = dyn_cast<PointerType>(PTy->getElementType())) if (Pte->getElementType()->isIntegerTy(8)) - if (const PointerType *PTy1 = dyn_cast<PointerType>(A1->getType())) { - const Type *ETy1 = PTy1->getElementType(); + if (PointerType *PTy1 = dyn_cast<PointerType>(A1->getType())) { + Type *ETy1 = PTy1->getElementType(); // Second argument is i8* if (ETy1->isIntegerTy(8)) return StringSwitch<InstructionClass>(F->getName()) @@ -258,7 +258,7 @@ static InstructionClass GetFunctionClass(const Function *F) { .Case("objc_initWeak", IC_InitWeak) .Default(IC_CallOrUser); // Second argument is i8**. - if (const PointerType *Pte1 = dyn_cast<PointerType>(ETy1)) + if (PointerType *Pte1 = dyn_cast<PointerType>(ETy1)) if (Pte1->getElementType()->isIntegerTy(8)) return StringSwitch<InstructionClass>(F->getName()) .Case("objc_moveWeak", IC_MoveWeak) @@ -344,6 +344,10 @@ static InstructionClass GetInstructionClass(const Value *V) { break; default: // For anything else, check all the operands. + // Note that this includes both operands of a Store: while the first + // operand isn't actually being dereferenced, it is being stored to + // memory where we can no longer track who might read it and dereference + // it, so we have to consider it potentially used. for (User::const_op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) if (IsPotentialUse(*OI)) @@ -421,9 +425,10 @@ static bool IsAlwaysTail(InstructionClass Class) { /// IsNoThrow - Test if the given class represents instructions which are always /// safe to mark with the nounwind attribute.. static bool IsNoThrow(InstructionClass Class) { + // objc_retainBlock is not nounwind because it calls user copy constructors + // which could theoretically throw. return Class == IC_Retain || Class == IC_RetainRV || - Class == IC_RetainBlock || Class == IC_Release || Class == IC_Autorelease || Class == IC_AutoreleaseRV || @@ -515,6 +520,10 @@ static bool IsObjCIdentifiedObject(const Value *V) { const Value *Pointer = StripPointerCastsAndObjCCalls(LI->getPointerOperand()); if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Pointer)) { + // A constant pointer can't be pointing to an object on the heap. It may + // be reference-counted, but it won't be deleted. + if (GV->isConstant()) + return true; StringRef Name = GV->getName(); // These special variables are known to hold values which are not // reference-counted pointers. @@ -738,7 +747,6 @@ ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS, const Location &Loc) { switch (GetBasicInstructionClass(CS.getInstruction())) { case IC_Retain: case IC_RetainRV: - case IC_RetainBlock: case IC_Autorelease: case IC_AutoreleaseRV: case IC_NoopCast: @@ -746,6 +754,8 @@ ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS, const Location &Loc) { case IC_FusedRetainAutorelease: case IC_FusedRetainAutoreleaseRV: // These functions don't access any memory visible to the compiler. + // Note that this doesn't include objc_retainBlock, becuase it updates + // pointers when it copies block data. return NoModRef; default: break; @@ -877,7 +887,9 @@ bool ObjCARCExpand::runOnFunction(Function &F) { // usually can't sink them past other calls, which would be the main // case where it would be useful. -/// TODO: The pointer returned from objc_loadWeakRetained is retained. +// TODO: The pointer returned from objc_loadWeakRetained is retained. + +// TODO: Delete release+retain pairs (rare). #include "llvm/GlobalAlias.h" #include "llvm/Constants.h" @@ -1098,16 +1110,16 @@ static Sequence MergeSeqs(Sequence A, Sequence B, bool TopDown) { if (A == S_None || B == S_None) return S_None; - // Note that we can't merge S_CanRelease and S_Use. if (A > B) std::swap(A, B); if (TopDown) { // Choose the side which is further along in the sequence. - if (A == S_Retain && (B == S_CanRelease || B == S_Use)) + if ((A == S_Retain || A == S_CanRelease) && + (B == S_CanRelease || B == S_Use)) return B; } else { // Choose the side which is further along in the sequence. if ((A == S_Use || A == S_CanRelease) && - (B == S_Release || B == S_Stop || B == S_MovableRelease)) + (B == S_Use || B == S_Release || B == S_Stop || B == S_MovableRelease)) return A; // If both sides are releases, choose the more conservative one. if (A == S_Stop && (B == S_Release || B == S_MovableRelease)) @@ -1124,13 +1136,19 @@ namespace { /// retain-decrement-use-release sequence or release-use-decrement-retain /// reverese sequence. struct RRInfo { - /// KnownIncremented - After an objc_retain, the reference count of the - /// referenced object is known to be positive. Similarly, before an - /// objc_release, the reference count of the referenced object is known to - /// be positive. If there are retain-release pairs in code regions where the - /// retain count is known to be positive, they can be eliminated, regardless - /// of any side effects between them. - bool KnownIncremented; + /// KnownSafe - After an objc_retain, the reference count of the referenced + /// object is known to be positive. Similarly, before an objc_release, the + /// reference count of the referenced object is known to be positive. If + /// there are retain-release pairs in code regions where the retain count + /// is known to be positive, they can be eliminated, regardless of any side + /// effects between them. + /// + /// Also, a retain+release pair nested within another retain+release + /// pair all on the known same pointer value can be eliminated, regardless + /// of any intervening side effects. + /// + /// KnownSafe is true when either of these conditions is satisfied. + bool KnownSafe; /// IsRetainBlock - True if the Calls are objc_retainBlock calls (as /// opposed to objc_retain calls). @@ -1153,7 +1171,7 @@ namespace { SmallPtrSet<Instruction *, 2> ReverseInsertPts; RRInfo() : - KnownIncremented(false), IsRetainBlock(false), IsTailCallRelease(false), + KnownSafe(false), IsRetainBlock(false), IsTailCallRelease(false), ReleaseMetadata(0) {} void clear(); @@ -1161,7 +1179,7 @@ namespace { } void RRInfo::clear() { - KnownIncremented = false; + KnownSafe = false; IsRetainBlock = false; IsTailCallRelease = false; ReleaseMetadata = 0; @@ -1176,6 +1194,9 @@ namespace { /// RefCount - The known minimum number of reference count increments. unsigned RefCount; + /// NestCount - The known minimum level of retain+release nesting. + unsigned NestCount; + /// Seq - The current position in the sequence. Sequence Seq; @@ -1184,7 +1205,11 @@ namespace { /// TODO: Encapsulate this better. RRInfo RRI; - PtrState() : RefCount(0), Seq(S_None) {} + PtrState() : RefCount(0), NestCount(0), Seq(S_None) {} + + void SetAtLeastOneRefCount() { + if (RefCount == 0) RefCount = 1; + } void IncrementRefCount() { if (RefCount != UINT_MAX) ++RefCount; @@ -1194,14 +1219,22 @@ namespace { if (RefCount != 0) --RefCount; } - void ClearRefCount() { - RefCount = 0; - } - bool IsKnownIncremented() const { return RefCount > 0; } + void IncrementNestCount() { + if (NestCount != UINT_MAX) ++NestCount; + } + + void DecrementNestCount() { + if (NestCount != 0) --NestCount; + } + + bool IsKnownNested() const { + return NestCount > 0; + } + void SetSeq(Sequence NewSeq) { Seq = NewSeq; } @@ -1233,6 +1266,7 @@ void PtrState::Merge(const PtrState &Other, bool TopDown) { Seq = MergeSeqs(Seq, Other.Seq, TopDown); RefCount = std::min(RefCount, Other.RefCount); + NestCount = std::min(NestCount, Other.NestCount); // We can't merge a plain objc_retain with an objc_retainBlock. if (RRI.IsRetainBlock != Other.RRI.IsRetainBlock) @@ -1245,7 +1279,7 @@ PtrState::Merge(const PtrState &Other, bool TopDown) { if (RRI.ReleaseMetadata != Other.RRI.ReleaseMetadata) RRI.ReleaseMetadata = 0; - RRI.KnownIncremented = RRI.KnownIncremented && Other.RRI.KnownIncremented; + RRI.KnownSafe = RRI.KnownSafe && Other.RRI.KnownSafe; RRI.IsTailCallRelease = RRI.IsTailCallRelease && Other.RRI.IsTailCallRelease; RRI.Calls.insert(Other.RRI.Calls.begin(), Other.RRI.Calls.end()); RRI.ReverseInsertPts.insert(Other.RRI.ReverseInsertPts.begin(), @@ -1316,7 +1350,7 @@ namespace { } void clearBottomUpPointers() { - PerPtrTopDown.clear(); + PerPtrBottomUp.clear(); } void clearTopDownPointers() { @@ -1334,6 +1368,12 @@ namespace { unsigned GetAllPathCount() const { return TopDownPathCount * BottomUpPathCount; } + + /// IsVisitedTopDown - Test whether the block for this BBState has been + /// visited by the top-down portion of the algorithm. + bool isVisitedTopDown() const { + return TopDownPathCount != 0; + } }; } @@ -1364,7 +1404,7 @@ void BBState::MergePred(const BBState &Other) { /*TopDown=*/true); } - // For each entry in our set, if the other set doens't have an entry with the + // For each entry in our set, if the other set doesn't have an entry with the // same key, force it to merge with an empty entry. for (ptr_iterator MI = top_down_ptr_begin(), ME = top_down_ptr_end(); MI != ME; ++MI) @@ -1389,7 +1429,7 @@ void BBState::MergeSucc(const BBState &Other) { /*TopDown=*/false); } - // For each entry in our set, if the other set doens't have an entry + // For each entry in our set, if the other set doesn't have an entry // with the same key, force it to merge with an empty entry. for (ptr_iterator MI = bottom_up_ptr_begin(), ME = bottom_up_ptr_end(); MI != ME; ++MI) @@ -1406,15 +1446,11 @@ namespace { /// Run - A flag indicating whether this optimization pass should run. bool Run; - /// RetainFunc, RelaseFunc - Declarations for objc_retain, - /// objc_retainBlock, and objc_release. - Function *RetainFunc, *RetainBlockFunc, *RetainRVFunc, *ReleaseFunc; - /// RetainRVCallee, etc. - Declarations for ObjC runtime /// functions, for use in creating calls to them. These are initialized /// lazily to avoid cluttering up the Module with unused declarations. Constant *RetainRVCallee, *AutoreleaseRVCallee, *ReleaseCallee, - *RetainCallee, *AutoreleaseCallee; + *RetainCallee, *RetainBlockCallee, *AutoreleaseCallee; /// UsedInThisFunciton - Flags which determine whether each of the /// interesting runtine functions is in fact used in the current function. @@ -1428,6 +1464,7 @@ namespace { Constant *getAutoreleaseRVCallee(Module *M); Constant *getReleaseCallee(Module *M); Constant *getRetainCallee(Module *M); + Constant *getRetainBlockCallee(Module *M); Constant *getAutoreleaseCallee(Module *M); void OptimizeRetainCall(Function &F, Instruction *Retain); @@ -1452,11 +1489,13 @@ namespace { void MoveCalls(Value *Arg, RRInfo &RetainsToMove, RRInfo &ReleasesToMove, MapVector<Value *, RRInfo> &Retains, DenseMap<Value *, RRInfo> &Releases, - SmallVectorImpl<Instruction *> &DeadInsts); + SmallVectorImpl<Instruction *> &DeadInsts, + Module *M); bool PerformCodePlacement(DenseMap<const BasicBlock *, BBState> &BBStates, MapVector<Value *, RRInfo> &Retains, - DenseMap<Value *, RRInfo> &Releases); + DenseMap<Value *, RRInfo> &Releases, + Module *M); void OptimizeWeakCalls(Function &F); @@ -1501,7 +1540,7 @@ Constant *ObjCARCOpt::getRetainRVCallee(Module *M) { Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); std::vector<Type *> Params; Params.push_back(I8X); - const FunctionType *FTy = + FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false); AttrListPtr Attributes; Attributes.addAttr(~0u, Attribute::NoUnwind); @@ -1518,7 +1557,7 @@ Constant *ObjCARCOpt::getAutoreleaseRVCallee(Module *M) { Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); std::vector<Type *> Params; Params.push_back(I8X); - const FunctionType *FTy = + FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false); AttrListPtr Attributes; Attributes.addAttr(~0u, Attribute::NoUnwind); @@ -1561,6 +1600,23 @@ Constant *ObjCARCOpt::getRetainCallee(Module *M) { return RetainCallee; } +Constant *ObjCARCOpt::getRetainBlockCallee(Module *M) { + if (!RetainBlockCallee) { + LLVMContext &C = M->getContext(); + std::vector<Type *> Params; + Params.push_back(PointerType::getUnqual(Type::getInt8Ty(C))); + AttrListPtr Attributes; + // objc_retainBlock is not nounwind because it calls user copy constructors + // which could theoretically throw. + RetainBlockCallee = + M->getOrInsertFunction( + "objc_retainBlock", + FunctionType::get(Params[0], Params, /*isVarArg=*/false), + Attributes); + } + return RetainBlockCallee; +} + Constant *ObjCARCOpt::getAutoreleaseCallee(Module *M) { if (!AutoreleaseCallee) { LLVMContext &C = M->getContext(); @@ -1904,12 +1960,19 @@ void ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV) { // Check for a return of the pointer value. const Value *Ptr = GetObjCArg(AutoreleaseRV); - for (Value::const_use_iterator UI = Ptr->use_begin(), UE = Ptr->use_end(); - UI != UE; ++UI) { - const User *I = *UI; - if (isa<ReturnInst>(I) || GetBasicInstructionClass(I) == IC_RetainRV) - return; - } + SmallVector<const Value *, 2> Users; + Users.push_back(Ptr); + do { + Ptr = Users.pop_back_val(); + for (Value::const_use_iterator UI = Ptr->use_begin(), UE = Ptr->use_end(); + UI != UE; ++UI) { + const User *I = *UI; + if (isa<ReturnInst>(I) || GetBasicInstructionClass(I) == IC_RetainRV) + return; + if (isa<BitCastInst>(I)) + Users.push_back(I); + } + } while (!Users.empty()); Changed = true; ++NumPeeps; @@ -1953,7 +2016,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { case IC_DestroyWeak: { CallInst *CI = cast<CallInst>(Inst); if (isNullOrUndef(CI->getArgOperand(0))) { - const Type *Ty = CI->getArgOperand(0)->getType(); + Type *Ty = CI->getArgOperand(0)->getType(); new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()), Constant::getNullValue(Ty), CI); @@ -1968,7 +2031,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { CallInst *CI = cast<CallInst>(Inst); if (isNullOrUndef(CI->getArgOperand(0)) || isNullOrUndef(CI->getArgOperand(1))) { - const Type *Ty = CI->getArgOperand(0)->getType(); + Type *Ty = CI->getArgOperand(0)->getType(); new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()), Constant::getNullValue(Ty), CI); @@ -2090,7 +2153,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { ++NumPartialNoops; // Clone the call into each predecessor that has a non-null value. CallInst *CInst = cast<CallInst>(Inst); - const Type *ParamTy = CInst->getArgOperand(0)->getType(); + Type *ParamTy = CInst->getArgOperand(0)->getType(); for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { Value *Incoming = StripPointerCastsAndObjCCalls(PN->getIncomingValue(i)); @@ -2132,41 +2195,49 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB, const TerminatorInst *TI = cast<TerminatorInst>(&BB->back()); bool SomeSuccHasSame = false; bool AllSuccsHaveSame = true; - for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI) - switch (BBStates[*SI].getPtrBottomUpState(Arg).GetSeq()) { + PtrState &S = MyStates.getPtrTopDownState(Arg); + for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI) { + PtrState &SuccS = BBStates[*SI].getPtrBottomUpState(Arg); + switch (SuccS.GetSeq()) { case S_None: - case S_CanRelease: - MyStates.getPtrTopDownState(Arg).ClearSequenceProgress(); - SomeSuccHasSame = false; - break; + case S_CanRelease: { + if (!S.RRI.KnownSafe && !SuccS.RRI.KnownSafe) + S.ClearSequenceProgress(); + continue; + } case S_Use: SomeSuccHasSame = true; break; case S_Stop: case S_Release: case S_MovableRelease: - AllSuccsHaveSame = false; + if (!S.RRI.KnownSafe && !SuccS.RRI.KnownSafe) + AllSuccsHaveSame = false; break; case S_Retain: llvm_unreachable("bottom-up pointer in retain state!"); } + } // If the state at the other end of any of the successor edges // matches the current state, require all edges to match. This // guards against loops in the middle of a sequence. if (SomeSuccHasSame && !AllSuccsHaveSame) - MyStates.getPtrTopDownState(Arg).ClearSequenceProgress(); + S.ClearSequenceProgress(); } case S_CanRelease: { const Value *Arg = I->first; const TerminatorInst *TI = cast<TerminatorInst>(&BB->back()); bool SomeSuccHasSame = false; bool AllSuccsHaveSame = true; - for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI) - switch (BBStates[*SI].getPtrBottomUpState(Arg).GetSeq()) { - case S_None: - MyStates.getPtrTopDownState(Arg).ClearSequenceProgress(); - SomeSuccHasSame = false; - break; + PtrState &S = MyStates.getPtrTopDownState(Arg); + for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI) { + PtrState &SuccS = BBStates[*SI].getPtrBottomUpState(Arg); + switch (SuccS.GetSeq()) { + case S_None: { + if (!S.RRI.KnownSafe && !SuccS.RRI.KnownSafe) + S.ClearSequenceProgress(); + continue; + } case S_CanRelease: SomeSuccHasSame = true; break; @@ -2174,16 +2245,18 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB, case S_Release: case S_MovableRelease: case S_Use: - AllSuccsHaveSame = false; + if (!S.RRI.KnownSafe && !SuccS.RRI.KnownSafe) + AllSuccsHaveSame = false; break; case S_Retain: llvm_unreachable("bottom-up pointer in retain state!"); } + } // If the state at the other end of any of the successor edges // matches the current state, require all edges to match. This // guards against loops in the middle of a sequence. if (SomeSuccHasSame && !AllSuccsHaveSame) - MyStates.getPtrTopDownState(Arg).ClearSequenceProgress(); + S.ClearSequenceProgress(); } } } @@ -2207,6 +2280,8 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB, if (Succ == BB) continue; DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Succ); + // If we haven't seen this node yet, then we've found a CFG cycle. + // Be optimistic here; it's CheckForCFGHazards' job detect trouble. if (I == BBStates.end()) continue; MyStates.InitFromSucc(I->second); @@ -2245,11 +2320,12 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB, S.SetSeqToRelease(Inst->getMetadata(ImpreciseReleaseMDKind)); S.RRI.clear(); - S.RRI.KnownIncremented = S.IsKnownIncremented(); + S.RRI.KnownSafe = S.IsKnownNested() || S.IsKnownIncremented(); S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall(); S.RRI.Calls.insert(Inst); S.IncrementRefCount(); + S.IncrementNestCount(); break; } case IC_RetainBlock: @@ -2259,6 +2335,13 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB, PtrState &S = MyStates.getPtrBottomUpState(Arg); S.DecrementRefCount(); + S.SetAtLeastOneRefCount(); + S.DecrementNestCount(); + + // An objc_retainBlock call with just a use still needs to be kept, + // because it may be copying a block from the stack to the heap. + if (Class == IC_RetainBlock && S.GetSeq() == S_Use) + S.SetSeq(S_CanRelease); switch (S.GetSeq()) { case S_Stop: @@ -2281,7 +2364,7 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB, case S_Retain: llvm_unreachable("bottom-up pointer in retain state!"); } - break; + continue; } case IC_AutoreleasepoolPop: // Conservatively, clear MyStates for all known pointers. @@ -2305,26 +2388,22 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB, PtrState &S = MI->second; Sequence Seq = S.GetSeq(); - // Check for possible retains and releases. + // Check for possible releases. if (CanAlterRefCount(Inst, Ptr, PA, Class)) { - // Check for a retain (we're going bottom-up here). S.DecrementRefCount(); - - // Check for a release. - if (!IsRetain(Class) && Class != IC_RetainBlock) - switch (Seq) { - case S_Use: - S.SetSeq(S_CanRelease); - continue; - case S_CanRelease: - case S_Release: - case S_MovableRelease: - case S_Stop: - case S_None: - break; - case S_Retain: - llvm_unreachable("bottom-up pointer in retain state!"); - } + switch (Seq) { + case S_Use: + S.SetSeq(S_CanRelease); + continue; + case S_CanRelease: + case S_Release: + case S_MovableRelease: + case S_Stop: + case S_None: + break; + case S_Retain: + llvm_unreachable("bottom-up pointer in retain state!"); + } } // Check for possible direct uses. @@ -2332,14 +2411,14 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB, case S_Release: case S_MovableRelease: if (CanUse(Inst, Ptr, PA, Class)) { - S.RRI.ReverseInsertPts.clear(); + assert(S.RRI.ReverseInsertPts.empty()); S.RRI.ReverseInsertPts.insert(Inst); S.SetSeq(S_Use); } else if (Seq == S_Release && (Class == IC_User || Class == IC_CallOrUser)) { // Non-movable releases depend on any possible objc pointer use. S.SetSeq(S_Stop); - S.RRI.ReverseInsertPts.clear(); + assert(S.RRI.ReverseInsertPts.empty()); S.RRI.ReverseInsertPts.insert(Inst); } break; @@ -2378,14 +2457,18 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB, if (Pred == BB) continue; DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred); - if (I == BBStates.end()) + assert(I != BBStates.end()); + // If we haven't seen this node yet, then we've found a CFG cycle. + // Be optimistic here; it's CheckForCFGHazards' job detect trouble. + if (!I->second.isVisitedTopDown()) continue; MyStates.InitFromPred(I->second); while (PI != PE) { Pred = *PI++; if (Pred != BB) { I = BBStates.find(Pred); - if (I != BBStates.end()) + assert(I != BBStates.end()); + if (I->second.isVisitedTopDown()) MyStates.MergePred(I->second); } } @@ -2422,18 +2505,23 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB, S.SetSeq(S_Retain); S.RRI.clear(); S.RRI.IsRetainBlock = Class == IC_RetainBlock; - S.RRI.KnownIncremented = S.IsKnownIncremented(); + // Don't check S.IsKnownIncremented() here because it's not + // sufficient. + S.RRI.KnownSafe = S.IsKnownNested(); S.RRI.Calls.insert(Inst); } + S.SetAtLeastOneRefCount(); S.IncrementRefCount(); - break; + S.IncrementNestCount(); + continue; } case IC_Release: { Arg = GetObjCArg(Inst); PtrState &S = MyStates.getPtrTopDownState(Arg); S.DecrementRefCount(); + S.DecrementNestCount(); switch (S.GetSeq()) { case S_Retain: @@ -2478,16 +2566,12 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB, Sequence Seq = S.GetSeq(); // Check for possible releases. - if (!IsRetain(Class) && Class != IC_RetainBlock && - CanAlterRefCount(Inst, Ptr, PA, Class)) { - // Check for a release. + if (CanAlterRefCount(Inst, Ptr, PA, Class)) { S.DecrementRefCount(); - - // Check for a release. switch (Seq) { case S_Retain: S.SetSeq(S_CanRelease); - S.RRI.ReverseInsertPts.clear(); + assert(S.RRI.ReverseInsertPts.empty()); S.RRI.ReverseInsertPts.insert(Inst); // One call can't cause a transition from S_Retain to S_CanRelease @@ -2511,8 +2595,18 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB, if (CanUse(Inst, Ptr, PA, Class)) S.SetSeq(S_Use); break; - case S_Use: case S_Retain: + // An objc_retainBlock call may be responsible for copying the block + // data from the stack to the heap. Model this by moving it straight + // from S_Retain to S_Use. + if (S.RRI.IsRetainBlock && + CanUse(Inst, Ptr, PA, Class)) { + assert(S.RRI.ReverseInsertPts.empty()); + S.RRI.ReverseInsertPts.insert(Inst); + S.SetSeq(S_Use); + } + break; + case S_Use: case S_None: break; case S_Stop: @@ -2533,28 +2627,43 @@ ObjCARCOpt::Visit(Function &F, DenseMap<const BasicBlock *, BBState> &BBStates, MapVector<Value *, RRInfo> &Retains, DenseMap<Value *, RRInfo> &Releases) { - // Use postorder for bottom-up, and reverse-postorder for top-down, because we + // Use reverse-postorder on the reverse CFG for bottom-up, because we // magically know that loops will be well behaved, i.e. they won't repeatedly - // call retain on a single pointer without doing a release. + // call retain on a single pointer without doing a release. We can't use + // ReversePostOrderTraversal here because we want to walk up from each + // function exit point. + SmallPtrSet<BasicBlock *, 16> Visited; + SmallVector<std::pair<BasicBlock *, pred_iterator>, 16> Stack; + SmallVector<BasicBlock *, 16> Order; + for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { + BasicBlock *BB = I; + if (BB->getTerminator()->getNumSuccessors() == 0) + Stack.push_back(std::make_pair(BB, pred_begin(BB))); + } + while (!Stack.empty()) { + pred_iterator End = pred_end(Stack.back().first); + while (Stack.back().second != End) { + BasicBlock *BB = *Stack.back().second++; + if (Visited.insert(BB)) + Stack.push_back(std::make_pair(BB, pred_begin(BB))); + } + Order.push_back(Stack.pop_back_val().first); + } bool BottomUpNestingDetected = false; - SmallVector<BasicBlock *, 8> PostOrder; - for (po_iterator<Function *> I = po_begin(&F), E = po_end(&F); I != E; ++I) { + for (SmallVectorImpl<BasicBlock *>::const_reverse_iterator I = + Order.rbegin(), E = Order.rend(); I != E; ++I) { BasicBlock *BB = *I; - PostOrder.push_back(BB); - BottomUpNestingDetected |= VisitBottomUp(BB, BBStates, Retains); } - // Iterate through the post-order in reverse order, achieving a - // reverse-postorder traversal. We don't use the ReversePostOrderTraversal - // class here because it works by computing its own full postorder iteration, - // recording the sequence, and playing it back in reverse. Since we're already - // doing a full iteration above, we can just record the sequence manually and - // avoid the cost of having ReversePostOrderTraversal compute it. + // Use regular reverse-postorder for top-down. bool TopDownNestingDetected = false; - for (SmallVectorImpl<BasicBlock *>::const_reverse_iterator - RI = PostOrder.rbegin(), RE = PostOrder.rend(); RI != RE; ++RI) - TopDownNestingDetected |= VisitTopDown(*RI, BBStates, Releases); + typedef ReversePostOrderTraversal<Function *> RPOTType; + RPOTType RPOT(&F); + for (RPOTType::rpo_iterator I = RPOT.begin(), E = RPOT.end(); I != E; ++I) { + BasicBlock *BB = *I; + TopDownNestingDetected |= VisitTopDown(BB, BBStates, Releases); + } return TopDownNestingDetected && BottomUpNestingDetected; } @@ -2565,12 +2674,10 @@ void ObjCARCOpt::MoveCalls(Value *Arg, RRInfo &ReleasesToMove, MapVector<Value *, RRInfo> &Retains, DenseMap<Value *, RRInfo> &Releases, - SmallVectorImpl<Instruction *> &DeadInsts) { - const Type *ArgTy = Arg->getType(); - const Type *ParamTy = - (RetainRVFunc ? RetainRVFunc : - RetainFunc ? RetainFunc : - RetainBlockFunc)->arg_begin()->getType(); + SmallVectorImpl<Instruction *> &DeadInsts, + Module *M) { + Type *ArgTy = Arg->getType(); + Type *ParamTy = PointerType::getUnqual(Type::getInt8Ty(ArgTy->getContext())); // Insert the new retain and release calls. for (SmallPtrSet<Instruction *, 2>::const_iterator @@ -2581,7 +2688,7 @@ void ObjCARCOpt::MoveCalls(Value *Arg, new BitCastInst(Arg, ParamTy, "", InsertPt); CallInst *Call = CallInst::Create(RetainsToMove.IsRetainBlock ? - RetainBlockFunc : RetainFunc, + getRetainBlockCallee(M) : getRetainCallee(M), MyArg, "", InsertPt); Call->setDoesNotThrow(); if (!RetainsToMove.IsRetainBlock) @@ -2598,8 +2705,8 @@ void ObjCARCOpt::MoveCalls(Value *Arg, // The invoke's return value isn't available in the unwind block, // but our releases will never depend on it, because they must be // paired with retains from before the invoke. - InsertPts[0] = II->getNormalDest()->getFirstNonPHI(); - InsertPts[1] = II->getUnwindDest()->getFirstNonPHI(); + InsertPts[0] = II->getNormalDest()->getFirstInsertionPt(); + InsertPts[1] = II->getUnwindDest()->getFirstInsertionPt(); } else { // Insert code immediately after the last use. InsertPts[0] = llvm::next(BasicBlock::iterator(LastUse)); @@ -2609,7 +2716,8 @@ void ObjCARCOpt::MoveCalls(Value *Arg, Instruction *InsertPt = *I; Value *MyArg = ArgTy == ParamTy ? Arg : new BitCastInst(Arg, ParamTy, "", InsertPt); - CallInst *Call = CallInst::Create(ReleaseFunc, MyArg, "", InsertPt); + CallInst *Call = CallInst::Create(getReleaseCallee(M), MyArg, + "", InsertPt); // Attach a clang.imprecise_release metadata tag, if appropriate. if (MDNode *M = ReleasesToMove.ReleaseMetadata) Call->setMetadata(ImpreciseReleaseMDKind, M); @@ -2640,7 +2748,8 @@ bool ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState> &BBStates, MapVector<Value *, RRInfo> &Retains, - DenseMap<Value *, RRInfo> &Releases) { + DenseMap<Value *, RRInfo> &Releases, + Module *M) { bool AnyPairsCompletelyEliminated = false; RRInfo RetainsToMove; RRInfo ReleasesToMove; @@ -2649,21 +2758,36 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState> SmallVector<Instruction *, 8> DeadInsts; for (MapVector<Value *, RRInfo>::const_iterator I = Retains.begin(), - E = Retains.end(); I != E; ) { - Value *V = (I++)->first; + E = Retains.end(); I != E; ++I) { + Value *V = I->first; if (!V) continue; // blotted Instruction *Retain = cast<Instruction>(V); Value *Arg = GetObjCArg(Retain); - // If the object being released is in static or stack storage, we know it's + // If the object being released is in static storage, we know it's // not being managed by ObjC reference counting, so we can delete pairs // regardless of what possible decrements or uses lie between them. - bool KnownSafe = isa<Constant>(Arg) || isa<AllocaInst>(Arg); + bool KnownSafe = isa<Constant>(Arg); + + // Same for stack storage, unless this is an objc_retainBlock call, + // which is responsible for copying the block data from the stack to + // the heap. + if (!I->second.IsRetainBlock && isa<AllocaInst>(Arg)) + KnownSafe = true; + + // A constant pointer can't be pointing to an object on the heap. It may + // be reference-counted, but it won't be deleted. + if (const LoadInst *LI = dyn_cast<LoadInst>(Arg)) + if (const GlobalVariable *GV = + dyn_cast<GlobalVariable>( + StripPointerCastsAndObjCCalls(LI->getPointerOperand()))) + if (GV->isConstant()) + KnownSafe = true; // If a pair happens in a region where it is known that the reference count // is already incremented, we can similarly ignore possible decrements. - bool KnownIncrementedTD = true, KnownIncrementedBU = true; + bool KnownSafeTD = true, KnownSafeBU = true; // Connect the dots between the top-down-collected RetainsToMove and // bottom-up-collected ReleasesToMove to form sets of related calls. @@ -2683,7 +2807,7 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState> MapVector<Value *, RRInfo>::const_iterator It = Retains.find(NewRetain); assert(It != Retains.end()); const RRInfo &NewRetainRRI = It->second; - KnownIncrementedTD &= NewRetainRRI.KnownIncremented; + KnownSafeTD &= NewRetainRRI.KnownSafe; for (SmallPtrSet<Instruction *, 2>::const_iterator LI = NewRetainRRI.Calls.begin(), LE = NewRetainRRI.Calls.end(); LI != LE; ++LI) { @@ -2739,7 +2863,7 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState> Releases.find(NewRelease); assert(It != Releases.end()); const RRInfo &NewReleaseRRI = It->second; - KnownIncrementedBU &= NewReleaseRRI.KnownIncremented; + KnownSafeBU &= NewReleaseRRI.KnownSafe; for (SmallPtrSet<Instruction *, 2>::const_iterator LI = NewReleaseRRI.Calls.begin(), LE = NewReleaseRRI.Calls.end(); LI != LE; ++LI) { @@ -2787,12 +2911,19 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState> if (NewRetains.empty()) break; } - // If the pointer is known incremented, we can safely delete the pair - // regardless of what's between them. - if (KnownIncrementedTD || KnownIncrementedBU) { + // If the pointer is known incremented or nested, we can safely delete the + // pair regardless of what's between them. + if (KnownSafeTD || KnownSafeBU) { RetainsToMove.ReverseInsertPts.clear(); ReleasesToMove.ReverseInsertPts.clear(); NewCount = 0; + } else { + // Determine whether the new insertion points we computed preserve the + // balance of retain and release calls through the program. + // TODO: If the fully aggressive solution isn't valid, try to find a + // less aggressive solution which is. + if (NewDelta != 0) + goto next_retain; } // Determine whether the original call points are balanced in the retain and @@ -2803,18 +2934,12 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState> if (OldDelta != 0) goto next_retain; - // Determine whether the new insertion points we computed preserve the - // balance of retain and release calls through the program. - // TODO: If the fully aggressive solution isn't valid, try to find a - // less aggressive solution which is. - if (NewDelta != 0) - goto next_retain; - // Ok, everything checks out and we're all set. Let's move some code! Changed = true; AnyPairsCompletelyEliminated = NewCount == 0; NumRRs += OldCount - NewCount; - MoveCalls(Arg, RetainsToMove, ReleasesToMove, Retains, Releases, DeadInsts); + MoveCalls(Arg, RetainsToMove, ReleasesToMove, + Retains, Releases, DeadInsts, M); next_retain: NewReleases.clear(); @@ -2993,7 +3118,8 @@ bool ObjCARCOpt::OptimizeSequences(Function &F) { bool NestingDetected = Visit(F, BBStates, Retains, Releases); // Transform. - return PerformCodePlacement(BBStates, Retains, Releases) && NestingDetected; + return PerformCodePlacement(BBStates, Retains, Releases, F.getParent()) && + NestingDetected; } /// OptimizeReturns - Look for this pattern: @@ -3072,7 +3198,8 @@ void ObjCARCOpt::OptimizeReturns(Function &F) { // Check that there is nothing that can affect the reference // count between the retain and the call. - FindDependencies(CanChangeRetainCount, Arg, BB, Retain, + // Note that Retain need not be in BB. + FindDependencies(CanChangeRetainCount, Arg, Retain->getParent(), Retain, DependingInstructions, Visited, PA); if (DependingInstructions.size() != 1) goto next_block; @@ -3117,12 +3244,6 @@ bool ObjCARCOpt::doInitialization(Module &M) { ImpreciseReleaseMDKind = M.getContext().getMDKindID("clang.imprecise_release"); - // Identify the declarations for objc_retain and friends. - RetainFunc = M.getFunction("objc_retain"); - RetainBlockFunc = M.getFunction("objc_retainBlock"); - RetainRVFunc = M.getFunction("objc_retainAutoreleasedReturnValue"); - ReleaseFunc = M.getFunction("objc_release"); - // Intuitively, objc_retain and others are nocapture, however in practice // they are not, because they return their argument value. And objc_release // calls finalizers. @@ -3132,6 +3253,7 @@ bool ObjCARCOpt::doInitialization(Module &M) { AutoreleaseRVCallee = 0; ReleaseCallee = 0; RetainCallee = 0; + RetainBlockCallee = 0; AutoreleaseCallee = 0; return false; @@ -3294,7 +3416,7 @@ Constant *ObjCARCContract::getRetainAutoreleaseCallee(Module *M) { Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); std::vector<Type *> Params; Params.push_back(I8X); - const FunctionType *FTy = + FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false); AttrListPtr Attributes; Attributes.addAttr(~0u, Attribute::NoUnwind); @@ -3310,7 +3432,7 @@ Constant *ObjCARCContract::getRetainAutoreleaseRVCallee(Module *M) { Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); std::vector<Type *> Params; Params.push_back(I8X); - const FunctionType *FTy = + FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false); AttrListPtr Attributes; Attributes.addAttr(~0u, Attribute::NoUnwind); @@ -3377,7 +3499,7 @@ ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease, void ObjCARCContract::ContractRelease(Instruction *Release, inst_iterator &Iter) { LoadInst *Load = dyn_cast<LoadInst>(GetObjCArg(Release)); - if (!Load || Load->isVolatile()) return; + if (!Load || !Load->isSimple()) return; // For now, require everything to be in one basic block. BasicBlock *BB = Release->getParent(); @@ -3393,7 +3515,7 @@ void ObjCARCContract::ContractRelease(Instruction *Release, !(AA->getModRefInfo(I, Loc) & AliasAnalysis::Mod))) ++I; StoreInst *Store = dyn_cast<StoreInst>(I); - if (!Store || Store->isVolatile()) return; + if (!Store || !Store->isSimple()) return; if (Store->getPointerOperand() != Loc.Ptr) return; Value *New = StripPointerCastsAndObjCCalls(Store->getValueOperand()); @@ -3411,8 +3533,8 @@ void ObjCARCContract::ContractRelease(Instruction *Release, ++NumStoreStrongs; LLVMContext &C = Release->getContext(); - const Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); - const Type *I8XX = PointerType::getUnqual(I8X); + Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); + Type *I8XX = PointerType::getUnqual(I8X); Value *Args[] = { Load->getPointerOperand(), New }; if (Args[0]->getType() != I8XX) @@ -3548,7 +3670,7 @@ bool ObjCARCContract::runOnFunction(Function &F) { if (Inst != UserInst && DT->dominates(Inst, UserInst)) { Changed = true; Instruction *Replacement = Inst; - const Type *UseTy = U.get()->getType(); + Type *UseTy = U.get()->getType(); if (PHINode *PHI = dyn_cast<PHINode>(UserInst)) { // For PHI nodes, insert the bitcast in the predecessor block. unsigned ValNo = diff --git a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp index e6341ae..8f98a5b 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -309,7 +309,7 @@ void Reassociate::LinearizeExprTree(BinaryOperator *I, std::swap(LHS, RHS); bool Success = !I->swapOperands(); assert(Success && "swapOperands failed"); - Success = false; + (void)Success; MadeChange = true; } else if (RHSBO) { // Turn (A+B)+(C+D) -> (((A+B)+C)+D). This guarantees the RHS is not diff --git a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp index 083412e..196a847 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -156,7 +156,7 @@ namespace { /// class SCCPSolver : public InstVisitor<SCCPSolver> { const TargetData *TD; - SmallPtrSet<BasicBlock*, 8> BBExecutable;// The BBs that are executable. + SmallPtrSet<BasicBlock*, 8> BBExecutable; // The BBs that are executable. DenseMap<Value*, LatticeVal> ValueState; // The state each value is in. /// StructValueState - This maintains ValueState for values that have @@ -241,7 +241,7 @@ public: /// this method must be called. void AddTrackedFunction(Function *F) { // Add an entry, F -> undef. - if (const StructType *STy = dyn_cast<StructType>(F->getReturnType())) { + if (StructType *STy = dyn_cast<StructType>(F->getReturnType())) { MRVFunctionsTracked.insert(F); for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) TrackedMultipleRetVals.insert(std::make_pair(std::make_pair(F, i), @@ -302,7 +302,7 @@ public: /// markAnythingOverdefined - Mark the specified value overdefined. This /// works with both scalars and structs. void markAnythingOverdefined(Value *V) { - if (const StructType *STy = dyn_cast<StructType>(V->getType())) + if (StructType *STy = dyn_cast<StructType>(V->getType())) for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) markOverdefined(getStructValueState(V, i), V); else @@ -417,7 +417,7 @@ private: else if (ConstantStruct *CS = dyn_cast<ConstantStruct>(C)) LV.markConstant(CS->getOperand(i)); // Constants are constant. else if (isa<ConstantAggregateZero>(C)) { - const Type *FieldTy = cast<StructType>(V->getType())->getElementType(i); + Type *FieldTy = cast<StructType>(V->getType())->getElementType(i); LV.markConstant(Constant::getNullValue(FieldTy)); } else LV.markOverdefined(); // Unknown sort of constant. @@ -471,9 +471,9 @@ private: /// UsersOfOverdefinedPHIs map for PN, remove them now. void RemoveFromOverdefinedPHIs(Instruction *I, PHINode *PN) { if (UsersOfOverdefinedPHIs.empty()) return; - std::multimap<PHINode*, Instruction*>::iterator It, E; - tie(It, E) = UsersOfOverdefinedPHIs.equal_range(PN); - while (It != E) { + typedef std::multimap<PHINode*, Instruction*>::iterator ItTy; + std::pair<ItTy, ItTy> Range = UsersOfOverdefinedPHIs.equal_range(PN); + for (ItTy It = Range.first, E = Range.second; It != E;) { if (It->second == I) UsersOfOverdefinedPHIs.erase(It++); else @@ -486,9 +486,9 @@ private: /// (Duplicate entries do not break anything directly, but can lead to /// exponential growth of the table in rare cases.) void InsertInOverdefinedPHIs(Instruction *I, PHINode *PN) { - std::multimap<PHINode*, Instruction*>::iterator J, E; - tie(J, E) = UsersOfOverdefinedPHIs.equal_range(PN); - for (; J != E; ++J) + typedef std::multimap<PHINode*, Instruction*>::iterator ItTy; + std::pair<ItTy, ItTy> Range = UsersOfOverdefinedPHIs.equal_range(PN); + for (ItTy J = Range.first, E = Range.second; J != E; ++J) if (J->second == I) return; UsersOfOverdefinedPHIs.insert(std::make_pair(PN, I)); @@ -515,6 +515,7 @@ private: void visitShuffleVectorInst(ShuffleVectorInst &I); void visitExtractValueInst(ExtractValueInst &EVI); void visitInsertValueInst(InsertValueInst &IVI); + void visitLandingPadInst(LandingPadInst &I) { markAnythingOverdefined(&I); } // Instructions that cannot be folded away. void visitStoreInst (StoreInst &I); @@ -528,8 +529,12 @@ private: visitTerminatorInst(II); } void visitCallSite (CallSite CS); + void visitResumeInst (TerminatorInst &I) { /*returns void*/ } void visitUnwindInst (TerminatorInst &I) { /*returns void*/ } void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ } + void visitFenceInst (FenceInst &I) { /*returns void*/ } + void visitAtomicCmpXchgInst (AtomicCmpXchgInst &I) { markOverdefined(&I); } + void visitAtomicRMWInst (AtomicRMWInst &I) { markOverdefined(&I); } void visitAllocaInst (Instruction &I) { markOverdefined(&I); } void visitVAArgInst (Instruction &I) { markAnythingOverdefined(&I); } @@ -577,6 +582,10 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI, } if (SwitchInst *SI = dyn_cast<SwitchInst>(&TI)) { + if (TI.getNumSuccessors() < 2) { + Succs[0] = true; + return; + } LatticeVal SCValue = getValueState(SI->getCondition()); ConstantInt *CI = SCValue.getConstantInt(); @@ -637,6 +646,9 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) { return true; if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { + if (SI->getNumSuccessors() < 2) + return true; + LatticeVal SCValue = getValueState(SI->getCondition()); ConstantInt *CI = SCValue.getConstantInt(); @@ -692,13 +704,14 @@ void SCCPSolver::visitPHINode(PHINode &PN) { // There may be instructions using this PHI node that are not overdefined // themselves. If so, make sure that they know that the PHI node operand // changed. - std::multimap<PHINode*, Instruction*>::iterator I, E; - tie(I, E) = UsersOfOverdefinedPHIs.equal_range(&PN); - if (I == E) + typedef std::multimap<PHINode*, Instruction*>::iterator ItTy; + std::pair<ItTy, ItTy> Range = UsersOfOverdefinedPHIs.equal_range(&PN); + + if (Range.first == Range.second) return; SmallVector<Instruction*, 16> Users; - for (; I != E; ++I) + for (ItTy I = Range.first, E = Range.second; I != E; ++I) Users.push_back(I->second); while (!Users.empty()) visit(Users.pop_back_val()); @@ -772,7 +785,7 @@ void SCCPSolver::visitReturnInst(ReturnInst &I) { // Handle functions that return multiple values. if (!TrackedMultipleRetVals.empty()) { - if (const StructType *STy = dyn_cast<StructType>(ResultOp->getType())) + if (StructType *STy = dyn_cast<StructType>(ResultOp->getType())) if (MRVFunctionsTracked.count(F)) for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) mergeInValue(TrackedMultipleRetVals[std::make_pair(F, i)], F, @@ -825,7 +838,7 @@ void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) { } void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) { - const StructType *STy = dyn_cast<StructType>(IVI.getType()); + StructType *STy = dyn_cast<StructType>(IVI.getType()); if (STy == 0) return markOverdefined(&IVI); @@ -925,7 +938,7 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) { // Could annihilate value. if (I.getOpcode() == Instruction::And) markConstant(IV, &I, Constant::getNullValue(I.getType())); - else if (const VectorType *PT = dyn_cast<VectorType>(I.getType())) + else if (VectorType *PT = dyn_cast<VectorType>(I.getType())) markConstant(IV, &I, Constant::getAllOnesValue(PT)); else markConstant(IV, &I, @@ -1179,8 +1192,8 @@ void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) { } Constant *Ptr = Operands[0]; - markConstant(&I, ConstantExpr::getGetElementPtr(Ptr, &Operands[0]+1, - Operands.size()-1)); + ArrayRef<Constant *> Indices(Operands.begin() + 1, Operands.end()); + markConstant(&I, ConstantExpr::getGetElementPtr(Ptr, Indices)); } void SCCPSolver::visitStoreInst(StoreInst &SI) { @@ -1278,7 +1291,7 @@ CallOverdefined: // If we can constant fold this, mark the result of the call as a // constant. - if (Constant *C = ConstantFoldCall(F, Operands.data(), Operands.size())) + if (Constant *C = ConstantFoldCall(F, Operands)) return markConstant(I, C); } @@ -1303,7 +1316,7 @@ CallOverdefined: continue; } - if (const StructType *STy = dyn_cast<StructType>(AI->getType())) { + if (StructType *STy = dyn_cast<StructType>(AI->getType())) { for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { LatticeVal CallArg = getStructValueState(*CAI, i); mergeInValue(getStructValueState(AI, i), AI, CallArg); @@ -1315,7 +1328,7 @@ CallOverdefined: } // If this is a single/zero retval case, see if we're tracking the function. - if (const StructType *STy = dyn_cast<StructType>(F->getReturnType())) { + if (StructType *STy = dyn_cast<StructType>(F->getReturnType())) { if (!MRVFunctionsTracked.count(F)) goto CallOverdefined; // Not tracking this callee. @@ -1419,67 +1432,116 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // Look for instructions which produce undef values. if (I->getType()->isVoidTy()) continue; - if (const StructType *STy = dyn_cast<StructType>(I->getType())) { - // Only a few things that can be structs matter for undef. Just send - // all their results to overdefined. We could be more precise than this - // but it isn't worth bothering. - if (isa<CallInst>(I) || isa<SelectInst>(I)) { - for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { - LatticeVal &LV = getStructValueState(I, i); - if (LV.isUndefined()) - markOverdefined(LV, I); - } + if (StructType *STy = dyn_cast<StructType>(I->getType())) { + // Only a few things that can be structs matter for undef. + + // Tracked calls must never be marked overdefined in ResolvedUndefsIn. + if (CallSite CS = CallSite(I)) + if (Function *F = CS.getCalledFunction()) + if (MRVFunctionsTracked.count(F)) + continue; + + // extractvalue and insertvalue don't need to be marked; they are + // tracked as precisely as their operands. + if (isa<ExtractValueInst>(I) || isa<InsertValueInst>(I)) + continue; + + // Send the results of everything else to overdefined. We could be + // more precise than this but it isn't worth bothering. + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + LatticeVal &LV = getStructValueState(I, i); + if (LV.isUndefined()) + markOverdefined(LV, I); } continue; } - + LatticeVal &LV = getValueState(I); if (!LV.isUndefined()) continue; - // No instructions using structs need disambiguation. - if (I->getOperand(0)->getType()->isStructTy()) + // extractvalue is safe; check here because the argument is a struct. + if (isa<ExtractValueInst>(I)) continue; - // Get the lattice values of the first two operands for use below. + // Compute the operand LatticeVals, for convenience below. + // Anything taking a struct is conservatively assumed to require + // overdefined markings. + if (I->getOperand(0)->getType()->isStructTy()) { + markOverdefined(I); + return true; + } LatticeVal Op0LV = getValueState(I->getOperand(0)); LatticeVal Op1LV; if (I->getNumOperands() == 2) { - // No instructions using structs need disambiguation. - if (I->getOperand(1)->getType()->isStructTy()) - continue; - - // If this is a two-operand instruction, and if both operands are - // undefs, the result stays undef. + if (I->getOperand(1)->getType()->isStructTy()) { + markOverdefined(I); + return true; + } + Op1LV = getValueState(I->getOperand(1)); - if (Op0LV.isUndefined() && Op1LV.isUndefined()) - continue; } - // If this is an instructions whose result is defined even if the input is // not fully defined, propagate the information. - const Type *ITy = I->getType(); + Type *ITy = I->getType(); switch (I->getOpcode()) { - default: break; // Leave the instruction as an undef. + case Instruction::Add: + case Instruction::Sub: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: + break; // Any undef -> undef + case Instruction::FSub: + case Instruction::FAdd: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + // Floating-point binary operation: be conservative. + if (Op0LV.isUndefined() && Op1LV.isUndefined()) + markForcedConstant(I, Constant::getNullValue(ITy)); + else + markOverdefined(I); + return true; case Instruction::ZExt: - // After a zero extend, we know the top part is zero. SExt doesn't have - // to be handled here, because we don't know whether the top part is 1's - // or 0's. - case Instruction::SIToFP: // some FP values are not possible, just use 0. - case Instruction::UIToFP: // some FP values are not possible, just use 0. + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + // undef -> 0; some outputs are impossible markForcedConstant(I, Constant::getNullValue(ITy)); return true; case Instruction::Mul: case Instruction::And: + // Both operands undef -> undef + if (Op0LV.isUndefined() && Op1LV.isUndefined()) + break; // undef * X -> 0. X could be zero. // undef & X -> 0. X could be zero. markForcedConstant(I, Constant::getNullValue(ITy)); return true; case Instruction::Or: + // Both operands undef -> undef + if (Op0LV.isUndefined() && Op1LV.isUndefined()) + break; // undef | X -> -1. X could be -1. markForcedConstant(I, Constant::getAllOnesValue(ITy)); return true; + case Instruction::Xor: + // undef ^ undef -> 0; strictly speaking, this is not strictly + // necessary, but we try to be nice to people who expect this + // behavior in simple cases + if (Op0LV.isUndefined() && Op1LV.isUndefined()) { + markForcedConstant(I, Constant::getNullValue(ITy)); + return true; + } + // undef ^ X -> undef + break; + case Instruction::SDiv: case Instruction::UDiv: case Instruction::SRem: @@ -1494,26 +1556,24 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { return true; case Instruction::AShr: - // undef >>s X -> undef. No change. - if (Op0LV.isUndefined()) break; - - // X >>s undef -> X. X could be 0, X could have the high-bit known set. - if (Op0LV.isConstant()) - markForcedConstant(I, Op0LV.getConstant()); - else - markOverdefined(I); + // X >>a undef -> undef. + if (Op1LV.isUndefined()) break; + + // undef >>a X -> all ones + markForcedConstant(I, Constant::getAllOnesValue(ITy)); return true; case Instruction::LShr: case Instruction::Shl: - // undef >> X -> undef. No change. - // undef << X -> undef. No change. - if (Op0LV.isUndefined()) break; - - // X >> undef -> 0. X could be 0. - // X << undef -> 0. X could be 0. + // X << undef -> undef. + // X >> undef -> undef. + if (Op1LV.isUndefined()) break; + + // undef << X -> 0 + // undef >> X -> 0 markForcedConstant(I, Constant::getNullValue(ITy)); return true; case Instruction::Select: + Op1LV = getValueState(I->getOperand(1)); // undef ? X : Y -> X or Y. There could be commonality between X/Y. if (Op0LV.isUndefined()) { if (!Op1LV.isConstant()) // Pick the constant one if there is any. @@ -1533,9 +1593,35 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { else markOverdefined(I); return true; + case Instruction::Load: + // A load here means one of two things: a load of undef from a global, + // a load from an unknown pointer. Either way, having it return undef + // is okay. + break; + case Instruction::ICmp: + // X == undef -> undef. Other comparisons get more complicated. + if (cast<ICmpInst>(I)->isEquality()) + break; + markOverdefined(I); + return true; case Instruction::Call: - // If a call has an undef result, it is because it is constant foldable - // but one of the inputs was undef. Just force the result to + case Instruction::Invoke: { + // There are two reasons a call can have an undef result + // 1. It could be tracked. + // 2. It could be constant-foldable. + // Because of the way we solve return values, tracked calls must + // never be marked overdefined in ResolvedUndefsIn. + if (Function *F = CallSite(I).getCalledFunction()) + if (TrackedRetVals.count(F)) + break; + + // If the call is constant-foldable, we mark it overdefined because + // we do not know what return values are valid. + markOverdefined(I); + return true; + } + default: + // If we don't know what should happen here, conservatively mark it // overdefined. markOverdefined(I); return true; @@ -1621,15 +1707,25 @@ FunctionPass *llvm::createSCCPPass() { static void DeleteInstructionInBlock(BasicBlock *BB) { DEBUG(dbgs() << " BasicBlock Dead:" << *BB); ++NumDeadBlocks; - - // Delete the instructions backwards, as it has a reduced likelihood of - // having to update as many def-use and use-def chains. - while (!isa<TerminatorInst>(BB->begin())) { - Instruction *I = --BasicBlock::iterator(BB->getTerminator()); - - if (!I->use_empty()) - I->replaceAllUsesWith(UndefValue::get(I->getType())); - BB->getInstList().erase(I); + + // Check to see if there are non-terminating instructions to delete. + if (isa<TerminatorInst>(BB->begin())) + return; + + // Delete the instructions backwards, as it has a reduced likelihood of having + // to update as many def-use and use-def chains. + Instruction *EndInst = BB->getTerminator(); // Last not to be deleted. + while (EndInst != BB->begin()) { + // Delete the next to last instruction. + BasicBlock::iterator I = EndInst; + Instruction *Inst = --I; + if (!Inst->use_empty()) + Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); + if (isa<LandingPadInst>(Inst)) { + EndInst = Inst; + continue; + } + BB->getInstList().erase(Inst); ++NumInstRemoved; } } diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp index 302c287..f6918de 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -63,7 +63,6 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeCFGSimplifyPassPass(Registry); initializeSimplifyLibCallsPass(Registry); initializeSinkingPass(Registry); - initializeTailDupPass(Registry); initializeTailCallElimPass(Registry); } @@ -187,3 +186,7 @@ void LLVMAddTypeBasedAliasAnalysisPass(LLVMPassManagerRef PM) { void LLVMAddBasicAliasAnalysisPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createBasicAliasAnalysisPass()); } + +void LLVMAddLowerExpectIntrinsicPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLowerExpectIntrinsicPass()); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp index 7d6349c..c6d9123 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -129,11 +129,11 @@ namespace { AllocaInfo &Info); void isSafeGEP(GetElementPtrInst *GEPI, uint64_t &Offset, AllocaInfo &Info); void isSafeMemAccess(uint64_t Offset, uint64_t MemSize, - const Type *MemOpType, bool isStore, AllocaInfo &Info, + Type *MemOpType, bool isStore, AllocaInfo &Info, Instruction *TheAccess, bool AllowWholeAccess); - bool TypeHasComponent(const Type *T, uint64_t Offset, uint64_t Size); - uint64_t FindElementAndOffset(const Type *&T, uint64_t &Offset, - const Type *&IdxTy); + bool TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size); + uint64_t FindElementAndOffset(Type *&T, uint64_t &Offset, + Type *&IdxTy); void DoScalarReplacement(AllocaInst *AI, std::vector<AllocaInst*> &WorkList); @@ -145,6 +145,9 @@ namespace { SmallVector<AllocaInst*, 32> &NewElts); void RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, SmallVector<AllocaInst*, 32> &NewElts); + void RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI, + uint64_t Offset, + SmallVector<AllocaInst*, 32> &NewElts); void RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, AllocaInst *AI, SmallVector<AllocaInst*, 32> &NewElts); @@ -253,7 +256,7 @@ class ConvertToScalarInfo { /// VectorTy - This tracks the type that we should promote the vector to if /// it is possible to turn it into a vector. This starts out null, and if it /// isn't possible to turn into a vector type, it gets set to VoidTy. - const VectorType *VectorTy; + VectorType *VectorTy; /// HadNonMemTransferAccess - True if there is at least one access to the /// alloca that is not a MemTransferInst. We don't want to turn structs into @@ -269,11 +272,11 @@ public: private: bool CanConvertToScalar(Value *V, uint64_t Offset); - void MergeInTypeForLoadOrStore(const Type *In, uint64_t Offset); - bool MergeInVectorType(const VectorType *VInTy, uint64_t Offset); + void MergeInTypeForLoadOrStore(Type *In, uint64_t Offset); + bool MergeInVectorType(VectorType *VInTy, uint64_t Offset); void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset); - Value *ConvertScalar_ExtractValue(Value *NV, const Type *ToType, + Value *ConvertScalar_ExtractValue(Value *NV, Type *ToType, uint64_t Offset, IRBuilder<> &Builder); Value *ConvertScalar_InsertValue(Value *StoredVal, Value *ExistingVal, uint64_t Offset, IRBuilder<> &Builder); @@ -295,8 +298,6 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) { if (ScalarKind == Unknown) ScalarKind = Integer; - // FIXME: It should be possible to promote the vector type up to the alloca's - // size. if (ScalarKind == Vector && VectorTy->getBitWidth() != AllocaSize * 8) ScalarKind = Integer; @@ -306,7 +307,7 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) { // random stuff that doesn't use vectors (e.g. <9 x double>) because then // we just get a lot of insert/extracts. If at least one vector is // involved, then we probably really do have a union of vector/array. - const Type *NewTy; + Type *NewTy; if (ScalarKind == Vector) { assert(VectorTy && "Missing type for vector scalar."); DEBUG(dbgs() << "CONVERT TO VECTOR: " << *AI << "\n TYPE = " @@ -331,20 +332,16 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) { /// (VectorTy) so far at the offset specified by Offset (which is specified in /// bytes). /// -/// There are three cases we handle here: +/// There are two cases we handle here: /// 1) A union of vector types of the same size and potentially its elements. /// Here we turn element accesses into insert/extract element operations. /// This promotes a <4 x float> with a store of float to the third element /// into a <4 x float> that uses insert element. -/// 2) A union of vector types with power-of-2 size differences, e.g. a float, -/// <2 x float> and <4 x float>. Here we turn element accesses into insert -/// and extract element operations, and <2 x float> accesses into a cast to -/// <2 x double>, an extract, and a cast back to <2 x float>. -/// 3) A fully general blob of memory, which we turn into some (potentially +/// 2) A fully general blob of memory, which we turn into some (potentially /// large) integer type with extract and insert operations where the loads /// and stores would mutate the memory. We mark this by setting VectorTy /// to VoidTy. -void ConvertToScalarInfo::MergeInTypeForLoadOrStore(const Type *In, +void ConvertToScalarInfo::MergeInTypeForLoadOrStore(Type *In, uint64_t Offset) { // If we already decided to turn this into a blob of integer memory, there is // nothing to be done. @@ -355,7 +352,7 @@ void ConvertToScalarInfo::MergeInTypeForLoadOrStore(const Type *In, // If the In type is a vector that is the same size as the alloca, see if it // matches the existing VecTy. - if (const VectorType *VInTy = dyn_cast<VectorType>(In)) { + if (VectorType *VInTy = dyn_cast<VectorType>(In)) { if (MergeInVectorType(VInTy, Offset)) return; } else if (In->isFloatTy() || In->isDoubleTy() || @@ -371,20 +368,13 @@ void ConvertToScalarInfo::MergeInTypeForLoadOrStore(const Type *In, // if the implied vector agrees with what we already have and if Offset is // compatible with it. if (Offset % EltSize == 0 && AllocaSize % EltSize == 0 && - (!VectorTy || Offset * 8 < VectorTy->getPrimitiveSizeInBits())) { + (!VectorTy || EltSize == VectorTy->getElementType() + ->getPrimitiveSizeInBits()/8)) { if (!VectorTy) { ScalarKind = ImplicitVector; VectorTy = VectorType::get(In, AllocaSize/EltSize); - return; } - - unsigned CurrentEltSize = VectorTy->getElementType() - ->getPrimitiveSizeInBits()/8; - if (EltSize == CurrentEltSize) - return; - - if (In->isIntegerTy() && isPowerOf2_32(AllocaSize / EltSize)) - return; + return; } } @@ -395,74 +385,21 @@ void ConvertToScalarInfo::MergeInTypeForLoadOrStore(const Type *In, /// MergeInVectorType - Handles the vector case of MergeInTypeForLoadOrStore, /// returning true if the type was successfully merged and false otherwise. -bool ConvertToScalarInfo::MergeInVectorType(const VectorType *VInTy, +bool ConvertToScalarInfo::MergeInVectorType(VectorType *VInTy, uint64_t Offset) { - // TODO: Support nonzero offsets? - if (Offset != 0) - return false; - - // Only allow vectors that are a power-of-2 away from the size of the alloca. - if (!isPowerOf2_64(AllocaSize / (VInTy->getBitWidth() / 8))) - return false; - - // If this the first vector we see, remember the type so that we know the - // element size. - if (!VectorTy) { + if (VInTy->getBitWidth()/8 == AllocaSize && Offset == 0) { + // If we're storing/loading a vector of the right size, allow it as a + // vector. If this the first vector we see, remember the type so that + // we know the element size. If this is a subsequent access, ignore it + // even if it is a differing type but the same size. Worst case we can + // bitcast the resultant vectors. + if (!VectorTy) + VectorTy = VInTy; ScalarKind = Vector; - VectorTy = VInTy; return true; } - unsigned BitWidth = VectorTy->getBitWidth(); - unsigned InBitWidth = VInTy->getBitWidth(); - - // Vectors of the same size can be converted using a simple bitcast. - if (InBitWidth == BitWidth && AllocaSize == (InBitWidth / 8)) { - ScalarKind = Vector; - return true; - } - - const Type *ElementTy = VectorTy->getElementType(); - const Type *InElementTy = VInTy->getElementType(); - - // Do not allow mixed integer and floating-point accesses from vectors of - // different sizes. - if (ElementTy->isFloatingPointTy() != InElementTy->isFloatingPointTy()) - return false; - - if (ElementTy->isFloatingPointTy()) { - // Only allow floating-point vectors of different sizes if they have the - // same element type. - // TODO: This could be loosened a bit, but would anything benefit? - if (ElementTy != InElementTy) - return false; - - // There are no arbitrary-precision floating-point types, which limits the - // number of legal vector types with larger element types that we can form - // to bitcast and extract a subvector. - // TODO: We could support some more cases with mixed fp128 and double here. - if (!(BitWidth == 64 || BitWidth == 128) || - !(InBitWidth == 64 || InBitWidth == 128)) - return false; - } else { - assert(ElementTy->isIntegerTy() && "Vector elements must be either integer " - "or floating-point."); - unsigned BitWidth = ElementTy->getPrimitiveSizeInBits(); - unsigned InBitWidth = InElementTy->getPrimitiveSizeInBits(); - - // Do not allow integer types smaller than a byte or types whose widths are - // not a multiple of a byte. - if (BitWidth < 8 || InBitWidth < 8 || - BitWidth % 8 != 0 || InBitWidth % 8 != 0) - return false; - } - - // Pick the largest of the two vector types. - ScalarKind = Vector; - if (InBitWidth > BitWidth) - VectorTy = VInTy; - - return true; + return false; } /// CanConvertToScalar - V is a pointer. If we can convert the pointee and all @@ -480,7 +417,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) { if (LoadInst *LI = dyn_cast<LoadInst>(User)) { // Don't break volatile loads. - if (LI->isVolatile()) + if (!LI->isSimple()) return false; // Don't touch MMX operations. if (LI->getType()->isX86_MMXTy()) @@ -492,7 +429,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) { if (StoreInst *SI = dyn_cast<StoreInst>(User)) { // Storing the pointer, not into the value? - if (SI->getOperand(0) == V || SI->isVolatile()) return false; + if (SI->getOperand(0) == V || !SI->isSimple()) return false; // Don't touch MMX operations. if (SI->getOperand(0)->getType()->isX86_MMXTy()) return false; @@ -502,7 +439,8 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) { } if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) { - IsNotTrivial = true; // Can't be mem2reg'd. + if (!onlyUsedByLifetimeMarkers(BCI)) + IsNotTrivial = true; // Can't be mem2reg'd. if (!CanConvertToScalar(BCI, Offset)) return false; continue; @@ -516,7 +454,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) { // Compute the offset that this GEP adds to the pointer. SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end()); uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(), - &Indices[0], Indices.size()); + Indices); // See if all uses can be converted. if (!CanConvertToScalar(GEP, Offset+GEPOffset)) return false; @@ -560,6 +498,14 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) { continue; } + // If this is a lifetime intrinsic, we can handle it. + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) { + if (II->getIntrinsicID() == Intrinsic::lifetime_start || + II->getIntrinsicID() == Intrinsic::lifetime_end) { + continue; + } + } + // Otherwise, we cannot handle this! return false; } @@ -589,7 +535,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, // Compute the offset that this GEP adds to the pointer. SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end()); uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(), - &Indices[0], Indices.size()); + Indices); ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8); GEP->eraseFromParent(); continue; @@ -599,7 +545,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, if (LoadInst *LI = dyn_cast<LoadInst>(User)) { // The load is a bit extract from NewAI shifted right by Offset bits. - Value *LoadedVal = Builder.CreateLoad(NewAI, "tmp"); + Value *LoadedVal = Builder.CreateLoad(NewAI); Value *NewLoadVal = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset, Builder); LI->replaceAllUsesWith(NewLoadVal); @@ -668,8 +614,8 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, // pointer (bitcasted), then a store to our new alloca. assert(MTI->getRawDest() == Ptr && "Neither use is of pointer?"); Value *SrcPtr = MTI->getSource(); - const PointerType* SPTy = cast<PointerType>(SrcPtr->getType()); - const PointerType* AIPTy = cast<PointerType>(NewAI->getType()); + PointerType* SPTy = cast<PointerType>(SrcPtr->getType()); + PointerType* AIPTy = cast<PointerType>(NewAI->getType()); if (SPTy->getAddressSpace() != AIPTy->getAddressSpace()) { AIPTy = PointerType::get(AIPTy->getElementType(), SPTy->getAddressSpace()); @@ -685,8 +631,8 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, assert(MTI->getRawSource() == Ptr && "Neither use is of pointer?"); LoadInst *SrcVal = Builder.CreateLoad(NewAI, "srcval"); - const PointerType* DPTy = cast<PointerType>(MTI->getDest()->getType()); - const PointerType* AIPTy = cast<PointerType>(NewAI->getType()); + PointerType* DPTy = cast<PointerType>(MTI->getDest()->getType()); + PointerType* AIPTy = cast<PointerType>(NewAI->getType()); if (DPTy->getAddressSpace() != AIPTy->getAddressSpace()) { AIPTy = PointerType::get(AIPTy->getElementType(), DPTy->getAddressSpace()); @@ -703,65 +649,18 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, continue; } - llvm_unreachable("Unsupported operation!"); - } -} + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) { + if (II->getIntrinsicID() == Intrinsic::lifetime_start || + II->getIntrinsicID() == Intrinsic::lifetime_end) { + // There's no need to preserve these, as the resulting alloca will be + // converted to a register anyways. + II->eraseFromParent(); + continue; + } + } -/// getScaledElementType - Gets a scaled element type for a partial vector -/// access of an alloca. The input types must be integer or floating-point -/// scalar or vector types, and the resulting type is an integer, float or -/// double. -static const Type *getScaledElementType(const Type *Ty1, const Type *Ty2, - unsigned NewBitWidth) { - bool IsFP1 = Ty1->isFloatingPointTy() || - (Ty1->isVectorTy() && - cast<VectorType>(Ty1)->getElementType()->isFloatingPointTy()); - bool IsFP2 = Ty2->isFloatingPointTy() || - (Ty2->isVectorTy() && - cast<VectorType>(Ty2)->getElementType()->isFloatingPointTy()); - - LLVMContext &Context = Ty1->getContext(); - - // Prefer floating-point types over integer types, as integer types may have - // been created by earlier scalar replacement. - if (IsFP1 || IsFP2) { - if (NewBitWidth == 32) - return Type::getFloatTy(Context); - if (NewBitWidth == 64) - return Type::getDoubleTy(Context); + llvm_unreachable("Unsupported operation!"); } - - return Type::getIntNTy(Context, NewBitWidth); -} - -/// CreateShuffleVectorCast - Creates a shuffle vector to convert one vector -/// to another vector of the same element type which has the same allocation -/// size but different primitive sizes (e.g. <3 x i32> and <4 x i32>). -static Value *CreateShuffleVectorCast(Value *FromVal, const Type *ToType, - IRBuilder<> &Builder) { - const Type *FromType = FromVal->getType(); - const VectorType *FromVTy = cast<VectorType>(FromType); - const VectorType *ToVTy = cast<VectorType>(ToType); - assert((ToVTy->getElementType() == FromVTy->getElementType()) && - "Vectors must have the same element type"); - Value *UnV = UndefValue::get(FromType); - unsigned numEltsFrom = FromVTy->getNumElements(); - unsigned numEltsTo = ToVTy->getNumElements(); - - SmallVector<Constant*, 3> Args; - const Type* Int32Ty = Builder.getInt32Ty(); - unsigned minNumElts = std::min(numEltsFrom, numEltsTo); - unsigned i; - for (i=0; i != minNumElts; ++i) - Args.push_back(ConstantInt::get(Int32Ty, i)); - - if (i < numEltsTo) { - Constant* UnC = UndefValue::get(Int32Ty); - for (; i != numEltsTo; ++i) - Args.push_back(UnC); - } - Constant *Mask = ConstantVector::get(Args); - return Builder.CreateShuffleVector(FromVal, UnV, Mask, "tmpV"); } /// ConvertScalar_ExtractValue - Extract a value of type ToType from an integer @@ -775,50 +674,20 @@ static Value *CreateShuffleVectorCast(Value *FromVal, const Type *ToType, /// Offset is an offset from the original alloca, in bits that need to be /// shifted to the right. Value *ConvertToScalarInfo:: -ConvertScalar_ExtractValue(Value *FromVal, const Type *ToType, +ConvertScalar_ExtractValue(Value *FromVal, Type *ToType, uint64_t Offset, IRBuilder<> &Builder) { // If the load is of the whole new alloca, no conversion is needed. - const Type *FromType = FromVal->getType(); + Type *FromType = FromVal->getType(); if (FromType == ToType && Offset == 0) return FromVal; // If the result alloca is a vector type, this is either an element // access or a bitcast to another vector type of the same size. - if (const VectorType *VTy = dyn_cast<VectorType>(FromType)) { + if (VectorType *VTy = dyn_cast<VectorType>(FromType)) { unsigned FromTypeSize = TD.getTypeAllocSize(FromType); unsigned ToTypeSize = TD.getTypeAllocSize(ToType); - if (FromTypeSize == ToTypeSize) { - // If the two types have the same primitive size, use a bit cast. - // Otherwise, it is two vectors with the same element type that has - // the same allocation size but different number of elements so use - // a shuffle vector. - if (FromType->getPrimitiveSizeInBits() == - ToType->getPrimitiveSizeInBits()) - return Builder.CreateBitCast(FromVal, ToType, "tmp"); - else - return CreateShuffleVectorCast(FromVal, ToType, Builder); - } - - if (isPowerOf2_64(FromTypeSize / ToTypeSize)) { - assert(!(ToType->isVectorTy() && Offset != 0) && "Can't extract a value " - "of a smaller vector type at a nonzero offset."); - - const Type *CastElementTy = getScaledElementType(FromType, ToType, - ToTypeSize * 8); - unsigned NumCastVectorElements = FromTypeSize / ToTypeSize; - - LLVMContext &Context = FromVal->getContext(); - const Type *CastTy = VectorType::get(CastElementTy, - NumCastVectorElements); - Value *Cast = Builder.CreateBitCast(FromVal, CastTy, "tmp"); - - unsigned EltSize = TD.getTypeAllocSizeInBits(CastElementTy); - unsigned Elt = Offset/EltSize; - assert(EltSize*Elt == Offset && "Invalid modulus in validity checking"); - Value *Extract = Builder.CreateExtractElement(Cast, ConstantInt::get( - Type::getInt32Ty(Context), Elt), "tmp"); - return Builder.CreateBitCast(Extract, ToType, "tmp"); - } + if (FromTypeSize == ToTypeSize) + return Builder.CreateBitCast(FromVal, ToType); // Otherwise it must be an element access. unsigned Elt = 0; @@ -828,40 +697,39 @@ ConvertScalar_ExtractValue(Value *FromVal, const Type *ToType, assert(EltSize*Elt == Offset && "Invalid modulus in validity checking"); } // Return the element extracted out of it. - Value *V = Builder.CreateExtractElement(FromVal, ConstantInt::get( - Type::getInt32Ty(FromVal->getContext()), Elt), "tmp"); + Value *V = Builder.CreateExtractElement(FromVal, Builder.getInt32(Elt)); if (V->getType() != ToType) - V = Builder.CreateBitCast(V, ToType, "tmp"); + V = Builder.CreateBitCast(V, ToType); return V; } // If ToType is a first class aggregate, extract out each of the pieces and // use insertvalue's to form the FCA. - if (const StructType *ST = dyn_cast<StructType>(ToType)) { + if (StructType *ST = dyn_cast<StructType>(ToType)) { const StructLayout &Layout = *TD.getStructLayout(ST); Value *Res = UndefValue::get(ST); for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { Value *Elt = ConvertScalar_ExtractValue(FromVal, ST->getElementType(i), Offset+Layout.getElementOffsetInBits(i), Builder); - Res = Builder.CreateInsertValue(Res, Elt, i, "tmp"); + Res = Builder.CreateInsertValue(Res, Elt, i); } return Res; } - if (const ArrayType *AT = dyn_cast<ArrayType>(ToType)) { + if (ArrayType *AT = dyn_cast<ArrayType>(ToType)) { uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType()); Value *Res = UndefValue::get(AT); for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { Value *Elt = ConvertScalar_ExtractValue(FromVal, AT->getElementType(), Offset+i*EltSize, Builder); - Res = Builder.CreateInsertValue(Res, Elt, i, "tmp"); + Res = Builder.CreateInsertValue(Res, Elt, i); } return Res; } // Otherwise, this must be a union that was converted to an integer value. - const IntegerType *NTy = cast<IntegerType>(FromVal->getType()); + IntegerType *NTy = cast<IntegerType>(FromVal->getType()); // If this is a big-endian system and the load is narrower than the // full alloca type, we need to do a shift to get the right bits. @@ -881,33 +749,31 @@ ConvertScalar_ExtractValue(Value *FromVal, const Type *ToType, // only some bits are used. if (ShAmt > 0 && (unsigned)ShAmt < NTy->getBitWidth()) FromVal = Builder.CreateLShr(FromVal, - ConstantInt::get(FromVal->getType(), - ShAmt), "tmp"); + ConstantInt::get(FromVal->getType(), ShAmt)); else if (ShAmt < 0 && (unsigned)-ShAmt < NTy->getBitWidth()) FromVal = Builder.CreateShl(FromVal, - ConstantInt::get(FromVal->getType(), - -ShAmt), "tmp"); + ConstantInt::get(FromVal->getType(), -ShAmt)); // Finally, unconditionally truncate the integer to the right width. unsigned LIBitWidth = TD.getTypeSizeInBits(ToType); if (LIBitWidth < NTy->getBitWidth()) FromVal = Builder.CreateTrunc(FromVal, IntegerType::get(FromVal->getContext(), - LIBitWidth), "tmp"); + LIBitWidth)); else if (LIBitWidth > NTy->getBitWidth()) FromVal = Builder.CreateZExt(FromVal, IntegerType::get(FromVal->getContext(), - LIBitWidth), "tmp"); + LIBitWidth)); // If the result is an integer, this is a trunc or bitcast. if (ToType->isIntegerTy()) { // Should be done. } else if (ToType->isFloatingPointTy() || ToType->isVectorTy()) { // Just do a bitcast, we know the sizes match up. - FromVal = Builder.CreateBitCast(FromVal, ToType, "tmp"); + FromVal = Builder.CreateBitCast(FromVal, ToType); } else { // Otherwise must be a pointer. - FromVal = Builder.CreateIntToPtr(FromVal, ToType, "tmp"); + FromVal = Builder.CreateIntToPtr(FromVal, ToType); } assert(FromVal->getType() == ToType && "Didn't convert right?"); return FromVal; @@ -927,65 +793,30 @@ ConvertScalar_InsertValue(Value *SV, Value *Old, uint64_t Offset, IRBuilder<> &Builder) { // Convert the stored type to the actual type, shift it left to insert // then 'or' into place. - const Type *AllocaType = Old->getType(); + Type *AllocaType = Old->getType(); LLVMContext &Context = Old->getContext(); - if (const VectorType *VTy = dyn_cast<VectorType>(AllocaType)) { + if (VectorType *VTy = dyn_cast<VectorType>(AllocaType)) { uint64_t VecSize = TD.getTypeAllocSizeInBits(VTy); uint64_t ValSize = TD.getTypeAllocSizeInBits(SV->getType()); // Changing the whole vector with memset or with an access of a different // vector type? - if (ValSize == VecSize) { - // If the two types have the same primitive size, use a bit cast. - // Otherwise, it is two vectors with the same element type that has - // the same allocation size but different number of elements so use - // a shuffle vector. - if (VTy->getPrimitiveSizeInBits() == - SV->getType()->getPrimitiveSizeInBits()) - return Builder.CreateBitCast(SV, AllocaType, "tmp"); - else - return CreateShuffleVectorCast(SV, VTy, Builder); - } - - if (isPowerOf2_64(VecSize / ValSize)) { - assert(!(SV->getType()->isVectorTy() && Offset != 0) && "Can't insert a " - "value of a smaller vector type at a nonzero offset."); - - const Type *CastElementTy = getScaledElementType(VTy, SV->getType(), - ValSize); - unsigned NumCastVectorElements = VecSize / ValSize; - - LLVMContext &Context = SV->getContext(); - const Type *OldCastTy = VectorType::get(CastElementTy, - NumCastVectorElements); - Value *OldCast = Builder.CreateBitCast(Old, OldCastTy, "tmp"); - - Value *SVCast = Builder.CreateBitCast(SV, CastElementTy, "tmp"); - - unsigned EltSize = TD.getTypeAllocSizeInBits(CastElementTy); - unsigned Elt = Offset/EltSize; - assert(EltSize*Elt == Offset && "Invalid modulus in validity checking"); - Value *Insert = - Builder.CreateInsertElement(OldCast, SVCast, ConstantInt::get( - Type::getInt32Ty(Context), Elt), "tmp"); - return Builder.CreateBitCast(Insert, AllocaType, "tmp"); - } + if (ValSize == VecSize) + return Builder.CreateBitCast(SV, AllocaType); // Must be an element insertion. assert(SV->getType() == VTy->getElementType()); uint64_t EltSize = TD.getTypeAllocSizeInBits(VTy->getElementType()); unsigned Elt = Offset/EltSize; - return Builder.CreateInsertElement(Old, SV, - ConstantInt::get(Type::getInt32Ty(SV->getContext()), Elt), - "tmp"); + return Builder.CreateInsertElement(Old, SV, Builder.getInt32(Elt)); } // If SV is a first-class aggregate value, insert each value recursively. - if (const StructType *ST = dyn_cast<StructType>(SV->getType())) { + if (StructType *ST = dyn_cast<StructType>(SV->getType())) { const StructLayout &Layout = *TD.getStructLayout(ST); for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { - Value *Elt = Builder.CreateExtractValue(SV, i, "tmp"); + Value *Elt = Builder.CreateExtractValue(SV, i); Old = ConvertScalar_InsertValue(Elt, Old, Offset+Layout.getElementOffsetInBits(i), Builder); @@ -993,10 +824,10 @@ ConvertScalar_InsertValue(Value *SV, Value *Old, return Old; } - if (const ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) { + if (ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) { uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType()); for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { - Value *Elt = Builder.CreateExtractValue(SV, i, "tmp"); + Value *Elt = Builder.CreateExtractValue(SV, i); Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, Builder); } return Old; @@ -1009,20 +840,19 @@ ConvertScalar_InsertValue(Value *SV, Value *Old, unsigned SrcStoreWidth = TD.getTypeStoreSizeInBits(SV->getType()); unsigned DestStoreWidth = TD.getTypeStoreSizeInBits(AllocaType); if (SV->getType()->isFloatingPointTy() || SV->getType()->isVectorTy()) - SV = Builder.CreateBitCast(SV, - IntegerType::get(SV->getContext(),SrcWidth), "tmp"); + SV = Builder.CreateBitCast(SV, IntegerType::get(SV->getContext(),SrcWidth)); else if (SV->getType()->isPointerTy()) - SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getContext()), "tmp"); + SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getContext())); // Zero extend or truncate the value if needed. if (SV->getType() != AllocaType) { if (SV->getType()->getPrimitiveSizeInBits() < AllocaType->getPrimitiveSizeInBits()) - SV = Builder.CreateZExt(SV, AllocaType, "tmp"); + SV = Builder.CreateZExt(SV, AllocaType); else { // Truncation may be needed if storing more than the alloca can hold // (undefined behavior). - SV = Builder.CreateTrunc(SV, AllocaType, "tmp"); + SV = Builder.CreateTrunc(SV, AllocaType); SrcWidth = DestWidth; SrcStoreWidth = DestStoreWidth; } @@ -1045,12 +875,10 @@ ConvertScalar_InsertValue(Value *SV, Value *Old, // only some bits in the structure are set. APInt Mask(APInt::getLowBitsSet(DestWidth, SrcWidth)); if (ShAmt > 0 && (unsigned)ShAmt < DestWidth) { - SV = Builder.CreateShl(SV, ConstantInt::get(SV->getType(), - ShAmt), "tmp"); + SV = Builder.CreateShl(SV, ConstantInt::get(SV->getType(), ShAmt)); Mask <<= ShAmt; } else if (ShAmt < 0 && (unsigned)-ShAmt < DestWidth) { - SV = Builder.CreateLShr(SV, ConstantInt::get(SV->getType(), - -ShAmt), "tmp"); + SV = Builder.CreateLShr(SV, ConstantInt::get(SV->getType(), -ShAmt)); Mask = Mask.lshr(-ShAmt); } @@ -1196,7 +1024,7 @@ static bool isSafeSelectToSpeculate(SelectInst *SI, const TargetData *TD) { for (Value::use_iterator UI = SI->use_begin(), UE = SI->use_end(); UI != UE; ++UI) { LoadInst *LI = dyn_cast<LoadInst>(*UI); - if (LI == 0 || LI->isVolatile()) return false; + if (LI == 0 || !LI->isSimple()) return false; // Both operands to the select need to be dereferencable, either absolutely // (e.g. allocas) or at this point because we can see other accesses to it. @@ -1237,7 +1065,7 @@ static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) { for (Value::use_iterator UI = PN->use_begin(), UE = PN->use_end(); UI != UE; ++UI) { LoadInst *LI = dyn_cast<LoadInst>(*UI); - if (LI == 0 || LI->isVolatile()) return false; + if (LI == 0 || !LI->isSimple()) return false; // For now we only allow loads in the same block as the PHI. This is a // common case that happens when instcombine merges two loads through a PHI. @@ -1258,17 +1086,21 @@ static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) { // trapping load in the predecessor if it is a critical edge. for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { BasicBlock *Pred = PN->getIncomingBlock(i); + Value *InVal = PN->getIncomingValue(i); + + // If the terminator of the predecessor has side-effects (an invoke), + // there is no safe place to put a load in the predecessor. + if (Pred->getTerminator()->mayHaveSideEffects()) + return false; + + // If the value is produced by the terminator of the predecessor + // (an invoke), there is no valid place to put a load in the predecessor. + if (Pred->getTerminator() == InVal) + return false; // If the predecessor has a single successor, then the edge isn't critical. if (Pred->getTerminator()->getNumSuccessors() == 1) continue; - - Value *InVal = PN->getIncomingValue(i); - - // If the InVal is an invoke in the pred, we can't put a load on the edge. - if (InvokeInst *II = dyn_cast<InvokeInst>(InVal)) - if (II->getParent() == Pred) - return false; // If this pointer is always safe to load, or if we can prove that there is // already a load in the block, then we can move the load to the pred block. @@ -1295,13 +1127,13 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { UI != UE; ++UI) { User *U = *UI; if (LoadInst *LI = dyn_cast<LoadInst>(U)) { - if (LI->isVolatile()) + if (!LI->isSimple()) return false; continue; } if (StoreInst *SI = dyn_cast<StoreInst>(U)) { - if (SI->getOperand(0) == AI || SI->isVolatile()) + if (SI->getOperand(0) == AI || !SI->isSimple()) return false; // Don't allow a store OF the AI, only INTO the AI. continue; } @@ -1343,6 +1175,13 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { continue; } + if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) { + if (onlyUsedByLifetimeMarkers(BCI)) { + InstsToRewrite.insert(BCI); + continue; + } + } + return false; } @@ -1354,6 +1193,18 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { // If we have instructions that need to be rewritten for this to be promotable // take care of it now. for (unsigned i = 0, e = InstsToRewrite.size(); i != e; ++i) { + if (BitCastInst *BCI = dyn_cast<BitCastInst>(InstsToRewrite[i])) { + // This could only be a bitcast used by nothing but lifetime intrinsics. + for (BitCastInst::use_iterator I = BCI->use_begin(), E = BCI->use_end(); + I != E;) { + Use &U = I.getUse(); + ++I; + cast<Instruction>(U.getUser())->eraseFromParent(); + } + BCI->eraseFromParent(); + continue; + } + if (SelectInst *SI = dyn_cast<SelectInst>(InstsToRewrite[i])) { // Selects in InstsToRewrite only have load uses. Rewrite each as two // loads with a new select. @@ -1393,7 +1244,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { continue; } - const Type *LoadTy = cast<PointerType>(PN->getType())->getElementType(); + Type *LoadTy = cast<PointerType>(PN->getType())->getElementType(); PHINode *NewPN = PHINode::Create(LoadTy, PN->getNumIncomingValues(), PN->getName()+".ld", PN); @@ -1483,13 +1334,13 @@ bool SROA::performPromotion(Function &F) { /// ShouldAttemptScalarRepl - Decide if an alloca is a good candidate for /// SROA. It must be a struct or array type with a small number of elements. static bool ShouldAttemptScalarRepl(AllocaInst *AI) { - const Type *T = AI->getAllocatedType(); + Type *T = AI->getAllocatedType(); // Do not promote any struct into more than 32 separate vars. - if (const StructType *ST = dyn_cast<StructType>(T)) + if (StructType *ST = dyn_cast<StructType>(T)) return ST->getNumElements() <= 32; // Arrays are much less likely to be safe for SROA; only consider // them if they are very small. - if (const ArrayType *AT = dyn_cast<ArrayType>(T)) + if (ArrayType *AT = dyn_cast<ArrayType>(T)) return AT->getNumElements() <= 8; return false; } @@ -1594,7 +1445,7 @@ void SROA::DoScalarReplacement(AllocaInst *AI, std::vector<AllocaInst*> &WorkList) { DEBUG(dbgs() << "Found inst to SROA: " << *AI << '\n'); SmallVector<AllocaInst*, 32> ElementAllocas; - if (const StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) { + if (StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) { ElementAllocas.reserve(ST->getNumContainedTypes()); for (unsigned i = 0, e = ST->getNumContainedTypes(); i != e; ++i) { AllocaInst *NA = new AllocaInst(ST->getContainedType(i), 0, @@ -1604,9 +1455,9 @@ void SROA::DoScalarReplacement(AllocaInst *AI, WorkList.push_back(NA); // Add to worklist for recursive processing } } else { - const ArrayType *AT = cast<ArrayType>(AI->getAllocatedType()); + ArrayType *AT = cast<ArrayType>(AI->getAllocatedType()); ElementAllocas.reserve(AT->getNumElements()); - const Type *ElTy = AT->getElementType(); + Type *ElTy = AT->getElementType(); for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { AllocaInst *NA = new AllocaInst(ElTy, 0, AI->getAlignment(), AI->getName() + "." + Twine(i), AI); @@ -1670,22 +1521,26 @@ void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset, UI.getOperandNo() == 0, Info, MI, true /*AllowWholeAccess*/); } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) { - if (LI->isVolatile()) + if (!LI->isSimple()) return MarkUnsafe(Info, User); - const Type *LIType = LI->getType(); + Type *LIType = LI->getType(); isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType), LIType, false, Info, LI, true /*AllowWholeAccess*/); Info.hasALoadOrStore = true; } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) { // Store is ok if storing INTO the pointer, not storing the pointer - if (SI->isVolatile() || SI->getOperand(0) == I) + if (!SI->isSimple() || SI->getOperand(0) == I) return MarkUnsafe(Info, User); - const Type *SIType = SI->getOperand(0)->getType(); + Type *SIType = SI->getOperand(0)->getType(); isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType), SIType, true, Info, SI, true /*AllowWholeAccess*/); Info.hasALoadOrStore = true; + } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) { + if (II->getIntrinsicID() != Intrinsic::lifetime_start && + II->getIntrinsicID() != Intrinsic::lifetime_end) + return MarkUnsafe(Info, User); } else if (isa<PHINode>(User) || isa<SelectInst>(User)) { isSafePHISelectUseForScalarRepl(User, Offset, Info); } else { @@ -1725,19 +1580,19 @@ void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset, return MarkUnsafe(Info, User); isSafePHISelectUseForScalarRepl(GEPI, Offset, Info); } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) { - if (LI->isVolatile()) + if (!LI->isSimple()) return MarkUnsafe(Info, User); - const Type *LIType = LI->getType(); + Type *LIType = LI->getType(); isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType), LIType, false, Info, LI, false /*AllowWholeAccess*/); Info.hasALoadOrStore = true; } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) { // Store is ok if storing INTO the pointer, not storing the pointer - if (SI->isVolatile() || SI->getOperand(0) == I) + if (!SI->isSimple() || SI->getOperand(0) == I) return MarkUnsafe(Info, User); - const Type *SIType = SI->getOperand(0)->getType(); + Type *SIType = SI->getOperand(0)->getType(); isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType), SIType, true, Info, SI, false /*AllowWholeAccess*/); Info.hasALoadOrStore = true; @@ -1776,8 +1631,7 @@ void SROA::isSafeGEP(GetElementPtrInst *GEPI, // Compute the offset due to this GEP and check if the alloca has a // component element at that offset. SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end()); - Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(), - &Indices[0], Indices.size()); + Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(), Indices); if (!TypeHasComponent(Info.AI->getAllocatedType(), Offset, 0)) MarkUnsafe(Info, GEPI); } @@ -1786,14 +1640,14 @@ void SROA::isSafeGEP(GetElementPtrInst *GEPI, /// elements of the same type (which is always true for arrays). If so, /// return true with NumElts and EltTy set to the number of elements and the /// element type, respectively. -static bool isHomogeneousAggregate(const Type *T, unsigned &NumElts, - const Type *&EltTy) { - if (const ArrayType *AT = dyn_cast<ArrayType>(T)) { +static bool isHomogeneousAggregate(Type *T, unsigned &NumElts, + Type *&EltTy) { + if (ArrayType *AT = dyn_cast<ArrayType>(T)) { NumElts = AT->getNumElements(); EltTy = (NumElts == 0 ? 0 : AT->getElementType()); return true; } - if (const StructType *ST = dyn_cast<StructType>(T)) { + if (StructType *ST = dyn_cast<StructType>(T)) { NumElts = ST->getNumContainedTypes(); EltTy = (NumElts == 0 ? 0 : ST->getContainedType(0)); for (unsigned n = 1; n < NumElts; ++n) { @@ -1807,12 +1661,12 @@ static bool isHomogeneousAggregate(const Type *T, unsigned &NumElts, /// isCompatibleAggregate - Check if T1 and T2 are either the same type or are /// "homogeneous" aggregates with the same element type and number of elements. -static bool isCompatibleAggregate(const Type *T1, const Type *T2) { +static bool isCompatibleAggregate(Type *T1, Type *T2) { if (T1 == T2) return true; unsigned NumElts1, NumElts2; - const Type *EltTy1, *EltTy2; + Type *EltTy1, *EltTy2; if (isHomogeneousAggregate(T1, NumElts1, EltTy1) && isHomogeneousAggregate(T2, NumElts2, EltTy2) && NumElts1 == NumElts2 && @@ -1830,7 +1684,7 @@ static bool isCompatibleAggregate(const Type *T1, const Type *T2) { /// If AllowWholeAccess is true, then this allows uses of the entire alloca as a /// unit. If false, it only allows accesses known to be in a single element. void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize, - const Type *MemOpType, bool isStore, + Type *MemOpType, bool isStore, AllocaInfo &Info, Instruction *TheAccess, bool AllowWholeAccess) { // Check if this is a load/store of the entire alloca. @@ -1857,7 +1711,7 @@ void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize, } } // Check if the offset/size correspond to a component within the alloca type. - const Type *T = Info.AI->getAllocatedType(); + Type *T = Info.AI->getAllocatedType(); if (TypeHasComponent(T, Offset, MemSize)) { Info.hasSubelementAccess = true; return; @@ -1868,16 +1722,16 @@ void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize, /// TypeHasComponent - Return true if T has a component type with the /// specified offset and size. If Size is zero, do not check the size. -bool SROA::TypeHasComponent(const Type *T, uint64_t Offset, uint64_t Size) { - const Type *EltTy; +bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size) { + Type *EltTy; uint64_t EltSize; - if (const StructType *ST = dyn_cast<StructType>(T)) { + if (StructType *ST = dyn_cast<StructType>(T)) { const StructLayout *Layout = TD->getStructLayout(ST); unsigned EltIdx = Layout->getElementContainingOffset(Offset); EltTy = ST->getContainedType(EltIdx); EltSize = TD->getTypeAllocSize(EltTy); Offset -= Layout->getElementOffset(EltIdx); - } else if (const ArrayType *AT = dyn_cast<ArrayType>(T)) { + } else if (ArrayType *AT = dyn_cast<ArrayType>(T)) { EltTy = AT->getElementType(); EltSize = TD->getTypeAllocSize(EltTy); if (Offset >= AT->getNumElements() * EltSize) @@ -1924,9 +1778,17 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, // address operand will be updated, so nothing else needs to be done. continue; } + + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) { + if (II->getIntrinsicID() == Intrinsic::lifetime_start || + II->getIntrinsicID() == Intrinsic::lifetime_end) { + RewriteLifetimeIntrinsic(II, AI, Offset, NewElts); + } + continue; + } if (LoadInst *LI = dyn_cast<LoadInst>(User)) { - const Type *LIType = LI->getType(); + Type *LIType = LI->getType(); if (isCompatibleAggregate(LIType, AI->getAllocatedType())) { // Replace: @@ -1956,7 +1818,7 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, if (StoreInst *SI = dyn_cast<StoreInst>(User)) { Value *Val = SI->getOperand(0); - const Type *SIType = Val->getType(); + Type *SIType = Val->getType(); if (isCompatibleAggregate(SIType, AI->getAllocatedType())) { // Replace: // store { i32, i32 } %val, { i32, i32 }* %alloc @@ -2026,10 +1888,10 @@ void SROA::RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset, /// Sets T to the type of the element and Offset to the offset within that /// element. IdxTy is set to the type of the index result to be used in a /// GEP instruction. -uint64_t SROA::FindElementAndOffset(const Type *&T, uint64_t &Offset, - const Type *&IdxTy) { +uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset, + Type *&IdxTy) { uint64_t Idx = 0; - if (const StructType *ST = dyn_cast<StructType>(T)) { + if (StructType *ST = dyn_cast<StructType>(T)) { const StructLayout *Layout = TD->getStructLayout(ST); Idx = Layout->getElementContainingOffset(Offset); T = ST->getContainedType(Idx); @@ -2037,7 +1899,7 @@ uint64_t SROA::FindElementAndOffset(const Type *&T, uint64_t &Offset, IdxTy = Type::getInt32Ty(T->getContext()); return Idx; } - const ArrayType *AT = cast<ArrayType>(T); + ArrayType *AT = cast<ArrayType>(T); T = AT->getElementType(); uint64_t EltSize = TD->getTypeAllocSize(T); Idx = Offset / EltSize; @@ -2053,13 +1915,12 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, SmallVector<AllocaInst*, 32> &NewElts) { uint64_t OldOffset = Offset; SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end()); - Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(), - &Indices[0], Indices.size()); + Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(), Indices); RewriteForScalarRepl(GEPI, AI, Offset, NewElts); - const Type *T = AI->getAllocatedType(); - const Type *IdxTy; + Type *T = AI->getAllocatedType(); + Type *IdxTy; uint64_t OldIdx = FindElementAndOffset(T, OldOffset, IdxTy); if (GEPI->getOperand(0) == AI) OldIdx = ~0ULL; // Force the GEP to be rewritten. @@ -2073,7 +1934,7 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, if (Idx == OldIdx) return; - const Type *i32Ty = Type::getInt32Ty(AI->getContext()); + Type *i32Ty = Type::getInt32Ty(AI->getContext()); SmallVector<Value*, 8> NewArgs; NewArgs.push_back(Constant::getNullValue(i32Ty)); while (EltOffset != 0) { @@ -2082,8 +1943,7 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, } Instruction *Val = NewElts[Idx]; if (NewArgs.size() > 1) { - Val = GetElementPtrInst::CreateInBounds(Val, NewArgs.begin(), - NewArgs.end(), "", GEPI); + Val = GetElementPtrInst::CreateInBounds(Val, NewArgs, "", GEPI); Val->takeName(GEPI); } if (Val->getType() != GEPI->getType()) @@ -2092,6 +1952,62 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, DeadInsts.push_back(GEPI); } +/// RewriteLifetimeIntrinsic - II is a lifetime.start/lifetime.end. Rewrite it +/// to mark the lifetime of the scalarized memory. +void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI, + uint64_t Offset, + SmallVector<AllocaInst*, 32> &NewElts) { + ConstantInt *OldSize = cast<ConstantInt>(II->getArgOperand(0)); + // Put matching lifetime markers on everything from Offset up to + // Offset+OldSize. + Type *AIType = AI->getAllocatedType(); + uint64_t NewOffset = Offset; + Type *IdxTy; + uint64_t Idx = FindElementAndOffset(AIType, NewOffset, IdxTy); + + IRBuilder<> Builder(II); + uint64_t Size = OldSize->getLimitedValue(); + + if (NewOffset) { + // Splice the first element and index 'NewOffset' bytes in. SROA will + // split the alloca again later. + Value *V = Builder.CreateBitCast(NewElts[Idx], Builder.getInt8PtrTy()); + V = Builder.CreateGEP(V, Builder.getInt64(NewOffset)); + + IdxTy = NewElts[Idx]->getAllocatedType(); + uint64_t EltSize = TD->getTypeAllocSize(IdxTy) - NewOffset; + if (EltSize > Size) { + EltSize = Size; + Size = 0; + } else { + Size -= EltSize; + } + if (II->getIntrinsicID() == Intrinsic::lifetime_start) + Builder.CreateLifetimeStart(V, Builder.getInt64(EltSize)); + else + Builder.CreateLifetimeEnd(V, Builder.getInt64(EltSize)); + ++Idx; + } + + for (; Idx != NewElts.size() && Size; ++Idx) { + IdxTy = NewElts[Idx]->getAllocatedType(); + uint64_t EltSize = TD->getTypeAllocSize(IdxTy); + if (EltSize > Size) { + EltSize = Size; + Size = 0; + } else { + Size -= EltSize; + } + if (II->getIntrinsicID() == Intrinsic::lifetime_start) + Builder.CreateLifetimeStart(NewElts[Idx], + Builder.getInt64(EltSize)); + else + Builder.CreateLifetimeEnd(NewElts[Idx], + Builder.getInt64(EltSize)); + } + DeadInsts.push_back(II); +} + /// RewriteMemIntrinUserOfAlloca - MI is a memcpy/memset/memmove from or to AI. /// Rewrite it to copy or set the elements of the scalarized memory. void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, @@ -2139,7 +2055,7 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, // If the pointer is not the right type, insert a bitcast to the right // type. - const Type *NewTy = + Type *NewTy = PointerType::get(AI->getType()->getElementType(), AddrSpace); if (OtherPtr->getType() != NewTy) @@ -2159,16 +2075,16 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, if (OtherPtr) { Value *Idx[2] = { Zero, ConstantInt::get(Type::getInt32Ty(MI->getContext()), i) }; - OtherElt = GetElementPtrInst::CreateInBounds(OtherPtr, Idx, Idx + 2, + OtherElt = GetElementPtrInst::CreateInBounds(OtherPtr, Idx, OtherPtr->getName()+"."+Twine(i), MI); uint64_t EltOffset; - const PointerType *OtherPtrTy = cast<PointerType>(OtherPtr->getType()); - const Type *OtherTy = OtherPtrTy->getElementType(); - if (const StructType *ST = dyn_cast<StructType>(OtherTy)) { + PointerType *OtherPtrTy = cast<PointerType>(OtherPtr->getType()); + Type *OtherTy = OtherPtrTy->getElementType(); + if (StructType *ST = dyn_cast<StructType>(OtherTy)) { EltOffset = TD->getStructLayout(ST)->getElementOffset(i); } else { - const Type *EltTy = cast<SequentialType>(OtherTy)->getElementType(); + Type *EltTy = cast<SequentialType>(OtherTy)->getElementType(); EltOffset = TD->getTypeAllocSize(EltTy)*i; } @@ -2181,7 +2097,7 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, } Value *EltPtr = NewElts[i]; - const Type *EltTy = cast<PointerType>(EltPtr->getType())->getElementType(); + Type *EltTy = cast<PointerType>(EltPtr->getType())->getElementType(); // If we got down to a scalar, insert a load or store as appropriate. if (EltTy->isSingleValueType()) { @@ -2207,7 +2123,7 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, StoreVal = Constant::getNullValue(EltTy); // 0.0, null, 0, <0,0> } else { // If EltTy is a vector type, get the element type. - const Type *ValTy = EltTy->getScalarType(); + Type *ValTy = EltTy->getScalarType(); // Construct an integer with the right value. unsigned EltSize = TD->getTypeSizeInBits(ValTy); @@ -2228,8 +2144,8 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, assert(StoreVal->getType() == ValTy && "Type mismatch!"); // If the requested value was a vector constant, create it. - if (EltTy != ValTy) { - unsigned NumElts = cast<VectorType>(ValTy)->getNumElements(); + if (EltTy->isVectorTy()) { + unsigned NumElts = cast<VectorType>(EltTy)->getNumElements(); SmallVector<Constant*, 16> Elts(NumElts, StoreVal); StoreVal = ConstantVector::get(Elts); } @@ -2271,7 +2187,7 @@ void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, // Extract each element out of the integer according to its structure offset // and store the element value to the individual alloca. Value *SrcVal = SI->getOperand(0); - const Type *AllocaEltTy = AI->getAllocatedType(); + Type *AllocaEltTy = AI->getAllocatedType(); uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy); IRBuilder<> Builder(SI); @@ -2286,12 +2202,12 @@ void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, // There are two forms here: AI could be an array or struct. Both cases // have different ways to compute the element offset. - if (const StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) { + if (StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) { const StructLayout *Layout = TD->getStructLayout(EltSTy); for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { // Get the number of bits to shift SrcVal to get the value. - const Type *FieldTy = EltSTy->getElementType(i); + Type *FieldTy = EltSTy->getElementType(i); uint64_t Shift = Layout->getElementOffsetInBits(i); if (TD->isBigEndian()) @@ -2327,8 +2243,8 @@ void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, } } else { - const ArrayType *ATy = cast<ArrayType>(AllocaEltTy); - const Type *ArrayEltTy = ATy->getElementType(); + ArrayType *ATy = cast<ArrayType>(AllocaEltTy); + Type *ArrayEltTy = ATy->getElementType(); uint64_t ElementOffset = TD->getTypeAllocSizeInBits(ArrayEltTy); uint64_t ElementSizeBits = TD->getTypeSizeInBits(ArrayEltTy); @@ -2384,7 +2300,7 @@ void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, SmallVector<AllocaInst*, 32> &NewElts) { // Extract each element out of the NewElts according to its structure offset // and form the result value. - const Type *AllocaEltTy = AI->getAllocatedType(); + Type *AllocaEltTy = AI->getAllocatedType(); uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy); DEBUG(dbgs() << "PROMOTING LOAD OF WHOLE ALLOCA: " << *AI << '\n' << *LI @@ -2394,10 +2310,10 @@ void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, // have different ways to compute the element offset. const StructLayout *Layout = 0; uint64_t ArrayEltBitOffset = 0; - if (const StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) { + if (StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) { Layout = TD->getStructLayout(EltSTy); } else { - const Type *ArrayEltTy = cast<ArrayType>(AllocaEltTy)->getElementType(); + Type *ArrayEltTy = cast<ArrayType>(AllocaEltTy)->getElementType(); ArrayEltBitOffset = TD->getTypeAllocSizeInBits(ArrayEltTy); } @@ -2408,14 +2324,14 @@ void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, // Load the value from the alloca. If the NewElt is an aggregate, cast // the pointer to an integer of the same size before doing the load. Value *SrcField = NewElts[i]; - const Type *FieldTy = + Type *FieldTy = cast<PointerType>(SrcField->getType())->getElementType(); uint64_t FieldSizeBits = TD->getTypeSizeInBits(FieldTy); // Ignore zero sized fields like {}, they obviously contain no data. if (FieldSizeBits == 0) continue; - const IntegerType *FieldIntTy = IntegerType::get(LI->getContext(), + IntegerType *FieldIntTy = IntegerType::get(LI->getContext(), FieldSizeBits); if (!FieldTy->isIntegerTy() && !FieldTy->isFloatingPointTy() && !FieldTy->isVectorTy()) @@ -2468,14 +2384,14 @@ void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, /// HasPadding - Return true if the specified type has any structure or /// alignment padding in between the elements that would be split apart /// by SROA; return false otherwise. -static bool HasPadding(const Type *Ty, const TargetData &TD) { - if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { +static bool HasPadding(Type *Ty, const TargetData &TD) { + if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { Ty = ATy->getElementType(); return TD.getTypeSizeInBits(Ty) != TD.getTypeAllocSizeInBits(Ty); } // SROA currently handles only Arrays and Structs. - const StructType *STy = cast<StructType>(Ty); + StructType *STy = cast<StructType>(Ty); const StructLayout *SL = TD.getStructLayout(STy); unsigned PrevFieldBitOffset = 0; for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { @@ -2530,7 +2446,7 @@ bool SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) { // and fusion code. if (!Info.hasSubelementAccess && Info.hasALoadOrStore) { // If the struct/array just has one element, use basic SRoA. - if (const StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) { + if (StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) { if (ST->getNumElements() > 1) return false; } else { if (cast<ArrayType>(AI->getAllocatedType())->getNumElements() > 1) @@ -2576,7 +2492,7 @@ isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy, if (LoadInst *LI = dyn_cast<LoadInst>(U)) { // Ignore non-volatile loads, they are always ok. - if (LI->isVolatile()) return false; + if (!LI->isSimple()) return false; continue; } diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyLibCalls.cpp index 7c415e5..fbb9465 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SimplifyLibCalls.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyLibCalls.cpp @@ -134,7 +134,7 @@ namespace { struct StrCatOpt : public LibCallOptimization { virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { // Verify the "strcat" function prototype. - const FunctionType *FT = Callee->getFunctionType(); + FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || FT->getReturnType() != B.getInt8PtrTy() || FT->getParamType(0) != FT->getReturnType() || @@ -184,7 +184,7 @@ struct StrCatOpt : public LibCallOptimization { struct StrNCatOpt : public StrCatOpt { virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { // Verify the "strncat" function prototype. - const FunctionType *FT = Callee->getFunctionType(); + FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 3 || FT->getReturnType() != B.getInt8PtrTy() || FT->getParamType(0) != FT->getReturnType() || @@ -232,7 +232,7 @@ struct StrNCatOpt : public StrCatOpt { struct StrChrOpt : public LibCallOptimization { virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { // Verify the "strchr" function prototype. - const FunctionType *FT = Callee->getFunctionType(); + FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || FT->getReturnType() != B.getInt8PtrTy() || FT->getParamType(0) != FT->getReturnType() || @@ -282,7 +282,7 @@ struct StrChrOpt : public LibCallOptimization { struct StrRChrOpt : public LibCallOptimization { virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { // Verify the "strrchr" function prototype. - const FunctionType *FT = Callee->getFunctionType(); + FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || FT->getReturnType() != B.getInt8PtrTy() || FT->getParamType(0) != FT->getReturnType() || @@ -323,7 +323,7 @@ struct StrRChrOpt : public LibCallOptimization { struct StrCmpOpt : public LibCallOptimization { virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { // Verify the "strcmp" function prototype. - const FunctionType *FT = Callee->getFunctionType(); + FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || !FT->getReturnType()->isIntegerTy(32) || FT->getParamType(0) != FT->getParamType(1) || @@ -338,16 +338,17 @@ struct StrCmpOpt : public LibCallOptimization { bool HasStr1 = GetConstantStringInfo(Str1P, Str1); bool HasStr2 = GetConstantStringInfo(Str2P, Str2); - if (HasStr1 && Str1.empty()) // strcmp("", x) -> *x - return B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), CI->getType()); - - if (HasStr2 && Str2.empty()) // strcmp(x,"") -> *x - return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType()); - // strcmp(x, y) -> cnst (if both x and y are constant strings) if (HasStr1 && HasStr2) return ConstantInt::get(CI->getType(), - strcmp(Str1.c_str(),Str2.c_str())); + StringRef(Str1).compare(Str2)); + + if (HasStr1 && Str1.empty()) // strcmp("", x) -> -*x + return B.CreateNeg(B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), + CI->getType())); + + if (HasStr2 && Str2.empty()) // strcmp(x,"") -> *x + return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType()); // strcmp(P, "x") -> memcmp(P, "x", 2) uint64_t Len1 = GetStringLength(Str1P); @@ -371,7 +372,7 @@ struct StrCmpOpt : public LibCallOptimization { struct StrNCmpOpt : public LibCallOptimization { virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { // Verify the "strncmp" function prototype. - const FunctionType *FT = Callee->getFunctionType(); + FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 3 || !FT->getReturnType()->isIntegerTy(32) || FT->getParamType(0) != FT->getParamType(1) || @@ -400,16 +401,20 @@ struct StrNCmpOpt : public LibCallOptimization { bool HasStr1 = GetConstantStringInfo(Str1P, Str1); bool HasStr2 = GetConstantStringInfo(Str2P, Str2); - if (HasStr1 && Str1.empty()) // strncmp("", x, n) -> *x - return B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), CI->getType()); + // strncmp(x, y) -> cnst (if both x and y are constant strings) + if (HasStr1 && HasStr2) { + StringRef SubStr1 = StringRef(Str1).substr(0, Length); + StringRef SubStr2 = StringRef(Str2).substr(0, Length); + return ConstantInt::get(CI->getType(), SubStr1.compare(SubStr2)); + } + + if (HasStr1 && Str1.empty()) // strncmp("", x, n) -> -*x + return B.CreateNeg(B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), + CI->getType())); if (HasStr2 && Str2.empty()) // strncmp(x, "", n) -> *x return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType()); - // strncmp(x, y) -> cnst (if both x and y are constant strings) - if (HasStr1 && HasStr2) - return ConstantInt::get(CI->getType(), - strncmp(Str1.c_str(), Str2.c_str(), Length)); return 0; } }; @@ -426,7 +431,7 @@ struct StrCpyOpt : public LibCallOptimization { virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { // Verify the "strcpy" function prototype. unsigned NumParams = OptChkCall ? 3 : 2; - const FunctionType *FT = Callee->getFunctionType(); + FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != NumParams || FT->getReturnType() != FT->getParamType(0) || FT->getParamType(0) != FT->getParamType(1) || @@ -462,7 +467,7 @@ struct StrCpyOpt : public LibCallOptimization { struct StrNCpyOpt : public LibCallOptimization { virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - const FunctionType *FT = Callee->getFunctionType(); + FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || FT->getParamType(0) != FT->getParamType(1) || FT->getParamType(0) != B.getInt8PtrTy() || @@ -511,7 +516,7 @@ struct StrNCpyOpt : public LibCallOptimization { struct StrLenOpt : public LibCallOptimization { virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - const FunctionType *FT = Callee->getFunctionType(); + FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 1 || FT->getParamType(0) != B.getInt8PtrTy() || !FT->getReturnType()->isIntegerTy()) @@ -537,7 +542,7 @@ struct StrLenOpt : public LibCallOptimization { struct StrPBrkOpt : public LibCallOptimization { virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - const FunctionType *FT = Callee->getFunctionType(); + FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || FT->getParamType(0) != B.getInt8PtrTy() || FT->getParamType(1) != FT->getParamType(0) || @@ -575,7 +580,7 @@ struct StrPBrkOpt : public LibCallOptimization { struct StrToOpt : public LibCallOptimization { virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - const FunctionType *FT = Callee->getFunctionType(); + FunctionType *FT = Callee->getFunctionType(); if ((FT->getNumParams() != 2 && FT->getNumParams() != 3) || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isPointerTy()) @@ -597,7 +602,7 @@ struct StrToOpt : public LibCallOptimization { struct StrSpnOpt : public LibCallOptimization { virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - const FunctionType *FT = Callee->getFunctionType(); + FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || FT->getParamType(0) != B.getInt8PtrTy() || FT->getParamType(1) != FT->getParamType(0) || @@ -626,7 +631,7 @@ struct StrSpnOpt : public LibCallOptimization { struct StrCSpnOpt : public LibCallOptimization { virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - const FunctionType *FT = Callee->getFunctionType(); + FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || FT->getParamType(0) != B.getInt8PtrTy() || FT->getParamType(1) != FT->getParamType(0) || @@ -658,7 +663,7 @@ struct StrCSpnOpt : public LibCallOptimization { struct StrStrOpt : public LibCallOptimization { virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - const FunctionType *FT = Callee->getFunctionType(); + FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isPointerTy() || @@ -722,7 +727,7 @@ struct StrStrOpt : public LibCallOptimization { struct MemCmpOpt : public LibCallOptimization { virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - const FunctionType *FT = Callee->getFunctionType(); + FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 3 || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isPointerTy() || !FT->getReturnType()->isIntegerTy(32)) @@ -773,7 +778,7 @@ struct MemCpyOpt : public LibCallOptimization { // These optimizations require TargetData. if (!TD) return 0; - const FunctionType *FT = Callee->getFunctionType(); + FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isPointerTy() || @@ -795,7 +800,7 @@ struct MemMoveOpt : public LibCallOptimization { // These optimizations require TargetData. if (!TD) return 0; - const FunctionType *FT = Callee->getFunctionType(); + FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isPointerTy() || @@ -817,7 +822,7 @@ struct MemSetOpt : public LibCallOptimization { // These optimizations require TargetData. if (!TD) return 0; - const FunctionType *FT = Callee->getFunctionType(); + FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isIntegerTy() || @@ -840,7 +845,7 @@ struct MemSetOpt : public LibCallOptimization { struct PowOpt : public LibCallOptimization { virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - const FunctionType *FT = Callee->getFunctionType(); + FunctionType *FT = Callee->getFunctionType(); // Just make sure this has 2 arguments of the same FP type, which match the // result type. if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) || @@ -874,8 +879,8 @@ struct PowOpt : public LibCallOptimization { Callee->getAttributes()); Value *FAbs = EmitUnaryFloatFnCall(Sqrt, "fabs", B, Callee->getAttributes()); - Value *FCmp = B.CreateFCmpOEQ(Op1, NegInf, "tmp"); - Value *Sel = B.CreateSelect(FCmp, Inf, FAbs, "tmp"); + Value *FCmp = B.CreateFCmpOEQ(Op1, NegInf); + Value *Sel = B.CreateSelect(FCmp, Inf, FAbs); return Sel; } @@ -895,7 +900,7 @@ struct PowOpt : public LibCallOptimization { struct Exp2Opt : public LibCallOptimization { virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - const FunctionType *FT = Callee->getFunctionType(); + FunctionType *FT = Callee->getFunctionType(); // Just make sure this has 1 argument of FP type, which matches the // result type. if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || @@ -908,10 +913,10 @@ struct Exp2Opt : public LibCallOptimization { Value *LdExpArg = 0; if (SIToFPInst *OpC = dyn_cast<SIToFPInst>(Op)) { if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() <= 32) - LdExpArg = B.CreateSExt(OpC->getOperand(0), B.getInt32Ty(), "tmp"); + LdExpArg = B.CreateSExt(OpC->getOperand(0), B.getInt32Ty()); } else if (UIToFPInst *OpC = dyn_cast<UIToFPInst>(Op)) { if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() < 32) - LdExpArg = B.CreateZExt(OpC->getOperand(0), B.getInt32Ty(), "tmp"); + LdExpArg = B.CreateZExt(OpC->getOperand(0), B.getInt32Ty()); } if (LdExpArg) { @@ -946,7 +951,7 @@ struct Exp2Opt : public LibCallOptimization { struct UnaryDoubleFPOpt : public LibCallOptimization { virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - const FunctionType *FT = Callee->getFunctionType(); + FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 1 || !FT->getReturnType()->isDoubleTy() || !FT->getParamType(0)->isDoubleTy()) return 0; @@ -973,7 +978,7 @@ struct UnaryDoubleFPOpt : public LibCallOptimization { struct FFSOpt : public LibCallOptimization { virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - const FunctionType *FT = Callee->getFunctionType(); + FunctionType *FT = Callee->getFunctionType(); // Just make sure this has 2 arguments of the same FP type, which match the // result type. if (FT->getNumParams() != 1 || @@ -996,10 +1001,10 @@ struct FFSOpt : public LibCallOptimization { Value *F = Intrinsic::getDeclaration(Callee->getParent(), Intrinsic::cttz, ArgType); Value *V = B.CreateCall(F, Op, "cttz"); - V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1), "tmp"); - V = B.CreateIntCast(V, B.getInt32Ty(), false, "tmp"); + V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1)); + V = B.CreateIntCast(V, B.getInt32Ty(), false); - Value *Cond = B.CreateICmpNE(Op, Constant::getNullValue(ArgType), "tmp"); + Value *Cond = B.CreateICmpNE(Op, Constant::getNullValue(ArgType)); return B.CreateSelect(Cond, V, B.getInt32(0)); } }; @@ -1009,7 +1014,7 @@ struct FFSOpt : public LibCallOptimization { struct IsDigitOpt : public LibCallOptimization { virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - const FunctionType *FT = Callee->getFunctionType(); + FunctionType *FT = Callee->getFunctionType(); // We require integer(i32) if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() || !FT->getParamType(0)->isIntegerTy(32)) @@ -1028,7 +1033,7 @@ struct IsDigitOpt : public LibCallOptimization { struct IsAsciiOpt : public LibCallOptimization { virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - const FunctionType *FT = Callee->getFunctionType(); + FunctionType *FT = Callee->getFunctionType(); // We require integer(i32) if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() || !FT->getParamType(0)->isIntegerTy(32)) @@ -1046,7 +1051,7 @@ struct IsAsciiOpt : public LibCallOptimization { struct AbsOpt : public LibCallOptimization { virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - const FunctionType *FT = Callee->getFunctionType(); + FunctionType *FT = Callee->getFunctionType(); // We require integer(integer) where the types agree. if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() || FT->getParamType(0) != FT->getReturnType()) @@ -1067,7 +1072,7 @@ struct AbsOpt : public LibCallOptimization { struct ToAsciiOpt : public LibCallOptimization { virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { - const FunctionType *FT = Callee->getFunctionType(); + FunctionType *FT = Callee->getFunctionType(); // We require i32(i32) if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) || !FT->getParamType(0)->isIntegerTy(32)) @@ -1147,7 +1152,7 @@ struct PrintFOpt : public LibCallOptimization { virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { // Require one fixed pointer argument and an integer/void result. - const FunctionType *FT = Callee->getFunctionType(); + FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() || !(FT->getReturnType()->isIntegerTy() || FT->getReturnType()->isVoidTy())) @@ -1241,7 +1246,7 @@ struct SPrintFOpt : public LibCallOptimization { virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { // Require two fixed pointer arguments and an integer result. - const FunctionType *FT = Callee->getFunctionType(); + FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isPointerTy() || !FT->getReturnType()->isIntegerTy()) @@ -1272,7 +1277,7 @@ struct SPrintFOpt : public LibCallOptimization { struct FWriteOpt : public LibCallOptimization { virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { // Require a pointer, an integer, an integer, a pointer, returning integer. - const FunctionType *FT = Callee->getFunctionType(); + FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 4 || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isIntegerTy() || !FT->getParamType(2)->isIntegerTy() || @@ -1310,7 +1315,7 @@ struct FPutsOpt : public LibCallOptimization { if (!TD) return 0; // Require two pointers. Also, we can't optimize if return value is used. - const FunctionType *FT = Callee->getFunctionType(); + FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isPointerTy() || !CI->use_empty()) @@ -1379,7 +1384,7 @@ struct FPrintFOpt : public LibCallOptimization { virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { // Require two fixed paramters as pointers and integer result. - const FunctionType *FT = Callee->getFunctionType(); + FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() || !FT->getParamType(1)->isPointerTy() || !FT->getReturnType()->isIntegerTy()) @@ -1410,7 +1415,7 @@ struct FPrintFOpt : public LibCallOptimization { struct PutsOpt : public LibCallOptimization { virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { // Require one fixed pointer argument and an integer/void result. - const FunctionType *FT = Callee->getFunctionType(); + FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() || !(FT->getReturnType()->isIntegerTy() || FT->getReturnType()->isVoidTy())) @@ -1685,7 +1690,7 @@ void SimplifyLibCalls::setDoesNotAlias(Function &F, unsigned n) { void SimplifyLibCalls::inferPrototypeAttributes(Function &F) { - const FunctionType *FTy = F.getFunctionType(); + FunctionType *FTy = F.getFunctionType(); StringRef Name = F.getName(); switch (Name[0]) { diff --git a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp index 705f442..c83f56c 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp @@ -153,9 +153,13 @@ bool Sinking::ProcessBlock(BasicBlock &BB) { static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA, SmallPtrSet<Instruction *, 8> &Stores) { - if (LoadInst *L = dyn_cast<LoadInst>(Inst)) { - if (L->isVolatile()) return false; + if (Inst->mayWriteToMemory()) { + Stores.insert(Inst); + return false; + } + + if (LoadInst *L = dyn_cast<LoadInst>(Inst)) { AliasAnalysis::Location Loc = AA->getLocation(L); for (SmallPtrSet<Instruction *, 8>::iterator I = Stores.begin(), E = Stores.end(); I != E; ++I) @@ -163,11 +167,6 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA, return false; } - if (Inst->mayWriteToMemory()) { - Stores.insert(Inst); - return false; - } - if (isa<TerminatorInst>(Inst) || isa<PHINode>(Inst)) return false; diff --git a/contrib/llvm/lib/Transforms/Scalar/TailDuplication.cpp b/contrib/llvm/lib/Transforms/Scalar/TailDuplication.cpp deleted file mode 100644 index 9dd83c0..0000000 --- a/contrib/llvm/lib/Transforms/Scalar/TailDuplication.cpp +++ /dev/null @@ -1,373 +0,0 @@ -//===- TailDuplication.cpp - Simplify CFG through tail duplication --------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This pass performs a limited form of tail duplication, intended to simplify -// CFGs by removing some unconditional branches. This pass is necessary to -// straighten out loops created by the C front-end, but also is capable of -// making other code nicer. After this pass is run, the CFG simplify pass -// should be run to clean up the mess. -// -// This pass could be enhanced in the future to use profile information to be -// more aggressive. -// -//===----------------------------------------------------------------------===// - -#define DEBUG_TYPE "tailduplicate" -#include "llvm/Transforms/Scalar.h" -#include "llvm/Constant.h" -#include "llvm/Function.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/Pass.h" -#include "llvm/Type.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Support/CFG.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Utils/Local.h" -#include <map> -using namespace llvm; - -STATISTIC(NumEliminated, "Number of unconditional branches eliminated"); - -static cl::opt<unsigned> -TailDupThreshold("taildup-threshold", - cl::desc("Max block size to tail duplicate"), - cl::init(1), cl::Hidden); - -namespace { - class TailDup : public FunctionPass { - bool runOnFunction(Function &F); - public: - static char ID; // Pass identification, replacement for typeid - TailDup() : FunctionPass(ID) { - initializeTailDupPass(*PassRegistry::getPassRegistry()); - } - - private: - inline bool shouldEliminateUnconditionalBranch(TerminatorInst *, unsigned); - inline void eliminateUnconditionalBranch(BranchInst *BI); - SmallPtrSet<BasicBlock*, 4> CycleDetector; - }; -} - -char TailDup::ID = 0; -INITIALIZE_PASS(TailDup, "tailduplicate", "Tail Duplication", false, false) - -// Public interface to the Tail Duplication pass -FunctionPass *llvm::createTailDuplicationPass() { return new TailDup(); } - -/// runOnFunction - Top level algorithm - Loop over each unconditional branch in -/// the function, eliminating it if it looks attractive enough. CycleDetector -/// prevents infinite loops by checking that we aren't redirecting a branch to -/// a place it already pointed to earlier; see PR 2323. -bool TailDup::runOnFunction(Function &F) { - bool Changed = false; - CycleDetector.clear(); - for (Function::iterator I = F.begin(), E = F.end(); I != E; ) { - if (shouldEliminateUnconditionalBranch(I->getTerminator(), - TailDupThreshold)) { - eliminateUnconditionalBranch(cast<BranchInst>(I->getTerminator())); - Changed = true; - } else { - ++I; - CycleDetector.clear(); - } - } - return Changed; -} - -/// shouldEliminateUnconditionalBranch - Return true if this branch looks -/// attractive to eliminate. We eliminate the branch if the destination basic -/// block has <= 5 instructions in it, not counting PHI nodes. In practice, -/// since one of these is a terminator instruction, this means that we will add -/// up to 4 instructions to the new block. -/// -/// We don't count PHI nodes in the count since they will be removed when the -/// contents of the block are copied over. -/// -bool TailDup::shouldEliminateUnconditionalBranch(TerminatorInst *TI, - unsigned Threshold) { - BranchInst *BI = dyn_cast<BranchInst>(TI); - if (!BI || !BI->isUnconditional()) return false; // Not an uncond branch! - - BasicBlock *Dest = BI->getSuccessor(0); - if (Dest == BI->getParent()) return false; // Do not loop infinitely! - - // Do not inline a block if we will just get another branch to the same block! - TerminatorInst *DTI = Dest->getTerminator(); - if (BranchInst *DBI = dyn_cast<BranchInst>(DTI)) - if (DBI->isUnconditional() && DBI->getSuccessor(0) == Dest) - return false; // Do not loop infinitely! - - // FIXME: DemoteRegToStack cannot yet demote invoke instructions to the stack, - // because doing so would require breaking critical edges. This should be - // fixed eventually. - if (!DTI->use_empty()) - return false; - - // Do not bother with blocks with only a single predecessor: simplify - // CFG will fold these two blocks together! - pred_iterator PI = pred_begin(Dest), PE = pred_end(Dest); - ++PI; - if (PI == PE) return false; // Exactly one predecessor! - - BasicBlock::iterator I = Dest->getFirstNonPHI(); - - for (unsigned Size = 0; I != Dest->end(); ++I) { - if (Size == Threshold) return false; // The block is too large. - - // Don't tail duplicate call instructions. They are very large compared to - // other instructions. - if (isa<CallInst>(I) || isa<InvokeInst>(I)) return false; - - // Also alloca and malloc. - if (isa<AllocaInst>(I)) return false; - - // Some vector instructions can expand into a number of instructions. - if (isa<ShuffleVectorInst>(I) || isa<ExtractElementInst>(I) || - isa<InsertElementInst>(I)) return false; - - // Only count instructions that are not debugger intrinsics. - if (!isa<DbgInfoIntrinsic>(I)) ++Size; - } - - // Do not tail duplicate a block that has thousands of successors into a block - // with a single successor if the block has many other predecessors. This can - // cause an N^2 explosion in CFG edges (and PHI node entries), as seen in - // cases that have a large number of indirect gotos. - unsigned NumSuccs = DTI->getNumSuccessors(); - if (NumSuccs > 8) { - unsigned TooMany = 128; - if (NumSuccs >= TooMany) return false; - TooMany = TooMany/NumSuccs; - for (; PI != PE; ++PI) - if (TooMany-- == 0) return false; - } - - // If this unconditional branch is a fall-through, be careful about - // tail duplicating it. In particular, we don't want to taildup it if the - // original block will still be there after taildup is completed: doing so - // would eliminate the fall-through, requiring unconditional branches. - Function::iterator DestI = Dest; - if (&*--DestI == BI->getParent()) { - // The uncond branch is a fall-through. Tail duplication of the block is - // will eliminate the fall-through-ness and end up cloning the terminator - // at the end of the Dest block. Since the original Dest block will - // continue to exist, this means that one or the other will not be able to - // fall through. One typical example that this helps with is code like: - // if (a) - // foo(); - // if (b) - // foo(); - // Cloning the 'if b' block into the end of the first foo block is messy. - - // The messy case is when the fall-through block falls through to other - // blocks. This is what we would be preventing if we cloned the block. - DestI = Dest; - if (++DestI != Dest->getParent()->end()) { - BasicBlock *DestSucc = DestI; - // If any of Dest's successors are fall-throughs, don't do this xform. - for (succ_iterator SI = succ_begin(Dest), SE = succ_end(Dest); - SI != SE; ++SI) - if (*SI == DestSucc) - return false; - } - } - - // Finally, check that we haven't redirected to this target block earlier; - // there are cases where we loop forever if we don't check this (PR 2323). - if (!CycleDetector.insert(Dest)) - return false; - - return true; -} - -/// FindObviousSharedDomOf - We know there is a branch from SrcBlock to -/// DestBlock, and that SrcBlock is not the only predecessor of DstBlock. If we -/// can find a predecessor of SrcBlock that is a dominator of both SrcBlock and -/// DstBlock, return it. -static BasicBlock *FindObviousSharedDomOf(BasicBlock *SrcBlock, - BasicBlock *DstBlock) { - // SrcBlock must have a single predecessor. - pred_iterator PI = pred_begin(SrcBlock), PE = pred_end(SrcBlock); - if (PI == PE || ++PI != PE) return 0; - - BasicBlock *SrcPred = *pred_begin(SrcBlock); - - // Look at the predecessors of DstBlock. One of them will be SrcBlock. If - // there is only one other pred, get it, otherwise we can't handle it. - PI = pred_begin(DstBlock); PE = pred_end(DstBlock); - BasicBlock *DstOtherPred = 0; - BasicBlock *P = *PI; - if (P == SrcBlock) { - if (++PI == PE) return 0; - DstOtherPred = *PI; - if (++PI != PE) return 0; - } else { - DstOtherPred = P; - if (++PI == PE || *PI != SrcBlock || ++PI != PE) return 0; - } - - // We can handle two situations here: "if then" and "if then else" blocks. An - // 'if then' situation is just where DstOtherPred == SrcPred. - if (DstOtherPred == SrcPred) - return SrcPred; - - // Check to see if we have an "if then else" situation, which means that - // DstOtherPred will have a single predecessor and it will be SrcPred. - PI = pred_begin(DstOtherPred); PE = pred_end(DstOtherPred); - if (PI != PE && *PI == SrcPred) { - if (++PI != PE) return 0; // Not a single pred. - return SrcPred; // Otherwise, it's an "if then" situation. Return the if. - } - - // Otherwise, this is something we can't handle. - return 0; -} - - -/// eliminateUnconditionalBranch - Clone the instructions from the destination -/// block into the source block, eliminating the specified unconditional branch. -/// If the destination block defines values used by successors of the dest -/// block, we may need to insert PHI nodes. -/// -void TailDup::eliminateUnconditionalBranch(BranchInst *Branch) { - BasicBlock *SourceBlock = Branch->getParent(); - BasicBlock *DestBlock = Branch->getSuccessor(0); - assert(SourceBlock != DestBlock && "Our predicate is broken!"); - - DEBUG(dbgs() << "TailDuplication[" << SourceBlock->getParent()->getName() - << "]: Eliminating branch: " << *Branch); - - // See if we can avoid duplicating code by moving it up to a dominator of both - // blocks. - if (BasicBlock *DomBlock = FindObviousSharedDomOf(SourceBlock, DestBlock)) { - DEBUG(dbgs() << "Found shared dominator: " << DomBlock->getName() << "\n"); - - // If there are non-phi instructions in DestBlock that have no operands - // defined in DestBlock, and if the instruction has no side effects, we can - // move the instruction to DomBlock instead of duplicating it. - BasicBlock::iterator BBI = DestBlock->getFirstNonPHI(); - while (!isa<TerminatorInst>(BBI)) { - Instruction *I = BBI++; - - bool CanHoist = I->isSafeToSpeculativelyExecute() && - !I->mayReadFromMemory(); - if (CanHoist) { - for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op) - if (Instruction *OpI = dyn_cast<Instruction>(I->getOperand(op))) - if (OpI->getParent() == DestBlock || - (isa<InvokeInst>(OpI) && OpI->getParent() == DomBlock)) { - CanHoist = false; - break; - } - if (CanHoist) { - // Remove from DestBlock, move right before the term in DomBlock. - DestBlock->getInstList().remove(I); - DomBlock->getInstList().insert(DomBlock->getTerminator(), I); - DEBUG(dbgs() << "Hoisted: " << *I); - } - } - } - } - - // Tail duplication can not update SSA properties correctly if the values - // defined in the duplicated tail are used outside of the tail itself. For - // this reason, we spill all values that are used outside of the tail to the - // stack. - for (BasicBlock::iterator I = DestBlock->begin(); I != DestBlock->end(); ++I) - if (I->isUsedOutsideOfBlock(DestBlock)) { - // We found a use outside of the tail. Create a new stack slot to - // break this inter-block usage pattern. - DemoteRegToStack(*I); - } - - // We are going to have to map operands from the original block B to the new - // copy of the block B'. If there are PHI nodes in the DestBlock, these PHI - // nodes also define part of this mapping. Loop over these PHI nodes, adding - // them to our mapping. - // - std::map<Value*, Value*> ValueMapping; - - BasicBlock::iterator BI = DestBlock->begin(); - bool HadPHINodes = isa<PHINode>(BI); - for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI) - ValueMapping[PN] = PN->getIncomingValueForBlock(SourceBlock); - - // Clone the non-phi instructions of the dest block into the source block, - // keeping track of the mapping... - // - for (; BI != DestBlock->end(); ++BI) { - Instruction *New = BI->clone(); - New->setName(BI->getName()); - SourceBlock->getInstList().push_back(New); - ValueMapping[BI] = New; - } - - // Now that we have built the mapping information and cloned all of the - // instructions (giving us a new terminator, among other things), walk the new - // instructions, rewriting references of old instructions to use new - // instructions. - // - BI = Branch; ++BI; // Get an iterator to the first new instruction - for (; BI != SourceBlock->end(); ++BI) - for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) { - std::map<Value*, Value*>::const_iterator I = - ValueMapping.find(BI->getOperand(i)); - if (I != ValueMapping.end()) - BI->setOperand(i, I->second); - } - - // Next we check to see if any of the successors of DestBlock had PHI nodes. - // If so, we need to add entries to the PHI nodes for SourceBlock now. - for (succ_iterator SI = succ_begin(DestBlock), SE = succ_end(DestBlock); - SI != SE; ++SI) { - BasicBlock *Succ = *SI; - for (BasicBlock::iterator PNI = Succ->begin(); isa<PHINode>(PNI); ++PNI) { - PHINode *PN = cast<PHINode>(PNI); - // Ok, we have a PHI node. Figure out what the incoming value was for the - // DestBlock. - Value *IV = PN->getIncomingValueForBlock(DestBlock); - - // Remap the value if necessary... - std::map<Value*, Value*>::const_iterator I = ValueMapping.find(IV); - if (I != ValueMapping.end()) - IV = I->second; - PN->addIncoming(IV, SourceBlock); - } - } - - // Next, remove the old branch instruction, and any PHI node entries that we - // had. - BI = Branch; ++BI; // Get an iterator to the first new instruction - DestBlock->removePredecessor(SourceBlock); // Remove entries in PHI nodes... - SourceBlock->getInstList().erase(Branch); // Destroy the uncond branch... - - // Final step: now that we have finished everything up, walk the cloned - // instructions one last time, constant propagating and DCE'ing them, because - // they may not be needed anymore. - // - if (HadPHINodes) { - while (BI != SourceBlock->end()) { - Instruction *Inst = BI++; - if (isInstructionTriviallyDead(Inst)) - Inst->eraseFromParent(); - else if (Value *V = SimplifyInstruction(Inst)) { - Inst->replaceAllUsesWith(V); - Inst->eraseFromParent(); - } - } - } - - ++NumEliminated; // We just killed a branch! -} |