diff options
Diffstat (limited to 'contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp')
-rw-r--r-- | contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp | 298 |
1 files changed, 168 insertions, 130 deletions
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index a96c46a..8a32215 100644 --- a/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -28,6 +28,7 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Metadata.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -54,10 +55,10 @@ STATISTIC(NumRuntimeUnrolled, /// - Branch around the original loop if the trip count is less /// than the unroll factor. /// -static void ConnectProlog(Loop *L, Value *TripCount, unsigned Count, +static void ConnectProlog(Loop *L, Value *BECount, unsigned Count, BasicBlock *LastPrologBB, BasicBlock *PrologEnd, BasicBlock *OrigPH, BasicBlock *NewPH, - ValueToValueMapTy &LVMap, Pass *P) { + ValueToValueMapTy &VMap, Pass *P) { BasicBlock *Latch = L->getLoopLatch(); assert(Latch && "Loop must have a latch"); @@ -86,7 +87,7 @@ static void ConnectProlog(Loop *L, Value *TripCount, unsigned Count, Value *V = PN->getIncomingValueForBlock(Latch); if (Instruction *I = dyn_cast<Instruction>(V)) { if (L->contains(I)) { - V = LVMap[I]; + V = VMap[I]; } } // Adding a value to the new PHI node from the last prolog block @@ -104,12 +105,19 @@ static void ConnectProlog(Loop *L, Value *TripCount, unsigned Count, } } - // Create a branch around the orignal loop, which is taken if the - // trip count is less than the unroll factor. + // Create a branch around the orignal loop, which is taken if there are no + // iterations remaining to be executed after running the prologue. Instruction *InsertPt = PrologEnd->getTerminator(); + + assert(Count != 0 && "nonsensical Count!"); + + // If BECount <u (Count - 1) then (BECount + 1) & (Count - 1) == (BECount + 1) + // (since Count is a power of 2). This means %xtraiter is (BECount + 1) and + // and all of the iterations of this loop were executed by the prologue. Note + // that if BECount <u (Count - 1) then (BECount + 1) cannot unsigned-overflow. Instruction *BrLoopExit = - new ICmpInst(InsertPt, ICmpInst::ICMP_ULT, TripCount, - ConstantInt::get(TripCount->getType(), Count)); + new ICmpInst(InsertPt, ICmpInst::ICMP_ULT, BECount, + ConstantInt::get(BECount->getType(), Count - 1)); BasicBlock *Exit = L->getUniqueExitBlock(); assert(Exit && "Loop must have a single exit block only"); // Split the exit to maintain loop canonicalization guarantees @@ -127,76 +135,123 @@ static void ConnectProlog(Loop *L, Value *TripCount, unsigned Count, } /// Create a clone of the blocks in a loop and connect them together. -/// This function doesn't create a clone of the loop structure. +/// If UnrollProlog is true, loop structure will not be cloned, otherwise a new +/// loop will be created including all cloned blocks, and the iterator of it +/// switches to count NewIter down to 0. /// -/// There are two value maps that are defined and used. VMap is -/// for the values in the current loop instance. LVMap contains -/// the values from the last loop instance. We need the LVMap values -/// to update the initial values for the current loop instance. -/// -static void CloneLoopBlocks(Loop *L, - bool FirstCopy, - BasicBlock *InsertTop, - BasicBlock *InsertBot, +static void CloneLoopBlocks(Loop *L, Value *NewIter, const bool UnrollProlog, + BasicBlock *InsertTop, BasicBlock *InsertBot, std::vector<BasicBlock *> &NewBlocks, - LoopBlocksDFS &LoopBlocks, - ValueToValueMapTy &VMap, - ValueToValueMapTy &LVMap, + LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap, LoopInfo *LI) { - BasicBlock *Preheader = L->getLoopPreheader(); BasicBlock *Header = L->getHeader(); BasicBlock *Latch = L->getLoopLatch(); Function *F = Header->getParent(); LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO(); LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO(); + Loop *NewLoop = 0; + Loop *ParentLoop = L->getParentLoop(); + if (!UnrollProlog) { + NewLoop = new Loop(); + if (ParentLoop) + ParentLoop->addChildLoop(NewLoop); + else + LI->addTopLevelLoop(NewLoop); + } + // For each block in the original loop, create a new copy, // and update the value map with the newly created values. for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) { - BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".unr", F); + BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".prol", F); NewBlocks.push_back(NewBB); - if (Loop *ParentLoop = L->getParentLoop()) + if (NewLoop) + NewLoop->addBasicBlockToLoop(NewBB, LI->getBase()); + else if (ParentLoop) ParentLoop->addBasicBlockToLoop(NewBB, LI->getBase()); VMap[*BB] = NewBB; if (Header == *BB) { // For the first block, add a CFG connection to this newly - // created block + // created block. InsertTop->getTerminator()->setSuccessor(0, NewBB); - // Change the incoming values to the ones defined in the - // previously cloned loop. - for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) { - PHINode *NewPHI = cast<PHINode>(VMap[I]); - if (FirstCopy) { - // We replace the first phi node with the value from the preheader - VMap[I] = NewPHI->getIncomingValueForBlock(Preheader); - NewBB->getInstList().erase(NewPHI); - } else { - // Update VMap with values from the previous block - unsigned idx = NewPHI->getBasicBlockIndex(Latch); - Value *InVal = NewPHI->getIncomingValue(idx); - if (Instruction *I = dyn_cast<Instruction>(InVal)) - if (L->contains(I)) - InVal = LVMap[InVal]; - NewPHI->setIncomingValue(idx, InVal); - NewPHI->setIncomingBlock(idx, InsertTop); - } - } } - if (Latch == *BB) { + // For the last block, if UnrollProlog is true, create a direct jump to + // InsertBot. If not, create a loop back to cloned head. VMap.erase((*BB)->getTerminator()); - NewBB->getTerminator()->eraseFromParent(); - BranchInst::Create(InsertBot, NewBB); + BasicBlock *FirstLoopBB = cast<BasicBlock>(VMap[Header]); + BranchInst *LatchBR = cast<BranchInst>(NewBB->getTerminator()); + if (UnrollProlog) { + LatchBR->eraseFromParent(); + BranchInst::Create(InsertBot, NewBB); + } else { + PHINode *NewIdx = PHINode::Create(NewIter->getType(), 2, "prol.iter", + FirstLoopBB->getFirstNonPHI()); + IRBuilder<> Builder(LatchBR); + Value *IdxSub = + Builder.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1), + NewIdx->getName() + ".sub"); + Value *IdxCmp = + Builder.CreateIsNotNull(IdxSub, NewIdx->getName() + ".cmp"); + BranchInst::Create(FirstLoopBB, InsertBot, IdxCmp, NewBB); + NewIdx->addIncoming(NewIter, InsertTop); + NewIdx->addIncoming(IdxSub, NewBB); + LatchBR->eraseFromParent(); + } + } + } + + // Change the incoming values to the ones defined in the preheader or + // cloned loop. + for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) { + PHINode *NewPHI = cast<PHINode>(VMap[I]); + if (UnrollProlog) { + VMap[I] = NewPHI->getIncomingValueForBlock(Preheader); + cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI); + } else { + unsigned idx = NewPHI->getBasicBlockIndex(Preheader); + NewPHI->setIncomingBlock(idx, InsertTop); + BasicBlock *NewLatch = cast<BasicBlock>(VMap[Latch]); + idx = NewPHI->getBasicBlockIndex(Latch); + Value *InVal = NewPHI->getIncomingValue(idx); + NewPHI->setIncomingBlock(idx, NewLatch); + if (VMap[InVal]) + NewPHI->setIncomingValue(idx, VMap[InVal]); } } - // LastValueMap is updated with the values for the current loop - // which are used the next time this function is called. - for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end(); - VI != VE; ++VI) { - LVMap[VI->first] = VI->second; + if (NewLoop) { + // Add unroll disable metadata to disable future unrolling for this loop. + SmallVector<Metadata *, 4> MDs; + // Reserve first location for self reference to the LoopID metadata node. + MDs.push_back(nullptr); + MDNode *LoopID = NewLoop->getLoopID(); + if (LoopID) { + // First remove any existing loop unrolling metadata. + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + bool IsUnrollMetadata = false; + MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i)); + if (MD) { + const MDString *S = dyn_cast<MDString>(MD->getOperand(0)); + IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll."); + } + if (!IsUnrollMetadata) + MDs.push_back(LoopID->getOperand(i)); + } + } + + LLVMContext &Context = NewLoop->getHeader()->getContext(); + SmallVector<Metadata *, 1> DisableOperands; + DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.disable")); + MDNode *DisableNode = MDNode::get(Context, DisableOperands); + MDs.push_back(DisableNode); + + MDNode *NewLoopID = MDNode::get(Context, MDs); + // Set operand 0 to refer to the loop id itself. + NewLoopID->replaceOperandWith(0, NewLoopID); + NewLoop->setLoopID(NewLoopID); } } @@ -212,18 +267,16 @@ static void CloneLoopBlocks(Loop *L, /// instruction in SimplifyCFG.cpp. Then, the backend decides how code for /// the switch instruction is generated. /// -/// extraiters = tripcount % loopfactor -/// if (extraiters == 0) jump Loop: -/// if (extraiters == loopfactor) jump L1 -/// if (extraiters == loopfactor-1) jump L2 -/// ... -/// L1: LoopBody; -/// L2: LoopBody; -/// ... -/// if tripcount < loopfactor jump End -/// Loop: -/// ... -/// End: +/// extraiters = tripcount % loopfactor +/// if (extraiters == 0) jump Loop: +/// else jump Prol +/// Prol: LoopBody; +/// extraiters -= 1 // Omitted if unroll factor is 2. +/// if (extraiters != 0) jump Prol: // Omitted if unroll factor is 2. +/// if (tripcount < loopfactor) jump End +/// Loop: +/// ... +/// End: /// bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI, LPPassManager *LPM) { @@ -246,19 +299,28 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI, // Only unroll loops with a computable trip count and the trip count needs // to be an int value (allowing a pointer type is a TODO item) - const SCEV *BECount = SE->getBackedgeTakenCount(L); - if (isa<SCEVCouldNotCompute>(BECount) || !BECount->getType()->isIntegerTy()) + const SCEV *BECountSC = SE->getBackedgeTakenCount(L); + if (isa<SCEVCouldNotCompute>(BECountSC) || + !BECountSC->getType()->isIntegerTy()) return false; + unsigned BEWidth = cast<IntegerType>(BECountSC->getType())->getBitWidth(); + // Add 1 since the backedge count doesn't include the first loop iteration const SCEV *TripCountSC = - SE->getAddExpr(BECount, SE->getConstant(BECount->getType(), 1)); + SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1)); if (isa<SCEVCouldNotCompute>(TripCountSC)) return false; // We only handle cases when the unroll factor is a power of 2. // Count is the loop unroll factor, the number of extra copies added + 1. - if ((Count & (Count-1)) != 0) + if (!isPowerOf2_32(Count)) + return false; + + // This constraint lets us deal with an overflowing trip count easily; see the + // comment on ModVal below. This check is equivalent to `Log2(Count) < + // BEWidth`. + if (static_cast<uint64_t>(Count) > (1ULL << BEWidth)) return false; // If this loop is nested, then the loop unroller changes the code in @@ -280,30 +342,32 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI, SCEVExpander Expander(*SE, "loop-unroll"); Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(), PreHeaderBR); + Value *BECount = Expander.expandCodeFor(BECountSC, BECountSC->getType(), + PreHeaderBR); IRBuilder<> B(PreHeaderBR); Value *ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter"); - // Check if for no extra iterations, then jump to unrolled loop. We have to - // check that the trip count computation didn't overflow when adding one to - // the backedge taken count. - Value *LCmp = B.CreateIsNotNull(ModVal, "lcmp.mod"); - Value *OverflowCheck = B.CreateIsNull(TripCount, "lcmp.overflow"); - Value *BranchVal = B.CreateOr(OverflowCheck, LCmp, "lcmp.or"); + // If ModVal is zero, we know that either + // 1. there are no iteration to be run in the prologue loop + // OR + // 2. the addition computing TripCount overflowed + // + // If (2) is true, we know that TripCount really is (1 << BEWidth) and so the + // number of iterations that remain to be run in the original loop is a + // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (we + // explicitly check this above). + + Value *BranchVal = B.CreateIsNotNull(ModVal, "lcmp.mod"); - // Branch to either the extra iterations or the unrolled loop + // Branch to either the extra iterations or the cloned/unrolled loop // We will fix up the true branch label when adding loop body copies BranchInst::Create(PEnd, PEnd, BranchVal, PreHeaderBR); assert(PreHeaderBR->isUnconditional() && PreHeaderBR->getSuccessor(0) == PEnd && "CFG edges in Preheader are not correct"); PreHeaderBR->eraseFromParent(); - - ValueToValueMapTy LVMap; Function *F = Header->getParent(); - // These variables are used to update the CFG links in each iteration - BasicBlock *CompareBB = nullptr; - BasicBlock *LastLoopBB = PH; // Get an ordered list of blocks in the loop to help with the ordering of the // cloned blocks in the prolog code LoopBlocksDFS LoopBlocks(L); @@ -314,62 +378,36 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI, // and generate a condition that branches to the copy depending on the // number of 'left over' iterations. // - for (unsigned leftOverIters = Count-1; leftOverIters > 0; --leftOverIters) { - std::vector<BasicBlock*> NewBlocks; - ValueToValueMapTy VMap; - - // Clone all the basic blocks in the loop, but we don't clone the loop - // This function adds the appropriate CFG connections. - CloneLoopBlocks(L, (leftOverIters == Count-1), LastLoopBB, PEnd, NewBlocks, - LoopBlocks, VMap, LVMap, LI); - LastLoopBB = cast<BasicBlock>(VMap[Latch]); - - // Insert the cloned blocks into function just before the original loop - F->getBasicBlockList().splice(PEnd, F->getBasicBlockList(), - NewBlocks[0], F->end()); - - // Generate the code for the comparison which determines if the loop - // prolog code needs to be executed. - if (leftOverIters == Count-1) { - // There is no compare block for the fall-thru case when for the last - // left over iteration - CompareBB = NewBlocks[0]; - } else { - // Create a new block for the comparison - BasicBlock *NewBB = BasicBlock::Create(CompareBB->getContext(), "unr.cmp", - F, CompareBB); - if (Loop *ParentLoop = L->getParentLoop()) { - // Add the new block to the parent loop, if needed - ParentLoop->addBasicBlockToLoop(NewBB, LI->getBase()); - } - - // The comparison w/ the extra iteration value and branch - Type *CountTy = TripCount->getType(); - Value *BranchVal = new ICmpInst(*NewBB, ICmpInst::ICMP_EQ, ModVal, - ConstantInt::get(CountTy, leftOverIters), - "un.tmp"); - // Branch to either the extra iterations or the unrolled loop - BranchInst::Create(NewBlocks[0], CompareBB, - BranchVal, NewBB); - CompareBB = NewBB; - PH->getTerminator()->setSuccessor(0, NewBB); - VMap[NewPH] = CompareBB; - } - - // Rewrite the cloned instruction operands to use the values - // created when the clone is created. - for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) { - for (BasicBlock::iterator I = NewBlocks[i]->begin(), - E = NewBlocks[i]->end(); I != E; ++I) { - RemapInstruction(I, VMap, - RF_NoModuleLevelChanges|RF_IgnoreMissingEntries); - } + std::vector<BasicBlock *> NewBlocks; + ValueToValueMapTy VMap; + + bool UnrollPrologue = Count == 2; + + // Clone all the basic blocks in the loop. If Count is 2, we don't clone + // the loop, otherwise we create a cloned loop to execute the extra + // iterations. This function adds the appropriate CFG connections. + CloneLoopBlocks(L, ModVal, UnrollPrologue, PH, PEnd, NewBlocks, LoopBlocks, + VMap, LI); + + // Insert the cloned blocks into function just before the original loop + F->getBasicBlockList().splice(PEnd, F->getBasicBlockList(), NewBlocks[0], + F->end()); + + // Rewrite the cloned instruction operands to use the values + // created when the clone is created. + for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) { + for (BasicBlock::iterator I = NewBlocks[i]->begin(), + E = NewBlocks[i]->end(); + I != E; ++I) { + RemapInstruction(I, VMap, + RF_NoModuleLevelChanges | RF_IgnoreMissingEntries); } } // Connect the prolog code to the original loop and update the // PHI functions. - ConnectProlog(L, TripCount, Count, LastLoopBB, PEnd, PH, NewPH, LVMap, + BasicBlock *LastLoopBB = cast<BasicBlock>(VMap[Latch]); + ConnectProlog(L, BECount, Count, LastLoopBB, PEnd, PH, NewPH, VMap, LPM->getAsPass()); NumRuntimeUnrolled++; return true; |