diff options
Diffstat (limited to 'contrib/llvm/lib/Transforms/Scalar')
50 files changed, 6225 insertions, 5515 deletions
diff --git a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp index d6fc916..590a52d 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp @@ -1,4 +1,4 @@ -//===- DCE.cpp - Code to perform dead code elimination --------------------===// +//===- ADCE.cpp - Code to perform dead code elimination -------------------===// // // The LLVM Compiler Infrastructure // @@ -14,52 +14,33 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/ADCE.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/Pass.h" +#include "llvm/Transforms/Scalar.h" using namespace llvm; #define DEBUG_TYPE "adce" STATISTIC(NumRemoved, "Number of instructions removed"); -namespace { -struct ADCE : public FunctionPass { - static char ID; // Pass identification, replacement for typeid - ADCE() : FunctionPass(ID) { - initializeADCEPass(*PassRegistry::getPassRegistry()); - } - - bool runOnFunction(Function& F) override; - - void getAnalysisUsage(AnalysisUsage& AU) const override { - AU.setPreservesCFG(); - } -}; -} - -char ADCE::ID = 0; -INITIALIZE_PASS(ADCE, "adce", "Aggressive Dead Code Elimination", false, false) - -bool ADCE::runOnFunction(Function& F) { - if (skipOptnoneFunction(F)) - return false; - +static bool aggressiveDCE(Function& F) { SmallPtrSet<Instruction*, 128> Alive; SmallVector<Instruction*, 128> Worklist; // Collect the set of "root" instructions that are known live. - for (Instruction &I : inst_range(F)) { - if (isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) || - isa<LandingPadInst>(I) || I.mayHaveSideEffects()) { + for (Instruction &I : instructions(F)) { + if (isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) || I.isEHPad() || + I.mayHaveSideEffects()) { Alive.insert(&I); Worklist.push_back(&I); } @@ -79,7 +60,7 @@ bool ADCE::runOnFunction(Function& F) { // which have no side effects and do not influence the control flow or return // value of the function, and may therefore be deleted safely. // NOTE: We reuse the Worklist vector here for memory efficiency. - for (Instruction &I : inst_range(F)) { + for (Instruction &I : instructions(F)) { if (!Alive.count(&I)) { Worklist.push_back(&I); I.dropAllReferences(); @@ -94,6 +75,34 @@ bool ADCE::runOnFunction(Function& F) { return !Worklist.empty(); } -FunctionPass *llvm::createAggressiveDCEPass() { - return new ADCE(); +PreservedAnalyses ADCEPass::run(Function &F) { + if (aggressiveDCE(F)) + return PreservedAnalyses::none(); + return PreservedAnalyses::all(); } + +namespace { +struct ADCELegacyPass : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + ADCELegacyPass() : FunctionPass(ID) { + initializeADCELegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function& F) override { + if (skipOptnoneFunction(F)) + return false; + return aggressiveDCE(F); + } + + void getAnalysisUsage(AnalysisUsage& AU) const override { + AU.setPreservesCFG(); + AU.addPreserved<GlobalsAAWrapperPass>(); + } +}; +} + +char ADCELegacyPass::ID = 0; +INITIALIZE_PASS(ADCELegacyPass, "adce", "Aggressive Dead Code Elimination", + false, false) + +FunctionPass *llvm::createAggressiveDCEPass() { return new ADCELegacyPass(); } diff --git a/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp index 8918909..4b721d3 100644 --- a/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp @@ -21,6 +21,8 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolution.h" @@ -54,13 +56,15 @@ struct AlignmentFromAssumptions : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.setPreservesCFG(); + AU.addPreserved<AAResultsWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<ScalarEvolution>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); } // For memory transfers, we need a common alignment for both the source and @@ -84,7 +88,7 @@ INITIALIZE_PASS_BEGIN(AlignmentFromAssumptions, AA_NAME, aip_name, false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_END(AlignmentFromAssumptions, AA_NAME, aip_name, false, false) @@ -249,8 +253,7 @@ bool AlignmentFromAssumptions::extractAlignmentInfo(CallInst *I, // The mask must have some trailing ones (otherwise the condition is // trivial and tells us nothing about the alignment of the left operand). - unsigned TrailingOnes = - MaskSCEV->getValue()->getValue().countTrailingOnes(); + unsigned TrailingOnes = MaskSCEV->getAPInt().countTrailingOnes(); if (!TrailingOnes) return false; @@ -270,7 +273,7 @@ bool AlignmentFromAssumptions::extractAlignmentInfo(CallInst *I, OffSCEV = nullptr; if (PtrToIntInst *PToI = dyn_cast<PtrToIntInst>(AndLHS)) { AAPtr = PToI->getPointerOperand(); - OffSCEV = SE->getConstant(Int64Ty, 0); + OffSCEV = SE->getZero(Int64Ty); } else if (const SCEVAddExpr* AndLHSAddSCEV = dyn_cast<SCEVAddExpr>(AndLHSSCEV)) { // Try to find the ptrtoint; subtract it and the rest is the offset. @@ -410,7 +413,7 @@ bool AlignmentFromAssumptions::processAssumption(CallInst *ACall) { bool AlignmentFromAssumptions::runOnFunction(Function &F) { bool Changed = false; auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); NewDestAlignments.clear(); diff --git a/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp index 09c605e..cb9b8b6 100644 --- a/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp @@ -15,26 +15,18 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AssumptionCache.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/IR/BasicBlock.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/DemandedBits.h" #include "llvm/IR/CFG.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/Dominators.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" - using namespace llvm; #define DEBUG_TYPE "bdce" @@ -53,342 +45,42 @@ struct BDCE : public FunctionPass { void getAnalysisUsage(AnalysisUsage& AU) const override { AU.setPreservesCFG(); - AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<DemandedBits>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } - - void determineLiveOperandBits(const Instruction *UserI, - const Instruction *I, unsigned OperandNo, - const APInt &AOut, APInt &AB, - APInt &KnownZero, APInt &KnownOne, - APInt &KnownZero2, APInt &KnownOne2); - - AssumptionCache *AC; - DominatorTree *DT; }; } char BDCE::ID = 0; INITIALIZE_PASS_BEGIN(BDCE, "bdce", "Bit-Tracking Dead Code Elimination", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DemandedBits) INITIALIZE_PASS_END(BDCE, "bdce", "Bit-Tracking Dead Code Elimination", false, false) -static bool isAlwaysLive(Instruction *I) { - return isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) || - isa<LandingPadInst>(I) || I->mayHaveSideEffects(); -} - -void BDCE::determineLiveOperandBits(const Instruction *UserI, - const Instruction *I, unsigned OperandNo, - const APInt &AOut, APInt &AB, - APInt &KnownZero, APInt &KnownOne, - APInt &KnownZero2, APInt &KnownOne2) { - unsigned BitWidth = AB.getBitWidth(); - - // We're called once per operand, but for some instructions, we need to - // compute known bits of both operands in order to determine the live bits of - // either (when both operands are instructions themselves). We don't, - // however, want to do this twice, so we cache the result in APInts that live - // in the caller. For the two-relevant-operands case, both operand values are - // provided here. - auto ComputeKnownBits = - [&](unsigned BitWidth, const Value *V1, const Value *V2) { - const DataLayout &DL = I->getModule()->getDataLayout(); - KnownZero = APInt(BitWidth, 0); - KnownOne = APInt(BitWidth, 0); - computeKnownBits(const_cast<Value *>(V1), KnownZero, KnownOne, DL, 0, - AC, UserI, DT); - - if (V2) { - KnownZero2 = APInt(BitWidth, 0); - KnownOne2 = APInt(BitWidth, 0); - computeKnownBits(const_cast<Value *>(V2), KnownZero2, KnownOne2, DL, - 0, AC, UserI, DT); - } - }; - - switch (UserI->getOpcode()) { - default: break; - case Instruction::Call: - case Instruction::Invoke: - if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(UserI)) - switch (II->getIntrinsicID()) { - default: break; - case Intrinsic::bswap: - // The alive bits of the input are the swapped alive bits of - // the output. - AB = AOut.byteSwap(); - break; - case Intrinsic::ctlz: - if (OperandNo == 0) { - // We need some output bits, so we need all bits of the - // input to the left of, and including, the leftmost bit - // known to be one. - ComputeKnownBits(BitWidth, I, nullptr); - AB = APInt::getHighBitsSet(BitWidth, - std::min(BitWidth, KnownOne.countLeadingZeros()+1)); - } - break; - case Intrinsic::cttz: - if (OperandNo == 0) { - // We need some output bits, so we need all bits of the - // input to the right of, and including, the rightmost bit - // known to be one. - ComputeKnownBits(BitWidth, I, nullptr); - AB = APInt::getLowBitsSet(BitWidth, - std::min(BitWidth, KnownOne.countTrailingZeros()+1)); - } - break; - } - break; - case Instruction::Add: - case Instruction::Sub: - // Find the highest live output bit. We don't need any more input - // bits than that (adds, and thus subtracts, ripple only to the - // left). - AB = APInt::getLowBitsSet(BitWidth, AOut.getActiveBits()); - break; - case Instruction::Shl: - if (OperandNo == 0) - if (ConstantInt *CI = - dyn_cast<ConstantInt>(UserI->getOperand(1))) { - uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1); - AB = AOut.lshr(ShiftAmt); - - // If the shift is nuw/nsw, then the high bits are not dead - // (because we've promised that they *must* be zero). - const ShlOperator *S = cast<ShlOperator>(UserI); - if (S->hasNoSignedWrap()) - AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt+1); - else if (S->hasNoUnsignedWrap()) - AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt); - } - break; - case Instruction::LShr: - if (OperandNo == 0) - if (ConstantInt *CI = - dyn_cast<ConstantInt>(UserI->getOperand(1))) { - uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1); - AB = AOut.shl(ShiftAmt); - - // If the shift is exact, then the low bits are not dead - // (they must be zero). - if (cast<LShrOperator>(UserI)->isExact()) - AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt); - } - break; - case Instruction::AShr: - if (OperandNo == 0) - if (ConstantInt *CI = - dyn_cast<ConstantInt>(UserI->getOperand(1))) { - uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1); - AB = AOut.shl(ShiftAmt); - // Because the high input bit is replicated into the - // high-order bits of the result, if we need any of those - // bits, then we must keep the highest input bit. - if ((AOut & APInt::getHighBitsSet(BitWidth, ShiftAmt)) - .getBoolValue()) - AB.setBit(BitWidth-1); - - // If the shift is exact, then the low bits are not dead - // (they must be zero). - if (cast<AShrOperator>(UserI)->isExact()) - AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt); - } - break; - case Instruction::And: - AB = AOut; - - // For bits that are known zero, the corresponding bits in the - // other operand are dead (unless they're both zero, in which - // case they can't both be dead, so just mark the LHS bits as - // dead). - if (OperandNo == 0) { - ComputeKnownBits(BitWidth, I, UserI->getOperand(1)); - AB &= ~KnownZero2; - } else { - if (!isa<Instruction>(UserI->getOperand(0))) - ComputeKnownBits(BitWidth, UserI->getOperand(0), I); - AB &= ~(KnownZero & ~KnownZero2); - } - break; - case Instruction::Or: - AB = AOut; - - // For bits that are known one, the corresponding bits in the - // other operand are dead (unless they're both one, in which - // case they can't both be dead, so just mark the LHS bits as - // dead). - if (OperandNo == 0) { - ComputeKnownBits(BitWidth, I, UserI->getOperand(1)); - AB &= ~KnownOne2; - } else { - if (!isa<Instruction>(UserI->getOperand(0))) - ComputeKnownBits(BitWidth, UserI->getOperand(0), I); - AB &= ~(KnownOne & ~KnownOne2); - } - break; - case Instruction::Xor: - case Instruction::PHI: - AB = AOut; - break; - case Instruction::Trunc: - AB = AOut.zext(BitWidth); - break; - case Instruction::ZExt: - AB = AOut.trunc(BitWidth); - break; - case Instruction::SExt: - AB = AOut.trunc(BitWidth); - // Because the high input bit is replicated into the - // high-order bits of the result, if we need any of those - // bits, then we must keep the highest input bit. - if ((AOut & APInt::getHighBitsSet(AOut.getBitWidth(), - AOut.getBitWidth() - BitWidth)) - .getBoolValue()) - AB.setBit(BitWidth-1); - break; - case Instruction::Select: - if (OperandNo != 0) - AB = AOut; - break; - } -} - bool BDCE::runOnFunction(Function& F) { if (skipOptnoneFunction(F)) return false; + DemandedBits &DB = getAnalysis<DemandedBits>(); - AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - - DenseMap<Instruction *, APInt> AliveBits; SmallVector<Instruction*, 128> Worklist; - - // The set of visited instructions (non-integer-typed only). - SmallPtrSet<Instruction*, 128> Visited; - - // Collect the set of "root" instructions that are known live. - for (Instruction &I : inst_range(F)) { - if (!isAlwaysLive(&I)) - continue; - - DEBUG(dbgs() << "BDCE: Root: " << I << "\n"); - // For integer-valued instructions, set up an initial empty set of alive - // bits and add the instruction to the work list. For other instructions - // add their operands to the work list (for integer values operands, mark - // all bits as live). - if (IntegerType *IT = dyn_cast<IntegerType>(I.getType())) { - if (!AliveBits.count(&I)) { - AliveBits[&I] = APInt(IT->getBitWidth(), 0); - Worklist.push_back(&I); - } - - continue; - } - - // Non-integer-typed instructions... - for (Use &OI : I.operands()) { - if (Instruction *J = dyn_cast<Instruction>(OI)) { - if (IntegerType *IT = dyn_cast<IntegerType>(J->getType())) - AliveBits[J] = APInt::getAllOnesValue(IT->getBitWidth()); - Worklist.push_back(J); - } - } - // To save memory, we don't add I to the Visited set here. Instead, we - // check isAlwaysLive on every instruction when searching for dead - // instructions later (we need to check isAlwaysLive for the - // integer-typed instructions anyway). - } - - // Propagate liveness backwards to operands. - while (!Worklist.empty()) { - Instruction *UserI = Worklist.pop_back_val(); - - DEBUG(dbgs() << "BDCE: Visiting: " << *UserI); - APInt AOut; - if (UserI->getType()->isIntegerTy()) { - AOut = AliveBits[UserI]; - DEBUG(dbgs() << " Alive Out: " << AOut); - } - DEBUG(dbgs() << "\n"); - - if (!UserI->getType()->isIntegerTy()) - Visited.insert(UserI); - - APInt KnownZero, KnownOne, KnownZero2, KnownOne2; - // Compute the set of alive bits for each operand. These are anded into the - // existing set, if any, and if that changes the set of alive bits, the - // operand is added to the work-list. - for (Use &OI : UserI->operands()) { - if (Instruction *I = dyn_cast<Instruction>(OI)) { - if (IntegerType *IT = dyn_cast<IntegerType>(I->getType())) { - unsigned BitWidth = IT->getBitWidth(); - APInt AB = APInt::getAllOnesValue(BitWidth); - if (UserI->getType()->isIntegerTy() && !AOut && - !isAlwaysLive(UserI)) { - AB = APInt(BitWidth, 0); - } else { - // If all bits of the output are dead, then all bits of the input - // Bits of each operand that are used to compute alive bits of the - // output are alive, all others are dead. - determineLiveOperandBits(UserI, I, OI.getOperandNo(), AOut, AB, - KnownZero, KnownOne, - KnownZero2, KnownOne2); - } - - // If we've added to the set of alive bits (or the operand has not - // been previously visited), then re-queue the operand to be visited - // again. - APInt ABPrev(BitWidth, 0); - auto ABI = AliveBits.find(I); - if (ABI != AliveBits.end()) - ABPrev = ABI->second; - - APInt ABNew = AB | ABPrev; - if (ABNew != ABPrev || ABI == AliveBits.end()) { - AliveBits[I] = std::move(ABNew); - Worklist.push_back(I); - } - } else if (!Visited.count(I)) { - Worklist.push_back(I); - } - } - } - } - bool Changed = false; - // The inverse of the live set is the dead set. These are those instructions - // which have no side effects and do not influence the control flow or return - // value of the function, and may therefore be deleted safely. - // NOTE: We reuse the Worklist vector here for memory efficiency. - for (Instruction &I : inst_range(F)) { - // For live instructions that have all dead bits, first make them dead by - // replacing all uses with something else. Then, if they don't need to - // remain live (because they have side effects, etc.) we can remove them. - if (I.getType()->isIntegerTy()) { - auto ABI = AliveBits.find(&I); - if (ABI != AliveBits.end()) { - if (ABI->second.getBoolValue()) - continue; - - DEBUG(dbgs() << "BDCE: Trivializing: " << I << " (all bits dead)\n"); - // FIXME: In theory we could substitute undef here instead of zero. - // This should be reconsidered once we settle on the semantics of - // undef, poison, etc. - Value *Zero = ConstantInt::get(I.getType(), 0); - ++NumSimplified; - I.replaceAllUsesWith(Zero); - Changed = true; - } - } else if (Visited.count(&I)) { - continue; + for (Instruction &I : instructions(F)) { + if (I.getType()->isIntegerTy() && + !DB.getDemandedBits(&I).getBoolValue()) { + // For live instructions that have all dead bits, first make them dead by + // replacing all uses with something else. Then, if they don't need to + // remain live (because they have side effects, etc.) we can remove them. + DEBUG(dbgs() << "BDCE: Trivializing: " << I << " (all bits dead)\n"); + // FIXME: In theory we could substitute undef here instead of zero. + // This should be reconsidered once we settle on the semantics of + // undef, poison, etc. + Value *Zero = ConstantInt::get(I.getType(), 0); + ++NumSimplified; + I.replaceAllUsesWith(Zero); + Changed = true; } - - if (isAlwaysLive(&I)) + if (!DB.isInstructionDead(&I)) continue; Worklist.push_back(&I); diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp index 4288742..84f7f5f 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -223,10 +223,10 @@ Instruction *ConstantHoisting::findMatInsertPt(Instruction *Inst, } // The simple and common case. This also includes constant expressions. - if (!isa<PHINode>(Inst) && !isa<LandingPadInst>(Inst)) + if (!isa<PHINode>(Inst) && !Inst->isEHPad()) return Inst; - // We can't insert directly before a phi node or landing pad. Insert before + // We can't insert directly before a phi node or an eh pad. Insert before // the terminator of the incoming or dominating block. assert(Entry != Inst->getParent() && "PHI or landing pad in entry block!"); if (Idx != ~0U && isa<PHINode>(Inst)) @@ -365,9 +365,9 @@ void ConstantHoisting::collectConstantCandidates(ConstCandMapType &ConstCandMap, /// into an instruction itself. void ConstantHoisting::collectConstantCandidates(Function &Fn) { ConstCandMapType ConstCandMap; - for (Function::iterator BB : Fn) - for (BasicBlock::iterator Inst : *BB) - collectConstantCandidates(ConstCandMap, Inst); + for (BasicBlock &BB : Fn) + for (Instruction &Inst : BB) + collectConstantCandidates(ConstCandMap, &Inst); } /// \brief Find the base constant within the given range and rebase all other diff --git a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 79624b2..686bd40 100644 --- a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -13,6 +13,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LazyValueInfo.h" #include "llvm/IR/CFG.h" @@ -32,6 +33,7 @@ STATISTIC(NumPhis, "Number of phis propagated"); STATISTIC(NumSelects, "Number of selects propagated"); STATISTIC(NumMemAccess, "Number of memory access targets propagated"); STATISTIC(NumCmps, "Number of comparisons propagated"); +STATISTIC(NumReturns, "Number of return values propagated"); STATISTIC(NumDeadCases, "Number of switch cases removed"); namespace { @@ -43,6 +45,11 @@ namespace { bool processMemAccess(Instruction *I); bool processCmp(CmpInst *C); bool processSwitch(SwitchInst *SI); + bool processCallSite(CallSite CS); + + /// Return a constant value for V usable at At and everything it + /// dominates. If no such Constant can be found, return nullptr. + Constant *getConstantAt(Value *V, Instruction *At); public: static char ID; @@ -54,6 +61,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LazyValueInfo>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } }; } @@ -178,44 +186,33 @@ bool CorrelatedValuePropagation::processMemAccess(Instruction *I) { return true; } -/// processCmp - If the value of this comparison could be determined locally, -/// constant propagation would already have figured it out. Instead, walk -/// the predecessors and statically evaluate the comparison based on information -/// available on that edge. If a given static evaluation is true on ALL -/// incoming edges, then it's true universally and we can simplify the compare. +/// processCmp - See if LazyValueInfo's ability to exploit edge conditions, +/// or range information is sufficient to prove this comparison. Even for +/// local conditions, this can sometimes prove conditions instcombine can't by +/// exploiting range information. bool CorrelatedValuePropagation::processCmp(CmpInst *C) { Value *Op0 = C->getOperand(0); - if (isa<Instruction>(Op0) && - cast<Instruction>(Op0)->getParent() == C->getParent()) - return false; - Constant *Op1 = dyn_cast<Constant>(C->getOperand(1)); if (!Op1) return false; - pred_iterator PI = pred_begin(C->getParent()), PE = pred_end(C->getParent()); - if (PI == PE) return false; + // As a policy choice, we choose not to waste compile time on anything where + // the comparison is testing local values. While LVI can sometimes reason + // about such cases, it's not its primary purpose. We do make sure to do + // the block local query for uses from terminator instructions, but that's + // handled in the code for each terminator. + auto *I = dyn_cast<Instruction>(Op0); + if (I && I->getParent() == C->getParent()) + return false; - LazyValueInfo::Tristate Result = LVI->getPredicateOnEdge(C->getPredicate(), - C->getOperand(0), Op1, *PI, - C->getParent(), C); + LazyValueInfo::Tristate Result = + LVI->getPredicateAt(C->getPredicate(), Op0, Op1, C); if (Result == LazyValueInfo::Unknown) return false; - ++PI; - while (PI != PE) { - LazyValueInfo::Tristate Res = LVI->getPredicateOnEdge(C->getPredicate(), - C->getOperand(0), Op1, *PI, - C->getParent(), C); - if (Res != Result) return false; - ++PI; - } - ++NumCmps; - if (Result == LazyValueInfo::True) C->replaceAllUsesWith(ConstantInt::getTrue(C->getContext())); else C->replaceAllUsesWith(ConstantInt::getFalse(C->getContext())); - C->eraseFromParent(); return true; @@ -307,6 +304,59 @@ bool CorrelatedValuePropagation::processSwitch(SwitchInst *SI) { return Changed; } +/// processCallSite - Infer nonnull attributes for the arguments at the +/// specified callsite. +bool CorrelatedValuePropagation::processCallSite(CallSite CS) { + SmallVector<unsigned, 4> Indices; + unsigned ArgNo = 0; + + for (Value *V : CS.args()) { + PointerType *Type = dyn_cast<PointerType>(V->getType()); + + if (Type && !CS.paramHasAttr(ArgNo + 1, Attribute::NonNull) && + LVI->getPredicateAt(ICmpInst::ICMP_EQ, V, + ConstantPointerNull::get(Type), + CS.getInstruction()) == LazyValueInfo::False) + Indices.push_back(ArgNo + 1); + ArgNo++; + } + + assert(ArgNo == CS.arg_size() && "sanity check"); + + if (Indices.empty()) + return false; + + AttributeSet AS = CS.getAttributes(); + LLVMContext &Ctx = CS.getInstruction()->getContext(); + AS = AS.addAttribute(Ctx, Indices, Attribute::get(Ctx, Attribute::NonNull)); + CS.setAttributes(AS); + + return true; +} + +Constant *CorrelatedValuePropagation::getConstantAt(Value *V, Instruction *At) { + if (Constant *C = LVI->getConstant(V, At->getParent(), At)) + return C; + + // TODO: The following really should be sunk inside LVI's core algorithm, or + // at least the outer shims around such. + auto *C = dyn_cast<CmpInst>(V); + if (!C) return nullptr; + + Value *Op0 = C->getOperand(0); + Constant *Op1 = dyn_cast<Constant>(C->getOperand(1)); + if (!Op1) return nullptr; + + LazyValueInfo::Tristate Result = + LVI->getPredicateAt(C->getPredicate(), Op0, Op1, At); + if (Result == LazyValueInfo::Unknown) + return nullptr; + + return (Result == LazyValueInfo::True) ? + ConstantInt::getTrue(C->getContext()) : + ConstantInt::getFalse(C->getContext()); +} + bool CorrelatedValuePropagation::runOnFunction(Function &F) { if (skipOptnoneFunction(F)) return false; @@ -318,7 +368,7 @@ bool CorrelatedValuePropagation::runOnFunction(Function &F) { for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) { bool BBChanged = false; for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ) { - Instruction *II = BI++; + Instruction *II = &*BI++; switch (II->getOpcode()) { case Instruction::Select: BBChanged |= processSelect(cast<SelectInst>(II)); @@ -334,6 +384,10 @@ bool CorrelatedValuePropagation::runOnFunction(Function &F) { case Instruction::Store: BBChanged |= processMemAccess(II); break; + case Instruction::Call: + case Instruction::Invoke: + BBChanged |= processCallSite(CallSite(II)); + break; } } @@ -342,7 +396,21 @@ bool CorrelatedValuePropagation::runOnFunction(Function &F) { case Instruction::Switch: BBChanged |= processSwitch(cast<SwitchInst>(Term)); break; + case Instruction::Ret: { + auto *RI = cast<ReturnInst>(Term); + // Try to determine the return value if we can. This is mainly here to + // simplify the writing of unit tests, but also helps to enable IPO by + // constant folding the return values of callees. + auto *RetVal = RI->getReturnValue(); + if (!RetVal) break; // handle "ret void" + if (isa<Constant>(RetVal)) break; // nothing to do + if (auto *C = getConstantAt(RetVal, RI)) { + ++NumReturns; + RI->replaceUsesOfWith(RetVal, C); + BBChanged = true; + } } + }; FnChanged |= BBChanged; } diff --git a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp index 3b262a2..b67c3c7 100644 --- a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp @@ -17,6 +17,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" @@ -46,7 +47,7 @@ namespace { TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr; bool Changed = false; for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) { - Instruction *Inst = DI++; + Instruction *Inst = &*DI++; if (isInstructionTriviallyDead(Inst, TLI)) { Inst->eraseFromParent(); Changed = true; @@ -92,6 +93,34 @@ namespace { char DCE::ID = 0; INITIALIZE_PASS(DCE, "dce", "Dead Code Elimination", false, false) +static bool DCEInstruction(Instruction *I, + SmallSetVector<Instruction *, 16> &WorkList, + const TargetLibraryInfo *TLI) { + if (isInstructionTriviallyDead(I, TLI)) { + // Null out all of the instruction's operands to see if any operand becomes + // dead as we go. + for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { + Value *OpV = I->getOperand(i); + I->setOperand(i, nullptr); + + if (!OpV->use_empty() || I == OpV) + continue; + + // If the operand is an instruction that became dead as we nulled out the + // operand, and if it is 'trivially' dead, delete it in a future loop + // iteration. + if (Instruction *OpI = dyn_cast<Instruction>(OpV)) + if (isInstructionTriviallyDead(OpI, TLI)) + WorkList.insert(OpI); + } + + I->eraseFromParent(); + ++DCEEliminated; + return true; + } + return false; +} + bool DCE::runOnFunction(Function &F) { if (skipOptnoneFunction(F)) return false; @@ -99,39 +128,24 @@ bool DCE::runOnFunction(Function &F) { auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr; - // Start out with all of the instructions in the worklist... - std::vector<Instruction*> WorkList; - for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) - WorkList.push_back(&*i); - - // Loop over the worklist finding instructions that are dead. If they are - // dead make them drop all of their uses, making other instructions - // potentially dead, and work until the worklist is empty. - // bool MadeChange = false; + SmallSetVector<Instruction *, 16> WorkList; + // Iterate over the original function, only adding insts to the worklist + // if they actually need to be revisited. This avoids having to pre-init + // the worklist with the entire function's worth of instructions. + for (inst_iterator FI = inst_begin(F), FE = inst_end(F); FI != FE;) { + Instruction *I = &*FI; + ++FI; + + // We're visiting this instruction now, so make sure it's not in the + // worklist from an earlier visit. + if (!WorkList.count(I)) + MadeChange |= DCEInstruction(I, WorkList, TLI); + } + while (!WorkList.empty()) { - Instruction *I = WorkList.back(); - WorkList.pop_back(); - - if (isInstructionTriviallyDead(I, TLI)) { // If the instruction is dead. - // Loop over all of the values that the instruction uses, if there are - // instructions being used, add them to the worklist, because they might - // go dead after this one is removed. - // - for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) - if (Instruction *Used = dyn_cast<Instruction>(*OI)) - WorkList.push_back(Used); - - // Remove the instruction. - I->eraseFromParent(); - - // Remove the instruction from the worklist if it still exists in it. - WorkList.erase(std::remove(WorkList.begin(), WorkList.end(), I), - WorkList.end()); - - MadeChange = true; - ++DCEEliminated; - } + Instruction *I = WorkList.pop_back_val(); + MadeChange |= DCEInstruction(I, WorkList, TLI); } return MadeChange; } diff --git a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index c505584..36ad0a5 100644 --- a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -21,6 +21,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -40,6 +41,7 @@ using namespace llvm; #define DEBUG_TYPE "dse" +STATISTIC(NumRedundantStores, "Number of redundant stores deleted"); STATISTIC(NumFastStores, "Number of stores deleted"); STATISTIC(NumFastOther , "Number of other instrs removed"); @@ -59,23 +61,24 @@ namespace { if (skipOptnoneFunction(F)) return false; - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); MD = &getAnalysis<MemoryDependenceAnalysis>(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - TLI = AA->getTargetLibraryInfo(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); bool Changed = false; - for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) + for (BasicBlock &I : F) // Only check non-dead blocks. Dead blocks may have strange pointer // cycles that will confuse alias analysis. - if (DT->isReachableFromEntry(I)) - Changed |= runOnBasicBlock(*I); + if (DT->isReachableFromEntry(&I)) + Changed |= runOnBasicBlock(I); AA = nullptr; MD = nullptr; DT = nullptr; return Changed; } bool runOnBasicBlock(BasicBlock &BB); + bool MemoryIsNotModifiedBetween(Instruction *FirstI, Instruction *SecondI); bool HandleFree(CallInst *F); bool handleEndBlock(BasicBlock &BB); void RemoveAccessedObjects(const MemoryLocation &LoadedLoc, @@ -85,10 +88,11 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<MemoryDependenceAnalysis>(); - AU.addPreserved<AliasAnalysis>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); AU.addPreserved<MemoryDependenceAnalysis>(); } }; @@ -97,8 +101,10 @@ namespace { char DSE::ID = 0; INITIALIZE_PASS_BEGIN(DSE, "dse", "Dead Store Elimination", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(DSE, "dse", "Dead Store Elimination", false, false) FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); } @@ -115,7 +121,7 @@ FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); } /// static void DeleteDeadInstruction(Instruction *I, MemoryDependenceAnalysis &MD, - const TargetLibraryInfo *TLI, + const TargetLibraryInfo &TLI, SmallSetVector<Value*, 16> *ValueSet = nullptr) { SmallVector<Instruction*, 32> NowDeadInsts; @@ -140,7 +146,7 @@ static void DeleteDeadInstruction(Instruction *I, if (!Op->use_empty()) continue; if (Instruction *OpI = dyn_cast<Instruction>(Op)) - if (isInstructionTriviallyDead(OpI, TLI)) + if (isInstructionTriviallyDead(OpI, &TLI)) NowDeadInsts.push_back(OpI); } @@ -153,7 +159,7 @@ static void DeleteDeadInstruction(Instruction *I, /// hasMemoryWrite - Does this instruction write some memory? This only returns /// true for things that we can analyze with other helpers below. -static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo *TLI) { +static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo &TLI) { if (isa<StoreInst>(I)) return true; if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { @@ -170,20 +176,20 @@ static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo *TLI) { } if (auto CS = CallSite(I)) { if (Function *F = CS.getCalledFunction()) { - if (TLI && TLI->has(LibFunc::strcpy) && - F->getName() == TLI->getName(LibFunc::strcpy)) { + if (TLI.has(LibFunc::strcpy) && + F->getName() == TLI.getName(LibFunc::strcpy)) { return true; } - if (TLI && TLI->has(LibFunc::strncpy) && - F->getName() == TLI->getName(LibFunc::strncpy)) { + if (TLI.has(LibFunc::strncpy) && + F->getName() == TLI.getName(LibFunc::strncpy)) { return true; } - if (TLI && TLI->has(LibFunc::strcat) && - F->getName() == TLI->getName(LibFunc::strcat)) { + if (TLI.has(LibFunc::strcat) && + F->getName() == TLI.getName(LibFunc::strcat)) { return true; } - if (TLI && TLI->has(LibFunc::strncat) && - F->getName() == TLI->getName(LibFunc::strncat)) { + if (TLI.has(LibFunc::strncat) && + F->getName() == TLI.getName(LibFunc::strncat)) { return true; } } @@ -224,9 +230,9 @@ static MemoryLocation getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { /// getLocForRead - Return the location read by the specified "hasMemoryWrite" /// instruction if any. -static MemoryLocation getLocForRead(Instruction *Inst, AliasAnalysis &AA) { - assert(hasMemoryWrite(Inst, AA.getTargetLibraryInfo()) && - "Unknown instruction case"); +static MemoryLocation getLocForRead(Instruction *Inst, + const TargetLibraryInfo &TLI) { + assert(hasMemoryWrite(Inst, TLI) && "Unknown instruction case"); // The only instructions that both read and write are the mem transfer // instructions (memcpy/memmove). @@ -313,9 +319,9 @@ static Value *getStoredPointerOperand(Instruction *I) { } static uint64_t getPointerSize(const Value *V, const DataLayout &DL, - const TargetLibraryInfo *TLI) { + const TargetLibraryInfo &TLI) { uint64_t Size; - if (getObjectSize(V, Size, DL, TLI)) + if (getObjectSize(V, Size, DL, &TLI)) return Size; return MemoryLocation::UnknownSize; } @@ -336,7 +342,7 @@ namespace { static OverwriteResult isOverwrite(const MemoryLocation &Later, const MemoryLocation &Earlier, const DataLayout &DL, - const TargetLibraryInfo *TLI, + const TargetLibraryInfo &TLI, int64_t &EarlierOff, int64_t &LaterOff) { const Value *P1 = Earlier.Ptr->stripPointerCasts(); const Value *P2 = Later.Ptr->stripPointerCasts(); @@ -442,10 +448,12 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later, /// because the DSE inducing instruction may be a self-read. static bool isPossibleSelfRead(Instruction *Inst, const MemoryLocation &InstStoreLoc, - Instruction *DepWrite, AliasAnalysis &AA) { + Instruction *DepWrite, + const TargetLibraryInfo &TLI, + AliasAnalysis &AA) { // Self reads can only happen for instructions that read memory. Get the // location read. - MemoryLocation InstReadLoc = getLocForRead(Inst, AA); + MemoryLocation InstReadLoc = getLocForRead(Inst, TLI); if (!InstReadLoc.Ptr) return false; // Not a reading instruction. // If the read and written loc obviously don't alias, it isn't a read. @@ -459,7 +467,7 @@ static bool isPossibleSelfRead(Instruction *Inst, // Here we don't know if A/B may alias, but we do know that B/B are must // aliases, so removing the first memcpy is safe (assuming it writes <= # // bytes as the second one. - MemoryLocation DepReadLoc = getLocForRead(DepWrite, AA); + MemoryLocation DepReadLoc = getLocForRead(DepWrite, TLI); if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr)) return false; @@ -475,11 +483,12 @@ static bool isPossibleSelfRead(Instruction *Inst, //===----------------------------------------------------------------------===// bool DSE::runOnBasicBlock(BasicBlock &BB) { + const DataLayout &DL = BB.getModule()->getDataLayout(); bool MadeChange = false; // Do a top-down walk on the BB. for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) { - Instruction *Inst = BBI++; + Instruction *Inst = &*BBI++; // Handle 'free' calls specially. if (CallInst *F = isFreeCall(Inst, TLI)) { @@ -488,42 +497,68 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { } // If we find something that writes memory, get its memory dependence. - if (!hasMemoryWrite(Inst, TLI)) - continue; - - MemDepResult InstDep = MD->getDependency(Inst); - - // Ignore any store where we can't find a local dependence. - // FIXME: cross-block DSE would be fun. :) - if (!InstDep.isDef() && !InstDep.isClobber()) + if (!hasMemoryWrite(Inst, *TLI)) continue; // If we're storing the same value back to a pointer that we just // loaded from, then the store can be removed. if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { - if (LoadInst *DepLoad = dyn_cast<LoadInst>(InstDep.getInst())) { + + auto RemoveDeadInstAndUpdateBBI = [&](Instruction *DeadInst) { + // DeleteDeadInstruction can delete the current instruction. Save BBI + // in case we need it. + WeakVH NextInst(&*BBI); + + DeleteDeadInstruction(DeadInst, *MD, *TLI); + + if (!NextInst) // Next instruction deleted. + BBI = BB.begin(); + else if (BBI != BB.begin()) // Revisit this instruction if possible. + --BBI; + ++NumRedundantStores; + MadeChange = true; + }; + + if (LoadInst *DepLoad = dyn_cast<LoadInst>(SI->getValueOperand())) { if (SI->getPointerOperand() == DepLoad->getPointerOperand() && - SI->getOperand(0) == DepLoad && isRemovable(SI)) { + isRemovable(SI) && + MemoryIsNotModifiedBetween(DepLoad, SI)) { + DEBUG(dbgs() << "DSE: Remove Store Of Load from same pointer:\n " << "LOAD: " << *DepLoad << "\n STORE: " << *SI << '\n'); - // DeleteDeadInstruction can delete the current instruction. Save BBI - // in case we need it. - WeakVH NextInst(BBI); + RemoveDeadInstAndUpdateBBI(SI); + continue; + } + } - DeleteDeadInstruction(SI, *MD, TLI); + // Remove null stores into the calloc'ed objects + Constant *StoredConstant = dyn_cast<Constant>(SI->getValueOperand()); - if (!NextInst) // Next instruction deleted. - BBI = BB.begin(); - else if (BBI != BB.begin()) // Revisit this instruction if possible. - --BBI; - ++NumFastStores; - MadeChange = true; + if (StoredConstant && StoredConstant->isNullValue() && + isRemovable(SI)) { + Instruction *UnderlyingPointer = dyn_cast<Instruction>( + GetUnderlyingObject(SI->getPointerOperand(), DL)); + + if (UnderlyingPointer && isCallocLikeFn(UnderlyingPointer, TLI) && + MemoryIsNotModifiedBetween(UnderlyingPointer, SI)) { + DEBUG(dbgs() + << "DSE: Remove null store to the calloc'ed object:\n DEAD: " + << *Inst << "\n OBJECT: " << *UnderlyingPointer << '\n'); + + RemoveDeadInstAndUpdateBBI(SI); continue; } } } + MemDepResult InstDep = MD->getDependency(Inst); + + // Ignore any store where we can't find a local dependence. + // FIXME: cross-block DSE would be fun. :) + if (!InstDep.isDef() && !InstDep.isClobber()) + continue; + // Figure out what location is being stored to. MemoryLocation Loc = getLocForWrite(Inst, *AA); @@ -549,24 +584,22 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { // completely obliterated by the store to 'Loc', and c) which we know that // 'Inst' doesn't load from, then we can remove it. if (isRemovable(DepWrite) && - !isPossibleSelfRead(Inst, Loc, DepWrite, *AA)) { + !isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) { int64_t InstWriteOffset, DepWriteOffset; - const DataLayout &DL = BB.getModule()->getDataLayout(); OverwriteResult OR = - isOverwrite(Loc, DepLoc, DL, AA->getTargetLibraryInfo(), - DepWriteOffset, InstWriteOffset); + isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset); if (OR == OverwriteComplete) { DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *DepWrite << "\n KILLER: " << *Inst << '\n'); // Delete the store and now-dead instructions that feed it. - DeleteDeadInstruction(DepWrite, *MD, TLI); + DeleteDeadInstruction(DepWrite, *MD, *TLI); ++NumFastStores; MadeChange = true; // DeleteDeadInstruction can delete the current instruction in loop // cases, reset BBI. - BBI = Inst; + BBI = Inst->getIterator(); if (BBI != BB.begin()) --BBI; break; @@ -609,10 +642,11 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { if (DepWrite == &BB.front()) break; // Can't look past this instruction if it might read 'Loc'. - if (AA->getModRefInfo(DepWrite, Loc) & AliasAnalysis::Ref) + if (AA->getModRefInfo(DepWrite, Loc) & MRI_Ref) break; - InstDep = MD->getPointerDependencyFrom(Loc, false, DepWrite, &BB); + InstDep = MD->getPointerDependencyFrom(Loc, false, + DepWrite->getIterator(), &BB); } } @@ -624,6 +658,64 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { return MadeChange; } +/// Returns true if the memory which is accessed by the second instruction is not +/// modified between the first and the second instruction. +/// Precondition: Second instruction must be dominated by the first +/// instruction. +bool DSE::MemoryIsNotModifiedBetween(Instruction *FirstI, + Instruction *SecondI) { + SmallVector<BasicBlock *, 16> WorkList; + SmallPtrSet<BasicBlock *, 8> Visited; + BasicBlock::iterator FirstBBI(FirstI); + ++FirstBBI; + BasicBlock::iterator SecondBBI(SecondI); + BasicBlock *FirstBB = FirstI->getParent(); + BasicBlock *SecondBB = SecondI->getParent(); + MemoryLocation MemLoc = MemoryLocation::get(SecondI); + + // Start checking the store-block. + WorkList.push_back(SecondBB); + bool isFirstBlock = true; + + // Check all blocks going backward until we reach the load-block. + while (!WorkList.empty()) { + BasicBlock *B = WorkList.pop_back_val(); + + // Ignore instructions before LI if this is the FirstBB. + BasicBlock::iterator BI = (B == FirstBB ? FirstBBI : B->begin()); + + BasicBlock::iterator EI; + if (isFirstBlock) { + // Ignore instructions after SI if this is the first visit of SecondBB. + assert(B == SecondBB && "first block is not the store block"); + EI = SecondBBI; + isFirstBlock = false; + } else { + // It's not SecondBB or (in case of a loop) the second visit of SecondBB. + // In this case we also have to look at instructions after SI. + EI = B->end(); + } + for (; BI != EI; ++BI) { + Instruction *I = &*BI; + if (I->mayWriteToMemory() && I != SecondI) { + auto Res = AA->getModRefInfo(I, MemLoc); + if (Res != MRI_NoModRef) + return false; + } + } + if (B != FirstBB) { + assert(B != &FirstBB->getParent()->getEntryBlock() && + "Should not hit the entry block because SI must be dominated by LI"); + for (auto PredI = pred_begin(B), PE = pred_end(B); PredI != PE; ++PredI) { + if (!Visited.insert(*PredI).second) + continue; + WorkList.push_back(*PredI); + } + } + } + return true; +} + /// Find all blocks that will unconditionally lead to the block BB and append /// them to F. static void FindUnconditionalPreds(SmallVectorImpl<BasicBlock *> &Blocks, @@ -655,10 +747,11 @@ bool DSE::HandleFree(CallInst *F) { Instruction *InstPt = BB->getTerminator(); if (BB == F->getParent()) InstPt = F; - MemDepResult Dep = MD->getPointerDependencyFrom(Loc, false, InstPt, BB); + MemDepResult Dep = + MD->getPointerDependencyFrom(Loc, false, InstPt->getIterator(), BB); while (Dep.isDef() || Dep.isClobber()) { Instruction *Dependency = Dep.getInst(); - if (!hasMemoryWrite(Dependency, TLI) || !isRemovable(Dependency)) + if (!hasMemoryWrite(Dependency, *TLI) || !isRemovable(Dependency)) break; Value *DepPointer = @@ -668,10 +761,10 @@ bool DSE::HandleFree(CallInst *F) { if (!AA->isMustAlias(F->getArgOperand(0), DepPointer)) break; - Instruction *Next = std::next(BasicBlock::iterator(Dependency)); + auto Next = ++Dependency->getIterator(); // DCE instructions only used to calculate that store - DeleteDeadInstruction(Dependency, *MD, TLI); + DeleteDeadInstruction(Dependency, *MD, *TLI); ++NumFastStores; MadeChange = true; @@ -704,23 +797,22 @@ bool DSE::handleEndBlock(BasicBlock &BB) { SmallSetVector<Value*, 16> DeadStackObjects; // Find all of the alloca'd pointers in the entry block. - BasicBlock *Entry = BB.getParent()->begin(); - for (BasicBlock::iterator I = Entry->begin(), E = Entry->end(); I != E; ++I) { - if (isa<AllocaInst>(I)) - DeadStackObjects.insert(I); + BasicBlock &Entry = BB.getParent()->front(); + for (Instruction &I : Entry) { + if (isa<AllocaInst>(&I)) + DeadStackObjects.insert(&I); // Okay, so these are dead heap objects, but if the pointer never escapes // then it's leaked by this function anyways. - else if (isAllocLikeFn(I, TLI) && !PointerMayBeCaptured(I, true, true)) - DeadStackObjects.insert(I); + else if (isAllocLikeFn(&I, TLI) && !PointerMayBeCaptured(&I, true, true)) + DeadStackObjects.insert(&I); } // Treat byval or inalloca arguments the same, stores to them are dead at the // end of the function. - for (Function::arg_iterator AI = BB.getParent()->arg_begin(), - AE = BB.getParent()->arg_end(); AI != AE; ++AI) - if (AI->hasByValOrInAllocaAttr()) - DeadStackObjects.insert(AI); + for (Argument &AI : BB.getParent()->args()) + if (AI.hasByValOrInAllocaAttr()) + DeadStackObjects.insert(&AI); const DataLayout &DL = BB.getModule()->getDataLayout(); @@ -729,10 +821,10 @@ bool DSE::handleEndBlock(BasicBlock &BB) { --BBI; // If we find a store, check to see if it points into a dead stack value. - if (hasMemoryWrite(BBI, TLI) && isRemovable(BBI)) { + if (hasMemoryWrite(&*BBI, *TLI) && isRemovable(&*BBI)) { // See through pointer-to-pointer bitcasts SmallVector<Value *, 4> Pointers; - GetUnderlyingObjects(getStoredPointerOperand(BBI), Pointers, DL); + GetUnderlyingObjects(getStoredPointerOperand(&*BBI), Pointers, DL); // Stores to stack values are valid candidates for removal. bool AllDead = true; @@ -744,7 +836,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { } if (AllDead) { - Instruction *Dead = BBI++; + Instruction *Dead = &*BBI++; DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n DEAD: " << *Dead << "\n Objects: "; @@ -757,7 +849,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { dbgs() << '\n'); // DCE instructions only used to calculate that store. - DeleteDeadInstruction(Dead, *MD, TLI, &DeadStackObjects); + DeleteDeadInstruction(Dead, *MD, *TLI, &DeadStackObjects); ++NumFastStores; MadeChange = true; continue; @@ -765,9 +857,9 @@ bool DSE::handleEndBlock(BasicBlock &BB) { } // Remove any dead non-memory-mutating instructions. - if (isInstructionTriviallyDead(BBI, TLI)) { - Instruction *Inst = BBI++; - DeleteDeadInstruction(Inst, *MD, TLI, &DeadStackObjects); + if (isInstructionTriviallyDead(&*BBI, TLI)) { + Instruction *Inst = &*BBI++; + DeleteDeadInstruction(Inst, *MD, *TLI, &DeadStackObjects); ++NumFastOther; MadeChange = true; continue; @@ -776,15 +868,15 @@ bool DSE::handleEndBlock(BasicBlock &BB) { if (isa<AllocaInst>(BBI)) { // Remove allocas from the list of dead stack objects; there can't be // any references before the definition. - DeadStackObjects.remove(BBI); + DeadStackObjects.remove(&*BBI); continue; } - if (auto CS = CallSite(BBI)) { + if (auto CS = CallSite(&*BBI)) { // Remove allocation function calls from the list of dead stack objects; // there can't be any references before the definition. - if (isAllocLikeFn(BBI, TLI)) - DeadStackObjects.remove(BBI); + if (isAllocLikeFn(&*BBI, TLI)) + DeadStackObjects.remove(&*BBI); // If this call does not access memory, it can't be loading any of our // pointers. @@ -795,10 +887,9 @@ bool DSE::handleEndBlock(BasicBlock &BB) { // the call is live. DeadStackObjects.remove_if([&](Value *I) { // See if the call site touches the value. - AliasAnalysis::ModRefResult A = AA->getModRefInfo( - CS, I, getPointerSize(I, DL, AA->getTargetLibraryInfo())); + ModRefInfo A = AA->getModRefInfo(CS, I, getPointerSize(I, DL, *TLI)); - return A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref; + return A == MRI_ModRef || A == MRI_Ref; }); // If all of the allocas were clobbered by the call then we're not going @@ -864,8 +955,7 @@ void DSE::RemoveAccessedObjects(const MemoryLocation &LoadedLoc, // Remove objects that could alias LoadedLoc. DeadStackObjects.remove_if([&](Value *I) { // See if the loaded location could alias the stack location. - MemoryLocation StackLoc(I, - getPointerSize(I, DL, AA->getTargetLibraryInfo())); + MemoryLocation StackLoc(I, getPointerSize(I, DL, *TLI)); return !AA->isNoAlias(StackLoc, LoadedLoc); }); } diff --git a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp index 029b44c..7ef062e 100644 --- a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/Hashing.h" #include "llvm/ADT/ScopedHashTable.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -263,7 +264,6 @@ namespace { /// expected that a later pass of GVN will catch the interesting/hard cases. class EarlyCSE { public: - Function &F; const TargetLibraryInfo &TLI; const TargetTransformInfo &TTI; DominatorTree &DT; @@ -281,20 +281,37 @@ public: /// that dominated values can succeed in their lookup. ScopedHTType AvailableValues; - /// \brief A scoped hash table of the current values of loads. + /// A scoped hash table of the current values of previously encounted memory + /// locations. /// - /// This allows us to get efficient access to dominating loads when we have - /// a fully redundant load. In addition to the most recent load, we keep - /// track of a generation count of the read, which is compared against the - /// current generation count. The current generation count is incremented + /// This allows us to get efficient access to dominating loads or stores when + /// we have a fully redundant load. In addition to the most recent load, we + /// keep track of a generation count of the read, which is compared against + /// the current generation count. The current generation count is incremented /// after every possibly writing memory operation, which ensures that we only - /// CSE loads with other loads that have no intervening store. - typedef RecyclingAllocator< - BumpPtrAllocator, - ScopedHashTableVal<Value *, std::pair<Value *, unsigned>>> + /// CSE loads with other loads that have no intervening store. Ordering + /// events (such as fences or atomic instructions) increment the generation + /// count as well; essentially, we model these as writes to all possible + /// locations. Note that atomic and/or volatile loads and stores can be + /// present the table; it is the responsibility of the consumer to inspect + /// the atomicity/volatility if needed. + struct LoadValue { + Value *Data; + unsigned Generation; + int MatchingId; + bool IsAtomic; + LoadValue() + : Data(nullptr), Generation(0), MatchingId(-1), IsAtomic(false) {} + LoadValue(Value *Data, unsigned Generation, unsigned MatchingId, + bool IsAtomic) + : Data(Data), Generation(Generation), MatchingId(MatchingId), + IsAtomic(IsAtomic) {} + }; + typedef RecyclingAllocator<BumpPtrAllocator, + ScopedHashTableVal<Value *, LoadValue>> LoadMapAllocator; - typedef ScopedHashTable<Value *, std::pair<Value *, unsigned>, - DenseMapInfo<Value *>, LoadMapAllocator> LoadHTType; + typedef ScopedHashTable<Value *, LoadValue, DenseMapInfo<Value *>, + LoadMapAllocator> LoadHTType; LoadHTType AvailableLoads; /// \brief A scoped hash table of the current values of read-only call @@ -308,10 +325,9 @@ public: unsigned CurrentGeneration; /// \brief Set up the EarlyCSE runner for a particular function. - EarlyCSE(Function &F, const TargetLibraryInfo &TLI, - const TargetTransformInfo &TTI, DominatorTree &DT, - AssumptionCache &AC) - : F(F), TLI(TLI), TTI(TTI), DT(DT), AC(AC), CurrentGeneration(0) {} + EarlyCSE(const TargetLibraryInfo &TLI, const TargetTransformInfo &TTI, + DominatorTree &DT, AssumptionCache &AC) + : TLI(TLI), TTI(TTI), DT(DT), AC(AC), CurrentGeneration(0) {} bool run(); @@ -382,57 +398,91 @@ private: class ParseMemoryInst { public: ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI) - : Load(false), Store(false), Vol(false), MayReadFromMemory(false), - MayWriteToMemory(false), MatchingId(-1), Ptr(nullptr) { - MayReadFromMemory = Inst->mayReadFromMemory(); - MayWriteToMemory = Inst->mayWriteToMemory(); - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { - MemIntrinsicInfo Info; - if (!TTI.getTgtMemIntrinsic(II, Info)) - return; - if (Info.NumMemRefs == 1) { - Store = Info.WriteMem; - Load = Info.ReadMem; - MatchingId = Info.MatchingId; - MayReadFromMemory = Info.ReadMem; - MayWriteToMemory = Info.WriteMem; - Vol = Info.Vol; - Ptr = Info.PtrVal; - } - } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { - Load = true; - Vol = !LI->isSimple(); - Ptr = LI->getPointerOperand(); + : IsTargetMemInst(false), Inst(Inst) { + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) + if (TTI.getTgtMemIntrinsic(II, Info) && Info.NumMemRefs == 1) + IsTargetMemInst = true; + } + bool isLoad() const { + if (IsTargetMemInst) return Info.ReadMem; + return isa<LoadInst>(Inst); + } + bool isStore() const { + if (IsTargetMemInst) return Info.WriteMem; + return isa<StoreInst>(Inst); + } + bool isAtomic() const { + if (IsTargetMemInst) { + assert(Info.IsSimple && "need to refine IsSimple in TTI"); + return false; + } + return Inst->isAtomic(); + } + bool isUnordered() const { + if (IsTargetMemInst) { + assert(Info.IsSimple && "need to refine IsSimple in TTI"); + return true; + } + if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { + return LI->isUnordered(); + } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + return SI->isUnordered(); + } + // Conservative answer + return !Inst->isAtomic(); + } + + bool isVolatile() const { + if (IsTargetMemInst) { + assert(Info.IsSimple && "need to refine IsSimple in TTI"); + return false; + } + if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { + return LI->isVolatile(); } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { - Store = true; - Vol = !SI->isSimple(); - Ptr = SI->getPointerOperand(); + return SI->isVolatile(); } + // Conservative answer + return true; } - bool isLoad() { return Load; } - bool isStore() { return Store; } - bool isVolatile() { return Vol; } - bool isMatchingMemLoc(const ParseMemoryInst &Inst) { - return Ptr == Inst.Ptr && MatchingId == Inst.MatchingId; + + + bool isMatchingMemLoc(const ParseMemoryInst &Inst) const { + return (getPointerOperand() == Inst.getPointerOperand() && + getMatchingId() == Inst.getMatchingId()); } - bool isValid() { return Ptr != nullptr; } - int getMatchingId() { return MatchingId; } - Value *getPtr() { return Ptr; } - bool mayReadFromMemory() { return MayReadFromMemory; } - bool mayWriteToMemory() { return MayWriteToMemory; } + bool isValid() const { return getPointerOperand() != nullptr; } - private: - bool Load; - bool Store; - bool Vol; - bool MayReadFromMemory; - bool MayWriteToMemory; // For regular (non-intrinsic) loads/stores, this is set to -1. For // intrinsic loads/stores, the id is retrieved from the corresponding // field in the MemIntrinsicInfo structure. That field contains // non-negative values only. - int MatchingId; - Value *Ptr; + int getMatchingId() const { + if (IsTargetMemInst) return Info.MatchingId; + return -1; + } + Value *getPointerOperand() const { + if (IsTargetMemInst) return Info.PtrVal; + if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { + return LI->getPointerOperand(); + } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + return SI->getPointerOperand(); + } + return nullptr; + } + bool mayReadFromMemory() const { + if (IsTargetMemInst) return Info.ReadMem; + return Inst->mayReadFromMemory(); + } + bool mayWriteToMemory() const { + if (IsTargetMemInst) return Info.WriteMem; + return Inst->mayWriteToMemory(); + } + + private: + bool IsTargetMemInst; + MemIntrinsicInfo Info; + Instruction *Inst; }; bool processNode(DomTreeNode *Node); @@ -497,7 +547,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // See if any instructions in the block can be eliminated. If so, do it. If // not, add them to AvailableValues. for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { - Instruction *Inst = I++; + Instruction *Inst = &*I++; // Dead instructions should just be removed. if (isInstructionTriviallyDead(Inst, &TLI)) { @@ -548,24 +598,26 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { ParseMemoryInst MemInst(Inst, TTI); // If this is a non-volatile load, process it. if (MemInst.isValid() && MemInst.isLoad()) { - // Ignore volatile loads. - if (MemInst.isVolatile()) { + // (conservatively) we can't peak past the ordering implied by this + // operation, but we can add this load to our set of available values + if (MemInst.isVolatile() || !MemInst.isUnordered()) { LastStore = nullptr; - // Don't CSE across synchronization boundaries. - if (Inst->mayWriteToMemory()) - ++CurrentGeneration; - continue; + ++CurrentGeneration; } // If we have an available version of this load, and if it is the right // generation, replace this instruction. - std::pair<Value *, unsigned> InVal = - AvailableLoads.lookup(MemInst.getPtr()); - if (InVal.first != nullptr && InVal.second == CurrentGeneration) { - Value *Op = getOrCreateResult(InVal.first, Inst->getType()); + LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand()); + if (InVal.Data != nullptr && InVal.Generation == CurrentGeneration && + InVal.MatchingId == MemInst.getMatchingId() && + // We don't yet handle removing loads with ordering of any kind. + !MemInst.isVolatile() && MemInst.isUnordered() && + // We can't replace an atomic load with one which isn't also atomic. + InVal.IsAtomic >= MemInst.isAtomic()) { + Value *Op = getOrCreateResult(InVal.Data, Inst->getType()); if (Op != nullptr) { DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst - << " to: " << *InVal.first << '\n'); + << " to: " << *InVal.Data << '\n'); if (!Inst->use_empty()) Inst->replaceAllUsesWith(Op); Inst->eraseFromParent(); @@ -576,8 +628,10 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { } // Otherwise, remember that we have this instruction. - AvailableLoads.insert(MemInst.getPtr(), std::pair<Value *, unsigned>( - Inst, CurrentGeneration)); + AvailableLoads.insert( + MemInst.getPointerOperand(), + LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(), + MemInst.isAtomic())); LastStore = nullptr; continue; } @@ -613,6 +667,44 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { continue; } + // A release fence requires that all stores complete before it, but does + // not prevent the reordering of following loads 'before' the fence. As a + // result, we don't need to consider it as writing to memory and don't need + // to advance the generation. We do need to prevent DSE across the fence, + // but that's handled above. + if (FenceInst *FI = dyn_cast<FenceInst>(Inst)) + if (FI->getOrdering() == Release) { + assert(Inst->mayReadFromMemory() && "relied on to prevent DSE above"); + continue; + } + + // write back DSE - If we write back the same value we just loaded from + // the same location and haven't passed any intervening writes or ordering + // operations, we can remove the write. The primary benefit is in allowing + // the available load table to remain valid and value forward past where + // the store originally was. + if (MemInst.isValid() && MemInst.isStore()) { + LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand()); + if (InVal.Data && + InVal.Data == getOrCreateResult(Inst, InVal.Data->getType()) && + InVal.Generation == CurrentGeneration && + InVal.MatchingId == MemInst.getMatchingId() && + // We don't yet handle removing stores with ordering of any kind. + !MemInst.isVolatile() && MemInst.isUnordered()) { + assert((!LastStore || + ParseMemoryInst(LastStore, TTI).getPointerOperand() == + MemInst.getPointerOperand()) && + "can't have an intervening store!"); + DEBUG(dbgs() << "EarlyCSE DSE (writeback): " << *Inst << '\n'); + Inst->eraseFromParent(); + Changed = true; + ++NumDSE; + // We can avoid incrementing the generation count since we were able + // to eliminate this store. + continue; + } + } + // Okay, this isn't something we can CSE at all. Check to see if it is // something that could modify memory. If so, our available memory values // cannot be used so bump the generation count. @@ -622,8 +714,16 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { if (MemInst.isValid() && MemInst.isStore()) { // We do a trivial form of DSE if there are two stores to the same // location with no intervening loads. Delete the earlier store. + // At the moment, we don't remove ordered stores, but do remove + // unordered atomic stores. There's no special requirement (for + // unordered atomics) about removing atomic stores only in favor of + // other atomic stores since we we're going to execute the non-atomic + // one anyway and the atomic one might never have become visible. if (LastStore) { ParseMemoryInst LastStoreMemInst(LastStore, TTI); + assert(LastStoreMemInst.isUnordered() && + !LastStoreMemInst.isVolatile() && + "Violated invariant"); if (LastStoreMemInst.isMatchingMemLoc(MemInst)) { DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore << " due to: " << *Inst << '\n'); @@ -640,12 +740,22 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // version of the pointer. It is safe to forward from volatile stores // to non-volatile loads, so we don't have to check for volatility of // the store. - AvailableLoads.insert(MemInst.getPtr(), std::pair<Value *, unsigned>( - Inst, CurrentGeneration)); - - // Remember that this was the last store we saw for DSE. - if (!MemInst.isVolatile()) + AvailableLoads.insert( + MemInst.getPointerOperand(), + LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(), + MemInst.isAtomic())); + + // Remember that this was the last unordered store we saw for DSE. We + // don't yet handle DSE on ordered or volatile stores since we don't + // have a good way to model the ordering requirement for following + // passes once the store is removed. We could insert a fence, but + // since fences are slightly stronger than stores in their ordering, + // it's not clear this is a profitable transform. Another option would + // be to merge the ordering with that of the post dominating store. + if (MemInst.isUnordered() && !MemInst.isVolatile()) LastStore = Inst; + else + LastStore = nullptr; } } } @@ -714,7 +824,7 @@ PreservedAnalyses EarlyCSEPass::run(Function &F, auto &DT = AM->getResult<DominatorTreeAnalysis>(F); auto &AC = AM->getResult<AssumptionAnalysis>(F); - EarlyCSE CSE(F, TLI, TTI, DT, AC); + EarlyCSE CSE(TLI, TTI, DT, AC); if (!CSE.run()) return PreservedAnalyses::all(); @@ -751,7 +861,7 @@ public: auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - EarlyCSE CSE(F, TLI, TTI, DT, AC); + EarlyCSE CSE(TLI, TTI, DT, AC); return CSE.run(); } @@ -761,6 +871,7 @@ public: AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); AU.setPreservesCFG(); } }; diff --git a/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp index 0430c18..185cdbd 100644 --- a/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp @@ -30,7 +30,7 @@ public: bool runOnFunction(Function &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); } private: @@ -41,7 +41,7 @@ private: char FlattenCFGPass::ID = 0; INITIALIZE_PASS_BEGIN(FlattenCFGPass, "flattencfg", "Flatten the CFG", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(FlattenCFGPass, "flattencfg", "Flatten the CFG", false, false) @@ -59,7 +59,7 @@ static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) { // Loop over all of the basic blocks and remove them if they are unneeded... // for (Function::iterator BBIt = F.begin(); BBIt != F.end();) { - if (FlattenCFG(BBIt++, AA)) { + if (FlattenCFG(&*BBIt++, AA)) { LocalChange = true; } } @@ -69,7 +69,7 @@ static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) { } bool FlattenCFGPass::runOnFunction(Function &F) { - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); bool EverChanged = false; // iterativelyFlattenCFG can make some blocks dead. while (iterativelyFlattenCFG(F, AA)) { diff --git a/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp index c931422..7f5d786 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp @@ -19,6 +19,8 @@ #include "llvm/ADT/EquivalenceClasses.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/IR/ConstantRange.h" #include "llvm/IR/Constants.h" #include "llvm/IR/IRBuilder.h" @@ -41,7 +43,7 @@ using namespace llvm; // integer domain inputs, produce an integer output; fadd, for example. // // If a non-mappable instruction is seen, this entire def-use graph is marked -// as non-transformable. If we see an instruction that converts from the +// as non-transformable. If we see an instruction that converts from the // integer domain to FP domain (uitofp,sitofp), we terminate our walk. /// The largest integer type worth dealing with. @@ -60,6 +62,7 @@ namespace { bool runOnFunction(Function &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + AU.addPreserved<GlobalsAAWrapperPass>(); } void findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots); @@ -82,7 +85,9 @@ namespace { } char Float2Int::ID = 0; -INITIALIZE_PASS(Float2Int, "float2int", "Float to int", false, false) +INITIALIZE_PASS_BEGIN(Float2Int, "float2int", "Float to int", false, false) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_END(Float2Int, "float2int", "Float to int", false, false) // Given a FCmp predicate, return a matching ICmp predicate if one // exists, otherwise return BAD_ICMP_PREDICATE. @@ -125,7 +130,9 @@ static Instruction::BinaryOps mapBinOpcode(unsigned Opcode) { // Find the roots - instructions that convert from the FP domain to // integer domain. void Float2Int::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) { - for (auto &I : inst_range(F)) { + for (auto &I : instructions(F)) { + if (isa<VectorType>(I.getType())) + continue; switch (I.getOpcode()) { default: break; case Instruction::FPToUI: @@ -133,7 +140,7 @@ void Float2Int::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) { Roots.insert(&I); break; case Instruction::FCmp: - if (mapFCmpPred(cast<CmpInst>(&I)->getPredicate()) != + if (mapFCmpPred(cast<CmpInst>(&I)->getPredicate()) != CmpInst::BAD_ICMP_PREDICATE) Roots.insert(&I); break; @@ -176,7 +183,7 @@ ConstantRange Float2Int::validateRange(ConstantRange R) { // - walkForwards: Iterate over SeenInsts in reverse order, so we visit // defs before their uses. Calculate the real range info. -// Breadth-first walk of the use-def graph; determine the set of nodes +// Breadth-first walk of the use-def graph; determine the set of nodes // we care about and eagerly determine if some of them are poisonous. void Float2Int::walkBackwards(const SmallPtrSetImpl<Instruction*> &Roots) { std::deque<Instruction*> Worklist(Roots.begin(), Roots.end()); @@ -222,14 +229,14 @@ void Float2Int::walkBackwards(const SmallPtrSetImpl<Instruction*> &Roots) { seen(I, unknownRange()); break; } - + for (Value *O : I->operands()) { if (Instruction *OI = dyn_cast<Instruction>(O)) { // Unify def-use chains if they interfere. ECs.unionSets(I, OI); - if (SeenInsts.find(I)->second != badRange()) + if (SeenInsts.find(I)->second != badRange()) Worklist.push_back(OI); - } else if (!isa<ConstantFP>(O)) { + } else if (!isa<ConstantFP>(O)) { // Not an instruction or ConstantFP? we can't do anything. seen(I, badRange()); } @@ -240,11 +247,11 @@ void Float2Int::walkBackwards(const SmallPtrSetImpl<Instruction*> &Roots) { // Walk forwards down the list of seen instructions, so we visit defs before // uses. void Float2Int::walkForwards() { - for (auto It = SeenInsts.rbegin(), E = SeenInsts.rend(); It != E; ++It) { - if (It->second != unknownRange()) + for (auto &It : make_range(SeenInsts.rbegin(), SeenInsts.rend())) { + if (It.second != unknownRange()) continue; - Instruction *I = It->first; + Instruction *I = It.first; std::function<ConstantRange(ArrayRef<ConstantRange>)> Op; switch (I->getOpcode()) { // FIXME: Handle select and phi nodes. @@ -299,7 +306,7 @@ void Float2Int::walkForwards() { for (Value *O : I->operands()) { if (Instruction *OI = dyn_cast<Instruction>(O)) { assert(SeenInsts.find(OI) != SeenInsts.end() && - "def not seen before use!"); + "def not seen before use!"); OpRanges.push_back(SeenInsts.find(OI)->second); } else if (ConstantFP *CF = dyn_cast<ConstantFP>(O)) { // Work out if the floating point number can be losslessly represented @@ -314,11 +321,11 @@ void Float2Int::walkForwards() { APFloat F = CF->getValueAPF(); // First, weed out obviously incorrect values. Non-finite numbers - // can't be represented and neither can negative zero, unless + // can't be represented and neither can negative zero, unless // we're in fast math mode. if (!F.isFinite() || (F.isZero() && F.isNegative() && isa<FPMathOperator>(I) && - !I->hasNoSignedZeros())) { + !I->hasNoSignedZeros())) { seen(I, badRange()); Abort = true; break; @@ -345,7 +352,7 @@ void Float2Int::walkForwards() { // Reduce the operands' ranges to a single range and return. if (!Abort) - seen(I, Op(OpRanges)); + seen(I, Op(OpRanges)); } } @@ -395,7 +402,7 @@ bool Float2Int::validateAndTransform() { R.isFullSet() || R.isSignWrappedSet()) continue; assert(ConvertedToTy && "Must have set the convertedtoty by this point!"); - + // The number of bits required is the maximum of the upper and // lower limits, plus one so it can be signed. unsigned MinBW = std::max(R.getLower().getMinSignedBits(), @@ -505,9 +512,8 @@ Value *Float2Int::convert(Instruction *I, Type *ToTy) { // Perform dead code elimination on the instructions we just modified. void Float2Int::cleanup() { - for (auto I = ConvertedInsts.rbegin(), E = ConvertedInsts.rend(); - I != E; ++I) - I->first->eraseFromParent(); + for (auto &I : make_range(ConvertedInsts.rbegin(), ConvertedInsts.rend())) + I.first->eraseFromParent(); } bool Float2Int::runOnFunction(Function &F) { @@ -534,7 +540,4 @@ bool Float2Int::runOnFunction(Function &F) { return Modified; } -FunctionPass *llvm::createFloat2IntPass() { - return new Float2Int(); -} - +FunctionPass *llvm::createFloat2IntPass() { return new Float2Int(); } diff --git a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp index 89a0d0a..a028b8c 100644 --- a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp @@ -28,6 +28,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/MemoryBuiltins.h" @@ -128,6 +129,7 @@ namespace { uint32_t lookup(Value *V) const; uint32_t lookup_or_add_cmp(unsigned Opcode, CmpInst::Predicate Pred, Value *LHS, Value *RHS); + bool exists(Value *V) const; void add(Value *V, uint32_t num); void clear(); void erase(Value *v); @@ -388,6 +390,9 @@ uint32_t ValueTable::lookup_or_add_call(CallInst *C) { } } +/// Returns true if a value number exists for the specified value. +bool ValueTable::exists(Value *V) const { return valueNumbering.count(V) != 0; } + /// lookup_or_add - Returns the value number for the specified value, assigning /// it a new number if it did not have one before. uint32_t ValueTable::lookup_or_add(Value *V) { @@ -608,6 +613,10 @@ namespace { DenseMap<uint32_t, LeaderTableEntry> LeaderTable; BumpPtrAllocator TableAllocator; + // Block-local map of equivalent values to their leader, does not + // propagate to any successors. Entries added mid-block are applied + // to the remaining instructions in the block. + SmallMapVector<llvm::Value *, llvm::Constant *, 4> ReplaceWithConstMap; SmallVector<Instruction*, 8> InstrsToErase; typedef SmallVector<NonLocalDepResult, 64> LoadDepVect; @@ -689,16 +698,17 @@ namespace { AU.addRequired<TargetLibraryInfoWrapperPass>(); if (!NoLoads) AU.addRequired<MemoryDependenceAnalysis>(); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<AliasAnalysis>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } - // Helper fuctions of redundant load elimination + // Helper functions of redundant load elimination bool processLoad(LoadInst *L); bool processNonLocalLoad(LoadInst *L); + bool processAssumeIntrinsic(IntrinsicInst *II); void AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, AvailValInBlkVect &ValuesPerBlock, UnavailBlkVect &UnavailableBlocks); @@ -719,7 +729,9 @@ namespace { void verifyRemoved(const Instruction *I) const; bool splitCriticalEdges(); BasicBlock *splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ); - bool propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root); + bool replaceOperandsWithConsts(Instruction *I) const; + bool propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root, + bool DominatesByEdge); bool processFoldableCondBr(BranchInst *BI); void addDeadBlock(BasicBlock *BB); void assignValNumForDeadCode(); @@ -738,7 +750,8 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_END(GVN, "gvn", "Global Value Numbering", false, false) #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1290,8 +1303,7 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, SSAUpdater SSAUpdate(&NewPHIs); SSAUpdate.Initialize(LI->getType(), LI->getName()); - for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) { - const AvailableValueInBlock &AV = ValuesPerBlock[i]; + for (const AvailableValueInBlock &AV : ValuesPerBlock) { BasicBlock *BB = AV.BB; if (SSAUpdate.HasValueForBlock(BB)) @@ -1301,24 +1313,7 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, } // Perform PHI construction. - Value *V = SSAUpdate.GetValueInMiddleOfBlock(LI->getParent()); - - // If new PHI nodes were created, notify alias analysis. - if (V->getType()->getScalarType()->isPointerTy()) { - AliasAnalysis *AA = gvn.getAliasAnalysis(); - - // Scan the new PHIs and inform alias analysis that we've added potentially - // escaping uses to any values that are operands to these PHIs. - for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i) { - PHINode *P = NewPHIs[i]; - for (unsigned ii = 0, ee = P->getNumIncomingValues(); ii != ee; ++ii) { - unsigned jj = PHINode::getOperandNumForIncomingValue(ii); - AA->addEscapingUse(P->getOperandUse(jj)); - } - } - } - - return V; + return SSAUpdate.GetValueInMiddleOfBlock(LI->getParent()); } Value *AvailableValueInBlock::MaterializeAdjustedValue(LoadInst *LI, @@ -1518,9 +1513,8 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, // that we only have to insert *one* load (which means we're basically moving // the load, not inserting a new one). - SmallPtrSet<BasicBlock *, 4> Blockers; - for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i) - Blockers.insert(UnavailableBlocks[i]); + SmallPtrSet<BasicBlock *, 4> Blockers(UnavailableBlocks.begin(), + UnavailableBlocks.end()); // Let's find the first basic block with more than one predecessor. Walk // backwards through predecessors if needed. @@ -1550,15 +1544,22 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, // available. MapVector<BasicBlock *, Value *> PredLoads; DenseMap<BasicBlock*, char> FullyAvailableBlocks; - for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) - FullyAvailableBlocks[ValuesPerBlock[i].BB] = true; - for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i) - FullyAvailableBlocks[UnavailableBlocks[i]] = false; + for (const AvailableValueInBlock &AV : ValuesPerBlock) + FullyAvailableBlocks[AV.BB] = true; + for (BasicBlock *UnavailableBB : UnavailableBlocks) + FullyAvailableBlocks[UnavailableBB] = false; SmallVector<BasicBlock *, 4> CriticalEdgePred; - for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB); - PI != E; ++PI) { - BasicBlock *Pred = *PI; + for (BasicBlock *Pred : predecessors(LoadBB)) { + // If any predecessor block is an EH pad that does not allow non-PHI + // instructions before the terminator, we can't PRE the load. + if (Pred->getTerminator()->isEHPad()) { + DEBUG(dbgs() + << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD PREDECESSOR '" + << Pred->getName() << "': " << *LI << '\n'); + return false; + } + if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks, 0)) { continue; } @@ -1570,9 +1571,9 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, return false; } - if (LoadBB->isLandingPad()) { + if (LoadBB->isEHPad()) { DEBUG(dbgs() - << "COULD NOT PRE LOAD BECAUSE OF LANDING PAD CRITICAL EDGE '" + << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD CRITICAL EDGE '" << Pred->getName() << "': " << *LI << '\n'); return false; } @@ -1655,12 +1656,12 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, << *NewInsts.back() << '\n'); // Assign value numbers to the new instructions. - for (unsigned i = 0, e = NewInsts.size(); i != e; ++i) { + for (Instruction *I : NewInsts) { // FIXME: We really _ought_ to insert these value numbers into their // parent's availability map. However, in doing so, we risk getting into // ordering issues. If a block hasn't been processed yet, we would be // marking a value as AVAIL-IN, which isn't what we intend. - VN.lookup_or_add(NewInsts[i]); + VN.lookup_or_add(I); } for (const auto &PredLoad : PredLoads) { @@ -1677,6 +1678,11 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, if (Tags) NewLoad->setAAMetadata(Tags); + if (auto *MD = LI->getMetadata(LLVMContext::MD_invariant_load)) + NewLoad->setMetadata(LLVMContext::MD_invariant_load, MD); + if (auto *InvGroupMD = LI->getMetadata(LLVMContext::MD_invariant_group)) + NewLoad->setMetadata(LLVMContext::MD_invariant_group, InvGroupMD); + // Transfer DebugLoc. NewLoad->setDebugLoc(LI->getDebugLoc()); @@ -1704,6 +1710,10 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, /// Attempt to eliminate a load whose dependencies are /// non-local by performing PHI construction. bool GVN::processNonLocalLoad(LoadInst *LI) { + // non-local speculations are not allowed under asan. + if (LI->getParent()->getParent()->hasFnAttribute(Attribute::SanitizeAddress)) + return false; + // Step 1: Find the non-local dependencies of the load. LoadDepVect Deps; MD->getNonLocalPointerDependency(LI, Deps); @@ -1777,6 +1787,63 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { return PerformLoadPRE(LI, ValuesPerBlock, UnavailableBlocks); } +bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) { + assert(IntrinsicI->getIntrinsicID() == Intrinsic::assume && + "This function can only be called with llvm.assume intrinsic"); + Value *V = IntrinsicI->getArgOperand(0); + + if (ConstantInt *Cond = dyn_cast<ConstantInt>(V)) { + if (Cond->isZero()) { + Type *Int8Ty = Type::getInt8Ty(V->getContext()); + // Insert a new store to null instruction before the load to indicate that + // this code is not reachable. FIXME: We could insert unreachable + // instruction directly because we can modify the CFG. + new StoreInst(UndefValue::get(Int8Ty), + Constant::getNullValue(Int8Ty->getPointerTo()), + IntrinsicI); + } + markInstructionForDeletion(IntrinsicI); + return false; + } + + Constant *True = ConstantInt::getTrue(V->getContext()); + bool Changed = false; + + for (BasicBlock *Successor : successors(IntrinsicI->getParent())) { + BasicBlockEdge Edge(IntrinsicI->getParent(), Successor); + + // This property is only true in dominated successors, propagateEquality + // will check dominance for us. + Changed |= propagateEquality(V, True, Edge, false); + } + + // We can replace assume value with true, which covers cases like this: + // call void @llvm.assume(i1 %cmp) + // br i1 %cmp, label %bb1, label %bb2 ; will change %cmp to true + ReplaceWithConstMap[V] = True; + + // If one of *cmp *eq operand is const, adding it to map will cover this: + // %cmp = fcmp oeq float 3.000000e+00, %0 ; const on lhs could happen + // call void @llvm.assume(i1 %cmp) + // ret float %0 ; will change it to ret float 3.000000e+00 + if (auto *CmpI = dyn_cast<CmpInst>(V)) { + if (CmpI->getPredicate() == CmpInst::Predicate::ICMP_EQ || + CmpI->getPredicate() == CmpInst::Predicate::FCMP_OEQ || + (CmpI->getPredicate() == CmpInst::Predicate::FCMP_UEQ && + CmpI->getFastMathFlags().noNaNs())) { + Value *CmpLHS = CmpI->getOperand(0); + Value *CmpRHS = CmpI->getOperand(1); + if (isa<Constant>(CmpLHS)) + std::swap(CmpLHS, CmpRHS); + auto *RHSConst = dyn_cast<Constant>(CmpRHS); + + // If only one operand is constant. + if (RHSConst != nullptr && !isa<Constant>(CmpLHS)) + ReplaceWithConstMap[CmpLHS] = RHSConst; + } + } + return Changed; +} static void patchReplacementInstruction(Instruction *I, Value *Repl) { // Patch the replacement so that it is not more restrictive than the value @@ -1789,7 +1856,7 @@ static void patchReplacementInstruction(Instruction *I, Value *Repl) { if (Instruction *ReplInst = dyn_cast<Instruction>(Repl)) { // FIXME: If both the original and replacement value are part of the // same control-flow region (meaning that the execution of one - // guarentees the executation of the other), then we can combine the + // guarantees the execution of the other), then we can combine the // noalias scopes here and do better than the general conservative // answer used in combineMetadata(). @@ -1797,13 +1864,10 @@ static void patchReplacementInstruction(Instruction *I, Value *Repl) { // regions, and so we need a conservative combination of the noalias // scopes. static const unsigned KnownIDs[] = { - LLVMContext::MD_tbaa, - LLVMContext::MD_alias_scope, - LLVMContext::MD_noalias, - LLVMContext::MD_range, - LLVMContext::MD_fpmath, - LLVMContext::MD_invariant_load, - }; + LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, + LLVMContext::MD_noalias, LLVMContext::MD_range, + LLVMContext::MD_fpmath, LLVMContext::MD_invariant_load, + LLVMContext::MD_invariant_group}; combineMetadata(ReplInst, I, KnownIDs); } } @@ -1890,10 +1954,8 @@ bool GVN::processLoad(LoadInst *L) { ++NumGVNLoad; return true; } - } - // If the value isn't available, don't do anything! - if (Dep.isClobber()) { + // If the value isn't available, don't do anything! DEBUG( // fast print dep, using operator<< on instruction is too slow. dbgs() << "GVN: load "; @@ -2049,11 +2111,31 @@ static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E, return Pred != nullptr; } +// Tries to replace instruction with const, using information from +// ReplaceWithConstMap. +bool GVN::replaceOperandsWithConsts(Instruction *Instr) const { + bool Changed = false; + for (unsigned OpNum = 0; OpNum < Instr->getNumOperands(); ++OpNum) { + Value *Operand = Instr->getOperand(OpNum); + auto it = ReplaceWithConstMap.find(Operand); + if (it != ReplaceWithConstMap.end()) { + assert(!isa<Constant>(Operand) && + "Replacing constants with constants is invalid"); + DEBUG(dbgs() << "GVN replacing: " << *Operand << " with " << *it->second + << " in instruction " << *Instr << '\n'); + Instr->setOperand(OpNum, it->second); + Changed = true; + } + } + return Changed; +} + /// The given values are known to be equal in every block /// dominated by 'Root'. Exploit this, for example by replacing 'LHS' with /// 'RHS' everywhere in the scope. Returns whether a change was made. -bool GVN::propagateEquality(Value *LHS, Value *RHS, - const BasicBlockEdge &Root) { +/// If DominatesByEdge is false, then it means that it is dominated by Root.End. +bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root, + bool DominatesByEdge) { SmallVector<std::pair<Value*, Value*>, 4> Worklist; Worklist.push_back(std::make_pair(LHS, RHS)); bool Changed = false; @@ -2065,11 +2147,13 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, std::pair<Value*, Value*> Item = Worklist.pop_back_val(); LHS = Item.first; RHS = Item.second; - if (LHS == RHS) continue; + if (LHS == RHS) + continue; assert(LHS->getType() == RHS->getType() && "Equality but unequal types!"); // Don't try to propagate equalities between constants. - if (isa<Constant>(LHS) && isa<Constant>(RHS)) continue; + if (isa<Constant>(LHS) && isa<Constant>(RHS)) + continue; // Prefer a constant on the right-hand side, or an Argument if no constants. if (isa<Constant>(LHS) || (isa<Argument>(LHS) && !isa<Constant>(RHS))) @@ -2108,7 +2192,11 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, // LHS always has at least one use that is not dominated by Root, this will // never do anything if LHS has only one use. if (!LHS->hasOneUse()) { - unsigned NumReplacements = replaceDominatedUsesWith(LHS, RHS, *DT, Root); + unsigned NumReplacements = + DominatesByEdge + ? replaceDominatedUsesWith(LHS, RHS, *DT, Root) + : replaceDominatedUsesWith(LHS, RHS, *DT, Root.getEnd()); + Changed |= NumReplacements > 0; NumGVNEqProp += NumReplacements; } @@ -2180,7 +2268,10 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, Value *NotCmp = findLeader(Root.getEnd(), Num); if (NotCmp && isa<Instruction>(NotCmp)) { unsigned NumReplacements = - replaceDominatedUsesWith(NotCmp, NotVal, *DT, Root); + DominatesByEdge + ? replaceDominatedUsesWith(NotCmp, NotVal, *DT, Root) + : replaceDominatedUsesWith(NotCmp, NotVal, *DT, + Root.getEnd()); Changed |= NumReplacements > 0; NumGVNEqProp += NumReplacements; } @@ -2220,6 +2311,10 @@ bool GVN::processInstruction(Instruction *I) { return true; } + if (IntrinsicInst *IntrinsicI = dyn_cast<IntrinsicInst>(I)) + if (IntrinsicI->getIntrinsicID() == Intrinsic::assume) + return processAssumeIntrinsic(IntrinsicI); + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { if (processLoad(LI)) return true; @@ -2250,11 +2345,11 @@ bool GVN::processInstruction(Instruction *I) { Value *TrueVal = ConstantInt::getTrue(TrueSucc->getContext()); BasicBlockEdge TrueE(Parent, TrueSucc); - Changed |= propagateEquality(BranchCond, TrueVal, TrueE); + Changed |= propagateEquality(BranchCond, TrueVal, TrueE, true); Value *FalseVal = ConstantInt::getFalse(FalseSucc->getContext()); BasicBlockEdge FalseE(Parent, FalseSucc); - Changed |= propagateEquality(BranchCond, FalseVal, FalseE); + Changed |= propagateEquality(BranchCond, FalseVal, FalseE, true); return Changed; } @@ -2276,7 +2371,7 @@ bool GVN::processInstruction(Instruction *I) { // If there is only a single edge, propagate the case value into it. if (SwitchEdges.lookup(Dst) == 1) { BasicBlockEdge E(Parent, Dst); - Changed |= propagateEquality(SwitchCond, i.getCaseValue(), E); + Changed |= propagateEquality(SwitchCond, i.getCaseValue(), E, true); } } return Changed; @@ -2284,7 +2379,8 @@ bool GVN::processInstruction(Instruction *I) { // Instructions with void type don't return a value, so there's // no point in trying to find redundancies in them. - if (I->getType()->isVoidTy()) return false; + if (I->getType()->isVoidTy()) + return false; uint32_t NextNum = VN.getNextUnusedValueNumber(); unsigned Num = VN.lookup_or_add(I); @@ -2306,17 +2402,21 @@ bool GVN::processInstruction(Instruction *I) { // Perform fast-path value-number based elimination of values inherited from // dominators. - Value *repl = findLeader(I->getParent(), Num); - if (!repl) { + Value *Repl = findLeader(I->getParent(), Num); + if (!Repl) { // Failure, just remember this instance for future use. addToLeaderTable(Num, I, I->getParent()); return false; + } else if (Repl == I) { + // If I was the result of a shortcut PRE, it might already be in the table + // and the best replacement for itself. Nothing to do. + return false; } // Remove it! - patchAndReplaceAllUsesWith(I, repl); - if (MD && repl->getType()->getScalarType()->isPointerTy()) - MD->invalidateCachedPointerInfo(repl); + patchAndReplaceAllUsesWith(I, Repl); + if (MD && Repl->getType()->getScalarType()->isPointerTy()) + MD->invalidateCachedPointerInfo(Repl); markInstructionForDeletion(I); return true; } @@ -2331,7 +2431,7 @@ bool GVN::runOnFunction(Function& F) { DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); - VN.setAliasAnalysis(&getAnalysis<AliasAnalysis>()); + VN.setAliasAnalysis(&getAnalysis<AAResultsWrapperPass>().getAAResults()); VN.setMemDep(MD); VN.setDomTree(DT); @@ -2341,10 +2441,10 @@ bool GVN::runOnFunction(Function& F) { // Merge unconditional branches, allowing PRE to catch more // optimization opportunities. for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ) { - BasicBlock *BB = FI++; + BasicBlock *BB = &*FI++; - bool removedBlock = MergeBlockIntoPredecessor( - BB, DT, /* LoopInfo */ nullptr, VN.getAliasAnalysis(), MD); + bool removedBlock = + MergeBlockIntoPredecessor(BB, DT, /* LoopInfo */ nullptr, MD); if (removedBlock) ++NumGVNBlocks; Changed |= removedBlock; @@ -2382,7 +2482,6 @@ bool GVN::runOnFunction(Function& F) { return Changed; } - bool GVN::processBlock(BasicBlock *BB) { // FIXME: Kill off InstrsToErase by doing erasing eagerly in a helper function // (and incrementing BI before processing an instruction). @@ -2391,11 +2490,16 @@ bool GVN::processBlock(BasicBlock *BB) { if (DeadBlocks.count(BB)) return false; + // Clearing map before every BB because it can be used only for single BB. + ReplaceWithConstMap.clear(); bool ChangedFunction = false; for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { - ChangedFunction |= processInstruction(BI); + if (!ReplaceWithConstMap.empty()) + ChangedFunction |= replaceOperandsWithConsts(&*BI); + ChangedFunction |= processInstruction(&*BI); + if (InstrsToErase.empty()) { ++BI; continue; @@ -2439,7 +2543,14 @@ bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred, Value *Op = Instr->getOperand(i); if (isa<Argument>(Op) || isa<Constant>(Op) || isa<GlobalValue>(Op)) continue; - + // This could be a newly inserted instruction, in which case, we won't + // find a value number, and should give up before we hurt ourselves. + // FIXME: Rewrite the infrastructure to let it easier to value number + // and process newly inserted instructions. + if (!VN.exists(Op)) { + success = false; + break; + } if (Value *V = findLeader(Pred, VN.lookup(Op))) { Instr->setOperand(i, V); } else { @@ -2499,9 +2610,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) { BasicBlock *CurrentBlock = CurInst->getParent(); predMap.clear(); - for (pred_iterator PI = pred_begin(CurrentBlock), PE = pred_end(CurrentBlock); - PI != PE; ++PI) { - BasicBlock *P = *PI; + for (BasicBlock *P : predecessors(CurrentBlock)) { // We're not interested in PRE where the block is its // own predecessor, or in blocks with predecessors // that are not reachable. @@ -2570,7 +2679,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) { // Create a PHI to make the value available in this block. PHINode *Phi = PHINode::Create(CurInst->getType(), predMap.size(), - CurInst->getName() + ".pre-phi", CurrentBlock->begin()); + CurInst->getName() + ".pre-phi", &CurrentBlock->front()); for (unsigned i = 0, e = predMap.size(); i != e; ++i) { if (Value *V = predMap[i].first) Phi->addIncoming(V, predMap[i].second); @@ -2582,18 +2691,8 @@ bool GVN::performScalarPRE(Instruction *CurInst) { addToLeaderTable(ValNo, Phi, CurrentBlock); Phi->setDebugLoc(CurInst->getDebugLoc()); CurInst->replaceAllUsesWith(Phi); - if (Phi->getType()->getScalarType()->isPointerTy()) { - // Because we have added a PHI-use of the pointer value, it has now - // "escaped" from alias analysis' perspective. We need to inform - // AA of this. - for (unsigned ii = 0, ee = Phi->getNumIncomingValues(); ii != ee; ++ii) { - unsigned jj = PHINode::getOperandNumForIncomingValue(ii); - VN.getAliasAnalysis()->addEscapingUse(Phi->getOperandUse(jj)); - } - - if (MD) - MD->invalidateCachedPointerInfo(Phi); - } + if (MD && Phi->getType()->getScalarType()->isPointerTy()) + MD->invalidateCachedPointerInfo(Phi); VN.erase(CurInst); removeFromLeaderTable(ValNo, CurInst, CurrentBlock); @@ -2616,15 +2715,15 @@ bool GVN::performPRE(Function &F) { if (CurrentBlock == &F.getEntryBlock()) continue; - // Don't perform PRE on a landing pad. - if (CurrentBlock->isLandingPad()) + // Don't perform PRE on an EH pad. + if (CurrentBlock->isEHPad()) continue; for (BasicBlock::iterator BI = CurrentBlock->begin(), BE = CurrentBlock->end(); BI != BE;) { - Instruction *CurInst = BI++; - Changed = performScalarPRE(CurInst); + Instruction *CurInst = &*BI++; + Changed |= performScalarPRE(CurInst); } } @@ -2637,8 +2736,8 @@ bool GVN::performPRE(Function &F) { /// Split the critical edge connecting the given two blocks, and return /// the block inserted to the critical edge. BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) { - BasicBlock *BB = SplitCriticalEdge( - Pred, Succ, CriticalEdgeSplittingOptions(getAliasAnalysis(), DT)); + BasicBlock *BB = + SplitCriticalEdge(Pred, Succ, CriticalEdgeSplittingOptions(DT)); if (MD) MD->invalidateCachedPredecessors(); return BB; @@ -2652,7 +2751,7 @@ bool GVN::splitCriticalEdges() { do { std::pair<TerminatorInst*, unsigned> Edge = toSplit.pop_back_val(); SplitCriticalEdge(Edge.first, Edge.second, - CriticalEdgeSplittingOptions(getAliasAnalysis(), DT)); + CriticalEdgeSplittingOptions(DT)); } while (!toSplit.empty()); if (MD) MD->invalidateCachedPredecessors(); return true; @@ -2728,17 +2827,14 @@ void GVN::addDeadBlock(BasicBlock *BB) { DeadBlocks.insert(Dom.begin(), Dom.end()); // Figure out the dominance-frontier(D). - for (SmallVectorImpl<BasicBlock *>::iterator I = Dom.begin(), - E = Dom.end(); I != E; I++) { - BasicBlock *B = *I; - for (succ_iterator SI = succ_begin(B), SE = succ_end(B); SI != SE; SI++) { - BasicBlock *S = *SI; + for (BasicBlock *B : Dom) { + for (BasicBlock *S : successors(B)) { if (DeadBlocks.count(S)) continue; bool AllPredDead = true; - for (pred_iterator PI = pred_begin(S), PE = pred_end(S); PI != PE; PI++) - if (!DeadBlocks.count(*PI)) { + for (BasicBlock *P : predecessors(S)) + if (!DeadBlocks.count(P)) { AllPredDead = false; break; } @@ -2766,10 +2862,7 @@ void GVN::addDeadBlock(BasicBlock *BB) { continue; SmallVector<BasicBlock *, 4> Preds(pred_begin(B), pred_end(B)); - for (SmallVectorImpl<BasicBlock *>::iterator PI = Preds.begin(), - PE = Preds.end(); PI != PE; PI++) { - BasicBlock *P = *PI; - + for (BasicBlock *P : Preds) { if (!DeadBlocks.count(P)) continue; @@ -2794,7 +2887,7 @@ void GVN::addDeadBlock(BasicBlock *BB) { // R be the target of the dead out-coming edge. // 1) Identify the set of dead blocks implied by the branch's dead outcoming // edge. The result of this step will be {X| X is dominated by R} -// 2) Identify those blocks which haves at least one dead prodecessor. The +// 2) Identify those blocks which haves at least one dead predecessor. The // result of this step will be dominance-frontier(R). // 3) Update the PHIs in DF(R) by replacing the operands corresponding to // dead blocks with "UndefVal" in an hope these PHIs will optimized away. @@ -2829,14 +2922,10 @@ bool GVN::processFoldableCondBr(BranchInst *BI) { // instructions, it makes more sense just to "fabricate" a val-number for the // dead code than checking if instruction involved is dead or not. void GVN::assignValNumForDeadCode() { - for (SetVector<BasicBlock *>::iterator I = DeadBlocks.begin(), - E = DeadBlocks.end(); I != E; I++) { - BasicBlock *BB = *I; - for (BasicBlock::iterator II = BB->begin(), EE = BB->end(); - II != EE; II++) { - Instruction *Inst = &*II; - unsigned ValNum = VN.lookup_or_add(Inst); - addToLeaderTable(ValNum, Inst, BB); + for (BasicBlock *BB : DeadBlocks) { + for (Instruction &Inst : *BB) { + unsigned ValNum = VN.lookup_or_add(&Inst); + addToLeaderTable(ValNum, &Inst, BB); } } } diff --git a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 2a954d9..ec5e15f 100644 --- a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -28,9 +28,11 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/BasicBlock.h" @@ -48,6 +50,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SimplifyIndVar.h" using namespace llvm; @@ -83,64 +86,62 @@ static cl::opt<ReplaceExitVal> ReplaceExitValue( namespace { struct RewritePhi; -} -namespace { - class IndVarSimplify : public LoopPass { - LoopInfo *LI; - ScalarEvolution *SE; - DominatorTree *DT; - TargetLibraryInfo *TLI; - const TargetTransformInfo *TTI; - - SmallVector<WeakVH, 16> DeadInsts; - bool Changed; - public: - - static char ID; // Pass identification, replacement for typeid - IndVarSimplify() - : LoopPass(ID), LI(nullptr), SE(nullptr), DT(nullptr), Changed(false) { - initializeIndVarSimplifyPass(*PassRegistry::getPassRegistry()); - } +class IndVarSimplify : public LoopPass { + LoopInfo *LI; + ScalarEvolution *SE; + DominatorTree *DT; + TargetLibraryInfo *TLI; + const TargetTransformInfo *TTI; - bool runOnLoop(Loop *L, LPPassManager &LPM) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfoWrapperPass>(); - AU.addRequired<ScalarEvolution>(); - AU.addRequiredID(LoopSimplifyID); - AU.addRequiredID(LCSSAID); - AU.addPreserved<ScalarEvolution>(); - AU.addPreservedID(LoopSimplifyID); - AU.addPreservedID(LCSSAID); - AU.setPreservesCFG(); - } + SmallVector<WeakVH, 16> DeadInsts; + bool Changed; +public: - private: - void releaseMemory() override { - DeadInsts.clear(); - } + static char ID; // Pass identification, replacement for typeid + IndVarSimplify() + : LoopPass(ID), LI(nullptr), SE(nullptr), DT(nullptr), Changed(false) { + initializeIndVarSimplifyPass(*PassRegistry::getPassRegistry()); + } - bool isValidRewrite(Value *FromVal, Value *ToVal); + bool runOnLoop(Loop *L, LPPassManager &LPM) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addRequiredID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + AU.addPreserved<GlobalsAAWrapperPass>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); + AU.addPreservedID(LoopSimplifyID); + AU.addPreservedID(LCSSAID); + AU.setPreservesCFG(); + } - void HandleFloatingPointIV(Loop *L, PHINode *PH); - void RewriteNonIntegerIVs(Loop *L); +private: + void releaseMemory() override { + DeadInsts.clear(); + } - void SimplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LPPassManager &LPM); + bool isValidRewrite(Value *FromVal, Value *ToVal); - bool CanLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet); - void RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter); + void handleFloatingPointIV(Loop *L, PHINode *PH); + void rewriteNonIntegerIVs(Loop *L); - Value *LinearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount, - PHINode *IndVar, SCEVExpander &Rewriter); + void simplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LoopInfo *LI); - void SinkUnusedInvariants(Loop *L); + bool canLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet); + void rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter); - Value *ExpandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S, Loop *L, - Instruction *InsertPt, Type *Ty, - bool &IsHighCostExpansion); - }; + Value *linearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount, + PHINode *IndVar, SCEVExpander &Rewriter); + + void sinkUnusedInvariants(Loop *L); + + Value *expandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S, Loop *L, + Instruction *InsertPt, Type *Ty); +}; } char IndVarSimplify::ID = 0; @@ -148,7 +149,7 @@ INITIALIZE_PASS_BEGIN(IndVarSimplify, "indvars", "Induction Variable Simplification", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_END(IndVarSimplify, "indvars", @@ -158,10 +159,10 @@ Pass *llvm::createIndVarSimplifyPass() { return new IndVarSimplify(); } -/// isValidRewrite - Return true if the SCEV expansion generated by the -/// rewriter can replace the original value. SCEV guarantees that it -/// produces the same value, but the way it is produced may be illegal IR. -/// Ideally, this function will only be called for verification. +/// Return true if the SCEV expansion generated by the rewriter can replace the +/// original value. SCEV guarantees that it produces the same value, but the way +/// it is produced may be illegal IR. Ideally, this function will only be +/// called for verification. bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) { // If an SCEV expression subsumed multiple pointers, its expansion could // reassociate the GEP changing the base pointer. This is illegal because the @@ -175,10 +176,10 @@ bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) { // because it understands lcssa phis while SCEV does not. Value *FromPtr = FromVal; Value *ToPtr = ToVal; - if (GEPOperator *GEP = dyn_cast<GEPOperator>(FromVal)) { + if (auto *GEP = dyn_cast<GEPOperator>(FromVal)) { FromPtr = GEP->getPointerOperand(); } - if (GEPOperator *GEP = dyn_cast<GEPOperator>(ToVal)) { + if (auto *GEP = dyn_cast<GEPOperator>(ToVal)) { ToPtr = GEP->getPointerOperand(); } if (FromPtr != FromVal || ToPtr != ToVal) { @@ -215,7 +216,7 @@ bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) { /// loop. For PHI nodes, there may be multiple uses, so compute the nearest /// common dominator for the incoming blocks. static Instruction *getInsertPointForUses(Instruction *User, Value *Def, - DominatorTree *DT) { + DominatorTree *DT, LoopInfo *LI) { PHINode *PHI = dyn_cast<PHINode>(User); if (!PHI) return User; @@ -234,17 +235,28 @@ static Instruction *getInsertPointForUses(Instruction *User, Value *Def, InsertPt = InsertBB->getTerminator(); } assert(InsertPt && "Missing phi operand"); - assert((!isa<Instruction>(Def) || - DT->dominates(cast<Instruction>(Def), InsertPt)) && - "def does not dominate all uses"); - return InsertPt; + + auto *DefI = dyn_cast<Instruction>(Def); + if (!DefI) + return InsertPt; + + assert(DT->dominates(DefI, InsertPt) && "def does not dominate all uses"); + + auto *L = LI->getLoopFor(DefI->getParent()); + assert(!L || L->contains(LI->getLoopFor(InsertPt->getParent()))); + + for (auto *DTN = (*DT)[InsertPt->getParent()]; DTN; DTN = DTN->getIDom()) + if (LI->getLoopFor(DTN->getBlock()) == L) + return DTN->getBlock()->getTerminator(); + + llvm_unreachable("DefI dominates InsertPt!"); } //===----------------------------------------------------------------------===// -// RewriteNonIntegerIVs and helpers. Prefer integer IVs. +// rewriteNonIntegerIVs and helpers. Prefer integer IVs. //===----------------------------------------------------------------------===// -/// ConvertToSInt - Convert APF to an integer, if possible. +/// Convert APF to an integer, if possible. static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) { bool isExact = false; // See if we can convert this to an int64_t @@ -256,8 +268,8 @@ static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) { return true; } -/// HandleFloatingPointIV - If the loop has floating induction variable -/// then insert corresponding integer induction variable if possible. +/// If the loop has floating induction variable then insert corresponding +/// integer induction variable if possible. /// For example, /// for(double i = 0; i < 10000; ++i) /// bar(i) @@ -265,13 +277,12 @@ static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) { /// for(int i = 0; i < 10000; ++i) /// bar((double)i); /// -void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) { +void IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) { unsigned IncomingEdge = L->contains(PN->getIncomingBlock(0)); unsigned BackEdge = IncomingEdge^1; // Check incoming value. - ConstantFP *InitValueVal = - dyn_cast<ConstantFP>(PN->getIncomingValue(IncomingEdge)); + auto *InitValueVal = dyn_cast<ConstantFP>(PN->getIncomingValue(IncomingEdge)); int64_t InitValue; if (!InitValueVal || !ConvertToSInt(InitValueVal->getValueAPF(), InitValue)) @@ -279,8 +290,7 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) { // Check IV increment. Reject this PN if increment operation is not // an add or increment value can not be represented by an integer. - BinaryOperator *Incr = - dyn_cast<BinaryOperator>(PN->getIncomingValue(BackEdge)); + auto *Incr = dyn_cast<BinaryOperator>(PN->getIncomingValue(BackEdge)); if (Incr == nullptr || Incr->getOpcode() != Instruction::FAdd) return; // If this is not an add of the PHI with a constantfp, or if the constant fp @@ -456,14 +466,14 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) { // platforms. if (WeakPH) { Value *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv", - PN->getParent()->getFirstInsertionPt()); + &*PN->getParent()->getFirstInsertionPt()); PN->replaceAllUsesWith(Conv); RecursivelyDeleteTriviallyDeadInstructions(PN, TLI); } Changed = true; } -void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) { +void IndVarSimplify::rewriteNonIntegerIVs(Loop *L) { // First step. Check to see if there are any floating-point recurrences. // If there are, change them into integer recurrences, permitting analysis by // the SCEV routines. @@ -477,7 +487,7 @@ void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) { for (unsigned i = 0, e = PHIs.size(); i != e; ++i) if (PHINode *PN = dyn_cast_or_null<PHINode>(&*PHIs[i])) - HandleFloatingPointIV(L, PN); + handleFloatingPointIV(L, PN); // If the loop previously had floating-point IV, ScalarEvolution // may not have been able to compute a trip count. Now that we've done some @@ -488,7 +498,7 @@ void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) { namespace { // Collect information about PHI nodes which can be transformed in -// RewriteLoopExitValues. +// rewriteLoopExitValues. struct RewritePhi { PHINode *PN; unsigned Ith; // Ith incoming value. @@ -501,70 +511,37 @@ struct RewritePhi { }; } -Value *IndVarSimplify::ExpandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S, +Value *IndVarSimplify::expandSCEVIfNeeded(SCEVExpander &Rewriter, const SCEV *S, Loop *L, Instruction *InsertPt, - Type *ResultTy, - bool &IsHighCostExpansion) { - using namespace llvm::PatternMatch; - - if (!Rewriter.isHighCostExpansion(S, L)) { - IsHighCostExpansion = false; - return Rewriter.expandCodeFor(S, ResultTy, InsertPt); - } - + Type *ResultTy) { // Before expanding S into an expensive LLVM expression, see if we can use an - // already existing value as the expansion for S. There is potential to make - // this significantly smarter, but this simple heuristic already gets some - // interesting cases. - - SmallVector<BasicBlock *, 4> Latches; - L->getLoopLatches(Latches); - - for (BasicBlock *BB : Latches) { - ICmpInst::Predicate Pred; - Instruction *LHS, *RHS; - BasicBlock *TrueBB, *FalseBB; - - if (!match(BB->getTerminator(), - m_Br(m_ICmp(Pred, m_Instruction(LHS), m_Instruction(RHS)), - TrueBB, FalseBB))) - continue; - - if (SE->getSCEV(LHS) == S && DT->dominates(LHS, InsertPt)) { - IsHighCostExpansion = false; - return LHS; - } - - if (SE->getSCEV(RHS) == S && DT->dominates(RHS, InsertPt)) { - IsHighCostExpansion = false; - return RHS; - } - } + // already existing value as the expansion for S. + if (Value *ExistingValue = Rewriter.findExistingExpansion(S, InsertPt, L)) + if (ExistingValue->getType() == ResultTy) + return ExistingValue; // We didn't find anything, fall back to using SCEVExpander. - assert(Rewriter.isHighCostExpansion(S, L) && "this should not have changed!"); - IsHighCostExpansion = true; return Rewriter.expandCodeFor(S, ResultTy, InsertPt); } //===----------------------------------------------------------------------===// -// RewriteLoopExitValues - Optimize IV users outside the loop. +// rewriteLoopExitValues - Optimize IV users outside the loop. // As a side effect, reduces the amount of IV processing within the loop. //===----------------------------------------------------------------------===// -/// RewriteLoopExitValues - Check to see if this loop has a computable -/// loop-invariant execution count. If so, this means that we can compute the -/// final value of any expressions that are recurrent in the loop, and -/// substitute the exit values from the loop into any instructions outside of -/// the loop that use the final values of the current expressions. +/// Check to see if this loop has a computable loop-invariant execution count. +/// If so, this means that we can compute the final value of any expressions +/// that are recurrent in the loop, and substitute the exit values from the loop +/// into any instructions outside of the loop that use the final values of the +/// current expressions. /// /// This is mostly redundant with the regular IndVarSimplify activities that /// happen later, except that it's more powerful in some cases, because it's /// able to brute-force evaluate arbitrary instructions as long as they have /// constant operands at the beginning of the loop. -void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { - // Verify the input to the pass in already in LCSSA form. - assert(L->isLCSSAForm(*DT)); +void IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { + // Check a pre-condition. + assert(L->isRecursivelyLCSSAForm(*DT) && "Indvars did not preserve LCSSA!"); SmallVector<BasicBlock*, 8> ExitBlocks; L->getUniqueExitBlocks(ExitBlocks); @@ -679,9 +656,9 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { continue; } - bool HighCost = false; - Value *ExitVal = ExpandSCEVIfNeeded(Rewriter, ExitValue, L, Inst, - PN->getType(), HighCost); + bool HighCost = Rewriter.isHighCostExpansion(ExitValue, L, Inst); + Value *ExitVal = + expandSCEVIfNeeded(Rewriter, ExitValue, L, Inst, PN->getType()); DEBUG(dbgs() << "INDVARS: RLEV: AfterLoopVal = " << *ExitVal << '\n' << " LoopVal = " << *Inst << "\n"); @@ -698,7 +675,7 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { } } - bool LoopCanBeDel = CanLoopBeDeleted(L, RewritePhiSet); + bool LoopCanBeDel = canLoopBeDeleted(L, RewritePhiSet); // Transformation. for (const RewritePhi &Phi : RewritePhiSet) { @@ -735,10 +712,10 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { Rewriter.clearInsertPoint(); } -/// CanLoopBeDeleted - Check whether it is possible to delete the loop after -/// rewriting exit value. If it is possible, ignore ReplaceExitValue and -/// do rewriting aggressively. -bool IndVarSimplify::CanLoopBeDeleted( +/// Check whether it is possible to delete the loop after rewriting exit +/// value. If it is possible, ignore ReplaceExitValue and do rewriting +/// aggressively. +bool IndVarSimplify::canLoopBeDeleted( Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet) { BasicBlock *Preheader = L->getLoopPreheader(); @@ -782,14 +759,9 @@ bool IndVarSimplify::CanLoopBeDeleted( ++BI; } - for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end(); - LI != LE; ++LI) { - for (BasicBlock::iterator BI = (*LI)->begin(), BE = (*LI)->end(); BI != BE; - ++BI) { - if (BI->mayHaveSideEffects()) - return false; - } - } + for (auto *BB : L->blocks()) + if (any_of(*BB, [](Instruction &I) { return I.mayHaveSideEffects(); })) + return false; return true; } @@ -799,22 +771,19 @@ bool IndVarSimplify::CanLoopBeDeleted( //===----------------------------------------------------------------------===// namespace { - // Collect information about induction variables that are used by sign/zero - // extend operations. This information is recorded by CollectExtend and - // provides the input to WidenIV. - struct WideIVInfo { - PHINode *NarrowIV; - Type *WidestNativeType; // Widest integer type created [sz]ext - bool IsSigned; // Was a sext user seen before a zext? - - WideIVInfo() : NarrowIV(nullptr), WidestNativeType(nullptr), - IsSigned(false) {} - }; +// Collect information about induction variables that are used by sign/zero +// extend operations. This information is recorded by CollectExtend and provides +// the input to WidenIV. +struct WideIVInfo { + PHINode *NarrowIV = nullptr; + Type *WidestNativeType = nullptr; // Widest integer type created [sz]ext + bool IsSigned = false; // Was a sext user seen before a zext? +}; } -/// visitCast - Update information about the induction variable that is -/// extended by this sign or zero extend operation. This is used to determine -/// the final width of the IV before actually widening it. +/// Update information about the induction variable that is extended by this +/// sign or zero extend operation. This is used to determine the final width of +/// the IV before actually widening it. static void visitIVCast(CastInst *Cast, WideIVInfo &WI, ScalarEvolution *SE, const TargetTransformInfo *TTI) { bool IsSigned = Cast->getOpcode() == Instruction::SExt; @@ -855,24 +824,29 @@ static void visitIVCast(CastInst *Cast, WideIVInfo &WI, ScalarEvolution *SE, namespace { -/// NarrowIVDefUse - Record a link in the Narrow IV def-use chain along with the -/// WideIV that computes the same value as the Narrow IV def. This avoids -/// caching Use* pointers. +/// Record a link in the Narrow IV def-use chain along with the WideIV that +/// computes the same value as the Narrow IV def. This avoids caching Use* +/// pointers. struct NarrowIVDefUse { - Instruction *NarrowDef; - Instruction *NarrowUse; - Instruction *WideDef; - - NarrowIVDefUse(): NarrowDef(nullptr), NarrowUse(nullptr), WideDef(nullptr) {} - - NarrowIVDefUse(Instruction *ND, Instruction *NU, Instruction *WD): - NarrowDef(ND), NarrowUse(NU), WideDef(WD) {} + Instruction *NarrowDef = nullptr; + Instruction *NarrowUse = nullptr; + Instruction *WideDef = nullptr; + + // True if the narrow def is never negative. Tracking this information lets + // us use a sign extension instead of a zero extension or vice versa, when + // profitable and legal. + bool NeverNegative = false; + + NarrowIVDefUse(Instruction *ND, Instruction *NU, Instruction *WD, + bool NeverNegative) + : NarrowDef(ND), NarrowUse(NU), WideDef(WD), + NeverNegative(NeverNegative) {} }; -/// WidenIV - The goal of this transform is to remove sign and zero extends -/// without creating any new induction variables. To do this, it creates a new -/// phi of the wider type and redirects all users, either removing extends or -/// inserting truncs whenever we stop propagating the type. +/// The goal of this transform is to remove sign and zero extends without +/// creating any new induction variables. To do this, it creates a new phi of +/// the wider type and redirects all users, either removing extends or inserting +/// truncs whenever we stop propagating the type. /// class WidenIV { // Parameters @@ -913,32 +887,35 @@ public: assert(L->getHeader() == OrigPhi->getParent() && "Phi must be an IV"); } - PHINode *CreateWideIV(SCEVExpander &Rewriter); + PHINode *createWideIV(SCEVExpander &Rewriter); protected: - Value *getExtend(Value *NarrowOper, Type *WideType, bool IsSigned, - Instruction *Use); + Value *createExtendInst(Value *NarrowOper, Type *WideType, bool IsSigned, + Instruction *Use); - Instruction *CloneIVUser(NarrowIVDefUse DU); + Instruction *cloneIVUser(NarrowIVDefUse DU, const SCEVAddRecExpr *WideAR); + Instruction *cloneArithmeticIVUser(NarrowIVDefUse DU, + const SCEVAddRecExpr *WideAR); + Instruction *cloneBitwiseIVUser(NarrowIVDefUse DU); - const SCEVAddRecExpr *GetWideRecurrence(Instruction *NarrowUse); + const SCEVAddRecExpr *getWideRecurrence(Instruction *NarrowUse); - const SCEVAddRecExpr* GetExtendedOperandRecurrence(NarrowIVDefUse DU); + const SCEVAddRecExpr* getExtendedOperandRecurrence(NarrowIVDefUse DU); - const SCEV *GetSCEVByOpCode(const SCEV *LHS, const SCEV *RHS, + const SCEV *getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS, unsigned OpCode) const; - Instruction *WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter); + Instruction *widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter); - bool WidenLoopCompare(NarrowIVDefUse DU); + bool widenLoopCompare(NarrowIVDefUse DU); void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef); }; } // anonymous namespace -/// isLoopInvariant - Perform a quick domtree based check for loop invariance -/// assuming that V is used within the loop. LoopInfo::isLoopInvariant() seems -/// gratuitous for this purpose. +/// Perform a quick domtree based check for loop invariance assuming that V is +/// used within the loop. LoopInfo::isLoopInvariant() seems gratuitous for this +/// purpose. static bool isLoopInvariant(Value *V, const Loop *L, const DominatorTree *DT) { Instruction *Inst = dyn_cast<Instruction>(V); if (!Inst) @@ -947,8 +924,8 @@ static bool isLoopInvariant(Value *V, const Loop *L, const DominatorTree *DT) { return DT->properlyDominates(Inst->getParent(), L->getHeader()); } -Value *WidenIV::getExtend(Value *NarrowOper, Type *WideType, bool IsSigned, - Instruction *Use) { +Value *WidenIV::createExtendInst(Value *NarrowOper, Type *WideType, + bool IsSigned, Instruction *Use) { // Set the debug location and conservative insertion point. IRBuilder<> Builder(Use); // Hoist the insertion point into loop preheaders as far as possible. @@ -961,10 +938,11 @@ Value *WidenIV::getExtend(Value *NarrowOper, Type *WideType, bool IsSigned, Builder.CreateZExt(NarrowOper, WideType); } -/// CloneIVUser - Instantiate a wide operation to replace a narrow -/// operation. This only needs to handle operations that can evaluation to -/// SCEVAddRec. It can safely return 0 for any operation we decide not to clone. -Instruction *WidenIV::CloneIVUser(NarrowIVDefUse DU) { +/// Instantiate a wide operation to replace a narrow operation. This only needs +/// to handle operations that can evaluation to SCEVAddRec. It can safely return +/// 0 for any operation we decide not to clone. +Instruction *WidenIV::cloneIVUser(NarrowIVDefUse DU, + const SCEVAddRecExpr *WideAR) { unsigned Opcode = DU.NarrowUse->getOpcode(); switch (Opcode) { default: @@ -973,40 +951,140 @@ Instruction *WidenIV::CloneIVUser(NarrowIVDefUse DU) { case Instruction::Mul: case Instruction::UDiv: case Instruction::Sub: + return cloneArithmeticIVUser(DU, WideAR); + case Instruction::And: case Instruction::Or: case Instruction::Xor: case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: - DEBUG(dbgs() << "Cloning IVUser: " << *DU.NarrowUse << "\n"); - - // Replace NarrowDef operands with WideDef. Otherwise, we don't know - // anything about the narrow operand yet so must insert a [sz]ext. It is - // probably loop invariant and will be folded or hoisted. If it actually - // comes from a widened IV, it should be removed during a future call to - // WidenIVUse. - Value *LHS = (DU.NarrowUse->getOperand(0) == DU.NarrowDef) ? DU.WideDef : - getExtend(DU.NarrowUse->getOperand(0), WideType, IsSigned, DU.NarrowUse); - Value *RHS = (DU.NarrowUse->getOperand(1) == DU.NarrowDef) ? DU.WideDef : - getExtend(DU.NarrowUse->getOperand(1), WideType, IsSigned, DU.NarrowUse); - - BinaryOperator *NarrowBO = cast<BinaryOperator>(DU.NarrowUse); - BinaryOperator *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), - LHS, RHS, - NarrowBO->getName()); - IRBuilder<> Builder(DU.NarrowUse); - Builder.Insert(WideBO); - if (const OverflowingBinaryOperator *OBO = - dyn_cast<OverflowingBinaryOperator>(NarrowBO)) { - if (OBO->hasNoUnsignedWrap()) WideBO->setHasNoUnsignedWrap(); - if (OBO->hasNoSignedWrap()) WideBO->setHasNoSignedWrap(); + return cloneBitwiseIVUser(DU); + } +} + +Instruction *WidenIV::cloneBitwiseIVUser(NarrowIVDefUse DU) { + Instruction *NarrowUse = DU.NarrowUse; + Instruction *NarrowDef = DU.NarrowDef; + Instruction *WideDef = DU.WideDef; + + DEBUG(dbgs() << "Cloning bitwise IVUser: " << *NarrowUse << "\n"); + + // Replace NarrowDef operands with WideDef. Otherwise, we don't know anything + // about the narrow operand yet so must insert a [sz]ext. It is probably loop + // invariant and will be folded or hoisted. If it actually comes from a + // widened IV, it should be removed during a future call to widenIVUse. + Value *LHS = (NarrowUse->getOperand(0) == NarrowDef) + ? WideDef + : createExtendInst(NarrowUse->getOperand(0), WideType, + IsSigned, NarrowUse); + Value *RHS = (NarrowUse->getOperand(1) == NarrowDef) + ? WideDef + : createExtendInst(NarrowUse->getOperand(1), WideType, + IsSigned, NarrowUse); + + auto *NarrowBO = cast<BinaryOperator>(NarrowUse); + auto *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS, + NarrowBO->getName()); + IRBuilder<> Builder(NarrowUse); + Builder.Insert(WideBO); + WideBO->copyIRFlags(NarrowBO); + return WideBO; +} + +Instruction *WidenIV::cloneArithmeticIVUser(NarrowIVDefUse DU, + const SCEVAddRecExpr *WideAR) { + Instruction *NarrowUse = DU.NarrowUse; + Instruction *NarrowDef = DU.NarrowDef; + Instruction *WideDef = DU.WideDef; + + DEBUG(dbgs() << "Cloning arithmetic IVUser: " << *NarrowUse << "\n"); + + unsigned IVOpIdx = (NarrowUse->getOperand(0) == NarrowDef) ? 0 : 1; + + // We're trying to find X such that + // + // Widen(NarrowDef `op` NonIVNarrowDef) == WideAR == WideDef `op.wide` X + // + // We guess two solutions to X, sext(NonIVNarrowDef) and zext(NonIVNarrowDef), + // and check using SCEV if any of them are correct. + + // Returns true if extending NonIVNarrowDef according to `SignExt` is a + // correct solution to X. + auto GuessNonIVOperand = [&](bool SignExt) { + const SCEV *WideLHS; + const SCEV *WideRHS; + + auto GetExtend = [this, SignExt](const SCEV *S, Type *Ty) { + if (SignExt) + return SE->getSignExtendExpr(S, Ty); + return SE->getZeroExtendExpr(S, Ty); + }; + + if (IVOpIdx == 0) { + WideLHS = SE->getSCEV(WideDef); + const SCEV *NarrowRHS = SE->getSCEV(NarrowUse->getOperand(1)); + WideRHS = GetExtend(NarrowRHS, WideType); + } else { + const SCEV *NarrowLHS = SE->getSCEV(NarrowUse->getOperand(0)); + WideLHS = GetExtend(NarrowLHS, WideType); + WideRHS = SE->getSCEV(WideDef); + } + + // WideUse is "WideDef `op.wide` X" as described in the comment. + const SCEV *WideUse = nullptr; + + switch (NarrowUse->getOpcode()) { + default: + llvm_unreachable("No other possibility!"); + + case Instruction::Add: + WideUse = SE->getAddExpr(WideLHS, WideRHS); + break; + + case Instruction::Mul: + WideUse = SE->getMulExpr(WideLHS, WideRHS); + break; + + case Instruction::UDiv: + WideUse = SE->getUDivExpr(WideLHS, WideRHS); + break; + + case Instruction::Sub: + WideUse = SE->getMinusSCEV(WideLHS, WideRHS); + break; } - return WideBO; + + return WideUse == WideAR; + }; + + bool SignExtend = IsSigned; + if (!GuessNonIVOperand(SignExtend)) { + SignExtend = !SignExtend; + if (!GuessNonIVOperand(SignExtend)) + return nullptr; } + + Value *LHS = (NarrowUse->getOperand(0) == NarrowDef) + ? WideDef + : createExtendInst(NarrowUse->getOperand(0), WideType, + SignExtend, NarrowUse); + Value *RHS = (NarrowUse->getOperand(1) == NarrowDef) + ? WideDef + : createExtendInst(NarrowUse->getOperand(1), WideType, + SignExtend, NarrowUse); + + auto *NarrowBO = cast<BinaryOperator>(NarrowUse); + auto *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS, + NarrowBO->getName()); + + IRBuilder<> Builder(NarrowUse); + Builder.Insert(WideBO); + WideBO->copyIRFlags(NarrowBO); + return WideBO; } -const SCEV *WidenIV::GetSCEVByOpCode(const SCEV *LHS, const SCEV *RHS, +const SCEV *WidenIV::getSCEVByOpCode(const SCEV *LHS, const SCEV *RHS, unsigned OpCode) const { if (OpCode == Instruction::Add) return SE->getAddExpr(LHS, RHS); @@ -1022,7 +1100,7 @@ const SCEV *WidenIV::GetSCEVByOpCode(const SCEV *LHS, const SCEV *RHS, /// operands. Generate the SCEV value for the widened operation without /// actually modifying the IR yet. If the expression after extending the /// operands is an AddRec for this loop, return it. -const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) { +const SCEVAddRecExpr* WidenIV::getExtendedOperandRecurrence(NarrowIVDefUse DU) { // Handle the common case of add<nsw/nuw> const unsigned OpCode = DU.NarrowUse->getOpcode(); @@ -1062,19 +1140,18 @@ const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) { if (ExtendOperIdx == 0) std::swap(lhs, rhs); const SCEVAddRecExpr *AddRec = - dyn_cast<SCEVAddRecExpr>(GetSCEVByOpCode(lhs, rhs, OpCode)); + dyn_cast<SCEVAddRecExpr>(getSCEVByOpCode(lhs, rhs, OpCode)); if (!AddRec || AddRec->getLoop() != L) return nullptr; return AddRec; } -/// GetWideRecurrence - Is this instruction potentially interesting for further -/// simplification after widening it's type? In other words, can the -/// extend be safely hoisted out of the loop with SCEV reducing the value to a -/// recurrence on the same loop. If so, return the sign or zero extended -/// recurrence. Otherwise return NULL. -const SCEVAddRecExpr *WidenIV::GetWideRecurrence(Instruction *NarrowUse) { +/// Is this instruction potentially interesting for further simplification after +/// widening it's type? In other words, can the extend be safely hoisted out of +/// the loop with SCEV reducing the value to a recurrence on the same loop. If +/// so, return the sign or zero extended recurrence. Otherwise return NULL. +const SCEVAddRecExpr *WidenIV::getWideRecurrence(Instruction *NarrowUse) { if (!SE->isSCEVable(NarrowUse->getType())) return nullptr; @@ -1097,10 +1174,11 @@ const SCEVAddRecExpr *WidenIV::GetWideRecurrence(Instruction *NarrowUse) { /// This IV user cannot be widen. Replace this use of the original narrow IV /// with a truncation of the new wide IV to isolate and eliminate the narrow IV. -static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT) { +static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT, LoopInfo *LI) { DEBUG(dbgs() << "INDVARS: Truncate IV " << *DU.WideDef << " for user " << *DU.NarrowUse << "\n"); - IRBuilder<> Builder(getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT)); + IRBuilder<> Builder( + getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI)); Value *Trunc = Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType()); DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, Trunc); } @@ -1108,13 +1186,27 @@ static void truncateIVUse(NarrowIVDefUse DU, DominatorTree *DT) { /// If the narrow use is a compare instruction, then widen the compare // (and possibly the other operand). The extend operation is hoisted into the // loop preheader as far as possible. -bool WidenIV::WidenLoopCompare(NarrowIVDefUse DU) { +bool WidenIV::widenLoopCompare(NarrowIVDefUse DU) { ICmpInst *Cmp = dyn_cast<ICmpInst>(DU.NarrowUse); if (!Cmp) return false; - // Sign of IV user and compare must match. - if (IsSigned != CmpInst::isSigned(Cmp->getPredicate())) + // We can legally widen the comparison in the following two cases: + // + // - The signedness of the IV extension and comparison match + // + // - The narrow IV is always positive (and thus its sign extension is equal + // to its zero extension). For instance, let's say we're zero extending + // %narrow for the following use + // + // icmp slt i32 %narrow, %val ... (A) + // + // and %narrow is always positive. Then + // + // (A) == icmp slt i32 sext(%narrow), sext(%val) + // == icmp slt i32 zext(%narrow), sext(%val) + + if (!(DU.NeverNegative || IsSigned == Cmp->isSigned())) return false; Value *Op = Cmp->getOperand(Cmp->getOperand(0) == DU.NarrowDef ? 1 : 0); @@ -1123,20 +1215,21 @@ bool WidenIV::WidenLoopCompare(NarrowIVDefUse DU) { assert (CastWidth <= IVWidth && "Unexpected width while widening compare."); // Widen the compare instruction. - IRBuilder<> Builder(getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT)); + IRBuilder<> Builder( + getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI)); DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef); // Widen the other operand of the compare, if necessary. if (CastWidth < IVWidth) { - Value *ExtOp = getExtend(Op, WideType, IsSigned, Cmp); + Value *ExtOp = createExtendInst(Op, WideType, Cmp->isSigned(), Cmp); DU.NarrowUse->replaceUsesOfWith(Op, ExtOp); } return true; } -/// WidenIVUse - Determine whether an individual user of the narrow IV can be -/// widened. If so, return the wide clone of the user. -Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { +/// Determine whether an individual user of the narrow IV can be widened. If so, +/// return the wide clone of the user. +Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { // Stop traversing the def-use chain at inner-loop phis or post-loop phis. if (PHINode *UsePhi = dyn_cast<PHINode>(DU.NarrowUse)) { @@ -1145,13 +1238,13 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { // After SimplifyCFG most loop exit targets have a single predecessor. // Otherwise fall back to a truncate within the loop. if (UsePhi->getNumOperands() != 1) - truncateIVUse(DU, DT); + truncateIVUse(DU, DT, LI); else { PHINode *WidePhi = PHINode::Create(DU.WideDef->getType(), 1, UsePhi->getName() + ".wide", UsePhi); WidePhi->addIncoming(DU.WideDef, UsePhi->getIncomingBlock(0)); - IRBuilder<> Builder(WidePhi->getParent()->getFirstInsertionPt()); + IRBuilder<> Builder(&*WidePhi->getParent()->getFirstInsertionPt()); Value *Trunc = Builder.CreateTrunc(WidePhi, DU.NarrowDef->getType()); UsePhi->replaceAllUsesWith(Trunc); DeadInsts.emplace_back(UsePhi); @@ -1200,20 +1293,20 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { } // Does this user itself evaluate to a recurrence after widening? - const SCEVAddRecExpr *WideAddRec = GetWideRecurrence(DU.NarrowUse); + const SCEVAddRecExpr *WideAddRec = getWideRecurrence(DU.NarrowUse); if (!WideAddRec) - WideAddRec = GetExtendedOperandRecurrence(DU); + WideAddRec = getExtendedOperandRecurrence(DU); if (!WideAddRec) { // If use is a loop condition, try to promote the condition instead of // truncating the IV first. - if (WidenLoopCompare(DU)) + if (widenLoopCompare(DU)) return nullptr; // This user does not evaluate to a recurence after widening, so don't // follow it. Instead insert a Trunc to kill off the original use, // eventually isolating the original narrow IV so it can be removed. - truncateIVUse(DU, DT); + truncateIVUse(DU, DT, LI); return nullptr; } // Assume block terminators cannot evaluate to a recurrence. We can't to @@ -1228,7 +1321,7 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { && Rewriter.hoistIVInc(WideInc, DU.NarrowUse)) WideUse = WideInc; else { - WideUse = CloneIVUser(DU); + WideUse = cloneIVUser(DU, WideAddRec); if (!WideUse) return nullptr; } @@ -1248,9 +1341,13 @@ Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { return WideUse; } -/// pushNarrowIVUsers - Add eligible users of NarrowDef to NarrowIVUsers. +/// Add eligible users of NarrowDef to NarrowIVUsers. /// void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) { + const SCEV *NarrowSCEV = SE->getSCEV(NarrowDef); + bool NeverNegative = + SE->isKnownPredicate(ICmpInst::ICMP_SGE, NarrowSCEV, + SE->getConstant(NarrowSCEV->getType(), 0)); for (User *U : NarrowDef->users()) { Instruction *NarrowUser = cast<Instruction>(U); @@ -1258,21 +1355,21 @@ void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) { if (!Widened.insert(NarrowUser).second) continue; - NarrowIVUsers.push_back(NarrowIVDefUse(NarrowDef, NarrowUser, WideDef)); + NarrowIVUsers.push_back( + NarrowIVDefUse(NarrowDef, NarrowUser, WideDef, NeverNegative)); } } -/// CreateWideIV - Process a single induction variable. First use the -/// SCEVExpander to create a wide induction variable that evaluates to the same -/// recurrence as the original narrow IV. Then use a worklist to forward -/// traverse the narrow IV's def-use chain. After WidenIVUse has processed all -/// interesting IV users, the narrow IV will be isolated for removal by -/// DeleteDeadPHIs. +/// Process a single induction variable. First use the SCEVExpander to create a +/// wide induction variable that evaluates to the same recurrence as the +/// original narrow IV. Then use a worklist to forward traverse the narrow IV's +/// def-use chain. After widenIVUse has processed all interesting IV users, the +/// narrow IV will be isolated for removal by DeleteDeadPHIs. /// /// It would be simpler to delete uses as they are processed, but we must avoid /// invalidating SCEV expressions. /// -PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) { +PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) { // Is this phi an induction variable? const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(OrigPhi)); if (!AddRec) @@ -1302,11 +1399,11 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) { // either find an existing phi or materialize a new one. Either way, we // expect a well-formed cyclic phi-with-increments. i.e. any operand not part // of the phi-SCC dominates the loop entry. - Instruction *InsertPt = L->getHeader()->begin(); + Instruction *InsertPt = &L->getHeader()->front(); WidePhi = cast<PHINode>(Rewriter.expandCodeFor(AddRec, WideType, InsertPt)); // Remembering the WideIV increment generated by SCEVExpander allows - // WidenIVUse to reuse it when widening the narrow IV's increment. We don't + // widenIVUse to reuse it when widening the narrow IV's increment. We don't // employ a general reuse mechanism because the call above is the only call to // SCEVExpander. Henceforth, we produce 1-to-1 narrow to wide uses. if (BasicBlock *LatchBlock = L->getLoopLatch()) { @@ -1329,13 +1426,13 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) { // Process a def-use edge. This may replace the use, so don't hold a // use_iterator across it. - Instruction *WideUse = WidenIVUse(DU, Rewriter); + Instruction *WideUse = widenIVUse(DU, Rewriter); // Follow all def-use edges from the previous narrow use. if (WideUse) pushNarrowIVUsers(DU.NarrowUse, WideUse); - // WidenIVUse may have removed the def-use edge. + // widenIVUse may have removed the def-use edge. if (DU.NarrowDef->use_empty()) DeadInsts.emplace_back(DU.NarrowDef); } @@ -1352,38 +1449,38 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) { //===----------------------------------------------------------------------===// namespace { - class IndVarSimplifyVisitor : public IVVisitor { - ScalarEvolution *SE; - const TargetTransformInfo *TTI; - PHINode *IVPhi; - - public: - WideIVInfo WI; - - IndVarSimplifyVisitor(PHINode *IV, ScalarEvolution *SCEV, - const TargetTransformInfo *TTI, - const DominatorTree *DTree) - : SE(SCEV), TTI(TTI), IVPhi(IV) { - DT = DTree; - WI.NarrowIV = IVPhi; - if (ReduceLiveIVs) - setSplitOverflowIntrinsics(); - } +class IndVarSimplifyVisitor : public IVVisitor { + ScalarEvolution *SE; + const TargetTransformInfo *TTI; + PHINode *IVPhi; - // Implement the interface used by simplifyUsersOfIV. - void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, TTI); } - }; +public: + WideIVInfo WI; + + IndVarSimplifyVisitor(PHINode *IV, ScalarEvolution *SCEV, + const TargetTransformInfo *TTI, + const DominatorTree *DTree) + : SE(SCEV), TTI(TTI), IVPhi(IV) { + DT = DTree; + WI.NarrowIV = IVPhi; + if (ReduceLiveIVs) + setSplitOverflowIntrinsics(); + } + + // Implement the interface used by simplifyUsersOfIV. + void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, TTI); } +}; } -/// SimplifyAndExtend - Iteratively perform simplification on a worklist of IV -/// users. Each successive simplification may push more users which may -/// themselves be candidates for simplification. +/// Iteratively perform simplification on a worklist of IV users. Each +/// successive simplification may push more users which may themselves be +/// candidates for simplification. /// /// Sign/Zero extend elimination is interleaved with IV simplification. /// -void IndVarSimplify::SimplifyAndExtend(Loop *L, +void IndVarSimplify::simplifyAndExtend(Loop *L, SCEVExpander &Rewriter, - LPPassManager &LPM) { + LoopInfo *LI) { SmallVector<WideIVInfo, 8> WideIVs; SmallVector<PHINode*, 8> LoopPhis; @@ -1400,14 +1497,14 @@ void IndVarSimplify::SimplifyAndExtend(Loop *L, // extension. The first time SCEV attempts to normalize sign/zero extension, // the result becomes final. So for the most predictable results, we delay // evaluation of sign/zero extend evaluation until needed, and avoid running - // other SCEV based analysis prior to SimplifyAndExtend. + // other SCEV based analysis prior to simplifyAndExtend. do { PHINode *CurrIV = LoopPhis.pop_back_val(); // Information about sign/zero extensions of CurrIV. IndVarSimplifyVisitor Visitor(CurrIV, SE, TTI, DT); - Changed |= simplifyUsersOfIV(CurrIV, SE, &LPM, DeadInsts, &Visitor); + Changed |= simplifyUsersOfIV(CurrIV, SE, DT, LI, DeadInsts, &Visitor); if (Visitor.WI.WidestNativeType) { WideIVs.push_back(Visitor.WI); @@ -1416,7 +1513,7 @@ void IndVarSimplify::SimplifyAndExtend(Loop *L, for (; !WideIVs.empty(); WideIVs.pop_back()) { WidenIV Widener(WideIVs.back(), LI, SE, DT, DeadInsts); - if (PHINode *WidePhi = Widener.CreateWideIV(Rewriter)) { + if (PHINode *WidePhi = Widener.createWideIV(Rewriter)) { Changed = true; LoopPhis.push_back(WidePhi); } @@ -1425,12 +1522,12 @@ void IndVarSimplify::SimplifyAndExtend(Loop *L, } //===----------------------------------------------------------------------===// -// LinearFunctionTestReplace and its kin. Rewrite the loop exit condition. +// linearFunctionTestReplace and its kin. Rewrite the loop exit condition. //===----------------------------------------------------------------------===// -/// canExpandBackedgeTakenCount - Return true if this loop's backedge taken -/// count expression can be safely and cheaply expanded into an instruction -/// sequence that can be used by LinearFunctionTestReplace. +/// Return true if this loop's backedge taken count expression can be safely and +/// cheaply expanded into an instruction sequence that can be used by +/// linearFunctionTestReplace. /// /// TODO: This fails for pointer-type loop counters with greater than one byte /// strides, consequently preventing LFTR from running. For the purpose of LFTR @@ -1461,8 +1558,7 @@ static bool canExpandBackedgeTakenCount(Loop *L, ScalarEvolution *SE, return true; } -/// getLoopPhiForCounter - Return the loop header phi IFF IncV adds a loop -/// invariant value to the phi. +/// Return the loop header phi IFF IncV adds a loop invariant value to the phi. static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) { Instruction *IncI = dyn_cast<Instruction>(IncV); if (!IncI) @@ -1513,8 +1609,8 @@ static ICmpInst *getLoopTest(Loop *L) { return dyn_cast<ICmpInst>(BI->getCondition()); } -/// needsLFTR - LinearFunctionTestReplace policy. Return true unless we can show -/// that the current exit test is already sufficiently canonical. +/// linearFunctionTestReplace policy. Return true unless we can show that the +/// current exit test is already sufficiently canonical. static bool needsLFTR(Loop *L, DominatorTree *DT) { // Do LFTR to simplify the exit condition to an ICMP. ICmpInst *Cond = getLoopTest(L); @@ -1574,10 +1670,10 @@ static bool hasConcreteDefImpl(Value *V, SmallPtrSetImpl<Value*> &Visited, return false; // Optimistically handle other instructions. - for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) { - if (!Visited.insert(*OI).second) + for (Value *Op : I->operands()) { + if (!Visited.insert(Op).second) continue; - if (!hasConcreteDefImpl(*OI, Visited, Depth+1)) + if (!hasConcreteDefImpl(Op, Visited, Depth+1)) return false; } return true; @@ -1594,8 +1690,8 @@ static bool hasConcreteDef(Value *V) { return hasConcreteDefImpl(V, Visited, 0); } -/// AlmostDeadIV - Return true if this IV has any uses other than the (soon to -/// be rewritten) loop exit test. +/// Return true if this IV has any uses other than the (soon to be rewritten) +/// loop exit test. static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) { int LatchIdx = Phi->getBasicBlockIndex(LatchBlock); Value *IncV = Phi->getIncomingValue(LatchIdx); @@ -1608,7 +1704,7 @@ static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) { return true; } -/// FindLoopCounter - Find an affine IV in canonical form. +/// Find an affine IV in canonical form. /// /// BECount may be an i8* pointer type. The pointer difference is already /// valid count without scaling the address stride, so it remains a pointer @@ -1702,8 +1798,8 @@ static PHINode *FindLoopCounter(Loop *L, const SCEV *BECount, return BestPhi; } -/// genLoopLimit - Help LinearFunctionTestReplace by generating a value that -/// holds the RHS of the new loop test. +/// Help linearFunctionTestReplace by generating a value that holds the RHS of +/// the new loop test. static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L, SCEVExpander &Rewriter, ScalarEvolution *SE) { const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IndVar)); @@ -1785,13 +1881,13 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L, } } -/// LinearFunctionTestReplace - This method rewrites the exit condition of the -/// loop to be a canonical != comparison against the incremented loop induction -/// variable. This pass is able to rewrite the exit tests of any loop where the -/// SCEV analysis can determine a loop-invariant trip count of the loop, which -/// is actually a much broader range than just linear tests. +/// This method rewrites the exit condition of the loop to be a canonical != +/// comparison against the incremented loop induction variable. This pass is +/// able to rewrite the exit tests of any loop where the SCEV analysis can +/// determine a loop-invariant trip count of the loop, which is actually a much +/// broader range than just linear tests. Value *IndVarSimplify:: -LinearFunctionTestReplace(Loop *L, +linearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount, PHINode *IndVar, SCEVExpander &Rewriter) { @@ -1809,7 +1905,7 @@ LinearFunctionTestReplace(Loop *L, // This addition may overflow, which is valid as long as the comparison is // truncated to BackedgeTakenCount->getType(). IVCount = SE->getAddExpr(BackedgeTakenCount, - SE->getConstant(BackedgeTakenCount->getType(), 1)); + SE->getOne(BackedgeTakenCount->getType())); // The BackedgeTaken expression contains the number of times that the // backedge branches to the loop header. This is one less than the // number of times the loop executes, so use the incremented indvar. @@ -1847,8 +1943,8 @@ LinearFunctionTestReplace(Loop *L, const SCEV *ARStep = AR->getStepRecurrence(*SE); // For constant IVCount, avoid truncation. if (isa<SCEVConstant>(ARStart) && isa<SCEVConstant>(IVCount)) { - const APInt &Start = cast<SCEVConstant>(ARStart)->getValue()->getValue(); - APInt Count = cast<SCEVConstant>(IVCount)->getValue()->getValue(); + const APInt &Start = cast<SCEVConstant>(ARStart)->getAPInt(); + APInt Count = cast<SCEVConstant>(IVCount)->getAPInt(); // Note that the post-inc value of BackedgeTakenCount may have overflowed // above such that IVCount is now zero. if (IVCount != BackedgeTakenCount && Count == 0) { @@ -1886,21 +1982,21 @@ LinearFunctionTestReplace(Loop *L, } //===----------------------------------------------------------------------===// -// SinkUnusedInvariants. A late subpass to cleanup loop preheaders. +// sinkUnusedInvariants. A late subpass to cleanup loop preheaders. //===----------------------------------------------------------------------===// /// If there's a single exit block, sink any loop-invariant values that /// were defined in the preheader but not used inside the loop into the /// exit block to reduce register pressure in the loop. -void IndVarSimplify::SinkUnusedInvariants(Loop *L) { +void IndVarSimplify::sinkUnusedInvariants(Loop *L) { BasicBlock *ExitBlock = L->getExitBlock(); if (!ExitBlock) return; BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) return; - Instruction *InsertPt = ExitBlock->getFirstInsertionPt(); - BasicBlock::iterator I = Preheader->getTerminator(); + Instruction *InsertPt = &*ExitBlock->getFirstInsertionPt(); + BasicBlock::iterator I(Preheader->getTerminator()); while (I != Preheader->begin()) { --I; // New instructions were inserted at the end of the preheader. @@ -1920,8 +2016,8 @@ void IndVarSimplify::SinkUnusedInvariants(Loop *L) { if (isa<DbgInfoIntrinsic>(I)) continue; - // Skip landingpad instructions. - if (isa<LandingPadInst>(I)) + // Skip eh pad instructions. + if (I->isEHPad()) continue; // Don't sink alloca: we never want to sink static alloca's out of the @@ -1953,7 +2049,7 @@ void IndVarSimplify::SinkUnusedInvariants(Loop *L) { continue; // Otherwise, sink it to the exit block. - Instruction *ToMove = I; + Instruction *ToMove = &*I; bool Done = false; if (I != Preheader->begin()) { @@ -1994,7 +2090,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { return false; LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); TLI = TLIP ? &TLIP->getTLI() : nullptr; @@ -2007,7 +2103,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // If there are any floating-point recurrences, attempt to // transform them to use integer recurrences. - RewriteNonIntegerIVs(L); + rewriteNonIntegerIVs(L); const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L); @@ -2024,7 +2120,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // other expressions involving loop IVs have been evaluated. This helps SCEV // set no-wrap flags before normalizing sign/zero extension. Rewriter.disableCanonicalMode(); - SimplifyAndExtend(L, Rewriter, LPM); + simplifyAndExtend(L, Rewriter, LI); // Check to see if this loop has a computable loop-invariant execution count. // If so, this means that we can compute the final value of any expressions @@ -2034,7 +2130,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // if (ReplaceExitValue != NeverRepl && !isa<SCEVCouldNotCompute>(BackedgeTakenCount)) - RewriteLoopExitValues(L, Rewriter); + rewriteLoopExitValues(L, Rewriter); // Eliminate redundant IV cycles. NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts); @@ -2054,7 +2150,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // explicitly check any assumptions made by SCEV. Brittle. const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(BackedgeTakenCount); if (!AR || AR->getLoop()->getLoopPreheader()) - (void)LinearFunctionTestReplace(L, BackedgeTakenCount, IndVar, + (void)linearFunctionTestReplace(L, BackedgeTakenCount, IndVar, Rewriter); } } @@ -2074,13 +2170,13 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // Loop-invariant instructions in the preheader that aren't used in the // loop may be sunk below the loop to reduce register pressure. - SinkUnusedInvariants(L); + sinkUnusedInvariants(L); // Clean up dead instructions. Changed |= DeleteDeadPHIs(L->getHeader(), TLI); + // Check a post-condition. - assert(L->isLCSSAForm(*DT) && - "Indvars did not leave the loop in lcssa form!"); + assert(L->isRecursivelyLCSSAForm(*DT) && "Indvars did not preserve LCSSA!"); // Verify that LFTR, and any other change have not interfered with SCEV's // ability to compute trip count. diff --git a/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp index cbdacad..dea61f6 100644 --- a/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp @@ -214,8 +214,8 @@ public: AU.addRequired<LoopInfoWrapperPass>(); AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); - AU.addRequired<ScalarEvolution>(); - AU.addRequired<BranchProbabilityInfo>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addRequired<BranchProbabilityInfoWrapperPass>(); } bool runOnLoop(Loop *L, LPPassManager &LPM) override; @@ -224,8 +224,15 @@ public: char InductiveRangeCheckElimination::ID = 0; } -INITIALIZE_PASS(InductiveRangeCheckElimination, "irce", - "Inductive range check elimination", false, false) +INITIALIZE_PASS_BEGIN(InductiveRangeCheckElimination, "irce", + "Inductive range check elimination", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass) +INITIALIZE_PASS_END(InductiveRangeCheckElimination, "irce", + "Inductive range check elimination", false, false) const char *InductiveRangeCheck::rangeCheckKindToStr( InductiveRangeCheck::RangeCheckKind RCK) { @@ -1044,9 +1051,9 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd( auto BBInsertLocation = std::next(Function::iterator(LS.Latch)); RRI.ExitSelector = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".exit.selector", - &F, BBInsertLocation); + &F, &*BBInsertLocation); RRI.PseudoExit = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".pseudo.exit", &F, - BBInsertLocation); + &*BBInsertLocation); BranchInst *PreheaderJump = cast<BranchInst>(&*Preheader->rbegin()); bool Increasing = LS.IndVarIncreasing; @@ -1399,8 +1406,9 @@ bool InductiveRangeCheckElimination::runOnLoop(Loop *L, LPPassManager &LPM) { LLVMContext &Context = Preheader->getContext(); InductiveRangeCheck::AllocatorTy IRCAlloc; SmallVector<InductiveRangeCheck *, 16> RangeChecks; - ScalarEvolution &SE = getAnalysis<ScalarEvolution>(); - BranchProbabilityInfo &BPI = getAnalysis<BranchProbabilityInfo>(); + ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + BranchProbabilityInfo &BPI = + getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI(); for (auto BBI : L->getBlocks()) if (BranchInst *TBI = dyn_cast<BranchInst>(BBI->getTerminator())) diff --git a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 1130d22..087ce8a 100644 --- a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -18,15 +18,22 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BlockFrequencyInfoImpl.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LazyValueInfo.h" #include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" @@ -36,6 +43,8 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" +#include <algorithm> +#include <memory> using namespace llvm; #define DEBUG_TYPE "jump-threading" @@ -49,6 +58,13 @@ BBDuplicateThreshold("jump-threading-threshold", cl::desc("Max block size to duplicate for jump threading"), cl::init(6), cl::Hidden); +static cl::opt<unsigned> +ImplicationSearchThreshold( + "jump-threading-implication-search-threshold", + cl::desc("The number of predecessors to search for a stronger " + "condition to use to thread over a weaker condition"), + cl::init(3), cl::Hidden); + namespace { // These are at global scope so static functions can use them too. typedef SmallVectorImpl<std::pair<Constant*, BasicBlock*> > PredValueInfo; @@ -80,6 +96,9 @@ namespace { class JumpThreading : public FunctionPass { TargetLibraryInfo *TLI; LazyValueInfo *LVI; + std::unique_ptr<BlockFrequencyInfo> BFI; + std::unique_ptr<BranchProbabilityInfo> BPI; + bool HasProfileData; #ifdef NDEBUG SmallPtrSet<BasicBlock*, 16> LoopHeaders; #else @@ -114,9 +133,15 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LazyValueInfo>(); AU.addPreserved<LazyValueInfo>(); + AU.addPreserved<GlobalsAAWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); } + void releaseMemory() override { + BFI.reset(); + BPI.reset(); + } + void FindLoopHeaders(Function &F); bool ProcessBlock(BasicBlock *BB); bool ThreadEdge(BasicBlock *BB, const SmallVectorImpl<BasicBlock*> &PredBBs, @@ -134,9 +159,16 @@ namespace { bool ProcessBranchOnPHI(PHINode *PN); bool ProcessBranchOnXOR(BinaryOperator *BO); + bool ProcessImpliedCondition(BasicBlock *BB); bool SimplifyPartiallyRedundantLoad(LoadInst *LI); bool TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB); + + private: + BasicBlock *SplitBlockPreds(BasicBlock *BB, ArrayRef<BasicBlock *> Preds, + const char *Suffix); + void UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB, BasicBlock *BB, + BasicBlock *NewBB, BasicBlock *SuccBB); }; } @@ -160,11 +192,21 @@ bool JumpThreading::runOnFunction(Function &F) { DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n"); TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); LVI = &getAnalysis<LazyValueInfo>(); + BFI.reset(); + BPI.reset(); + // When profile data is available, we need to update edge weights after + // successful jump threading, which requires both BPI and BFI being available. + HasProfileData = F.getEntryCount().hasValue(); + if (HasProfileData) { + LoopInfo LI{DominatorTree(F)}; + BPI.reset(new BranchProbabilityInfo(F, LI)); + BFI.reset(new BlockFrequencyInfo(F, *BPI, LI)); + } // Remove unreachable blocks from function as they may result in infinite // loop. We do threading if we found something profitable. Jump threading a // branch can create other opportunities. If these opportunities form a cycle - // i.e. if any jump treading is undoing previous threading in the path, then + // i.e. if any jump threading is undoing previous threading in the path, then // we will loop forever. We take care of this issue by not jump threading for // back edges. This works for normal cases but not for unreachable blocks as // they may have cycle with no back edge. @@ -176,7 +218,7 @@ bool JumpThreading::runOnFunction(Function &F) { do { Changed = false; for (Function::iterator I = F.begin(), E = F.end(); I != E;) { - BasicBlock *BB = I; + BasicBlock *BB = &*I; // Thread all of the branches we can over this block. while (ProcessBlock(BB)) Changed = true; @@ -239,11 +281,26 @@ bool JumpThreading::runOnFunction(Function &F) { static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB, unsigned Threshold) { /// Ignore PHI nodes, these will be flattened when duplication happens. - BasicBlock::const_iterator I = BB->getFirstNonPHI(); + BasicBlock::const_iterator I(BB->getFirstNonPHI()); // FIXME: THREADING will delete values that are just used to compute the // branch, so they shouldn't count against the duplication cost. + unsigned Bonus = 0; + const TerminatorInst *BBTerm = BB->getTerminator(); + // Threading through a switch statement is particularly profitable. If this + // block ends in a switch, decrease its cost to make it more likely to happen. + if (isa<SwitchInst>(BBTerm)) + Bonus = 6; + + // The same holds for indirect branches, but slightly more so. + if (isa<IndirectBrInst>(BBTerm)) + Bonus = 8; + + // Bump the threshold up so the early exit from the loop doesn't skip the + // terminator-based Size adjustment at the end. + Threshold += Bonus; + // Sum up the cost of each instruction until we get to the terminator. Don't // include the terminator because the copy won't include it. unsigned Size = 0; @@ -260,6 +317,11 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB, if (isa<BitCastInst>(I) && I->getType()->isPointerTy()) continue; + // Bail out if this instruction gives back a token type, it is not possible + // to duplicate it if it is used outside this BB. + if (I->getType()->isTokenTy() && I->isUsedOutsideOfBlock(BB)) + return ~0U; + // All other instructions count for at least one unit. ++Size; @@ -268,7 +330,7 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB, // as having cost of 2 total, and if they are a vector intrinsic, we model // them as having cost 1. if (const CallInst *CI = dyn_cast<CallInst>(I)) { - if (CI->cannotDuplicate()) + if (CI->cannotDuplicate() || CI->isConvergent()) // Blocks with NoDuplicate are modelled as having infinite cost, so they // are never duplicated. return ~0U; @@ -279,16 +341,7 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB, } } - // Threading through a switch statement is particularly profitable. If this - // block ends in a switch, decrease its cost to make it more likely to happen. - if (isa<SwitchInst>(I)) - Size = Size > 6 ? Size-6 : 0; - - // The same holds for indirect branches, but slightly more so. - if (isa<IndirectBrInst>(I)) - Size = Size > 8 ? Size-8 : 0; - - return Size; + return Size > Bonus ? Size - Bonus : 0; } /// FindLoopHeaders - We do not want jump threading to turn proper loop @@ -669,7 +722,8 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { // because now the condition in this block can be threaded through // predecessors of our predecessor block. if (BasicBlock *SinglePred = BB->getSinglePredecessor()) { - if (SinglePred->getTerminator()->getNumSuccessors() == 1 && + const TerminatorInst *TI = SinglePred->getTerminator(); + if (!TI->isExceptional() && TI->getNumSuccessors() == 1 && SinglePred != BB && !hasAddressTakenAndUsed(BB)) { // If SinglePred was a loop header, BB becomes one. if (LoopHeaders.erase(SinglePred)) @@ -761,7 +815,7 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { // If we're branching on a conditional, LVI might be able to determine // it's value at the branch instruction. We only handle comparisons // against a constant at this time. - // TODO: This should be extended to handle switches as well. + // TODO: This should be extended to handle switches as well. BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator()); Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1)); if (CondBr && CondConst && CondBr->isConditional()) { @@ -829,9 +883,40 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { CondInst->getParent() == BB && isa<BranchInst>(BB->getTerminator())) return ProcessBranchOnXOR(cast<BinaryOperator>(CondInst)); + // Search for a stronger dominating condition that can be used to simplify a + // conditional branch leaving BB. + if (ProcessImpliedCondition(BB)) + return true; + + return false; +} + +bool JumpThreading::ProcessImpliedCondition(BasicBlock *BB) { + auto *BI = dyn_cast<BranchInst>(BB->getTerminator()); + if (!BI || !BI->isConditional()) + return false; + + Value *Cond = BI->getCondition(); + BasicBlock *CurrentBB = BB; + BasicBlock *CurrentPred = BB->getSinglePredecessor(); + unsigned Iter = 0; + + auto &DL = BB->getModule()->getDataLayout(); + + while (CurrentPred && Iter++ < ImplicationSearchThreshold) { + auto *PBI = dyn_cast<BranchInst>(CurrentPred->getTerminator()); + if (!PBI || !PBI->isConditional() || PBI->getSuccessor(0) != CurrentBB) + return false; - // TODO: If we have: "br (X > 0)" and we have a predecessor where we know - // "(X == 4)", thread through this block. + if (isImpliedCondition(PBI->getCondition(), Cond, DL)) { + BI->getSuccessor(1)->removePredecessor(BB); + BranchInst::Create(BI->getSuccessor(0), BI); + BI->eraseFromParent(); + return true; + } + CurrentBB = CurrentPred; + CurrentPred = CurrentBB->getSinglePredecessor(); + } return false; } @@ -850,10 +935,10 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { if (LoadBB->getSinglePredecessor()) return false; - // If the load is defined in a landing pad, it can't be partially redundant, - // because the edges between the invoke and the landing pad cannot have other + // If the load is defined in an EH pad, it can't be partially redundant, + // because the edges between the invoke and the EH pad cannot have other // instructions between them. - if (LoadBB->isLandingPad()) + if (LoadBB->isEHPad()) return false; Value *LoadedPtr = LI->getOperand(0); @@ -866,11 +951,11 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // Scan a few instructions up from the load, to see if it is obviously live at // the entry to its block. - BasicBlock::iterator BBIt = LI; + BasicBlock::iterator BBIt(LI); if (Value *AvailableVal = - FindAvailableLoadedValue(LoadedPtr, LoadBB, BBIt, 6)) { - // If the value if the load is locally available within the block, just use + FindAvailableLoadedValue(LoadedPtr, LoadBB, BBIt, DefMaxInstsToScan)) { + // If the value of the load is locally available within the block, just use // it. This frequently occurs for reg2mem'd allocas. //cerr << "LOAD ELIMINATED:\n" << *BBIt << *LI << "\n"; @@ -914,7 +999,8 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // Scan the predecessor to see if the value is available in the pred. BBIt = PredBB->end(); AAMDNodes ThisAATags; - Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt, 6, + Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt, + DefMaxInstsToScan, nullptr, &ThisAATags); if (!PredAvailable) { OneUnavailablePred = PredBB; @@ -968,8 +1054,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { } // Split them out to their own block. - UnavailablePred = - SplitBlockPredecessors(LoadBB, PredsToSplit, "thread-pre-split"); + UnavailablePred = SplitBlockPreds(LoadBB, PredsToSplit, "thread-pre-split"); } // If the value isn't available in all predecessors, then there will be @@ -995,7 +1080,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // Create a PHI node at the start of the block for the PRE'd load value. pred_iterator PB = pred_begin(LoadBB), PE = pred_end(LoadBB); PHINode *PN = PHINode::Create(LI->getType(), std::distance(PB, PE), "", - LoadBB->begin()); + &LoadBB->front()); PN->takeName(LI); PN->setDebugLoc(LI->getDebugLoc()); @@ -1262,7 +1347,7 @@ bool JumpThreading::ProcessBranchOnXOR(BinaryOperator *BO) { // Into: // BB': // %Y = icmp ne i32 %A, %B - // br i1 %Z, ... + // br i1 %Y, ... PredValueInfoTy XorOpValues; bool isLHS = true; @@ -1387,14 +1472,14 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, return false; } - // And finally, do it! Start by factoring the predecessors is needed. + // And finally, do it! Start by factoring the predecessors if needed. BasicBlock *PredBB; if (PredBBs.size() == 1) PredBB = PredBBs[0]; else { DEBUG(dbgs() << " Factoring out " << PredBBs.size() << " common predecessors.\n"); - PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm"); + PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm"); } // And finally, do it! @@ -1415,6 +1500,13 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, BB->getParent(), BB); NewBB->moveAfter(PredBB); + // Set the block frequency of NewBB. + if (HasProfileData) { + auto NewBBFreq = + BFI->getBlockFreq(PredBB) * BPI->getEdgeProbability(PredBB, BB); + BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency()); + } + BasicBlock::iterator BI = BB->begin(); for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI) ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB); @@ -1425,7 +1517,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, Instruction *New = BI->clone(); New->setName(BI->getName()); NewBB->getInstList().push_back(New); - ValueMapping[BI] = New; + ValueMapping[&*BI] = New; // Remap operands to patch up intra-block references. for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i) @@ -1438,7 +1530,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, // We didn't copy the terminator from BB over to NewBB, because there is now // an unconditional jump to SuccBB. Insert the unconditional jump. - BranchInst *NewBI =BranchInst::Create(SuccBB, NewBB); + BranchInst *NewBI = BranchInst::Create(SuccBB, NewBB); NewBI->setDebugLoc(BB->getTerminator()->getDebugLoc()); // Check to see if SuccBB has PHI nodes. If so, we need to add entries to the @@ -1475,8 +1567,8 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks // with the two values we know. SSAUpdate.Initialize(I->getType(), I->getName()); - SSAUpdate.AddAvailableValue(BB, I); - SSAUpdate.AddAvailableValue(NewBB, ValueMapping[I]); + SSAUpdate.AddAvailableValue(BB, &*I); + SSAUpdate.AddAvailableValue(NewBB, ValueMapping[&*I]); while (!UsesToRename.empty()) SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); @@ -1499,11 +1591,98 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, // frequently happens because of phi translation. SimplifyInstructionsInBlock(NewBB, TLI); + // Update the edge weight from BB to SuccBB, which should be less than before. + UpdateBlockFreqAndEdgeWeight(PredBB, BB, NewBB, SuccBB); + // Threaded an edge! ++NumThreads; return true; } +/// Create a new basic block that will be the predecessor of BB and successor of +/// all blocks in Preds. When profile data is availble, update the frequency of +/// this new block. +BasicBlock *JumpThreading::SplitBlockPreds(BasicBlock *BB, + ArrayRef<BasicBlock *> Preds, + const char *Suffix) { + // Collect the frequencies of all predecessors of BB, which will be used to + // update the edge weight on BB->SuccBB. + BlockFrequency PredBBFreq(0); + if (HasProfileData) + for (auto Pred : Preds) + PredBBFreq += BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, BB); + + BasicBlock *PredBB = SplitBlockPredecessors(BB, Preds, Suffix); + + // Set the block frequency of the newly created PredBB, which is the sum of + // frequencies of Preds. + if (HasProfileData) + BFI->setBlockFreq(PredBB, PredBBFreq.getFrequency()); + return PredBB; +} + +/// Update the block frequency of BB and branch weight and the metadata on the +/// edge BB->SuccBB. This is done by scaling the weight of BB->SuccBB by 1 - +/// Freq(PredBB->BB) / Freq(BB->SuccBB). +void JumpThreading::UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB, + BasicBlock *BB, + BasicBlock *NewBB, + BasicBlock *SuccBB) { + if (!HasProfileData) + return; + + assert(BFI && BPI && "BFI & BPI should have been created here"); + + // As the edge from PredBB to BB is deleted, we have to update the block + // frequency of BB. + auto BBOrigFreq = BFI->getBlockFreq(BB); + auto NewBBFreq = BFI->getBlockFreq(NewBB); + auto BB2SuccBBFreq = BBOrigFreq * BPI->getEdgeProbability(BB, SuccBB); + auto BBNewFreq = BBOrigFreq - NewBBFreq; + BFI->setBlockFreq(BB, BBNewFreq.getFrequency()); + + // Collect updated outgoing edges' frequencies from BB and use them to update + // edge probabilities. + SmallVector<uint64_t, 4> BBSuccFreq; + for (auto I = succ_begin(BB), E = succ_end(BB); I != E; ++I) { + auto SuccFreq = (*I == SuccBB) + ? BB2SuccBBFreq - NewBBFreq + : BBOrigFreq * BPI->getEdgeProbability(BB, *I); + BBSuccFreq.push_back(SuccFreq.getFrequency()); + } + + uint64_t MaxBBSuccFreq = + *std::max_element(BBSuccFreq.begin(), BBSuccFreq.end()); + + SmallVector<BranchProbability, 4> BBSuccProbs; + if (MaxBBSuccFreq == 0) + BBSuccProbs.assign(BBSuccFreq.size(), + {1, static_cast<unsigned>(BBSuccFreq.size())}); + else { + for (uint64_t Freq : BBSuccFreq) + BBSuccProbs.push_back( + BranchProbability::getBranchProbability(Freq, MaxBBSuccFreq)); + // Normalize edge probabilities so that they sum up to one. + BranchProbability::normalizeProbabilities(BBSuccProbs.begin(), + BBSuccProbs.end()); + } + + // Update edge probabilities in BPI. + for (int I = 0, E = BBSuccProbs.size(); I < E; I++) + BPI->setEdgeProbability(BB, I, BBSuccProbs[I]); + + if (BBSuccProbs.size() >= 2) { + SmallVector<uint32_t, 4> Weights; + for (auto Prob : BBSuccProbs) + Weights.push_back(Prob.getNumerator()); + + auto TI = BB->getTerminator(); + TI->setMetadata( + LLVMContext::MD_prof, + MDBuilder(TI->getParent()->getContext()).createBranchWeights(Weights)); + } +} + /// DuplicateCondBranchOnPHIIntoPred - PredBB contains an unconditional branch /// to BB which contains an i1 PHI node and a conditional branch on that PHI. /// If we can duplicate the contents of BB up into PredBB do so now, this @@ -1530,14 +1709,14 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, return false; } - // And finally, do it! Start by factoring the predecessors is needed. + // And finally, do it! Start by factoring the predecessors if needed. BasicBlock *PredBB; if (PredBBs.size() == 1) PredBB = PredBBs[0]; else { DEBUG(dbgs() << " Factoring out " << PredBBs.size() << " common predecessors.\n"); - PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm"); + PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm"); } // Okay, we decided to do this! Clone all the instructions in BB onto the end @@ -1581,12 +1760,12 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, if (Value *IV = SimplifyInstruction(New, BB->getModule()->getDataLayout())) { delete New; - ValueMapping[BI] = IV; + ValueMapping[&*BI] = IV; } else { // Otherwise, insert the new instruction into the block. New->setName(BI->getName()); - PredBB->getInstList().insert(OldPredBranch, New); - ValueMapping[BI] = New; + PredBB->getInstList().insert(OldPredBranch->getIterator(), New); + ValueMapping[&*BI] = New; } } @@ -1628,8 +1807,8 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks // with the two values we know. SSAUpdate.Initialize(I->getType(), I->getName()); - SSAUpdate.AddAvailableValue(BB, I); - SSAUpdate.AddAvailableValue(PredBB, ValueMapping[I]); + SSAUpdate.AddAvailableValue(BB, &*I); + SSAUpdate.AddAvailableValue(PredBB, ValueMapping[&*I]); while (!UsesToRename.empty()) SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); diff --git a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp index 43fc50e..6d70cdc 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp @@ -34,10 +34,13 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" @@ -118,9 +121,12 @@ namespace { AU.addPreservedID(LoopSimplifyID); AU.addRequiredID(LCSSAID); AU.addPreservedID(LCSSAID); - AU.addRequired<AliasAnalysis>(); - AU.addPreserved<AliasAnalysis>(); - AU.addPreserved<ScalarEvolution>(); + AU.addRequired<AAResultsWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); + AU.addPreserved<BasicAAWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); + AU.addPreserved<SCEVAAWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); } @@ -164,9 +170,12 @@ INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) INITIALIZE_PASS_END(LICM, "licm", "Loop Invariant Code Motion", false, false) Pass *llvm::createLICMPass() { return new LICM(); } @@ -183,7 +192,7 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { // Get our Loop and Alias Analysis information... LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); @@ -264,9 +273,10 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { // FIXME: This is really heavy handed. It would be a bit better to use an // SSAUpdater strategy during promotion that was LCSSA aware and reformed // it as it went. - if (Changed) - formLCSSARecursively(*L, *DT, LI, - getAnalysisIfAvailable<ScalarEvolution>()); + if (Changed) { + auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); + formLCSSARecursively(*L, *DT, LI, SEWP ? &SEWP->getSE() : nullptr); + } } // Check that neither this loop nor its parent have had LCSSA broken. LICM is @@ -402,7 +412,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, } /// Computes loop safety information, checks loop body & header -/// for the possiblity of may throw exception. +/// for the possibility of may throw exception. /// void llvm::computeLICMSafetyInfo(LICMSafetyInfo * SafetyInfo, Loop * CurLoop) { assert(CurLoop != nullptr && "CurLoop cant be null"); @@ -410,7 +420,7 @@ void llvm::computeLICMSafetyInfo(LICMSafetyInfo * SafetyInfo, Loop * CurLoop) { // Setting default safety values. SafetyInfo->MayThrow = false; SafetyInfo->HeaderMayThrow = false; - // Iterate over header and compute dafety info. + // Iterate over header and compute safety info. for (BasicBlock::iterator I = Header->begin(), E = Header->end(); (I != E) && !SafetyInfo->HeaderMayThrow; ++I) SafetyInfo->HeaderMayThrow |= I->mayThrow(); @@ -445,7 +455,7 @@ bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, DominatorTree *DT, // Don't hoist loads which have may-aliased stores in loop. uint64_t Size = 0; if (LI->getType()->isSized()) - Size = AA->getTypeStoreSize(LI->getType()); + Size = I.getModule()->getDataLayout().getTypeStoreSize(LI->getType()); AAMDNodes AAInfo; LI->getAAMetadata(AAInfo); @@ -457,10 +467,21 @@ bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA, DominatorTree *DT, return false; // Handle simple cases by querying alias analysis. - AliasAnalysis::ModRefBehavior Behavior = AA->getModRefBehavior(CI); - if (Behavior == AliasAnalysis::DoesNotAccessMemory) + FunctionModRefBehavior Behavior = AA->getModRefBehavior(CI); + if (Behavior == FMRB_DoesNotAccessMemory) return true; if (AliasAnalysis::onlyReadsMemory(Behavior)) { + // A readonly argmemonly function only reads from memory pointed to by + // it's arguments with arbitrary offsets. If we can prove there are no + // writes to this memory in the loop, we can hoist or sink. + if (AliasAnalysis::onlyAccessesArgPointees(Behavior)) { + for (Value *Op : CI->arg_operands()) + if (Op->getType()->isPointerTy() && + pointerInvalidatedByLoop(Op, MemoryLocation::UnknownSize, + AAMDNodes(), CurAST)) + return false; + return true; + } // If this call only reads from memory and there are no writes to memory // in the loop, we can hoist or sink the call as appropriate. bool FoundMod = false; @@ -566,7 +587,7 @@ static Instruction *CloneInstructionInExitBlock(const Instruction &I, if (!OLoop->contains(&PN)) { PHINode *OpPN = PHINode::Create(OInst->getType(), PN.getNumIncomingValues(), - OInst->getName() + ".lcssa", ExitBlock.begin()); + OInst->getName() + ".lcssa", &ExitBlock.front()); for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) OpPN->addIncoming(OInst, PN.getIncomingBlock(i)); *OI = OpPN; @@ -651,6 +672,10 @@ static bool hoist(Instruction &I, BasicBlock *Preheader) { // Move the new node to the Preheader, before its terminator. I.moveBefore(Preheader->getTerminator()); + // Metadata can be dependent on the condition we are hoisting above. + // Conservatively strip all metadata on the instruction. + I.dropUnknownNonDebugMetadata(); + if (isa<LoadInst>(I)) ++NumMovedLoads; else if (isa<CallInst>(I)) ++NumMovedCalls; ++NumHoisted; @@ -730,9 +755,9 @@ namespace { if (!L->contains(BB)) { // We need to create an LCSSA PHI node for the incoming value and // store that. - PHINode *PN = PHINode::Create( - I->getType(), PredCache.size(BB), - I->getName() + ".lcssa", BB->begin()); + PHINode *PN = + PHINode::Create(I->getType(), PredCache.size(BB), + I->getName() + ".lcssa", &BB->front()); for (BasicBlock *Pred : PredCache.get(BB)) PN->addIncoming(I, Pred); return PN; @@ -942,7 +967,7 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS, CurLoop->getUniqueExitBlocks(ExitBlocks); InsertPts.resize(ExitBlocks.size()); for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) - InsertPts[i] = ExitBlocks[i]->getFirstInsertionPt(); + InsertPts[i] = &*ExitBlocks[i]->getFirstInsertionPt(); } // We use the SSAUpdater interface to insert phi nodes as required. @@ -973,7 +998,7 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS, return Changed; } -/// Simple Analysis hook. Clone alias set info. +/// Simple analysis hook. Clone alias set info. /// void LICM::cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L) { AliasSetTracker *AST = LoopToAliasSetMap.lookup(L); diff --git a/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp b/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp index c19cd19..1648878 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/TargetFolder.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" @@ -56,7 +57,7 @@ class LoadCombine : public BasicBlockPass { public: LoadCombine() : BasicBlockPass(ID), C(nullptr), AA(nullptr) { - initializeSROAPass(*PassRegistry::getPassRegistry()); + initializeLoadCombinePass(*PassRegistry::getPassRegistry()); } using llvm::Pass::doInitialization; @@ -223,7 +224,7 @@ bool LoadCombine::runOnBasicBlock(BasicBlock &BB) { if (skipOptnoneFunction(BB)) return false; - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); IRBuilder<true, TargetFolder> TheBuilder( BB.getContext(), TargetFolder(BB.getModule()->getDataLayout())); @@ -262,8 +263,8 @@ bool LoadCombine::runOnBasicBlock(BasicBlock &BB) { void LoadCombine::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); - AU.addRequired<AliasAnalysis>(); - AU.addPreserved<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } char LoadCombine::ID = 0; @@ -274,7 +275,8 @@ BasicBlockPass *llvm::createLoadCombinePass() { INITIALIZE_PASS_BEGIN(LoadCombine, "load-combine", "Combine Adjacent Loads", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_END(LoadCombine, "load-combine", "Combine Adjacent Loads", false, false) diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp index 98b068e..bc00ff3 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp @@ -17,6 +17,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/IR/Dominators.h" @@ -35,18 +36,19 @@ namespace { } // Possibly eliminate loop L if it is dead. - bool runOnLoop(Loop *L, LPPassManager &LPM) override; + bool runOnLoop(Loop *L, LPPassManager &) override; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); - AU.addRequired<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); - AU.addPreserved<ScalarEvolution>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); AU.addPreservedID(LoopSimplifyID); AU.addPreservedID(LCSSAID); } @@ -64,7 +66,7 @@ INITIALIZE_PASS_BEGIN(LoopDeletion, "loop-deletion", "Delete dead loops", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_END(LoopDeletion, "loop-deletion", @@ -130,7 +132,7 @@ bool LoopDeletion::isLoopDead(Loop *L, /// so could change the halting/non-halting nature of a program. /// NOTE: This entire process relies pretty heavily on LoopSimplify and LCSSA /// in order to make various safety checks work. -bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) { +bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &) { if (skipOptnoneFunction(L)) return false; @@ -169,7 +171,7 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) { // Don't remove loops for which we can't solve the trip count. // They could be infinite, in which case we'd be changing program behavior. - ScalarEvolution &SE = getAnalysis<ScalarEvolution>(); + ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); const SCEV *S = SE.getMaxBackedgeTakenCount(L); if (isa<SCEVCouldNotCompute>(S)) return Changed; @@ -242,9 +244,8 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) { for (BasicBlock *BB : blocks) loopInfo.removeBlock(BB); - // The last step is to inform the loop pass manager that we've - // eliminated this loop. - LPM.deleteLoopFromQueue(L); + // The last step is to update LoopInfo now that we've eliminated this loop. + loopInfo.updateUnloop(L); Changed = true; ++NumDeleted; diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp index 1b9859b..3d3cf3e 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp @@ -34,6 +34,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/LoopVersioning.h" #include <list> @@ -54,6 +55,11 @@ static cl::opt<bool> DistributeNonIfConvertible( "if-convertible by the loop vectorizer"), cl::init(false)); +static cl::opt<unsigned> DistributeSCEVCheckThreshold( + "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden, + cl::desc("The maximum number of SCEV checks allowed for Loop " + "Distribution")); + STATISTIC(NumLoopsDistributed, "Number of loops distributed"); namespace { @@ -164,9 +170,7 @@ public: // Delete the instructions backwards, as it has a reduced likelihood of // having to update as many def-use and use-def chains. - for (auto I = Unused.rbegin(), E = Unused.rend(); I != E; ++I) { - auto *Inst = *I; - + for (auto *Inst : make_range(Unused.rbegin(), Unused.rend())) { if (!Inst->use_empty()) Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); Inst->eraseFromParent(); @@ -373,7 +377,7 @@ public: /// \brief This performs the main chunk of the work of cloning the loops for /// the partitions. - void cloneLoops(Pass *P) { + void cloneLoops() { BasicBlock *OrigPH = L->getLoopPreheader(); // At this point the predecessor of the preheader is either the memcheck // block or the top part of the original preheader. @@ -547,11 +551,11 @@ public: MemoryInstructionDependences( const SmallVectorImpl<Instruction *> &Instructions, - const SmallVectorImpl<Dependence> &InterestingDependences) { + const SmallVectorImpl<Dependence> &Dependences) { Accesses.append(Instructions.begin(), Instructions.end()); DEBUG(dbgs() << "Backward dependences:\n"); - for (auto &Dep : InterestingDependences) + for (auto &Dep : Dependences) if (Dep.isPossiblyBackward()) { // Note that the designations source and destination follow the program // order, i.e. source is always first. (The direction is given by the @@ -567,25 +571,6 @@ private: AccessesType Accesses; }; -/// \brief Returns the instructions that use values defined in the loop. -static SmallVector<Instruction *, 8> findDefsUsedOutsideOfLoop(Loop *L) { - SmallVector<Instruction *, 8> UsedOutside; - - for (auto *Block : L->getBlocks()) - // FIXME: I believe that this could use copy_if if the Inst reference could - // be adapted into a pointer. - for (auto &Inst : *Block) { - auto Users = Inst.users(); - if (std::any_of(Users.begin(), Users.end(), [&](User *U) { - auto *Use = cast<Instruction>(U); - return !L->contains(Use->getParent()); - })) - UsedOutside.push_back(&Inst); - } - - return UsedOutside; -} - /// \brief The pass class. class LoopDistribute : public FunctionPass { public: @@ -597,6 +582,7 @@ public: LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); LAA = &getAnalysis<LoopAccessAnalysis>(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); // Build up a worklist of inner-loops to vectorize. This is necessary as the // act of distributing a loop creates new loops and can invalidate iterators @@ -619,6 +605,7 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequired<LoopAccessAnalysis>(); @@ -629,6 +616,45 @@ public: static char ID; private: + /// \brief Filter out checks between pointers from the same partition. + /// + /// \p PtrToPartition contains the partition number for pointers. Partition + /// number -1 means that the pointer is used in multiple partitions. In this + /// case we can't safely omit the check. + SmallVector<RuntimePointerChecking::PointerCheck, 4> + includeOnlyCrossPartitionChecks( + const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks, + const SmallVectorImpl<int> &PtrToPartition, + const RuntimePointerChecking *RtPtrChecking) { + SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks; + + std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks), + [&](const RuntimePointerChecking::PointerCheck &Check) { + for (unsigned PtrIdx1 : Check.first->Members) + for (unsigned PtrIdx2 : Check.second->Members) + // Only include this check if there is a pair of pointers + // that require checking and the pointers fall into + // separate partitions. + // + // (Note that we already know at this point that the two + // pointer groups need checking but it doesn't follow + // that each pair of pointers within the two groups need + // checking as well. + // + // In other words we don't want to include a check just + // because there is a pair of pointers between the two + // pointer groups that require checks and a different + // pair whose pointers fall into different partitions.) + if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) && + !RuntimePointerChecking::arePointersInSamePartition( + PtrToPartition, PtrIdx1, PtrIdx2)) + return true; + return false; + }); + + return Checks; + } + /// \brief Try to distribute an inner-most loop. bool processLoop(Loop *L) { assert(L->empty() && "Only process inner loops."); @@ -655,9 +681,8 @@ private: DEBUG(dbgs() << "Skipping; memory operations are safe for vectorization"); return false; } - auto *InterestingDependences = - LAI.getDepChecker().getInterestingDependences(); - if (!InterestingDependences || InterestingDependences->empty()) { + auto *Dependences = LAI.getDepChecker().getDependences(); + if (!Dependences || Dependences->empty()) { DEBUG(dbgs() << "Skipping; No unsafe dependences to isolate"); return false; } @@ -685,7 +710,7 @@ private: // NumUnsafeDependencesActive reaches 0. const MemoryDepChecker &DepChecker = LAI.getDepChecker(); MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(), - *InterestingDependences); + *Dependences); int NumUnsafeDependencesActive = 0; for (auto &InstDep : MID) { @@ -735,6 +760,13 @@ private: return false; } + // Don't distribute the loop if we need too many SCEV run-time checks. + const SCEVUnionPredicate &Pred = LAI.PSE.getUnionPredicate(); + if (Pred.getComplexity() > DistributeSCEVCheckThreshold) { + DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n"); + return false; + } + DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n"); // We're done forming the partitions set up the reverse mapping from // instructions to partitions. @@ -746,20 +778,25 @@ private: if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator()) SplitBlock(PH, PH->getTerminator(), DT, LI); - // If we need run-time checks to disambiguate pointers are run-time, version - // the loop now. + // If we need run-time checks, version the loop now. auto PtrToPartition = Partitions.computePartitionSetForPointers(LAI); - LoopVersioning LVer(LAI, L, LI, DT, &PtrToPartition); - if (LVer.needsRuntimeChecks()) { + const auto *RtPtrChecking = LAI.getRuntimePointerChecking(); + const auto &AllChecks = RtPtrChecking->getChecks(); + auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition, + RtPtrChecking); + + if (!Pred.isAlwaysTrue() || !Checks.empty()) { DEBUG(dbgs() << "\nPointers:\n"); - DEBUG(LAI.getRuntimePointerChecking()->print(dbgs(), 0, &PtrToPartition)); - LVer.versionLoop(this); - LVer.addPHINodes(DefsUsedOutside); + DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks)); + LoopVersioning LVer(LAI, L, LI, DT, SE, false); + LVer.setAliasChecks(std::move(Checks)); + LVer.setSCEVChecks(LAI.PSE.getUnionPredicate()); + LVer.versionLoop(DefsUsedOutside); } // Create identical copies of the original loop for each partition and hook // them up sequentially. - Partitions.cloneLoops(this); + Partitions.cloneLoops(); // Now, we remove the instruction from each loop that don't belong to that // partition. @@ -780,6 +817,7 @@ private: LoopInfo *LI; LoopAccessAnalysis *LAA; DominatorTree *DT; + ScalarEvolution *SE; }; } // anonymous namespace @@ -790,6 +828,7 @@ INITIALIZE_PASS_BEGIN(LoopDistribute, LDIST_NAME, ldist_name, false, false) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false) namespace llvm { diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index a21ca24..2d577de 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -31,11 +31,6 @@ // void foo(_Complex float *P) // for (i) { __real__(*P) = 0; __imag__(*P) = 0; } // -// We should enhance this to handle negative strides through memory. -// Alternatively (and perhaps better) we could rely on an earlier pass to force -// forward iteration through memory, which is generally better for cache -// behavior. Negative strides *do* happen for memset/memcpy loops. -// // This could recognize common matrix multiplies and dot product idioms and // replace them with calls to BLAS (if linked in??). // @@ -44,7 +39,10 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -67,149 +65,85 @@ STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores"); namespace { - class LoopIdiomRecognize; +class LoopIdiomRecognize : public LoopPass { + Loop *CurLoop; + AliasAnalysis *AA; + DominatorTree *DT; + LoopInfo *LI; + ScalarEvolution *SE; + TargetLibraryInfo *TLI; + const TargetTransformInfo *TTI; + const DataLayout *DL; + +public: + static char ID; + explicit LoopIdiomRecognize() : LoopPass(ID) { + initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry()); + } - /// This class defines some utility functions for loop idiom recognization. - class LIRUtil { - public: - /// Return true iff the block contains nothing but an uncondition branch - /// (aka goto instruction). - static bool isAlmostEmpty(BasicBlock *); + bool runOnLoop(Loop *L, LPPassManager &LPM) override; + + /// This transformation requires natural loop information & requires that + /// loop preheaders be inserted into the CFG. + /// + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); + AU.addRequiredID(LoopSimplifyID); + AU.addPreservedID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + AU.addPreservedID(LCSSAID); + AU.addRequired<AAResultsWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); + AU.addPreserved<SCEVAAWrapperPass>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addPreserved<BasicAAWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + } - static BranchInst *getBranch(BasicBlock *BB) { - return dyn_cast<BranchInst>(BB->getTerminator()); - } +private: + typedef SmallVector<StoreInst *, 8> StoreList; + StoreList StoreRefs; - /// Derive the precondition block (i.e the block that guards the loop - /// preheader) from the given preheader. - static BasicBlock *getPrecondBb(BasicBlock *PreHead); - }; - - /// This class is to recoginize idioms of population-count conducted in - /// a noncountable loop. Currently it only recognizes this pattern: - /// \code - /// while(x) {cnt++; ...; x &= x - 1; ...} - /// \endcode - class NclPopcountRecognize { - LoopIdiomRecognize &LIR; - Loop *CurLoop; - BasicBlock *PreCondBB; - - typedef IRBuilder<> IRBuilderTy; - - public: - explicit NclPopcountRecognize(LoopIdiomRecognize &TheLIR); - bool recognize(); - - private: - /// Take a glimpse of the loop to see if we need to go ahead recoginizing - /// the idiom. - bool preliminaryScreen(); - - /// Check if the given conditional branch is based on the comparison - /// between a variable and zero, and if the variable is non-zero, the - /// control yields to the loop entry. If the branch matches the behavior, - /// the variable involved in the comparion is returned. This function will - /// be called to see if the precondition and postcondition of the loop - /// are in desirable form. - Value *matchCondition(BranchInst *Br, BasicBlock *NonZeroTarget) const; - - /// Return true iff the idiom is detected in the loop. and 1) \p CntInst - /// is set to the instruction counting the population bit. 2) \p CntPhi - /// is set to the corresponding phi node. 3) \p Var is set to the value - /// whose population bits are being counted. - bool detectIdiom - (Instruction *&CntInst, PHINode *&CntPhi, Value *&Var) const; - - /// Insert ctpop intrinsic function and some obviously dead instructions. - void transform(Instruction *CntInst, PHINode *CntPhi, Value *Var); - - /// Create llvm.ctpop.* intrinsic function. - CallInst *createPopcntIntrinsic(IRBuilderTy &IRB, Value *Val, DebugLoc DL); - }; - - class LoopIdiomRecognize : public LoopPass { - Loop *CurLoop; - DominatorTree *DT; - ScalarEvolution *SE; - TargetLibraryInfo *TLI; - const TargetTransformInfo *TTI; - public: - static char ID; - explicit LoopIdiomRecognize() : LoopPass(ID) { - initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry()); - DT = nullptr; - SE = nullptr; - TLI = nullptr; - TTI = nullptr; - } + /// \name Countable Loop Idiom Handling + /// @{ - bool runOnLoop(Loop *L, LPPassManager &LPM) override; - bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, - SmallVectorImpl<BasicBlock*> &ExitBlocks); - - bool processLoopStore(StoreInst *SI, const SCEV *BECount); - bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount); - - bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize, - unsigned StoreAlignment, - Value *SplatValue, Instruction *TheStore, - const SCEVAddRecExpr *Ev, - const SCEV *BECount); - bool processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, - const SCEVAddRecExpr *StoreEv, - const SCEVAddRecExpr *LoadEv, - const SCEV *BECount); - - /// This transformation requires natural loop information & requires that - /// loop preheaders be inserted into the CFG. - /// - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<LoopInfoWrapperPass>(); - AU.addPreserved<LoopInfoWrapperPass>(); - AU.addRequiredID(LoopSimplifyID); - AU.addPreservedID(LoopSimplifyID); - AU.addRequiredID(LCSSAID); - AU.addPreservedID(LCSSAID); - AU.addRequired<AliasAnalysis>(); - AU.addPreserved<AliasAnalysis>(); - AU.addRequired<ScalarEvolution>(); - AU.addPreserved<ScalarEvolution>(); - AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<TargetLibraryInfoWrapperPass>(); - AU.addRequired<TargetTransformInfoWrapperPass>(); - } + bool runOnCountableLoop(); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl<BasicBlock *> &ExitBlocks); - DominatorTree *getDominatorTree() { - return DT ? DT - : (DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree()); - } + void collectStores(BasicBlock *BB); + bool isLegalStore(StoreInst *SI); + bool processLoopStore(StoreInst *SI, const SCEV *BECount); + bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount); - ScalarEvolution *getScalarEvolution() { - return SE ? SE : (SE = &getAnalysis<ScalarEvolution>()); - } + bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize, + unsigned StoreAlignment, Value *SplatValue, + Instruction *TheStore, const SCEVAddRecExpr *Ev, + const SCEV *BECount, bool NegStride); + bool processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, + const SCEVAddRecExpr *StoreEv, + const SCEV *BECount, bool NegStride); - TargetLibraryInfo *getTargetLibraryInfo() { - if (!TLI) - TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + /// @} + /// \name Noncountable Loop Idiom Handling + /// @{ - return TLI; - } + bool runOnNoncountableLoop(); - const TargetTransformInfo *getTargetTransformInfo() { - return TTI ? TTI - : (TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( - *CurLoop->getHeader()->getParent())); - } + bool recognizePopcount(); + void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst, + PHINode *CntPhi, Value *Var); - Loop *getLoop() const { return CurLoop; } + /// @} +}; - private: - bool runOnNoncountableLoop(); - bool runOnCountableLoop(); - }; -} +} // End anonymous namespace. char LoopIdiomRecognize::ID = 0; INITIALIZE_PASS_BEGIN(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", @@ -218,9 +152,12 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", false, false) @@ -242,406 +179,64 @@ static void deleteDeadInstruction(Instruction *I, //===----------------------------------------------------------------------===// // -// Implementation of LIRUtil -// -//===----------------------------------------------------------------------===// - -// This function will return true iff the given block contains nothing but goto. -// A typical usage of this function is to check if the preheader function is -// "almost" empty such that generated intrinsic functions can be moved across -// the preheader and be placed at the end of the precondition block without -// the concern of breaking data dependence. -bool LIRUtil::isAlmostEmpty(BasicBlock *BB) { - if (BranchInst *Br = getBranch(BB)) { - return Br->isUnconditional() && Br == BB->begin(); - } - return false; -} - -BasicBlock *LIRUtil::getPrecondBb(BasicBlock *PreHead) { - if (BasicBlock *BB = PreHead->getSinglePredecessor()) { - BranchInst *Br = getBranch(BB); - return Br && Br->isConditional() ? BB : nullptr; - } - return nullptr; -} - -//===----------------------------------------------------------------------===// -// -// Implementation of NclPopcountRecognize +// Implementation of LoopIdiomRecognize // //===----------------------------------------------------------------------===// -NclPopcountRecognize::NclPopcountRecognize(LoopIdiomRecognize &TheLIR): - LIR(TheLIR), CurLoop(TheLIR.getLoop()), PreCondBB(nullptr) { -} - -bool NclPopcountRecognize::preliminaryScreen() { - const TargetTransformInfo *TTI = LIR.getTargetTransformInfo(); - if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware) - return false; - - // Counting population are usually conducted by few arithmetic instructions. - // Such instructions can be easilly "absorbed" by vacant slots in a - // non-compact loop. Therefore, recognizing popcount idiom only makes sense - // in a compact loop. - - // Give up if the loop has multiple blocks or multiple backedges. - if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) - return false; - - BasicBlock *LoopBody = *(CurLoop->block_begin()); - if (LoopBody->size() >= 20) { - // The loop is too big, bail out. - return false; - } - - // It should have a preheader containing nothing but a goto instruction. - BasicBlock *PreHead = CurLoop->getLoopPreheader(); - if (!PreHead || !LIRUtil::isAlmostEmpty(PreHead)) +bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { + if (skipOptnoneFunction(L)) return false; - // It should have a precondition block where the generated popcount instrinsic - // function will be inserted. - PreCondBB = LIRUtil::getPrecondBb(PreHead); - if (!PreCondBB) + CurLoop = L; + // If the loop could not be converted to canonical form, it must have an + // indirectbr in it, just give up. + if (!L->getLoopPreheader()) return false; - return true; -} - -Value *NclPopcountRecognize::matchCondition(BranchInst *Br, - BasicBlock *LoopEntry) const { - if (!Br || !Br->isConditional()) - return nullptr; - - ICmpInst *Cond = dyn_cast<ICmpInst>(Br->getCondition()); - if (!Cond) - return nullptr; - - ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1)); - if (!CmpZero || !CmpZero->isZero()) - return nullptr; - - ICmpInst::Predicate Pred = Cond->getPredicate(); - if ((Pred == ICmpInst::ICMP_NE && Br->getSuccessor(0) == LoopEntry) || - (Pred == ICmpInst::ICMP_EQ && Br->getSuccessor(1) == LoopEntry)) - return Cond->getOperand(0); - - return nullptr; -} - -bool NclPopcountRecognize::detectIdiom(Instruction *&CntInst, - PHINode *&CntPhi, - Value *&Var) const { - // Following code tries to detect this idiom: - // - // if (x0 != 0) - // goto loop-exit // the precondition of the loop - // cnt0 = init-val; - // do { - // x1 = phi (x0, x2); - // cnt1 = phi(cnt0, cnt2); - // - // cnt2 = cnt1 + 1; - // ... - // x2 = x1 & (x1 - 1); - // ... - // } while(x != 0); - // - // loop-exit: - // - - // step 1: Check to see if the look-back branch match this pattern: - // "if (a!=0) goto loop-entry". - BasicBlock *LoopEntry; - Instruction *DefX2, *CountInst; - Value *VarX1, *VarX0; - PHINode *PhiX, *CountPhi; - - DefX2 = CountInst = nullptr; - VarX1 = VarX0 = nullptr; - PhiX = CountPhi = nullptr; - LoopEntry = *(CurLoop->block_begin()); - - // step 1: Check if the loop-back branch is in desirable form. - { - if (Value *T = matchCondition (LIRUtil::getBranch(LoopEntry), LoopEntry)) - DefX2 = dyn_cast<Instruction>(T); - else - return false; - } - - // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)" - { - if (!DefX2 || DefX2->getOpcode() != Instruction::And) - return false; - - BinaryOperator *SubOneOp; - - if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0)))) - VarX1 = DefX2->getOperand(1); - else { - VarX1 = DefX2->getOperand(0); - SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1)); - } - if (!SubOneOp) - return false; - - Instruction *SubInst = cast<Instruction>(SubOneOp); - ConstantInt *Dec = dyn_cast<ConstantInt>(SubInst->getOperand(1)); - if (!Dec || - !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) || - (SubInst->getOpcode() == Instruction::Add && Dec->isAllOnesValue()))) { - return false; - } - } - - // step 3: Check the recurrence of variable X - { - PhiX = dyn_cast<PHINode>(VarX1); - if (!PhiX || - (PhiX->getOperand(0) != DefX2 && PhiX->getOperand(1) != DefX2)) { - return false; - } - } - - // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1 - { - CountInst = nullptr; - for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI(), - IterE = LoopEntry->end(); Iter != IterE; Iter++) { - Instruction *Inst = Iter; - if (Inst->getOpcode() != Instruction::Add) - continue; - - ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1)); - if (!Inc || !Inc->isOne()) - continue; - - PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0)); - if (!Phi || Phi->getParent() != LoopEntry) - continue; - - // Check if the result of the instruction is live of the loop. - bool LiveOutLoop = false; - for (User *U : Inst->users()) { - if ((cast<Instruction>(U))->getParent() != LoopEntry) { - LiveOutLoop = true; break; - } - } - - if (LiveOutLoop) { - CountInst = Inst; - CountPhi = Phi; - break; - } - } - - if (!CountInst) - return false; - } - - // step 5: check if the precondition is in this form: - // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;" - { - BranchInst *PreCondBr = LIRUtil::getBranch(PreCondBB); - Value *T = matchCondition (PreCondBr, CurLoop->getLoopPreheader()); - if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1)) - return false; - - CntInst = CountInst; - CntPhi = CountPhi; - Var = T; - } - - return true; -} - -void NclPopcountRecognize::transform(Instruction *CntInst, - PHINode *CntPhi, Value *Var) { - - ScalarEvolution *SE = LIR.getScalarEvolution(); - TargetLibraryInfo *TLI = LIR.getTargetLibraryInfo(); - BasicBlock *PreHead = CurLoop->getLoopPreheader(); - BranchInst *PreCondBr = LIRUtil::getBranch(PreCondBB); - const DebugLoc DL = CntInst->getDebugLoc(); - - // Assuming before transformation, the loop is following: - // if (x) // the precondition - // do { cnt++; x &= x - 1; } while(x); - - // Step 1: Insert the ctpop instruction at the end of the precondition block - IRBuilderTy Builder(PreCondBr); - Value *PopCnt, *PopCntZext, *NewCount, *TripCnt; - { - PopCnt = createPopcntIntrinsic(Builder, Var, DL); - NewCount = PopCntZext = - Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType())); - - if (NewCount != PopCnt) - (cast<Instruction>(NewCount))->setDebugLoc(DL); - - // TripCnt is exactly the number of iterations the loop has - TripCnt = NewCount; - - // If the population counter's initial value is not zero, insert Add Inst. - Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead); - ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal); - if (!InitConst || !InitConst->isZero()) { - NewCount = Builder.CreateAdd(NewCount, CntInitVal); - (cast<Instruction>(NewCount))->setDebugLoc(DL); - } - } - - // Step 2: Replace the precondition from "if(x == 0) goto loop-exit" to - // "if(NewCount == 0) loop-exit". Withtout this change, the intrinsic - // function would be partial dead code, and downstream passes will drag - // it back from the precondition block to the preheader. - { - ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition()); - - Value *Opnd0 = PopCntZext; - Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0); - if (PreCond->getOperand(0) != Var) - std::swap(Opnd0, Opnd1); - - ICmpInst *NewPreCond = - cast<ICmpInst>(Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1)); - PreCondBr->setCondition(NewPreCond); - - RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI); - } - - // Step 3: Note that the population count is exactly the trip count of the - // loop in question, which enble us to to convert the loop from noncountable - // loop into a countable one. The benefit is twofold: - // - // - If the loop only counts population, the entire loop become dead after - // the transformation. It is lots easier to prove a countable loop dead - // than to prove a noncountable one. (In some C dialects, a infite loop - // isn't dead even if it computes nothing useful. In general, DCE needs - // to prove a noncountable loop finite before safely delete it.) - // - // - If the loop also performs something else, it remains alive. - // Since it is transformed to countable form, it can be aggressively - // optimized by some optimizations which are in general not applicable - // to a noncountable loop. - // - // After this step, this loop (conceptually) would look like following: - // newcnt = __builtin_ctpop(x); - // t = newcnt; - // if (x) - // do { cnt++; x &= x-1; t--) } while (t > 0); - BasicBlock *Body = *(CurLoop->block_begin()); - { - BranchInst *LbBr = LIRUtil::getBranch(Body); - ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition()); - Type *Ty = TripCnt->getType(); - - PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", Body->begin()); - - Builder.SetInsertPoint(LbCond); - Value *Opnd1 = cast<Value>(TcPhi); - Value *Opnd2 = cast<Value>(ConstantInt::get(Ty, 1)); - Instruction *TcDec = - cast<Instruction>(Builder.CreateSub(Opnd1, Opnd2, "tcdec", false, true)); - - TcPhi->addIncoming(TripCnt, PreHead); - TcPhi->addIncoming(TcDec, Body); - - CmpInst::Predicate Pred = (LbBr->getSuccessor(0) == Body) ? - CmpInst::ICMP_UGT : CmpInst::ICMP_SLE; - LbCond->setPredicate(Pred); - LbCond->setOperand(0, TcDec); - LbCond->setOperand(1, cast<Value>(ConstantInt::get(Ty, 0))); - } - - // Step 4: All the references to the original population counter outside - // the loop are replaced with the NewCount -- the value returned from - // __builtin_ctpop(). - CntInst->replaceUsesOutsideBlock(NewCount, Body); - - // step 5: Forget the "non-computable" trip-count SCEV associated with the - // loop. The loop would otherwise not be deleted even if it becomes empty. - SE->forgetLoop(CurLoop); -} - -CallInst *NclPopcountRecognize::createPopcntIntrinsic(IRBuilderTy &IRBuilder, - Value *Val, DebugLoc DL) { - Value *Ops[] = { Val }; - Type *Tys[] = { Val->getType() }; - - Module *M = (*(CurLoop->block_begin()))->getParent()->getParent(); - Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys); - CallInst *CI = IRBuilder.CreateCall(Func, Ops); - CI->setDebugLoc(DL); - - return CI; -} - -/// recognize - detect population count idiom in a non-countable loop. If -/// detected, transform the relevant code to popcount intrinsic function -/// call, and return true; otherwise, return false. -bool NclPopcountRecognize::recognize() { - - if (!LIR.getTargetTransformInfo()) + // Disable loop idiom recognition if the function's name is a common idiom. + StringRef Name = L->getHeader()->getParent()->getName(); + if (Name == "memset" || Name == "memcpy") return false; - LIR.getScalarEvolution(); - - if (!preliminaryScreen()) - return false; + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( + *CurLoop->getHeader()->getParent()); + DL = &CurLoop->getHeader()->getModule()->getDataLayout(); - Instruction *CntInst; - PHINode *CntPhi; - Value *Val; - if (!detectIdiom(CntInst, CntPhi, Val)) - return false; + if (SE->hasLoopInvariantBackedgeTakenCount(L)) + return runOnCountableLoop(); - transform(CntInst, CntPhi, Val); - return true; + return runOnNoncountableLoop(); } -//===----------------------------------------------------------------------===// -// -// Implementation of LoopIdiomRecognize -// -//===----------------------------------------------------------------------===// - bool LoopIdiomRecognize::runOnCountableLoop() { const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop); assert(!isa<SCEVCouldNotCompute>(BECount) && - "runOnCountableLoop() called on a loop without a predictable" - "backedge-taken count"); + "runOnCountableLoop() called on a loop without a predictable" + "backedge-taken count"); // If this loop executes exactly one time, then it should be peeled, not // optimized by this pass. if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount)) - if (BECst->getValue()->getValue() == 0) + if (BECst->getAPInt() == 0) return false; - // set DT - (void)getDominatorTree(); - - LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); - - // set TLI - (void)getTargetLibraryInfo(); - - SmallVector<BasicBlock*, 8> ExitBlocks; + SmallVector<BasicBlock *, 8> ExitBlocks; CurLoop->getUniqueExitBlocks(ExitBlocks); DEBUG(dbgs() << "loop-idiom Scanning: F[" - << CurLoop->getHeader()->getParent()->getName() - << "] Loop %" << CurLoop->getHeader()->getName() << "\n"); + << CurLoop->getHeader()->getParent()->getName() << "] Loop %" + << CurLoop->getHeader()->getName() << "\n"); bool MadeChange = false; // Scan all the blocks in the loop that are not in subloops. for (auto *BB : CurLoop->getBlocks()) { // Ignore blocks in subloops. - if (LI.getLoopFor(BB) != CurLoop) + if (LI->getLoopFor(BB) != CurLoop) continue; MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks); @@ -649,41 +244,109 @@ bool LoopIdiomRecognize::runOnCountableLoop() { return MadeChange; } -bool LoopIdiomRecognize::runOnNoncountableLoop() { - NclPopcountRecognize Popcount(*this); - if (Popcount.recognize()) - return true; +static unsigned getStoreSizeInBytes(StoreInst *SI, const DataLayout *DL) { + uint64_t SizeInBits = DL->getTypeSizeInBits(SI->getValueOperand()->getType()); + assert(((SizeInBits & 7) || (SizeInBits >> 32) == 0) && + "Don't overflow unsigned."); + return (unsigned)SizeInBits >> 3; +} - return false; +static unsigned getStoreStride(const SCEVAddRecExpr *StoreEv) { + const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1)); + return ConstStride->getAPInt().getZExtValue(); } -bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { - if (skipOptnoneFunction(L)) +/// getMemSetPatternValue - If a strided store of the specified value is safe to +/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should +/// be passed in. Otherwise, return null. +/// +/// Note that we don't ever attempt to use memset_pattern8 or 4, because these +/// just replicate their input array and then pass on to memset_pattern16. +static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) { + // If the value isn't a constant, we can't promote it to being in a constant + // array. We could theoretically do a store to an alloca or something, but + // that doesn't seem worthwhile. + Constant *C = dyn_cast<Constant>(V); + if (!C) + return nullptr; + + // Only handle simple values that are a power of two bytes in size. + uint64_t Size = DL->getTypeSizeInBits(V->getType()); + if (Size == 0 || (Size & 7) || (Size & (Size - 1))) + return nullptr; + + // Don't care enough about darwin/ppc to implement this. + if (DL->isBigEndian()) + return nullptr; + + // Convert to size in bytes. + Size /= 8; + + // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see + // if the top and bottom are the same (e.g. for vectors and large integers). + if (Size > 16) + return nullptr; + + // If the constant is exactly 16 bytes, just use it. + if (Size == 16) + return C; + + // Otherwise, we'll use an array of the constants. + unsigned ArraySize = 16 / Size; + ArrayType *AT = ArrayType::get(V->getType(), ArraySize); + return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C)); +} + +bool LoopIdiomRecognize::isLegalStore(StoreInst *SI) { + // Don't touch volatile stores. + if (!SI->isSimple()) return false; - CurLoop = L; + Value *StoredVal = SI->getValueOperand(); + Value *StorePtr = SI->getPointerOperand(); - // If the loop could not be converted to canonical form, it must have an - // indirectbr in it, just give up. - if (!L->getLoopPreheader()) + // Reject stores that are so large that they overflow an unsigned. + uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType()); + if ((SizeInBits & 7) || (SizeInBits >> 32) != 0) return false; - // Disable loop idiom recognition if the function's name is a common idiom. - StringRef Name = L->getHeader()->getParent()->getName(); - if (Name == "memset" || Name == "memcpy") + // See if the pointer expression is an AddRec like {base,+,1} on the current + // loop, which indicates a strided store. If we have something else, it's a + // random store we can't handle. + const SCEVAddRecExpr *StoreEv = + dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); + if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine()) return false; - SE = &getAnalysis<ScalarEvolution>(); - if (SE->hasLoopInvariantBackedgeTakenCount(L)) - return runOnCountableLoop(); - return runOnNoncountableLoop(); + // Check to see if we have a constant stride. + if (!isa<SCEVConstant>(StoreEv->getOperand(1))) + return false; + + return true; +} + +void LoopIdiomRecognize::collectStores(BasicBlock *BB) { + StoreRefs.clear(); + for (Instruction &I : *BB) { + StoreInst *SI = dyn_cast<StoreInst>(&I); + if (!SI) + continue; + + // Make sure this is a strided store with a constant stride. + if (!isLegalStore(SI)) + continue; + + // Save the store locations. + StoreRefs.push_back(SI); + } } /// runOnLoopBlock - Process the specified block, which lives in a counted loop /// with the specified backedge count. This block is known to be in the current /// loop and not in any subloops. -bool LoopIdiomRecognize::runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, - SmallVectorImpl<BasicBlock*> &ExitBlocks) { +bool LoopIdiomRecognize::runOnLoopBlock( + BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl<BasicBlock *> &ExitBlocks) { // We can only promote stores in this block if they are unconditionally // executed in the loop. For a block to be unconditionally executed, it has // to dominate all the exit blocks of the loop. Verify this now. @@ -692,25 +355,18 @@ bool LoopIdiomRecognize::runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, return false; bool MadeChange = false; - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) { - Instruction *Inst = I++; - // Look for store instructions, which may be optimized to memset/memcpy. - if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { - WeakVH InstPtr(I); - if (!processLoopStore(SI, BECount)) continue; - MadeChange = true; - - // If processing the store invalidated our iterator, start over from the - // top of the block. - if (!InstPtr) - I = BB->begin(); - continue; - } + // Look for store instructions, which may be optimized to memset/memcpy. + collectStores(BB); + for (auto &SI : StoreRefs) + MadeChange |= processLoopStore(SI, BECount); + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { + Instruction *Inst = &*I++; // Look for memset instructions, which may be optimized to a larger memset. - if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) { - WeakVH InstPtr(I); - if (!processLoopMemSet(MSI, BECount)) continue; + if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) { + WeakVH InstPtr(&*I); + if (!processLoopMemSet(MSI, BECount)) + continue; MadeChange = true; // If processing the memset invalidated our iterator, start over from the @@ -724,71 +380,38 @@ bool LoopIdiomRecognize::runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, return MadeChange; } - /// processLoopStore - See if this store can be promoted to a memset or memcpy. bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) { - if (!SI->isSimple()) return false; + assert(SI->isSimple() && "Expected only non-volatile stores."); Value *StoredVal = SI->getValueOperand(); Value *StorePtr = SI->getPointerOperand(); - // Reject stores that are so large that they overflow an unsigned. - auto &DL = CurLoop->getHeader()->getModule()->getDataLayout(); - uint64_t SizeInBits = DL.getTypeSizeInBits(StoredVal->getType()); - if ((SizeInBits & 7) || (SizeInBits >> 32) != 0) - return false; - - // See if the pointer expression is an AddRec like {base,+,1} on the current - // loop, which indicates a strided store. If we have something else, it's a - // random store we can't handle. - const SCEVAddRecExpr *StoreEv = - dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); - if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine()) - return false; - // Check to see if the stride matches the size of the store. If so, then we // know that every byte is touched in the loop. - unsigned StoreSize = (unsigned)SizeInBits >> 3; - const SCEVConstant *Stride = dyn_cast<SCEVConstant>(StoreEv->getOperand(1)); - - if (!Stride || StoreSize != Stride->getValue()->getValue()) { - // TODO: Could also handle negative stride here someday, that will require - // the validity check in mayLoopAccessLocation to be updated though. - // Enable this to print exact negative strides. - if (0 && Stride && StoreSize == -Stride->getValue()->getValue()) { - dbgs() << "NEGATIVE STRIDE: " << *SI << "\n"; - dbgs() << "BB: " << *SI->getParent(); - } - + const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); + unsigned Stride = getStoreStride(StoreEv); + unsigned StoreSize = getStoreSizeInBytes(SI, DL); + if (StoreSize != Stride && StoreSize != -Stride) return false; - } + + bool NegStride = StoreSize == -Stride; // See if we can optimize just this store in isolation. if (processLoopStridedStore(StorePtr, StoreSize, SI->getAlignment(), - StoredVal, SI, StoreEv, BECount)) + StoredVal, SI, StoreEv, BECount, NegStride)) return true; - // If the stored value is a strided load in the same loop with the same stride - // this this may be transformable into a memcpy. This kicks in for stuff like - // for (i) A[i] = B[i]; - if (LoadInst *LI = dyn_cast<LoadInst>(StoredVal)) { - const SCEVAddRecExpr *LoadEv = - dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getOperand(0))); - if (LoadEv && LoadEv->getLoop() == CurLoop && LoadEv->isAffine() && - StoreEv->getOperand(1) == LoadEv->getOperand(1) && LI->isSimple()) - if (processLoopStoreOfLoopLoad(SI, StoreSize, StoreEv, LoadEv, BECount)) - return true; - } - //errs() << "UNHANDLED strided store: " << *StoreEv << " - " << *SI << "\n"; - - return false; + // Optimize the store into a memcpy, if it feeds an similarly strided load. + return processLoopStoreOfLoopLoad(SI, StoreSize, StoreEv, BECount, NegStride); } /// processLoopMemSet - See if this memset can be promoted to a large memset. -bool LoopIdiomRecognize:: -processLoopMemSet(MemSetInst *MSI, const SCEV *BECount) { +bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, + const SCEV *BECount) { // We can only handle non-volatile memsets with a constant size. - if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength())) return false; + if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength())) + return false; // If we're not allowed to hack on memset, we fail. if (!TLI->has(LibFunc::memset)) @@ -818,17 +441,16 @@ processLoopMemSet(MemSetInst *MSI, const SCEV *BECount) { return false; return processLoopStridedStore(Pointer, (unsigned)SizeInBytes, - MSI->getAlignment(), MSI->getValue(), - MSI, Ev, BECount); + MSI->getAlignment(), MSI->getValue(), MSI, Ev, + BECount, /*NegStride=*/false); } - /// mayLoopAccessLocation - Return true if the specified loop might access the /// specified pointer location, which is a loop-strided access. The 'Access' /// argument specifies what the verboten forms of access are (read or write). -static bool mayLoopAccessLocation(Value *Ptr,AliasAnalysis::ModRefResult Access, - Loop *L, const SCEV *BECount, - unsigned StoreSize, AliasAnalysis &AA, +static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, + const SCEV *BECount, unsigned StoreSize, + AliasAnalysis &AA, Instruction *IgnoredStore) { // Get the location that may be stored across the loop. Since the access is // strided positively through memory, we say that the modified location starts @@ -838,7 +460,7 @@ static bool mayLoopAccessLocation(Value *Ptr,AliasAnalysis::ModRefResult Access, // If the loop iterates a fixed number of times, we can refine the access size // to be exactly the size of the memset, which is (BECount+1)*StoreSize if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount)) - AccessSize = (BECst->getValue()->getZExtValue()+1)*StoreSize; + AccessSize = (BECst->getValue()->getZExtValue() + 1) * StoreSize; // TODO: For this to be really effective, we have to dive into the pointer // operand in the store. Store to &A[i] of 100 will always return may alias @@ -849,59 +471,31 @@ static bool mayLoopAccessLocation(Value *Ptr,AliasAnalysis::ModRefResult Access, for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E; ++BI) for (BasicBlock::iterator I = (*BI)->begin(), E = (*BI)->end(); I != E; ++I) - if (&*I != IgnoredStore && - (AA.getModRefInfo(I, StoreLoc) & Access)) + if (&*I != IgnoredStore && (AA.getModRefInfo(&*I, StoreLoc) & Access)) return true; return false; } -/// getMemSetPatternValue - If a strided store of the specified value is safe to -/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should -/// be passed in. Otherwise, return null. -/// -/// Note that we don't ever attempt to use memset_pattern8 or 4, because these -/// just replicate their input array and then pass on to memset_pattern16. -static Constant *getMemSetPatternValue(Value *V, const DataLayout &DL) { - // If the value isn't a constant, we can't promote it to being in a constant - // array. We could theoretically do a store to an alloca or something, but - // that doesn't seem worthwhile. - Constant *C = dyn_cast<Constant>(V); - if (!C) return nullptr; - - // Only handle simple values that are a power of two bytes in size. - uint64_t Size = DL.getTypeSizeInBits(V->getType()); - if (Size == 0 || (Size & 7) || (Size & (Size-1))) - return nullptr; - - // Don't care enough about darwin/ppc to implement this. - if (DL.isBigEndian()) - return nullptr; - - // Convert to size in bytes. - Size /= 8; - - // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see - // if the top and bottom are the same (e.g. for vectors and large integers). - if (Size > 16) return nullptr; - - // If the constant is exactly 16 bytes, just use it. - if (Size == 16) return C; - - // Otherwise, we'll use an array of the constants. - unsigned ArraySize = 16/Size; - ArrayType *AT = ArrayType::get(V->getType(), ArraySize); - return ConstantArray::get(AT, std::vector<Constant*>(ArraySize, C)); +// If we have a negative stride, Start refers to the end of the memory location +// we're trying to memset. Therefore, we need to recompute the base pointer, +// which is just Start - BECount*Size. +static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount, + Type *IntPtr, unsigned StoreSize, + ScalarEvolution *SE) { + const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr); + if (StoreSize != 1) + Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize), + SCEV::FlagNUW); + return SE->getMinusSCEV(Start, Index); } - /// processLoopStridedStore - We see a strided store of some value. If we can /// transform this into a memset or memset_pattern in the loop preheader, do so. -bool LoopIdiomRecognize:: -processLoopStridedStore(Value *DestPtr, unsigned StoreSize, - unsigned StoreAlignment, Value *StoredVal, - Instruction *TheStore, const SCEVAddRecExpr *Ev, - const SCEV *BECount) { +bool LoopIdiomRecognize::processLoopStridedStore( + Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment, + Value *StoredVal, Instruction *TheStore, const SCEVAddRecExpr *Ev, + const SCEV *BECount, bool NegStride) { // If the stored value is a byte-wise value (like i32 -1), then it may be // turned into a memset of i8 -1, assuming that all the consecutive bytes @@ -909,7 +503,6 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, // but it can be turned into memset_pattern if the target supports it. Value *SplatValue = isBytewiseValue(StoredVal); Constant *PatternValue = nullptr; - auto &DL = CurLoop->getHeader()->getModule()->getDataLayout(); unsigned DestAS = DestPtr->getType()->getPointerAddressSpace(); // If we're allowed to form a memset, and the stored value would be acceptable @@ -936,9 +529,15 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, // header. This allows us to insert code for it in the preheader. BasicBlock *Preheader = CurLoop->getLoopPreheader(); IRBuilder<> Builder(Preheader->getTerminator()); - SCEVExpander Expander(*SE, DL, "loop-idiom"); + SCEVExpander Expander(*SE, *DL, "loop-idiom"); Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS); + Type *IntPtr = Builder.getIntPtrTy(*DL, DestAS); + + const SCEV *Start = Ev->getStart(); + // Handle negative strided loops. + if (NegStride) + Start = getStartForNegStride(Start, BECount, IntPtr, StoreSize, SE); // Okay, we have a strided store "p[i]" of a splattable value. We can turn // this into a memset in the loop preheader now if we want. However, this @@ -946,12 +545,9 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, // or write to the aliased location. Check for any overlap by generating the // base pointer and checking the region. Value *BasePtr = - Expander.expandCodeFor(Ev->getStart(), DestInt8PtrTy, - Preheader->getTerminator()); - - if (mayLoopAccessLocation(BasePtr, AliasAnalysis::ModRef, - CurLoop, BECount, - StoreSize, getAnalysis<AliasAnalysis>(), TheStore)) { + Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator()); + if (mayLoopAccessLocation(BasePtr, MRI_ModRef, CurLoop, BECount, StoreSize, + *AA, TheStore)) { Expander.clear(); // If we generated new code for the base pointer, clean up. RecursivelyDeleteTriviallyDeadInstructions(BasePtr, TLI); @@ -962,36 +558,30 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, // The # stored bytes is (BECount+1)*Size. Expand the trip count out to // pointer size if it isn't already. - Type *IntPtr = Builder.getIntPtrTy(DL, DestAS); BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr); - const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1), - SCEV::FlagNUW); + const SCEV *NumBytesS = + SE->getAddExpr(BECount, SE->getOne(IntPtr), SCEV::FlagNUW); if (StoreSize != 1) { NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize), SCEV::FlagNUW); } Value *NumBytes = - Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator()); + Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator()); CallInst *NewCall; if (SplatValue) { - NewCall = Builder.CreateMemSet(BasePtr, - SplatValue, - NumBytes, - StoreAlignment); + NewCall = + Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, StoreAlignment); } else { // Everything is emitted in default address space Type *Int8PtrTy = DestInt8PtrTy; - Module *M = TheStore->getParent()->getParent()->getParent(); - Value *MSP = M->getOrInsertFunction("memset_pattern16", - Builder.getVoidTy(), - Int8PtrTy, - Int8PtrTy, - IntPtr, - (void*)nullptr); + Module *M = TheStore->getModule(); + Value *MSP = + M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(), + Int8PtrTy, Int8PtrTy, IntPtr, (void *)nullptr); // Otherwise we should form a memset_pattern16. PatternValue is known to be // an constant array of 16-bytes. Plop the value into a mergable global. @@ -1015,26 +605,47 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize, return true; } -/// processLoopStoreOfLoopLoad - We see a strided store whose value is a -/// same-strided load. -bool LoopIdiomRecognize:: -processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, - const SCEVAddRecExpr *StoreEv, - const SCEVAddRecExpr *LoadEv, - const SCEV *BECount) { +/// If the stored value is a strided load in the same loop with the same stride +/// this may be transformable into a memcpy. This kicks in for stuff like +/// for (i) A[i] = B[i]; +bool LoopIdiomRecognize::processLoopStoreOfLoopLoad( + StoreInst *SI, unsigned StoreSize, const SCEVAddRecExpr *StoreEv, + const SCEV *BECount, bool NegStride) { // If we're not allowed to form memcpy, we fail. if (!TLI->has(LibFunc::memcpy)) return false; - LoadInst *LI = cast<LoadInst>(SI->getValueOperand()); + // The store must be feeding a non-volatile load. + LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand()); + if (!LI || !LI->isSimple()) + return false; + + // See if the pointer expression is an AddRec like {base,+,1} on the current + // loop, which indicates a strided load. If we have something else, it's a + // random load we can't handle. + const SCEVAddRecExpr *LoadEv = + dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand())); + if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine()) + return false; + + // The store and load must share the same stride. + if (StoreEv->getOperand(1) != LoadEv->getOperand(1)) + return false; // The trip count of the loop and the base pointer of the addrec SCEV is // guaranteed to be loop invariant, which means that it should dominate the // header. This allows us to insert code for it in the preheader. BasicBlock *Preheader = CurLoop->getLoopPreheader(); IRBuilder<> Builder(Preheader->getTerminator()); - const DataLayout &DL = Preheader->getModule()->getDataLayout(); - SCEVExpander Expander(*SE, DL, "loop-idiom"); + SCEVExpander Expander(*SE, *DL, "loop-idiom"); + + const SCEV *StrStart = StoreEv->getStart(); + unsigned StrAS = SI->getPointerAddressSpace(); + Type *IntPtrTy = Builder.getIntPtrTy(*DL, StrAS); + + // Handle negative strided loops. + if (NegStride) + StrStart = getStartForNegStride(StrStart, BECount, IntPtrTy, StoreSize, SE); // Okay, we have a strided store "p[i]" of a loaded value. We can turn // this into a memcpy in the loop preheader now if we want. However, this @@ -1042,29 +653,31 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, // or write the memory region we're storing to. This includes the load that // feeds the stores. Check for an alias by generating the base address and // checking everything. - Value *StoreBasePtr = - Expander.expandCodeFor(StoreEv->getStart(), - Builder.getInt8PtrTy(SI->getPointerAddressSpace()), - Preheader->getTerminator()); - - if (mayLoopAccessLocation(StoreBasePtr, AliasAnalysis::ModRef, - CurLoop, BECount, StoreSize, - getAnalysis<AliasAnalysis>(), SI)) { + Value *StoreBasePtr = Expander.expandCodeFor( + StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator()); + + if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount, + StoreSize, *AA, SI)) { Expander.clear(); // If we generated new code for the base pointer, clean up. RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI); return false; } + const SCEV *LdStart = LoadEv->getStart(); + unsigned LdAS = LI->getPointerAddressSpace(); + + // Handle negative strided loops. + if (NegStride) + LdStart = getStartForNegStride(LdStart, BECount, IntPtrTy, StoreSize, SE); + // For a memcpy, we have to make sure that the input array is not being // mutated by the loop. - Value *LoadBasePtr = - Expander.expandCodeFor(LoadEv->getStart(), - Builder.getInt8PtrTy(LI->getPointerAddressSpace()), - Preheader->getTerminator()); + Value *LoadBasePtr = Expander.expandCodeFor( + LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator()); - if (mayLoopAccessLocation(LoadBasePtr, AliasAnalysis::Mod, CurLoop, BECount, - StoreSize, getAnalysis<AliasAnalysis>(), SI)) { + if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize, + *AA, SI)) { Expander.clear(); // If we generated new code for the base pointer, clean up. RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI); @@ -1074,34 +687,368 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, // Okay, everything is safe, we can transform this! - // The # stored bytes is (BECount+1)*Size. Expand the trip count out to // pointer size if it isn't already. - Type *IntPtrTy = Builder.getIntPtrTy(DL, SI->getPointerAddressSpace()); BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy); - const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtrTy, 1), - SCEV::FlagNUW); + const SCEV *NumBytesS = + SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW); if (StoreSize != 1) NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize), SCEV::FlagNUW); Value *NumBytes = - Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator()); + Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator()); CallInst *NewCall = - Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, - std::min(SI->getAlignment(), LI->getAlignment())); + Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, + std::min(SI->getAlignment(), LI->getAlignment())); NewCall->setDebugLoc(SI->getDebugLoc()); DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n" << " from load ptr=" << *LoadEv << " at: " << *LI << "\n" << " from store ptr=" << *StoreEv << " at: " << *SI << "\n"); - - // Okay, the memset has been formed. Zap the original store and anything that + // Okay, the memcpy has been formed. Zap the original store and anything that // feeds into it. deleteDeadInstruction(SI, TLI); ++NumMemCpy; return true; } + +bool LoopIdiomRecognize::runOnNoncountableLoop() { + return recognizePopcount(); +} + +/// Check if the given conditional branch is based on the comparison between +/// a variable and zero, and if the variable is non-zero, the control yields to +/// the loop entry. If the branch matches the behavior, the variable involved +/// in the comparion is returned. This function will be called to see if the +/// precondition and postcondition of the loop are in desirable form. +static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry) { + if (!BI || !BI->isConditional()) + return nullptr; + + ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition()); + if (!Cond) + return nullptr; + + ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1)); + if (!CmpZero || !CmpZero->isZero()) + return nullptr; + + ICmpInst::Predicate Pred = Cond->getPredicate(); + if ((Pred == ICmpInst::ICMP_NE && BI->getSuccessor(0) == LoopEntry) || + (Pred == ICmpInst::ICMP_EQ && BI->getSuccessor(1) == LoopEntry)) + return Cond->getOperand(0); + + return nullptr; +} + +/// Return true iff the idiom is detected in the loop. +/// +/// Additionally: +/// 1) \p CntInst is set to the instruction counting the population bit. +/// 2) \p CntPhi is set to the corresponding phi node. +/// 3) \p Var is set to the value whose population bits are being counted. +/// +/// The core idiom we are trying to detect is: +/// \code +/// if (x0 != 0) +/// goto loop-exit // the precondition of the loop +/// cnt0 = init-val; +/// do { +/// x1 = phi (x0, x2); +/// cnt1 = phi(cnt0, cnt2); +/// +/// cnt2 = cnt1 + 1; +/// ... +/// x2 = x1 & (x1 - 1); +/// ... +/// } while(x != 0); +/// +/// loop-exit: +/// \endcode +static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, + Instruction *&CntInst, PHINode *&CntPhi, + Value *&Var) { + // step 1: Check to see if the look-back branch match this pattern: + // "if (a!=0) goto loop-entry". + BasicBlock *LoopEntry; + Instruction *DefX2, *CountInst; + Value *VarX1, *VarX0; + PHINode *PhiX, *CountPhi; + + DefX2 = CountInst = nullptr; + VarX1 = VarX0 = nullptr; + PhiX = CountPhi = nullptr; + LoopEntry = *(CurLoop->block_begin()); + + // step 1: Check if the loop-back branch is in desirable form. + { + if (Value *T = matchCondition( + dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry)) + DefX2 = dyn_cast<Instruction>(T); + else + return false; + } + + // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)" + { + if (!DefX2 || DefX2->getOpcode() != Instruction::And) + return false; + + BinaryOperator *SubOneOp; + + if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0)))) + VarX1 = DefX2->getOperand(1); + else { + VarX1 = DefX2->getOperand(0); + SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1)); + } + if (!SubOneOp) + return false; + + Instruction *SubInst = cast<Instruction>(SubOneOp); + ConstantInt *Dec = dyn_cast<ConstantInt>(SubInst->getOperand(1)); + if (!Dec || + !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) || + (SubInst->getOpcode() == Instruction::Add && + Dec->isAllOnesValue()))) { + return false; + } + } + + // step 3: Check the recurrence of variable X + { + PhiX = dyn_cast<PHINode>(VarX1); + if (!PhiX || + (PhiX->getOperand(0) != DefX2 && PhiX->getOperand(1) != DefX2)) { + return false; + } + } + + // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1 + { + CountInst = nullptr; + for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(), + IterE = LoopEntry->end(); + Iter != IterE; Iter++) { + Instruction *Inst = &*Iter; + if (Inst->getOpcode() != Instruction::Add) + continue; + + ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1)); + if (!Inc || !Inc->isOne()) + continue; + + PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0)); + if (!Phi || Phi->getParent() != LoopEntry) + continue; + + // Check if the result of the instruction is live of the loop. + bool LiveOutLoop = false; + for (User *U : Inst->users()) { + if ((cast<Instruction>(U))->getParent() != LoopEntry) { + LiveOutLoop = true; + break; + } + } + + if (LiveOutLoop) { + CountInst = Inst; + CountPhi = Phi; + break; + } + } + + if (!CountInst) + return false; + } + + // step 5: check if the precondition is in this form: + // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;" + { + auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator()); + Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader()); + if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1)) + return false; + + CntInst = CountInst; + CntPhi = CountPhi; + Var = T; + } + + return true; +} + +/// Recognizes a population count idiom in a non-countable loop. +/// +/// If detected, transforms the relevant code to issue the popcount intrinsic +/// function call, and returns true; otherwise, returns false. +bool LoopIdiomRecognize::recognizePopcount() { + if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware) + return false; + + // Counting population are usually conducted by few arithmetic instructions. + // Such instructions can be easily "absorbed" by vacant slots in a + // non-compact loop. Therefore, recognizing popcount idiom only makes sense + // in a compact loop. + + // Give up if the loop has multiple blocks or multiple backedges. + if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) + return false; + + BasicBlock *LoopBody = *(CurLoop->block_begin()); + if (LoopBody->size() >= 20) { + // The loop is too big, bail out. + return false; + } + + // It should have a preheader containing nothing but an unconditional branch. + BasicBlock *PH = CurLoop->getLoopPreheader(); + if (!PH) + return false; + if (&PH->front() != PH->getTerminator()) + return false; + auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator()); + if (!EntryBI || EntryBI->isConditional()) + return false; + + // It should have a precondition block where the generated popcount instrinsic + // function can be inserted. + auto *PreCondBB = PH->getSinglePredecessor(); + if (!PreCondBB) + return false; + auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator()); + if (!PreCondBI || PreCondBI->isUnconditional()) + return false; + + Instruction *CntInst; + PHINode *CntPhi; + Value *Val; + if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val)) + return false; + + transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val); + return true; +} + +static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, + DebugLoc DL) { + Value *Ops[] = {Val}; + Type *Tys[] = {Val->getType()}; + + Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent(); + Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys); + CallInst *CI = IRBuilder.CreateCall(Func, Ops); + CI->setDebugLoc(DL); + + return CI; +} + +void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB, + Instruction *CntInst, + PHINode *CntPhi, Value *Var) { + BasicBlock *PreHead = CurLoop->getLoopPreheader(); + auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator()); + const DebugLoc DL = CntInst->getDebugLoc(); + + // Assuming before transformation, the loop is following: + // if (x) // the precondition + // do { cnt++; x &= x - 1; } while(x); + + // Step 1: Insert the ctpop instruction at the end of the precondition block + IRBuilder<> Builder(PreCondBr); + Value *PopCnt, *PopCntZext, *NewCount, *TripCnt; + { + PopCnt = createPopcntIntrinsic(Builder, Var, DL); + NewCount = PopCntZext = + Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType())); + + if (NewCount != PopCnt) + (cast<Instruction>(NewCount))->setDebugLoc(DL); + + // TripCnt is exactly the number of iterations the loop has + TripCnt = NewCount; + + // If the population counter's initial value is not zero, insert Add Inst. + Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead); + ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal); + if (!InitConst || !InitConst->isZero()) { + NewCount = Builder.CreateAdd(NewCount, CntInitVal); + (cast<Instruction>(NewCount))->setDebugLoc(DL); + } + } + + // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to + // "if (NewCount == 0) loop-exit". Without this change, the intrinsic + // function would be partial dead code, and downstream passes will drag + // it back from the precondition block to the preheader. + { + ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition()); + + Value *Opnd0 = PopCntZext; + Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0); + if (PreCond->getOperand(0) != Var) + std::swap(Opnd0, Opnd1); + + ICmpInst *NewPreCond = cast<ICmpInst>( + Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1)); + PreCondBr->setCondition(NewPreCond); + + RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI); + } + + // Step 3: Note that the population count is exactly the trip count of the + // loop in question, which enable us to to convert the loop from noncountable + // loop into a countable one. The benefit is twofold: + // + // - If the loop only counts population, the entire loop becomes dead after + // the transformation. It is a lot easier to prove a countable loop dead + // than to prove a noncountable one. (In some C dialects, an infinite loop + // isn't dead even if it computes nothing useful. In general, DCE needs + // to prove a noncountable loop finite before safely delete it.) + // + // - If the loop also performs something else, it remains alive. + // Since it is transformed to countable form, it can be aggressively + // optimized by some optimizations which are in general not applicable + // to a noncountable loop. + // + // After this step, this loop (conceptually) would look like following: + // newcnt = __builtin_ctpop(x); + // t = newcnt; + // if (x) + // do { cnt++; x &= x-1; t--) } while (t > 0); + BasicBlock *Body = *(CurLoop->block_begin()); + { + auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator()); + ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition()); + Type *Ty = TripCnt->getType(); + + PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front()); + + Builder.SetInsertPoint(LbCond); + Instruction *TcDec = cast<Instruction>( + Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1), + "tcdec", false, true)); + + TcPhi->addIncoming(TripCnt, PreHead); + TcPhi->addIncoming(TcDec, Body); + + CmpInst::Predicate Pred = + (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE; + LbCond->setPredicate(Pred); + LbCond->setOperand(0, TcDec); + LbCond->setOperand(1, ConstantInt::get(Ty, 0)); + } + + // Step 4: All the references to the original population counter outside + // the loop are replaced with the NewCount -- the value returned from + // __builtin_ctpop(). + CntInst->replaceUsesOutsideBlock(NewCount, Body); + + // step 5: Forget the "non-computable" trip-count SCEV associated with the + // loop. The loop would otherwise not be deleted even if it becomes empty. + SE->forgetLoop(CurLoop); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp index e125026..b4102fe 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp @@ -48,7 +48,7 @@ namespace { AU.addRequiredID(LoopSimplifyID); AU.addPreservedID(LoopSimplifyID); AU.addPreservedID(LCSSAID); - AU.addPreserved<ScalarEvolution>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); } }; @@ -112,7 +112,7 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // Simplify instructions in the current basic block. for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { - Instruction *I = BI++; + Instruction *I = &*BI++; // The first time through the loop ToSimplify is empty and we try to // simplify all instructions. On later iterations ToSimplify is not diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index 9d7e57f..4295235 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -99,7 +99,7 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, return false; if (St && !St->isSimple()) return false; - MemInstr.push_back(I); + MemInstr.push_back(&*I); } } @@ -176,7 +176,7 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, } } - // We don't have a DepMatrix to check legality return false + // We don't have a DepMatrix to check legality return false. if (DepMatrix.size() == 0) return false; return true; @@ -331,9 +331,9 @@ static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) { class LoopInterchangeLegality { public: LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE, - LoopInterchange *Pass) - : OuterLoop(Outer), InnerLoop(Inner), SE(SE), CurrentPass(Pass), - InnerLoopHasReduction(false) {} + LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA) + : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), + PreserveLCSSA(PreserveLCSSA), InnerLoopHasReduction(false) {} /// Check if the loops can be interchanged. bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId, @@ -357,9 +357,10 @@ private: Loop *OuterLoop; Loop *InnerLoop; - /// Scev analysis. ScalarEvolution *SE; - LoopInterchange *CurrentPass; + LoopInfo *LI; + DominatorTree *DT; + bool PreserveLCSSA; bool InnerLoopHasReduction; }; @@ -371,7 +372,7 @@ public: LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE) : OuterLoop(Outer), InnerLoop(Inner), SE(SE) {} - /// Check if the loop interchange is profitable + /// Check if the loop interchange is profitable. bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix); @@ -385,12 +386,12 @@ private: ScalarEvolution *SE; }; -/// LoopInterchangeTransform interchanges the loop +/// LoopInterchangeTransform interchanges the loop. class LoopInterchangeTransform { public: LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, - LoopInterchange *Pass, BasicBlock *LoopNestExit, + BasicBlock *LoopNestExit, bool InnerLoopContainsReductions) : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LoopExit(LoopNestExit), @@ -424,21 +425,22 @@ private: bool InnerLoopHasReduction; }; -// Main LoopInterchange Pass +// Main LoopInterchange Pass. struct LoopInterchange : public FunctionPass { static char ID; ScalarEvolution *SE; LoopInfo *LI; DependenceAnalysis *DA; DominatorTree *DT; + bool PreserveLCSSA; LoopInterchange() : FunctionPass(ID), SE(nullptr), LI(nullptr), DA(nullptr), DT(nullptr) { initializeLoopInterchangePass(*PassRegistry::getPassRegistry()); } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<ScalarEvolution>(); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<DependenceAnalysis>(); @@ -447,11 +449,13 @@ struct LoopInterchange : public FunctionPass { } bool runOnFunction(Function &F) override { - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); DA = &getAnalysis<DependenceAnalysis>(); auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); DT = DTWP ? &DTWP->getDomTree() : nullptr; + PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); + // Build up a worklist of loop pairs to analyze. SmallVector<LoopVector, 8> Worklist; @@ -489,7 +493,7 @@ struct LoopInterchange : public FunctionPass { unsigned selectLoopForInterchange(LoopVector LoopList) { // TODO: Add a better heuristic to select the loop to be interchanged based - // on the dependece matrix. Currently we select the innermost loop. + // on the dependence matrix. Currently we select the innermost loop. return LoopList.size() - 1; } @@ -544,7 +548,7 @@ struct LoopInterchange : public FunctionPass { } unsigned SelecLoopId = selectLoopForInterchange(LoopList); - // Move the selected loop outwards to the best posible position. + // Move the selected loop outwards to the best possible position. for (unsigned i = SelecLoopId; i > 0; i--) { bool Interchanged = processLoop(LoopList, i, i - 1, LoopNestExit, DependencyMatrix); @@ -574,7 +578,8 @@ struct LoopInterchange : public FunctionPass { Loop *InnerLoop = LoopList[InnerLoopId]; Loop *OuterLoop = LoopList[OuterLoopId]; - LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, this); + LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, LI, DT, + PreserveLCSSA); if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) { DEBUG(dbgs() << "Not interchanging Loops. Cannot prove legality\n"); return false; @@ -586,7 +591,7 @@ struct LoopInterchange : public FunctionPass { return false; } - LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, this, + LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LoopNestExit, LIL.hasInnerLoopReduction()); LIT.transform(); DEBUG(dbgs() << "Loops interchanged\n"); @@ -655,7 +660,7 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) { DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch \n"); // We do not have any basic block in between now make sure the outer header - // and outer loop latch doesnt contain any unsafe instructions. + // and outer loop latch doesn't contain any unsafe instructions. if (containsUnsafeInstructionsInHeader(OuterLoopHeader) || containsUnsafeInstructionsInLatch(OuterLoopLatch)) return false; @@ -698,9 +703,9 @@ bool LoopInterchangeLegality::findInductionAndReductions( return false; for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { RecurrenceDescriptor RD; + InductionDescriptor ID; PHINode *PHI = cast<PHINode>(I); - ConstantInt *StepValue = nullptr; - if (isInductionPHI(PHI, SE, StepValue)) + if (InductionDescriptor::isInductionPHI(PHI, SE, ID)) Inductions.push_back(PHI); else if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) Reductions.push_back(PHI); @@ -836,7 +841,7 @@ bool LoopInterchangeLegality::currentLimitations() { else FoundInduction = true; } - // The loop latch ended and we didnt find the induction variable return as + // The loop latch ended and we didn't find the induction variable return as // current limitation. if (!FoundInduction) return true; @@ -867,12 +872,14 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, if (!OuterLoopPreHeader || OuterLoopPreHeader == OuterLoop->getHeader() || isa<PHINode>(OuterLoopPreHeader->begin()) || !OuterLoopPreHeader->getUniquePredecessor()) { - OuterLoopPreHeader = InsertPreheaderForLoop(OuterLoop, CurrentPass); + OuterLoopPreHeader = + InsertPreheaderForLoop(OuterLoop, DT, LI, PreserveLCSSA); } if (!InnerLoopPreHeader || InnerLoopPreHeader == InnerLoop->getHeader() || InnerLoopPreHeader == OuterLoop->getHeader()) { - InnerLoopPreHeader = InsertPreheaderForLoop(InnerLoop, CurrentPass); + InnerLoopPreHeader = + InsertPreheaderForLoop(InnerLoop, DT, LI, PreserveLCSSA); } // TODO: The loops could not be interchanged due to current limitations in the @@ -966,7 +973,7 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) { - // TODO: Add Better Profitibility checks. + // TODO: Add better profitability checks. // e.g // 1) Construct dependency matrix and move the one with no loop carried dep // inside to enable vectorization. @@ -980,7 +987,7 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId, if (Cost < 0) return true; - // It is not profitable as per current cache profitibility model. But check if + // It is not profitable as per current cache profitability model. But check if // we can move this loop outside to improve parallelism. bool ImprovesPar = isProfitabileForVectorization(InnerLoopId, OuterLoopId, DepMatrix); @@ -996,7 +1003,7 @@ void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop, return; } } - assert(false && "Couldn't find loop"); + llvm_unreachable("Couldn't find loop"); } void LoopInterchangeTransform::restructureLoops(Loop *InnerLoop, @@ -1045,7 +1052,7 @@ bool LoopInterchangeTransform::transform() { splitInnerLoopLatch(InnerIndexVar); DEBUG(dbgs() << "splitInnerLoopLatch Done\n"); - // Splits the inner loops phi nodes out into a seperate basic block. + // Splits the inner loops phi nodes out into a separate basic block. splitInnerLoopHeader(); DEBUG(dbgs() << "splitInnerLoopHeader Done\n"); } @@ -1113,8 +1120,8 @@ static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) { auto &ToList = InsertBefore->getParent()->getInstList(); auto &FromList = FromBB->getInstList(); - ToList.splice(InsertBefore, FromList, FromList.begin(), - FromBB->getTerminator()); + ToList.splice(InsertBefore->getIterator(), FromList, FromList.begin(), + FromBB->getTerminator()->getIterator()); } void LoopInterchangeTransform::adjustOuterLoopPreheader() { @@ -1181,8 +1188,8 @@ bool LoopInterchangeTransform::adjustLoopBranches() { if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI) return false; - BasicBlock *InnerLoopHeaderSucessor = InnerLoopHeader->getUniqueSuccessor(); - if (!InnerLoopHeaderSucessor) + BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor(); + if (!InnerLoopHeaderSuccessor) return false; // Adjust Loop Preheader and headers @@ -1198,11 +1205,11 @@ bool LoopInterchangeTransform::adjustLoopBranches() { if (OuterLoopHeaderBI->getSuccessor(i) == OuterLoopLatch) OuterLoopHeaderBI->setSuccessor(i, LoopExit); else if (OuterLoopHeaderBI->getSuccessor(i) == InnerLoopPreHeader) - OuterLoopHeaderBI->setSuccessor(i, InnerLoopHeaderSucessor); + OuterLoopHeaderBI->setSuccessor(i, InnerLoopHeaderSuccessor); } // Adjust reduction PHI's now that the incoming block has changed. - updateIncomingBlock(InnerLoopHeaderSucessor, InnerLoopHeader, + updateIncomingBlock(InnerLoopHeaderSuccessor, InnerLoopHeader, OuterLoopHeader); BranchInst::Create(OuterLoopPreHeader, InnerLoopHeaderBI); @@ -1286,10 +1293,10 @@ bool LoopInterchangeTransform::adjustLoopLinks() { char LoopInterchange::ID = 0; INITIALIZE_PASS_BEGIN(LoopInterchange, "loop-interchange", "Interchanges loops for cache reuse", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(DependenceAnalysis) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp new file mode 100644 index 0000000..1064d08 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -0,0 +1,566 @@ +//===- LoopLoadElimination.cpp - Loop Load Elimination Pass ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implement a loop-aware load elimination pass. +// +// It uses LoopAccessAnalysis to identify loop-carried dependences with a +// distance of one between stores and loads. These form the candidates for the +// transformation. The source value of each store then propagated to the user +// of the corresponding load. This makes the load dead. +// +// The pass can also version the loop and add memchecks in order to prove that +// may-aliasing stores can't change the value in memory before it's read by the +// load. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/LoopVersioning.h" +#include <forward_list> + +#define LLE_OPTION "loop-load-elim" +#define DEBUG_TYPE LLE_OPTION + +using namespace llvm; + +static cl::opt<unsigned> CheckPerElim( + "runtime-check-per-loop-load-elim", cl::Hidden, + cl::desc("Max number of memchecks allowed per eliminated load on average"), + cl::init(1)); + +static cl::opt<unsigned> LoadElimSCEVCheckThreshold( + "loop-load-elimination-scev-check-threshold", cl::init(8), cl::Hidden, + cl::desc("The maximum number of SCEV checks allowed for Loop " + "Load Elimination")); + + +STATISTIC(NumLoopLoadEliminted, "Number of loads eliminated by LLE"); + +namespace { + +/// \brief Represent a store-to-forwarding candidate. +struct StoreToLoadForwardingCandidate { + LoadInst *Load; + StoreInst *Store; + + StoreToLoadForwardingCandidate(LoadInst *Load, StoreInst *Store) + : Load(Load), Store(Store) {} + + /// \brief Return true if the dependence from the store to the load has a + /// distance of one. E.g. A[i+1] = A[i] + bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE) const { + Value *LoadPtr = Load->getPointerOperand(); + Value *StorePtr = Store->getPointerOperand(); + Type *LoadPtrType = LoadPtr->getType(); + Type *LoadType = LoadPtrType->getPointerElementType(); + + assert(LoadPtrType->getPointerAddressSpace() == + StorePtr->getType()->getPointerAddressSpace() && + LoadType == StorePtr->getType()->getPointerElementType() && + "Should be a known dependence"); + + auto &DL = Load->getParent()->getModule()->getDataLayout(); + unsigned TypeByteSize = DL.getTypeAllocSize(const_cast<Type *>(LoadType)); + + auto *LoadPtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(LoadPtr)); + auto *StorePtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(StorePtr)); + + // We don't need to check non-wrapping here because forward/backward + // dependence wouldn't be valid if these weren't monotonic accesses. + auto *Dist = cast<SCEVConstant>( + PSE.getSE()->getMinusSCEV(StorePtrSCEV, LoadPtrSCEV)); + const APInt &Val = Dist->getAPInt(); + return Val.abs() == TypeByteSize; + } + + Value *getLoadPtr() const { return Load->getPointerOperand(); } + +#ifndef NDEBUG + friend raw_ostream &operator<<(raw_ostream &OS, + const StoreToLoadForwardingCandidate &Cand) { + OS << *Cand.Store << " -->\n"; + OS.indent(2) << *Cand.Load << "\n"; + return OS; + } +#endif +}; + +/// \brief Check if the store dominates all latches, so as long as there is no +/// intervening store this value will be loaded in the next iteration. +bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L, + DominatorTree *DT) { + SmallVector<BasicBlock *, 8> Latches; + L->getLoopLatches(Latches); + return std::all_of(Latches.begin(), Latches.end(), + [&](const BasicBlock *Latch) { + return DT->dominates(StoreBlock, Latch); + }); +} + +/// \brief The per-loop class that does most of the work. +class LoadEliminationForLoop { +public: + LoadEliminationForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI, + DominatorTree *DT) + : L(L), LI(LI), LAI(LAI), DT(DT), PSE(LAI.PSE) {} + + /// \brief Look through the loop-carried and loop-independent dependences in + /// this loop and find store->load dependences. + /// + /// Note that no candidate is returned if LAA has failed to analyze the loop + /// (e.g. if it's not bottom-tested, contains volatile memops, etc.) + std::forward_list<StoreToLoadForwardingCandidate> + findStoreToLoadDependences(const LoopAccessInfo &LAI) { + std::forward_list<StoreToLoadForwardingCandidate> Candidates; + + const auto *Deps = LAI.getDepChecker().getDependences(); + if (!Deps) + return Candidates; + + // Find store->load dependences (consequently true dep). Both lexically + // forward and backward dependences qualify. Disqualify loads that have + // other unknown dependences. + + SmallSet<Instruction *, 4> LoadsWithUnknownDepedence; + + for (const auto &Dep : *Deps) { + Instruction *Source = Dep.getSource(LAI); + Instruction *Destination = Dep.getDestination(LAI); + + if (Dep.Type == MemoryDepChecker::Dependence::Unknown) { + if (isa<LoadInst>(Source)) + LoadsWithUnknownDepedence.insert(Source); + if (isa<LoadInst>(Destination)) + LoadsWithUnknownDepedence.insert(Destination); + continue; + } + + if (Dep.isBackward()) + // Note that the designations source and destination follow the program + // order, i.e. source is always first. (The direction is given by the + // DepType.) + std::swap(Source, Destination); + else + assert(Dep.isForward() && "Needs to be a forward dependence"); + + auto *Store = dyn_cast<StoreInst>(Source); + if (!Store) + continue; + auto *Load = dyn_cast<LoadInst>(Destination); + if (!Load) + continue; + Candidates.emplace_front(Load, Store); + } + + if (!LoadsWithUnknownDepedence.empty()) + Candidates.remove_if([&](const StoreToLoadForwardingCandidate &C) { + return LoadsWithUnknownDepedence.count(C.Load); + }); + + return Candidates; + } + + /// \brief Return the index of the instruction according to program order. + unsigned getInstrIndex(Instruction *Inst) { + auto I = InstOrder.find(Inst); + assert(I != InstOrder.end() && "No index for instruction"); + return I->second; + } + + /// \brief If a load has multiple candidates associated (i.e. different + /// stores), it means that it could be forwarding from multiple stores + /// depending on control flow. Remove these candidates. + /// + /// Here, we rely on LAA to include the relevant loop-independent dependences. + /// LAA is known to omit these in the very simple case when the read and the + /// write within an alias set always takes place using the *same* pointer. + /// + /// However, we know that this is not the case here, i.e. we can rely on LAA + /// to provide us with loop-independent dependences for the cases we're + /// interested. Consider the case for example where a loop-independent + /// dependece S1->S2 invalidates the forwarding S3->S2. + /// + /// A[i] = ... (S1) + /// ... = A[i] (S2) + /// A[i+1] = ... (S3) + /// + /// LAA will perform dependence analysis here because there are two + /// *different* pointers involved in the same alias set (&A[i] and &A[i+1]). + void removeDependencesFromMultipleStores( + std::forward_list<StoreToLoadForwardingCandidate> &Candidates) { + // If Store is nullptr it means that we have multiple stores forwarding to + // this store. + typedef DenseMap<LoadInst *, const StoreToLoadForwardingCandidate *> + LoadToSingleCandT; + LoadToSingleCandT LoadToSingleCand; + + for (const auto &Cand : Candidates) { + bool NewElt; + LoadToSingleCandT::iterator Iter; + + std::tie(Iter, NewElt) = + LoadToSingleCand.insert(std::make_pair(Cand.Load, &Cand)); + if (!NewElt) { + const StoreToLoadForwardingCandidate *&OtherCand = Iter->second; + // Already multiple stores forward to this load. + if (OtherCand == nullptr) + continue; + + // Handle the very basic of case when the two stores are in the same + // block so deciding which one forwards is easy. The later one forwards + // as long as they both have a dependence distance of one to the load. + if (Cand.Store->getParent() == OtherCand->Store->getParent() && + Cand.isDependenceDistanceOfOne(PSE) && + OtherCand->isDependenceDistanceOfOne(PSE)) { + // They are in the same block, the later one will forward to the load. + if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store)) + OtherCand = &Cand; + } else + OtherCand = nullptr; + } + } + + Candidates.remove_if([&](const StoreToLoadForwardingCandidate &Cand) { + if (LoadToSingleCand[Cand.Load] != &Cand) { + DEBUG(dbgs() << "Removing from candidates: \n" << Cand + << " The load may have multiple stores forwarding to " + << "it\n"); + return true; + } + return false; + }); + } + + /// \brief Given two pointers operations by their RuntimePointerChecking + /// indices, return true if they require an alias check. + /// + /// We need a check if one is a pointer for a candidate load and the other is + /// a pointer for a possibly intervening store. + bool needsChecking(unsigned PtrIdx1, unsigned PtrIdx2, + const SmallSet<Value *, 4> &PtrsWrittenOnFwdingPath, + const std::set<Value *> &CandLoadPtrs) { + Value *Ptr1 = + LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx1).PointerValue; + Value *Ptr2 = + LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx2).PointerValue; + return ((PtrsWrittenOnFwdingPath.count(Ptr1) && CandLoadPtrs.count(Ptr2)) || + (PtrsWrittenOnFwdingPath.count(Ptr2) && CandLoadPtrs.count(Ptr1))); + } + + /// \brief Return pointers that are possibly written to on the path from a + /// forwarding store to a load. + /// + /// These pointers need to be alias-checked against the forwarding candidates. + SmallSet<Value *, 4> findPointersWrittenOnForwardingPath( + const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) { + // From FirstStore to LastLoad neither of the elimination candidate loads + // should overlap with any of the stores. + // + // E.g.: + // + // st1 C[i] + // ld1 B[i] <-------, + // ld0 A[i] <----, | * LastLoad + // ... | | + // st2 E[i] | | + // st3 B[i+1] -- | -' * FirstStore + // st0 A[i+1] ---' + // st4 D[i] + // + // st0 forwards to ld0 if the accesses in st4 and st1 don't overlap with + // ld0. + + LoadInst *LastLoad = + std::max_element(Candidates.begin(), Candidates.end(), + [&](const StoreToLoadForwardingCandidate &A, + const StoreToLoadForwardingCandidate &B) { + return getInstrIndex(A.Load) < getInstrIndex(B.Load); + }) + ->Load; + StoreInst *FirstStore = + std::min_element(Candidates.begin(), Candidates.end(), + [&](const StoreToLoadForwardingCandidate &A, + const StoreToLoadForwardingCandidate &B) { + return getInstrIndex(A.Store) < + getInstrIndex(B.Store); + }) + ->Store; + + // We're looking for stores after the first forwarding store until the end + // of the loop, then from the beginning of the loop until the last + // forwarded-to load. Collect the pointer for the stores. + SmallSet<Value *, 4> PtrsWrittenOnFwdingPath; + + auto InsertStorePtr = [&](Instruction *I) { + if (auto *S = dyn_cast<StoreInst>(I)) + PtrsWrittenOnFwdingPath.insert(S->getPointerOperand()); + }; + const auto &MemInstrs = LAI.getDepChecker().getMemoryInstructions(); + std::for_each(MemInstrs.begin() + getInstrIndex(FirstStore) + 1, + MemInstrs.end(), InsertStorePtr); + std::for_each(MemInstrs.begin(), &MemInstrs[getInstrIndex(LastLoad)], + InsertStorePtr); + + return PtrsWrittenOnFwdingPath; + } + + /// \brief Determine the pointer alias checks to prove that there are no + /// intervening stores. + SmallVector<RuntimePointerChecking::PointerCheck, 4> collectMemchecks( + const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) { + + SmallSet<Value *, 4> PtrsWrittenOnFwdingPath = + findPointersWrittenOnForwardingPath(Candidates); + + // Collect the pointers of the candidate loads. + // FIXME: SmallSet does not work with std::inserter. + std::set<Value *> CandLoadPtrs; + std::transform(Candidates.begin(), Candidates.end(), + std::inserter(CandLoadPtrs, CandLoadPtrs.begin()), + std::mem_fn(&StoreToLoadForwardingCandidate::getLoadPtr)); + + const auto &AllChecks = LAI.getRuntimePointerChecking()->getChecks(); + SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks; + + std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks), + [&](const RuntimePointerChecking::PointerCheck &Check) { + for (auto PtrIdx1 : Check.first->Members) + for (auto PtrIdx2 : Check.second->Members) + if (needsChecking(PtrIdx1, PtrIdx2, + PtrsWrittenOnFwdingPath, CandLoadPtrs)) + return true; + return false; + }); + + DEBUG(dbgs() << "\nPointer Checks (count: " << Checks.size() << "):\n"); + DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks)); + + return Checks; + } + + /// \brief Perform the transformation for a candidate. + void + propagateStoredValueToLoadUsers(const StoreToLoadForwardingCandidate &Cand, + SCEVExpander &SEE) { + // + // loop: + // %x = load %gep_i + // = ... %x + // store %y, %gep_i_plus_1 + // + // => + // + // ph: + // %x.initial = load %gep_0 + // loop: + // %x.storeforward = phi [%x.initial, %ph] [%y, %loop] + // %x = load %gep_i <---- now dead + // = ... %x.storeforward + // store %y, %gep_i_plus_1 + + Value *Ptr = Cand.Load->getPointerOperand(); + auto *PtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(Ptr)); + auto *PH = L->getLoopPreheader(); + Value *InitialPtr = SEE.expandCodeFor(PtrSCEV->getStart(), Ptr->getType(), + PH->getTerminator()); + Value *Initial = + new LoadInst(InitialPtr, "load_initial", PH->getTerminator()); + PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded", + &L->getHeader()->front()); + PHI->addIncoming(Initial, PH); + PHI->addIncoming(Cand.Store->getOperand(0), L->getLoopLatch()); + + Cand.Load->replaceAllUsesWith(PHI); + } + + /// \brief Top-level driver for each loop: find store->load forwarding + /// candidates, add run-time checks and perform transformation. + bool processLoop() { + DEBUG(dbgs() << "\nIn \"" << L->getHeader()->getParent()->getName() + << "\" checking " << *L << "\n"); + // Look for store-to-load forwarding cases across the + // backedge. E.g.: + // + // loop: + // %x = load %gep_i + // = ... %x + // store %y, %gep_i_plus_1 + // + // => + // + // ph: + // %x.initial = load %gep_0 + // loop: + // %x.storeforward = phi [%x.initial, %ph] [%y, %loop] + // %x = load %gep_i <---- now dead + // = ... %x.storeforward + // store %y, %gep_i_plus_1 + + // First start with store->load dependences. + auto StoreToLoadDependences = findStoreToLoadDependences(LAI); + if (StoreToLoadDependences.empty()) + return false; + + // Generate an index for each load and store according to the original + // program order. This will be used later. + InstOrder = LAI.getDepChecker().generateInstructionOrderMap(); + + // To keep things simple for now, remove those where the load is potentially + // fed by multiple stores. + removeDependencesFromMultipleStores(StoreToLoadDependences); + if (StoreToLoadDependences.empty()) + return false; + + // Filter the candidates further. + SmallVector<StoreToLoadForwardingCandidate, 4> Candidates; + unsigned NumForwarding = 0; + for (const StoreToLoadForwardingCandidate Cand : StoreToLoadDependences) { + DEBUG(dbgs() << "Candidate " << Cand); + // Make sure that the stored values is available everywhere in the loop in + // the next iteration. + if (!doesStoreDominatesAllLatches(Cand.Store->getParent(), L, DT)) + continue; + + // Check whether the SCEV difference is the same as the induction step, + // thus we load the value in the next iteration. + if (!Cand.isDependenceDistanceOfOne(PSE)) + continue; + + ++NumForwarding; + DEBUG(dbgs() + << NumForwarding + << ". Valid store-to-load forwarding across the loop backedge\n"); + Candidates.push_back(Cand); + } + if (Candidates.empty()) + return false; + + // Check intervening may-alias stores. These need runtime checks for alias + // disambiguation. + SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks = + collectMemchecks(Candidates); + + // Too many checks are likely to outweigh the benefits of forwarding. + if (Checks.size() > Candidates.size() * CheckPerElim) { + DEBUG(dbgs() << "Too many run-time checks needed.\n"); + return false; + } + + if (LAI.PSE.getUnionPredicate().getComplexity() > + LoadElimSCEVCheckThreshold) { + DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n"); + return false; + } + + // Point of no-return, start the transformation. First, version the loop if + // necessary. + if (!Checks.empty() || !LAI.PSE.getUnionPredicate().isAlwaysTrue()) { + LoopVersioning LV(LAI, L, LI, DT, PSE.getSE(), false); + LV.setAliasChecks(std::move(Checks)); + LV.setSCEVChecks(LAI.PSE.getUnionPredicate()); + LV.versionLoop(); + } + + // Next, propagate the value stored by the store to the users of the load. + // Also for the first iteration, generate the initial value of the load. + SCEVExpander SEE(*PSE.getSE(), L->getHeader()->getModule()->getDataLayout(), + "storeforward"); + for (const auto &Cand : Candidates) + propagateStoredValueToLoadUsers(Cand, SEE); + NumLoopLoadEliminted += NumForwarding; + + return true; + } + +private: + Loop *L; + + /// \brief Maps the load/store instructions to their index according to + /// program order. + DenseMap<Instruction *, unsigned> InstOrder; + + // Analyses used. + LoopInfo *LI; + const LoopAccessInfo &LAI; + DominatorTree *DT; + PredicatedScalarEvolution PSE; +}; + +/// \brief The pass. Most of the work is delegated to the per-loop +/// LoadEliminationForLoop class. +class LoopLoadElimination : public FunctionPass { +public: + LoopLoadElimination() : FunctionPass(ID) { + initializeLoopLoadEliminationPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + auto *LAA = &getAnalysis<LoopAccessAnalysis>(); + auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + + // Build up a worklist of inner-loops to vectorize. This is necessary as the + // act of distributing a loop creates new loops and can invalidate iterators + // across the loops. + SmallVector<Loop *, 8> Worklist; + + for (Loop *TopLevelLoop : *LI) + for (Loop *L : depth_first(TopLevelLoop)) + // We only handle inner-most loops. + if (L->empty()) + Worklist.push_back(L); + + // Now walk the identified inner loops. + bool Changed = false; + for (Loop *L : Worklist) { + const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap()); + // The actual work is performed by LoadEliminationForLoop. + LoadEliminationForLoop LEL(L, LI, LAI, DT); + Changed |= LEL.processLoop(); + } + + // Process each loop nest in the function. + return Changed; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); + AU.addRequired<LoopAccessAnalysis>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + } + + static char ID; +}; +} + +char LoopLoadElimination::ID; +static const char LLE_name[] = "Loop Load Elimination"; + +INITIALIZE_PASS_BEGIN(LoopLoadElimination, LLE_OPTION, LLE_name, false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_END(LoopLoadElimination, LLE_OPTION, LLE_name, false, false) + +namespace llvm { +FunctionPass *createLoopLoadEliminationPass() { + return new LoopLoadElimination(); +} +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp index ed103e6..27c2d88 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp @@ -147,12 +147,12 @@ namespace { bool runOnLoop(Loop *L, LPPassManager &LPM) override; void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addRequired<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); } @@ -162,11 +162,15 @@ namespace { ScalarEvolution *SE; TargetLibraryInfo *TLI; DominatorTree *DT; + bool PreserveLCSSA; typedef SmallVector<Instruction *, 16> SmallInstructionVector; typedef SmallSet<Instruction *, 16> SmallInstructionSet; - // A chain of isomorphic instructions, indentified by a single-use PHI, + // Map between induction variable and its increment + DenseMap<Instruction *, int64_t> IVToIncMap; + + // A chain of isomorphic instructions, identified by a single-use PHI // representing a reduction. Only the last value may be used outside the // loop. struct SimpleLoopReduction { @@ -300,22 +304,6 @@ namespace { // The functions below can be called after we've finished processing all // instructions in the loop, and we know which reductions were selected. - // Is the provided instruction the PHI of a reduction selected for - // rerolling? - bool isSelectedPHI(Instruction *J) { - if (!isa<PHINode>(J)) - return false; - - for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end(); - RI != RIE; ++RI) { - int i = *RI; - if (cast<Instruction>(J) == PossibleReds[i].getPHI()) - return true; - } - - return false; - } - bool validateSelected(); void replaceSelected(); @@ -335,7 +323,7 @@ namespace { // x[i*3+1] = y2 // x[i*3+2] = y3 // - // Base instruction -> i*3 + // Base instruction -> i*3 // +---+----+ // / | \ // ST[y1] +1 +2 <-- Roots @@ -366,8 +354,11 @@ namespace { struct DAGRootTracker { DAGRootTracker(LoopReroll *Parent, Loop *L, Instruction *IV, ScalarEvolution *SE, AliasAnalysis *AA, - TargetLibraryInfo *TLI) - : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), IV(IV) {} + TargetLibraryInfo *TLI, DominatorTree *DT, LoopInfo *LI, + bool PreserveLCSSA, + DenseMap<Instruction *, int64_t> &IncrMap) + : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), DT(DT), LI(LI), + PreserveLCSSA(PreserveLCSSA), IV(IV), IVToIncMap(IncrMap) {} /// Stage 1: Find all the DAG roots for the induction variable. bool findRoots(); @@ -413,11 +404,14 @@ namespace { ScalarEvolution *SE; AliasAnalysis *AA; TargetLibraryInfo *TLI; + DominatorTree *DT; + LoopInfo *LI; + bool PreserveLCSSA; // The loop induction variable. Instruction *IV; // Loop step amount. - uint64_t Inc; + int64_t Inc; // Loop reroll count; if Inc == 1, this records the scaling applied // to the indvar: a[i*2+0] = ...; a[i*2+1] = ... ; // If Inc is not 1, Scale = Inc. @@ -430,6 +424,8 @@ namespace { // they are used in (or specially, IL_All for instructions // used in the loop increment mechanism). UsesTy Uses; + // Map between induction variable and its increment + DenseMap<Instruction *, int64_t> &IVToIncMap; }; void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs); @@ -442,10 +438,10 @@ namespace { char LoopReroll::ID = 0; INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false) @@ -477,21 +473,20 @@ void LoopReroll::collectPossibleIVs(Loop *L, continue; if (const SCEVAddRecExpr *PHISCEV = - dyn_cast<SCEVAddRecExpr>(SE->getSCEV(I))) { + dyn_cast<SCEVAddRecExpr>(SE->getSCEV(&*I))) { if (PHISCEV->getLoop() != L) continue; if (!PHISCEV->isAffine()) continue; if (const SCEVConstant *IncSCEV = dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE))) { - if (!IncSCEV->getValue()->getValue().isStrictlyPositive()) + const APInt &AInt = IncSCEV->getAPInt().abs(); + if (IncSCEV->getValue()->isZero() || AInt.uge(MaxInc)) continue; - if (IncSCEV->getValue()->uge(MaxInc)) - continue; - - DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << - *PHISCEV << "\n"); - PossibleIVs.push_back(I); + IVToIncMap[&*I] = IncSCEV->getValue()->getSExtValue(); + DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << *PHISCEV + << "\n"); + PossibleIVs.push_back(&*I); } } } @@ -552,7 +547,7 @@ void LoopReroll::collectPossibleReductions(Loop *L, if (!I->getType()->isSingleValueType()) continue; - SimpleLoopReduction SLR(I, L); + SimpleLoopReduction SLR(&*I, L); if (!SLR.valid()) continue; @@ -699,17 +694,11 @@ collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) { } } - int64_t V = CI->getValue().getSExtValue(); + int64_t V = std::abs(CI->getValue().getSExtValue()); if (Roots.find(V) != Roots.end()) // No duplicates, please. return false; - // FIXME: Add support for negative values. - if (V < 0) { - DEBUG(dbgs() << "LRR: Aborting due to negative value: " << V << "\n"); - return false; - } - Roots[V] = cast<Instruction>(I); } @@ -731,7 +720,7 @@ collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) { unsigned NumBaseUses = BaseUsers.size(); if (NumBaseUses == 0) NumBaseUses = Roots.begin()->second->getNumUses(); - + // Check that every node has the same number of users. for (auto &KV : Roots) { if (KV.first == 0) @@ -744,7 +733,7 @@ collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) { } } - return true; + return true; } bool LoopReroll::DAGRootTracker:: @@ -787,7 +776,7 @@ findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) { if (!collectPossibleRoots(IVU, V)) return false; - // If we didn't get a root for index zero, then IVU must be + // If we didn't get a root for index zero, then IVU must be // subsumed. if (V.find(0) == V.end()) SubsumedInsts.insert(IVU); @@ -818,13 +807,10 @@ findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) { } bool LoopReroll::DAGRootTracker::findRoots() { - - const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(IV)); - Inc = cast<SCEVConstant>(RealIVSCEV->getOperand(1))-> - getValue()->getZExtValue(); + Inc = IVToIncMap[IV]; assert(RootSets.empty() && "Unclean state!"); - if (Inc == 1) { + if (std::abs(Inc) == 1) { for (auto *IVU : IV->users()) { if (isLoopIncrement(IVU, IV)) LoopIncs.push_back(cast<Instruction>(IVU)); @@ -996,6 +982,25 @@ bool LoopReroll::DAGRootTracker::instrDependsOn(Instruction *I, return false; } +static bool isIgnorableInst(const Instruction *I) { + if (isa<DbgInfoIntrinsic>(I)) + return true; + const IntrinsicInst* II = dyn_cast<IntrinsicInst>(I); + if (!II) + return false; + switch (II->getIntrinsicID()) { + default: + return false; + case llvm::Intrinsic::annotation: + case Intrinsic::ptr_annotation: + case Intrinsic::var_annotation: + // TODO: the following intrinsics may also be whitelisted: + // lifetime_start, lifetime_end, invariant_start, invariant_end + return true; + } + return false; +} + bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) { // We now need to check for equivalence of the use graph of each root with // that of the primary induction variable (excluding the roots). Our goal @@ -1029,7 +1034,7 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) { // Make sure all instructions in the loop are in one and only one // set. for (auto &KV : Uses) { - if (KV.second.count() != 1) { + if (KV.second.count() != 1 && !isIgnorableInst(KV.first)) { DEBUG(dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: " << *KV.first << " (#uses=" << KV.second.count() << ")\n"); return false; @@ -1103,15 +1108,15 @@ bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) { " vs. " << *RootInst << "\n"); return false; } - + RootIt = TryIt; RootInst = TryIt->first; } // All instructions between the last root and this root - // may belong to some other iteration. If they belong to a + // may belong to some other iteration. If they belong to a // future iteration, then they're dangerous to alias with. - // + // // Note that because we allow a limited amount of flexibility in the order // that we visit nodes, LastRootIt might be *before* RootIt, in which // case we've already checked this set of instructions so we shouldn't @@ -1267,6 +1272,7 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) { ++J; } + bool Negative = IVToIncMap[IV] < 0; const DataLayout &DL = Header->getModule()->getDataLayout(); // We need to create a new induction variable for each different BaseInst. @@ -1275,13 +1281,12 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) { const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst)); const SCEV *Start = RealIVSCEV->getStart(); - const SCEVAddRecExpr *H = cast<SCEVAddRecExpr> - (SE->getAddRecExpr(Start, - SE->getConstant(RealIVSCEV->getType(), 1), - L, SCEV::FlagAnyWrap)); + const SCEVAddRecExpr *H = cast<SCEVAddRecExpr>(SE->getAddRecExpr( + Start, SE->getConstant(RealIVSCEV->getType(), Negative ? -1 : 1), L, + SCEV::FlagAnyWrap)); { // Limit the lifetime of SCEVExpander. SCEVExpander Expander(*SE, DL, "reroll"); - Value *NewIV = Expander.expandCodeFor(H, IV->getType(), Header->begin()); + Value *NewIV = Expander.expandCodeFor(H, IV->getType(), &Header->front()); for (auto &KV : Uses) { if (KV.second.find_first() == 0) @@ -1294,8 +1299,8 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) { const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE); // Iteration count SCEV minus 1 - const SCEV *ICMinus1SCEV = - SE->getMinusSCEV(ICSCEV, SE->getConstant(ICSCEV->getType(), 1)); + const SCEV *ICMinus1SCEV = SE->getMinusSCEV( + ICSCEV, SE->getConstant(ICSCEV->getType(), Negative ? -1 : 1)); Value *ICMinus1; // Iteration count minus 1 if (isa<SCEVConstant>(ICMinus1SCEV)) { @@ -1303,7 +1308,7 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *IterCount) { } else { BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) - Preheader = InsertPreheaderForLoop(L, Parent); + Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA); ICMinus1 = Expander.expandCodeFor(ICMinus1SCEV, NewIV->getType(), Preheader->getTerminator()); @@ -1444,13 +1449,14 @@ void LoopReroll::ReductionTracker::replaceSelected() { bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount, ReductionTracker &Reductions) { - DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI); + DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, DT, LI, PreserveLCSSA, + IVToIncMap); if (!DAGRoots.findRoots()) return false; DEBUG(dbgs() << "LRR: Found all root induction increments for: " << *IV << "\n"); - + if (!DAGRoots.validate(Reductions)) return false; if (!Reductions.validateSelected()) @@ -1469,11 +1475,12 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) { if (skipOptnoneFunction(L)) return false; - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); BasicBlock *Header = L->getHeader(); DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() << @@ -1490,13 +1497,13 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) { return Changed; const SCEV *LIBETC = SE->getBackedgeTakenCount(L); - const SCEV *IterCount = - SE->getAddExpr(LIBETC, SE->getConstant(LIBETC->getType(), 1)); + const SCEV *IterCount = SE->getAddExpr(LIBETC, SE->getOne(LIBETC->getType())); DEBUG(dbgs() << "LRR: iteration count = " << *IterCount << "\n"); // First, we need to find the induction variable with respect to which we can // reroll (there may be several possible options). SmallInstructionVector PossibleIVs; + IVToIncMap.clear(); collectPossibleIVs(L, PossibleIVs); if (PossibleIVs.empty()) { diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp index a675e12..5e6c2da 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp @@ -13,11 +13,15 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" @@ -41,95 +45,6 @@ DefaultRotationThreshold("rotation-max-header-size", cl::init(16), cl::Hidden, cl::desc("The default maximum header size for automatic loop rotation")); STATISTIC(NumRotated, "Number of loops rotated"); -namespace { - - class LoopRotate : public LoopPass { - public: - static char ID; // Pass ID, replacement for typeid - LoopRotate(int SpecifiedMaxHeaderSize = -1) : LoopPass(ID) { - initializeLoopRotatePass(*PassRegistry::getPassRegistry()); - if (SpecifiedMaxHeaderSize == -1) - MaxHeaderSize = DefaultRotationThreshold; - else - MaxHeaderSize = unsigned(SpecifiedMaxHeaderSize); - } - - // LCSSA form makes instruction renaming easier. - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionCacheTracker>(); - AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addRequired<LoopInfoWrapperPass>(); - AU.addPreserved<LoopInfoWrapperPass>(); - AU.addRequiredID(LoopSimplifyID); - AU.addPreservedID(LoopSimplifyID); - AU.addRequiredID(LCSSAID); - AU.addPreservedID(LCSSAID); - AU.addPreserved<ScalarEvolution>(); - AU.addRequired<TargetTransformInfoWrapperPass>(); - } - - bool runOnLoop(Loop *L, LPPassManager &LPM) override; - bool simplifyLoopLatch(Loop *L); - bool rotateLoop(Loop *L, bool SimplifiedLatch); - - private: - unsigned MaxHeaderSize; - LoopInfo *LI; - const TargetTransformInfo *TTI; - AssumptionCache *AC; - DominatorTree *DT; - }; -} - -char LoopRotate::ID = 0; -INITIALIZE_PASS_BEGIN(LoopRotate, "loop-rotate", "Rotate Loops", false, false) -INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopSimplify) -INITIALIZE_PASS_DEPENDENCY(LCSSA) -INITIALIZE_PASS_END(LoopRotate, "loop-rotate", "Rotate Loops", false, false) - -Pass *llvm::createLoopRotatePass(int MaxHeaderSize) { - return new LoopRotate(MaxHeaderSize); -} - -/// Rotate Loop L as many times as possible. Return true if -/// the loop is rotated at least once. -bool LoopRotate::runOnLoop(Loop *L, LPPassManager &LPM) { - if (skipOptnoneFunction(L)) - return false; - - // Save the loop metadata. - MDNode *LoopMD = L->getLoopID(); - - Function &F = *L->getHeader()->getParent(); - - LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); - AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); - DT = DTWP ? &DTWP->getDomTree() : nullptr; - - // Simplify the loop latch before attempting to rotate the header - // upward. Rotation may not be needed if the loop tail can be folded into the - // loop exit. - bool SimplifiedLatch = simplifyLoopLatch(L); - - // One loop can be rotated multiple times. - bool MadeChange = false; - while (rotateLoop(L, SimplifiedLatch)) { - MadeChange = true; - SimplifiedLatch = false; - } - - // Restore the loop metadata. - // NB! We presume LoopRotation DOESN'T ADD its own metadata. - if ((MadeChange || SimplifiedLatch) && LoopMD) - L->setLoopID(LoopMD); - - return MadeChange; -} /// RewriteUsesOfClonedInstructions - We just cloned the instructions from the /// old header into the preheader. If there were uses of the values produced by @@ -147,7 +62,7 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader, // as necessary. SSAUpdater SSA; for (I = OrigHeader->begin(); I != E; ++I) { - Value *OrigHeaderVal = I; + Value *OrigHeaderVal = &*I; // If there are no uses of the value (e.g. because it returns void), there // is nothing to rewrite. @@ -196,127 +111,6 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader, } } -/// Determine whether the instructions in this range may be safely and cheaply -/// speculated. This is not an important enough situation to develop complex -/// heuristics. We handle a single arithmetic instruction along with any type -/// conversions. -static bool shouldSpeculateInstrs(BasicBlock::iterator Begin, - BasicBlock::iterator End, Loop *L) { - bool seenIncrement = false; - bool MultiExitLoop = false; - - if (!L->getExitingBlock()) - MultiExitLoop = true; - - for (BasicBlock::iterator I = Begin; I != End; ++I) { - - if (!isSafeToSpeculativelyExecute(I)) - return false; - - if (isa<DbgInfoIntrinsic>(I)) - continue; - - switch (I->getOpcode()) { - default: - return false; - case Instruction::GetElementPtr: - // GEPs are cheap if all indices are constant. - if (!cast<GEPOperator>(I)->hasAllConstantIndices()) - return false; - // fall-thru to increment case - case Instruction::Add: - case Instruction::Sub: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: { - Value *IVOpnd = !isa<Constant>(I->getOperand(0)) - ? I->getOperand(0) - : !isa<Constant>(I->getOperand(1)) - ? I->getOperand(1) - : nullptr; - if (!IVOpnd) - return false; - - // If increment operand is used outside of the loop, this speculation - // could cause extra live range interference. - if (MultiExitLoop) { - for (User *UseI : IVOpnd->users()) { - auto *UserInst = cast<Instruction>(UseI); - if (!L->contains(UserInst)) - return false; - } - } - - if (seenIncrement) - return false; - seenIncrement = true; - break; - } - case Instruction::Trunc: - case Instruction::ZExt: - case Instruction::SExt: - // ignore type conversions - break; - } - } - return true; -} - -/// Fold the loop tail into the loop exit by speculating the loop tail -/// instructions. Typically, this is a single post-increment. In the case of a -/// simple 2-block loop, hoisting the increment can be much better than -/// duplicating the entire loop header. In the case of loops with early exits, -/// rotation will not work anyway, but simplifyLoopLatch will put the loop in -/// canonical form so downstream passes can handle it. -/// -/// I don't believe this invalidates SCEV. -bool LoopRotate::simplifyLoopLatch(Loop *L) { - BasicBlock *Latch = L->getLoopLatch(); - if (!Latch || Latch->hasAddressTaken()) - return false; - - BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator()); - if (!Jmp || !Jmp->isUnconditional()) - return false; - - BasicBlock *LastExit = Latch->getSinglePredecessor(); - if (!LastExit || !L->isLoopExiting(LastExit)) - return false; - - BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator()); - if (!BI) - return false; - - if (!shouldSpeculateInstrs(Latch->begin(), Jmp, L)) - return false; - - DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into " - << LastExit->getName() << "\n"); - - // Hoist the instructions from Latch into LastExit. - LastExit->getInstList().splice(BI, Latch->getInstList(), Latch->begin(), Jmp); - - unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1; - BasicBlock *Header = Jmp->getSuccessor(0); - assert(Header == L->getHeader() && "expected a backward branch"); - - // Remove Latch from the CFG so that LastExit becomes the new Latch. - BI->setSuccessor(FallThruPath, Header); - Latch->replaceSuccessorsPhiUsesWith(LastExit); - Jmp->eraseFromParent(); - - // Nuke the Latch block. - assert(Latch->empty() && "unable to evacuate Latch"); - LI->removeBlock(Latch); - if (DT) - DT->eraseNode(Latch); - Latch->eraseFromParent(); - return true; -} - /// Rotate loop LP. Return true if the loop is rotated. /// /// \param SimplifiedLatch is true if the latch was just folded into the final @@ -327,7 +121,10 @@ bool LoopRotate::simplifyLoopLatch(Loop *L) { /// rotation. LoopRotate should be repeatable and converge to a canonical /// form. This property is satisfied because simplifying the loop latch can only /// happen once across multiple invocations of the LoopRotate pass. -bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { +static bool rotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI, + const TargetTransformInfo *TTI, AssumptionCache *AC, + DominatorTree *DT, ScalarEvolution *SE, + bool SimplifiedLatch) { // If the loop has only one block then there is not much to rotate. if (L->getBlocks().size() == 1) return false; @@ -382,7 +179,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // Anything ScalarEvolution may know about this loop or the PHI nodes // in its header will soon be invalidated. - if (ScalarEvolution *SE = getAnalysisIfAvailable<ScalarEvolution>()) + if (SE) SE->forgetLoop(L); DEBUG(dbgs() << "LoopRotation: rotating "; L->dump()); @@ -420,7 +217,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // possible or create a clone in the OldPreHeader if not. TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator(); while (I != E) { - Instruction *Inst = I++; + Instruction *Inst = &*I++; // If the instruction's operands are invariant and it doesn't read or write // memory, then it is safe to hoist. Doing this doesn't change the order of @@ -465,8 +262,8 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's // successors by duplicating their incoming values for OrigHeader. TerminatorInst *TI = OrigHeader->getTerminator(); - for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) - for (BasicBlock::iterator BI = TI->getSuccessor(i)->begin(); + for (BasicBlock *SuccBB : TI->successors()) + for (BasicBlock::iterator BI = SuccBB->begin(); PHINode *PN = dyn_cast<PHINode>(BI); ++BI) PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader); @@ -607,3 +404,221 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { ++NumRotated; return true; } + +/// Determine whether the instructions in this range may be safely and cheaply +/// speculated. This is not an important enough situation to develop complex +/// heuristics. We handle a single arithmetic instruction along with any type +/// conversions. +static bool shouldSpeculateInstrs(BasicBlock::iterator Begin, + BasicBlock::iterator End, Loop *L) { + bool seenIncrement = false; + bool MultiExitLoop = false; + + if (!L->getExitingBlock()) + MultiExitLoop = true; + + for (BasicBlock::iterator I = Begin; I != End; ++I) { + + if (!isSafeToSpeculativelyExecute(&*I)) + return false; + + if (isa<DbgInfoIntrinsic>(I)) + continue; + + switch (I->getOpcode()) { + default: + return false; + case Instruction::GetElementPtr: + // GEPs are cheap if all indices are constant. + if (!cast<GEPOperator>(I)->hasAllConstantIndices()) + return false; + // fall-thru to increment case + case Instruction::Add: + case Instruction::Sub: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: { + Value *IVOpnd = !isa<Constant>(I->getOperand(0)) + ? I->getOperand(0) + : !isa<Constant>(I->getOperand(1)) + ? I->getOperand(1) + : nullptr; + if (!IVOpnd) + return false; + + // If increment operand is used outside of the loop, this speculation + // could cause extra live range interference. + if (MultiExitLoop) { + for (User *UseI : IVOpnd->users()) { + auto *UserInst = cast<Instruction>(UseI); + if (!L->contains(UserInst)) + return false; + } + } + + if (seenIncrement) + return false; + seenIncrement = true; + break; + } + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + // ignore type conversions + break; + } + } + return true; +} + +/// Fold the loop tail into the loop exit by speculating the loop tail +/// instructions. Typically, this is a single post-increment. In the case of a +/// simple 2-block loop, hoisting the increment can be much better than +/// duplicating the entire loop header. In the case of loops with early exits, +/// rotation will not work anyway, but simplifyLoopLatch will put the loop in +/// canonical form so downstream passes can handle it. +/// +/// I don't believe this invalidates SCEV. +static bool simplifyLoopLatch(Loop *L, LoopInfo *LI, DominatorTree *DT) { + BasicBlock *Latch = L->getLoopLatch(); + if (!Latch || Latch->hasAddressTaken()) + return false; + + BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator()); + if (!Jmp || !Jmp->isUnconditional()) + return false; + + BasicBlock *LastExit = Latch->getSinglePredecessor(); + if (!LastExit || !L->isLoopExiting(LastExit)) + return false; + + BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator()); + if (!BI) + return false; + + if (!shouldSpeculateInstrs(Latch->begin(), Jmp->getIterator(), L)) + return false; + + DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into " + << LastExit->getName() << "\n"); + + // Hoist the instructions from Latch into LastExit. + LastExit->getInstList().splice(BI->getIterator(), Latch->getInstList(), + Latch->begin(), Jmp->getIterator()); + + unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1; + BasicBlock *Header = Jmp->getSuccessor(0); + assert(Header == L->getHeader() && "expected a backward branch"); + + // Remove Latch from the CFG so that LastExit becomes the new Latch. + BI->setSuccessor(FallThruPath, Header); + Latch->replaceSuccessorsPhiUsesWith(LastExit); + Jmp->eraseFromParent(); + + // Nuke the Latch block. + assert(Latch->empty() && "unable to evacuate Latch"); + LI->removeBlock(Latch); + if (DT) + DT->eraseNode(Latch); + Latch->eraseFromParent(); + return true; +} + +/// Rotate \c L as many times as possible. Return true if the loop is rotated +/// at least once. +static bool iterativelyRotateLoop(Loop *L, unsigned MaxHeaderSize, LoopInfo *LI, + const TargetTransformInfo *TTI, + AssumptionCache *AC, DominatorTree *DT, + ScalarEvolution *SE) { + // Save the loop metadata. + MDNode *LoopMD = L->getLoopID(); + + // Simplify the loop latch before attempting to rotate the header + // upward. Rotation may not be needed if the loop tail can be folded into the + // loop exit. + bool SimplifiedLatch = simplifyLoopLatch(L, LI, DT); + + // One loop can be rotated multiple times. + bool MadeChange = false; + while (rotateLoop(L, MaxHeaderSize, LI, TTI, AC, DT, SE, SimplifiedLatch)) { + MadeChange = true; + SimplifiedLatch = false; + } + + // Restore the loop metadata. + // NB! We presume LoopRotation DOESN'T ADD its own metadata. + if ((MadeChange || SimplifiedLatch) && LoopMD) + L->setLoopID(LoopMD); + + return MadeChange; +} + +namespace { + +class LoopRotate : public LoopPass { + unsigned MaxHeaderSize; + +public: + static char ID; // Pass ID, replacement for typeid + LoopRotate(int SpecifiedMaxHeaderSize = -1) : LoopPass(ID) { + initializeLoopRotatePass(*PassRegistry::getPassRegistry()); + if (SpecifiedMaxHeaderSize == -1) + MaxHeaderSize = DefaultRotationThreshold; + else + MaxHeaderSize = unsigned(SpecifiedMaxHeaderSize); + } + + // LCSSA form makes instruction renaming easier. + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addPreserved<AAResultsWrapperPass>(); + AU.addRequired<AssumptionCacheTracker>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); + AU.addRequiredID(LoopSimplifyID); + AU.addPreservedID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + AU.addPreservedID(LCSSAID); + AU.addPreserved<ScalarEvolutionWrapperPass>(); + AU.addPreserved<SCEVAAWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addPreserved<BasicAAWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override { + if (skipOptnoneFunction(L)) + return false; + Function &F = *L->getHeader()->getParent(); + + auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + const auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; + auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); + auto *SE = SEWP ? &SEWP->getSE() : nullptr; + + return iterativelyRotateLoop(L, MaxHeaderSize, LI, TTI, AC, DT, SE); + } +}; +} + +char LoopRotate::ID = 0; +INITIALIZE_PASS_BEGIN(LoopRotate, "loop-rotate", "Rotate Loops", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_END(LoopRotate, "loop-rotate", "Rotate Loops", false, false) + +Pass *llvm::createLoopRotatePass(int MaxHeaderSize) { + return new LoopRotate(MaxHeaderSize); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 4b59f3d..2101225 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -105,10 +105,33 @@ static bool StressIVChain = false; namespace { -/// RegSortData - This class holds data which is used to order reuse candidates. +struct MemAccessTy { + /// Used in situations where the accessed memory type is unknown. + static const unsigned UnknownAddressSpace = ~0u; + + Type *MemTy; + unsigned AddrSpace; + + MemAccessTy() : MemTy(nullptr), AddrSpace(UnknownAddressSpace) {} + + MemAccessTy(Type *Ty, unsigned AS) : + MemTy(Ty), AddrSpace(AS) {} + + bool operator==(MemAccessTy Other) const { + return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace; + } + + bool operator!=(MemAccessTy Other) const { return !(*this == Other); } + + static MemAccessTy getUnknown(LLVMContext &Ctx) { + return MemAccessTy(Type::getVoidTy(Ctx), UnknownAddressSpace); + } +}; + +/// This class holds data which is used to order reuse candidates. class RegSortData { public: - /// UsedByIndices - This represents the set of LSRUse indices which reference + /// This represents the set of LSRUse indices which reference /// a particular register. SmallBitVector UsedByIndices; @@ -122,16 +145,14 @@ void RegSortData::print(raw_ostream &OS) const { OS << "[NumUses=" << UsedByIndices.count() << ']'; } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void RegSortData::dump() const { print(errs()); errs() << '\n'; } -#endif namespace { -/// RegUseTracker - Map register candidates to information about how they are -/// used. +/// Map register candidates to information about how they are used. class RegUseTracker { typedef DenseMap<const SCEV *, RegSortData> RegUsesTy; @@ -139,9 +160,9 @@ class RegUseTracker { SmallVector<const SCEV *, 16> RegSequence; public: - void CountRegister(const SCEV *Reg, size_t LUIdx); - void DropRegister(const SCEV *Reg, size_t LUIdx); - void SwapAndDropUse(size_t LUIdx, size_t LastLUIdx); + void countRegister(const SCEV *Reg, size_t LUIdx); + void dropRegister(const SCEV *Reg, size_t LUIdx); + void swapAndDropUse(size_t LUIdx, size_t LastLUIdx); bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const; @@ -160,7 +181,7 @@ public: } void -RegUseTracker::CountRegister(const SCEV *Reg, size_t LUIdx) { +RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) { std::pair<RegUsesTy::iterator, bool> Pair = RegUsesMap.insert(std::make_pair(Reg, RegSortData())); RegSortData &RSD = Pair.first->second; @@ -171,7 +192,7 @@ RegUseTracker::CountRegister(const SCEV *Reg, size_t LUIdx) { } void -RegUseTracker::DropRegister(const SCEV *Reg, size_t LUIdx) { +RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) { RegUsesTy::iterator It = RegUsesMap.find(Reg); assert(It != RegUsesMap.end()); RegSortData &RSD = It->second; @@ -180,7 +201,7 @@ RegUseTracker::DropRegister(const SCEV *Reg, size_t LUIdx) { } void -RegUseTracker::SwapAndDropUse(size_t LUIdx, size_t LastLUIdx) { +RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) { assert(LUIdx <= LastLUIdx); // Update RegUses. The data structure is not optimized for this purpose; @@ -219,9 +240,8 @@ void RegUseTracker::clear() { namespace { -/// Formula - This class holds information that describes a formula for -/// computing satisfying a use. It may include broken-out immediates and scaled -/// registers. +/// This class holds information that describes a formula for computing +/// satisfying a use. It may include broken-out immediates and scaled registers. struct Formula { /// Global base address used for complex addressing. GlobalValue *BaseGV; @@ -235,8 +255,8 @@ struct Formula { /// The scale of any complex addressing. int64_t Scale; - /// BaseRegs - The list of "base" registers for this use. When this is - /// non-empty. The canonical representation of a formula is + /// The list of "base" registers for this use. When this is non-empty. The + /// canonical representation of a formula is /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty(). /// #1 enforces that the scaled register is always used when at least two @@ -247,31 +267,31 @@ struct Formula { /// form. SmallVector<const SCEV *, 4> BaseRegs; - /// ScaledReg - The 'scaled' register for this use. This should be non-null - /// when Scale is not zero. + /// The 'scaled' register for this use. This should be non-null when Scale is + /// not zero. const SCEV *ScaledReg; - /// UnfoldedOffset - An additional constant offset which added near the - /// use. This requires a temporary register, but the offset itself can - /// live in an add immediate field rather than a register. + /// An additional constant offset which added near the use. This requires a + /// temporary register, but the offset itself can live in an add immediate + /// field rather than a register. int64_t UnfoldedOffset; Formula() : BaseGV(nullptr), BaseOffset(0), HasBaseReg(false), Scale(0), ScaledReg(nullptr), UnfoldedOffset(0) {} - void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE); + void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE); bool isCanonical() const; - void Canonicalize(); + void canonicalize(); - bool Unscale(); + bool unscale(); size_t getNumRegs() const; Type *getType() const; - void DeleteBaseReg(const SCEV *&S); + void deleteBaseReg(const SCEV *&S); bool referencesReg(const SCEV *S) const; bool hasRegsUsedByUsesOtherThan(size_t LUIdx, @@ -283,7 +303,7 @@ struct Formula { } -/// DoInitialMatch - Recursion helper for InitialMatch. +/// Recursion helper for initialMatch. static void DoInitialMatch(const SCEV *S, Loop *L, SmallVectorImpl<const SCEV *> &Good, SmallVectorImpl<const SCEV *> &Bad, @@ -336,10 +356,9 @@ static void DoInitialMatch(const SCEV *S, Loop *L, Bad.push_back(S); } -/// InitialMatch - Incorporate loop-variant parts of S into this Formula, -/// attempting to keep all loop-invariant and loop-computable values in a -/// single base register. -void Formula::InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) { +/// Incorporate loop-variant parts of S into this Formula, attempting to keep +/// all loop-invariant and loop-computable values in a single base register. +void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) { SmallVector<const SCEV *, 4> Good; SmallVector<const SCEV *, 4> Bad; DoInitialMatch(S, L, Good, Bad, SE); @@ -355,7 +374,7 @@ void Formula::InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) { BaseRegs.push_back(Sum); HasBaseReg = true; } - Canonicalize(); + canonicalize(); } /// \brief Check whether or not this formula statisfies the canonical @@ -373,7 +392,7 @@ bool Formula::isCanonical() const { /// field. Otherwise, we would have to do special cases everywhere in LSR /// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ... /// On the other hand, 1*reg should be canonicalized into reg. -void Formula::Canonicalize() { +void Formula::canonicalize() { if (isCanonical()) return; // So far we did not need this case. This is easy to implement but it is @@ -394,7 +413,7 @@ void Formula::Canonicalize() { /// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2. /// \return true if it was possible to get rid of the scale, false otherwise. /// \note After this operation the formula may not be in the canonical form. -bool Formula::Unscale() { +bool Formula::unscale() { if (Scale != 1) return false; Scale = 0; @@ -403,15 +422,14 @@ bool Formula::Unscale() { return true; } -/// getNumRegs - Return the total number of register operands used by this -/// formula. This does not include register uses implied by non-constant -/// addrec strides. +/// Return the total number of register operands used by this formula. This does +/// not include register uses implied by non-constant addrec strides. size_t Formula::getNumRegs() const { return !!ScaledReg + BaseRegs.size(); } -/// getType - Return the type of this formula, if it has one, or null -/// otherwise. This type is meaningless except for the bit size. +/// Return the type of this formula, if it has one, or null otherwise. This type +/// is meaningless except for the bit size. Type *Formula::getType() const { return !BaseRegs.empty() ? BaseRegs.front()->getType() : ScaledReg ? ScaledReg->getType() : @@ -419,21 +437,21 @@ Type *Formula::getType() const { nullptr; } -/// DeleteBaseReg - Delete the given base reg from the BaseRegs list. -void Formula::DeleteBaseReg(const SCEV *&S) { +/// Delete the given base reg from the BaseRegs list. +void Formula::deleteBaseReg(const SCEV *&S) { if (&S != &BaseRegs.back()) std::swap(S, BaseRegs.back()); BaseRegs.pop_back(); } -/// referencesReg - Test if this formula references the given register. +/// Test if this formula references the given register. bool Formula::referencesReg(const SCEV *S) const { return S == ScaledReg || std::find(BaseRegs.begin(), BaseRegs.end(), S) != BaseRegs.end(); } -/// hasRegsUsedByUsesOtherThan - Test whether this formula uses registers -/// which are used by uses other than the use with the given index. +/// Test whether this formula uses registers which are used by uses other than +/// the use with the given index. bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx, const RegUseTracker &RegUses) const { if (ScaledReg) @@ -481,30 +499,29 @@ void Formula::print(raw_ostream &OS) const { } } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void Formula::dump() const { print(errs()); errs() << '\n'; } -#endif -/// isAddRecSExtable - Return true if the given addrec can be sign-extended -/// without changing its value. +/// Return true if the given addrec can be sign-extended without changing its +/// value. static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) { Type *WideTy = IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1); return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy)); } -/// isAddSExtable - Return true if the given add can be sign-extended -/// without changing its value. +/// Return true if the given add can be sign-extended without changing its +/// value. static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) { Type *WideTy = IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1); return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy)); } -/// isMulSExtable - Return true if the given mul can be sign-extended -/// without changing its value. +/// Return true if the given mul can be sign-extended without changing its +/// value. static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) { Type *WideTy = IntegerType::get(SE.getContext(), @@ -512,12 +529,11 @@ static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) { return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy)); } -/// getExactSDiv - Return an expression for LHS /s RHS, if it can be determined -/// and if the remainder is known to be zero, or null otherwise. If -/// IgnoreSignificantBits is true, expressions like (X * Y) /s Y are simplified -/// to Y, ignoring that the multiplication may overflow, which is useful when -/// the result will be used in a context where the most significant bits are -/// ignored. +/// Return an expression for LHS /s RHS, if it can be determined and if the +/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits +/// is true, expressions like (X * Y) /s Y are simplified to Y, ignoring that +/// the multiplication may overflow, which is useful when the result will be +/// used in a context where the most significant bits are ignored. static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE, bool IgnoreSignificantBits = false) { @@ -528,7 +544,7 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, // Handle a few RHS special cases. const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS); if (RC) { - const APInt &RA = RC->getValue()->getValue(); + const APInt &RA = RC->getAPInt(); // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do // some folding. if (RA.isAllOnesValue()) @@ -542,8 +558,8 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) { if (!RC) return nullptr; - const APInt &LA = C->getValue()->getValue(); - const APInt &RA = RC->getValue()->getValue(); + const APInt &LA = C->getAPInt(); + const APInt &RA = RC->getAPInt(); if (LA.srem(RA) != 0) return nullptr; return SE.getConstant(LA.sdiv(RA)); @@ -603,12 +619,11 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, return nullptr; } -/// ExtractImmediate - If S involves the addition of a constant integer value, -/// return that integer value, and mutate S to point to a new SCEV with that -/// value excluded. +/// If S involves the addition of a constant integer value, return that integer +/// value, and mutate S to point to a new SCEV with that value excluded. static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) { if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) { - if (C->getValue()->getValue().getMinSignedBits() <= 64) { + if (C->getAPInt().getMinSignedBits() <= 64) { S = SE.getConstant(C->getType(), 0); return C->getValue()->getSExtValue(); } @@ -630,9 +645,8 @@ static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) { return 0; } -/// ExtractSymbol - If S involves the addition of a GlobalValue address, -/// return that symbol, and mutate S to point to a new SCEV with that -/// value excluded. +/// If S involves the addition of a GlobalValue address, return that symbol, and +/// mutate S to point to a new SCEV with that value excluded. static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) { if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) { if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) { @@ -657,8 +671,8 @@ static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) { return nullptr; } -/// isAddressUse - Returns true if the specified instruction is using the -/// specified value as an address. +/// Returns true if the specified instruction is using the specified value as an +/// address. static bool isAddressUse(Instruction *Inst, Value *OperandVal) { bool isAddress = isa<LoadInst>(Inst); if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { @@ -682,12 +696,15 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) { return isAddress; } -/// getAccessType - Return the type of the memory being accessed. -static Type *getAccessType(const Instruction *Inst) { - Type *AccessTy = Inst->getType(); - if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) - AccessTy = SI->getOperand(0)->getType(); - else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { +/// Return the type of the memory being accessed. +static MemAccessTy getAccessType(const Instruction *Inst) { + MemAccessTy AccessTy(Inst->getType(), MemAccessTy::UnknownAddressSpace); + if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + AccessTy.MemTy = SI->getOperand(0)->getType(); + AccessTy.AddrSpace = SI->getPointerAddressSpace(); + } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) { + AccessTy.AddrSpace = LI->getPointerAddressSpace(); + } else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { // Addressing modes can also be folded into prefetches and a variety // of intrinsics. switch (II->getIntrinsicID()) { @@ -696,21 +713,21 @@ static Type *getAccessType(const Instruction *Inst) { case Intrinsic::x86_sse2_storeu_pd: case Intrinsic::x86_sse2_storeu_dq: case Intrinsic::x86_sse2_storel_dq: - AccessTy = II->getArgOperand(0)->getType(); + AccessTy.MemTy = II->getArgOperand(0)->getType(); break; } } // All pointers have the same requirements, so canonicalize them to an // arbitrary pointer type to minimize variation. - if (PointerType *PTy = dyn_cast<PointerType>(AccessTy)) - AccessTy = PointerType::get(IntegerType::get(PTy->getContext(), 1), - PTy->getAddressSpace()); + if (PointerType *PTy = dyn_cast<PointerType>(AccessTy.MemTy)) + AccessTy.MemTy = PointerType::get(IntegerType::get(PTy->getContext(), 1), + PTy->getAddressSpace()); return AccessTy; } -/// isExistingPhi - Return true if this AddRec is already a phi in its loop. +/// Return true if this AddRec is already a phi in its loop. static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) { for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I) { @@ -793,9 +810,8 @@ static bool isHighCostExpansion(const SCEV *S, return true; } -/// DeleteTriviallyDeadInstructions - If any of the instructions is the -/// specified set are trivially dead, delete them and see if this makes any of -/// their operands subsequently dead. +/// If any of the instructions is the specified set are trivially dead, delete +/// them and see if this makes any of their operands subsequently dead. static bool DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) { bool Changed = false; @@ -842,7 +858,7 @@ static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, namespace { -/// Cost - This class is used to measure and compare candidate formulae. +/// This class is used to measure and compare candidate formulae. class Cost { /// TODO: Some of these could be merged. Also, a lexical ordering /// isn't always optimal. @@ -905,7 +921,7 @@ private: } -/// RateRegister - Tally up interesting quantities from the given register. +/// Tally up interesting quantities from the given register. void Cost::RateRegister(const SCEV *Reg, SmallPtrSetImpl<const SCEV *> &Regs, const Loop *L, @@ -951,9 +967,9 @@ void Cost::RateRegister(const SCEV *Reg, SE.hasComputableLoopEvolution(Reg, L); } -/// RatePrimaryRegister - Record this register in the set. If we haven't seen it -/// before, rate it. Optional LoserRegs provides a way to declare any formula -/// that refers to one of those regs an instant loser. +/// Record this register in the set. If we haven't seen it before, rate +/// it. Optional LoserRegs provides a way to declare any formula that refers to +/// one of those regs an instant loser. void Cost::RatePrimaryRegister(const SCEV *Reg, SmallPtrSetImpl<const SCEV *> &Regs, const Loop *L, @@ -1024,7 +1040,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI, assert(isValid() && "invalid cost"); } -/// Lose - Set this cost to a losing value. +/// Set this cost to a losing value. void Cost::Lose() { NumRegs = ~0u; AddRecCost = ~0u; @@ -1035,7 +1051,7 @@ void Cost::Lose() { ScaleCost = ~0u; } -/// operator< - Choose the lower cost. +/// Choose the lower cost. bool Cost::operator<(const Cost &Other) const { return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost, ImmCost, SetupCost) < @@ -1061,37 +1077,35 @@ void Cost::print(raw_ostream &OS) const { OS << ", plus " << SetupCost << " setup cost"; } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void Cost::dump() const { print(errs()); errs() << '\n'; } -#endif namespace { -/// LSRFixup - An operand value in an instruction which is to be replaced -/// with some equivalent, possibly strength-reduced, replacement. +/// An operand value in an instruction which is to be replaced with some +/// equivalent, possibly strength-reduced, replacement. struct LSRFixup { - /// UserInst - The instruction which will be updated. + /// The instruction which will be updated. Instruction *UserInst; - /// OperandValToReplace - The operand of the instruction which will - /// be replaced. The operand may be used more than once; every instance - /// will be replaced. + /// The operand of the instruction which will be replaced. The operand may be + /// used more than once; every instance will be replaced. Value *OperandValToReplace; - /// PostIncLoops - If this user is to use the post-incremented value of an - /// induction variable, this variable is non-null and holds the loop - /// associated with the induction variable. + /// If this user is to use the post-incremented value of an induction + /// variable, this variable is non-null and holds the loop associated with the + /// induction variable. PostIncLoopSet PostIncLoops; - /// LUIdx - The index of the LSRUse describing the expression which - /// this fixup needs, minus an offset (below). + /// The index of the LSRUse describing the expression which this fixup needs, + /// minus an offset (below). size_t LUIdx; - /// Offset - A constant offset to be added to the LSRUse expression. - /// This allows multiple fixups to share the same LSRUse with different - /// offsets, for example in an unrolled loop. + /// A constant offset to be added to the LSRUse expression. This allows + /// multiple fixups to share the same LSRUse with different offsets, for + /// example in an unrolled loop. int64_t Offset; bool isUseFullyOutsideLoop(const Loop *L) const; @@ -1108,8 +1122,7 @@ LSRFixup::LSRFixup() : UserInst(nullptr), OperandValToReplace(nullptr), LUIdx(~size_t(0)), Offset(0) {} -/// isUseFullyOutsideLoop - Test whether this fixup always uses its -/// value outside of the given loop. +/// Test whether this fixup always uses its value outside of the given loop. bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const { // PHI nodes use their value in their incoming blocks. if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) { @@ -1149,16 +1162,15 @@ void LSRFixup::print(raw_ostream &OS) const { OS << ", Offset=" << Offset; } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void LSRFixup::dump() const { print(errs()); errs() << '\n'; } -#endif namespace { -/// UniquifierDenseMapInfo - A DenseMapInfo implementation for holding -/// DenseMaps and DenseSets of sorted SmallVectors of const SCEV*. +/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of sorted +/// SmallVectors of const SCEV*. struct UniquifierDenseMapInfo { static SmallVector<const SCEV *, 4> getEmptyKey() { SmallVector<const SCEV *, 4> V; @@ -1182,17 +1194,17 @@ struct UniquifierDenseMapInfo { } }; -/// LSRUse - This class holds the state that LSR keeps for each use in -/// IVUsers, as well as uses invented by LSR itself. It includes information -/// about what kinds of things can be folded into the user, information about -/// the user itself, and information about how the use may be satisfied. -/// TODO: Represent multiple users of the same expression in common? +/// This class holds the state that LSR keeps for each use in IVUsers, as well +/// as uses invented by LSR itself. It includes information about what kinds of +/// things can be folded into the user, information about the user itself, and +/// information about how the use may be satisfied. TODO: Represent multiple +/// users of the same expression in common? class LSRUse { DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier; public: - /// KindType - An enum for a kind of use, indicating what types of - /// scaled and immediate operands it might support. + /// An enum for a kind of use, indicating what types of scaled and immediate + /// operands it might support. enum KindType { Basic, ///< A normal use, with no folding. Special, ///< A special case of basic, allowing -1 scales. @@ -1204,15 +1216,14 @@ public: typedef PointerIntPair<const SCEV *, 2, KindType> SCEVUseKindPair; KindType Kind; - Type *AccessTy; + MemAccessTy AccessTy; SmallVector<int64_t, 8> Offsets; int64_t MinOffset; int64_t MaxOffset; - /// AllFixupsOutsideLoop - This records whether all of the fixups using this - /// LSRUse are outside of the loop, in which case some special-case heuristics - /// may be used. + /// This records whether all of the fixups using this LSRUse are outside of + /// the loop, in which case some special-case heuristics may be used. bool AllFixupsOutsideLoop; /// RigidFormula is set to true to guarantee that this use will be associated @@ -1222,26 +1233,24 @@ public: /// changing the formula. bool RigidFormula; - /// WidestFixupType - This records the widest use type for any fixup using - /// this LSRUse. FindUseWithSimilarFormula can't consider uses with different - /// max fixup widths to be equivalent, because the narrower one may be relying - /// on the implicit truncation to truncate away bogus bits. + /// This records the widest use type for any fixup using this + /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max + /// fixup widths to be equivalent, because the narrower one may be relying on + /// the implicit truncation to truncate away bogus bits. Type *WidestFixupType; - /// Formulae - A list of ways to build a value that can satisfy this user. - /// After the list is populated, one of these is selected heuristically and - /// used to formulate a replacement for OperandValToReplace in UserInst. + /// A list of ways to build a value that can satisfy this user. After the + /// list is populated, one of these is selected heuristically and used to + /// formulate a replacement for OperandValToReplace in UserInst. SmallVector<Formula, 12> Formulae; - /// Regs - The set of register candidates used by all formulae in this LSRUse. + /// The set of register candidates used by all formulae in this LSRUse. SmallPtrSet<const SCEV *, 4> Regs; - LSRUse(KindType K, Type *T) : Kind(K), AccessTy(T), - MinOffset(INT64_MAX), - MaxOffset(INT64_MIN), - AllFixupsOutsideLoop(true), - RigidFormula(false), - WidestFixupType(nullptr) {} + LSRUse(KindType K, MemAccessTy AT) + : Kind(K), AccessTy(AT), MinOffset(INT64_MAX), MaxOffset(INT64_MIN), + AllFixupsOutsideLoop(true), RigidFormula(false), + WidestFixupType(nullptr) {} bool HasFormulaWithSameRegs(const Formula &F) const; bool InsertFormula(const Formula &F); @@ -1254,8 +1263,8 @@ public: } -/// HasFormula - Test whether this use as a formula which has the same -/// registers as the given formula. +/// Test whether this use as a formula which has the same registers as the given +/// formula. bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const { SmallVector<const SCEV *, 4> Key = F.BaseRegs; if (F.ScaledReg) Key.push_back(F.ScaledReg); @@ -1264,9 +1273,8 @@ bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const { return Uniquifier.count(Key); } -/// InsertFormula - If the given formula has not yet been inserted, add it to -/// the list, and return true. Return false otherwise. -/// The formula must be in canonical form. +/// If the given formula has not yet been inserted, add it to the list, and +/// return true. Return false otherwise. The formula must be in canonical form. bool LSRUse::InsertFormula(const Formula &F) { assert(F.isCanonical() && "Invalid canonical representation"); @@ -1300,14 +1308,14 @@ bool LSRUse::InsertFormula(const Formula &F) { return true; } -/// DeleteFormula - Remove the given formula from this use's list. +/// Remove the given formula from this use's list. void LSRUse::DeleteFormula(Formula &F) { if (&F != &Formulae.back()) std::swap(F, Formulae.back()); Formulae.pop_back(); } -/// RecomputeRegs - Recompute the Regs field, and update RegUses. +/// Recompute the Regs field, and update RegUses. void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) { // Now that we've filtered out some formulae, recompute the Regs set. SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs); @@ -1320,7 +1328,7 @@ void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) { // Update the RegTracker. for (const SCEV *S : OldRegs) if (!Regs.count(S)) - RegUses.DropRegister(S, LUIdx); + RegUses.dropRegister(S, LUIdx); } void LSRUse::print(raw_ostream &OS) const { @@ -1331,10 +1339,13 @@ void LSRUse::print(raw_ostream &OS) const { case ICmpZero: OS << "ICmpZero"; break; case Address: OS << "Address of "; - if (AccessTy->isPointerTy()) + if (AccessTy.MemTy->isPointerTy()) OS << "pointer"; // the full pointer type could be really verbose - else - OS << *AccessTy; + else { + OS << *AccessTy.MemTy; + } + + OS << " in addrspace(" << AccessTy.AddrSpace << ')'; } OS << ", Offsets={"; @@ -1353,19 +1364,19 @@ void LSRUse::print(raw_ostream &OS) const { OS << ", widest fixup type: " << *WidestFixupType; } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void LSRUse::dump() const { print(errs()); errs() << '\n'; } -#endif static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, - LSRUse::KindType Kind, Type *AccessTy, + LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale) { switch (Kind) { case LSRUse::Address: - return TTI.isLegalAddressingMode(AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale); + return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, BaseOffset, + HasBaseReg, Scale, AccessTy.AddrSpace); case LSRUse::ICmpZero: // There's not even a target hook for querying whether it would be legal to @@ -1412,7 +1423,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, int64_t MinOffset, int64_t MaxOffset, - LSRUse::KindType Kind, Type *AccessTy, + LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale) { // Check for overflow. @@ -1433,7 +1444,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, int64_t MinOffset, int64_t MaxOffset, - LSRUse::KindType Kind, Type *AccessTy, + LSRUse::KindType Kind, MemAccessTy AccessTy, const Formula &F) { // For the purpose of isAMCompletelyFolded either having a canonical formula // or a scale not equal to zero is correct. @@ -1447,11 +1458,11 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale); } -/// isLegalUse - Test whether we know how to expand the current formula. +/// Test whether we know how to expand the current formula. static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, - int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy, - GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, - int64_t Scale) { + int64_t MaxOffset, LSRUse::KindType Kind, + MemAccessTy AccessTy, GlobalValue *BaseGV, + int64_t BaseOffset, bool HasBaseReg, int64_t Scale) { // We know how to expand completely foldable formulae. return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale) || @@ -1463,8 +1474,8 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, } static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, - int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy, - const Formula &F) { + int64_t MaxOffset, LSRUse::KindType Kind, + MemAccessTy AccessTy, const Formula &F) { return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale); } @@ -1490,14 +1501,12 @@ static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, switch (LU.Kind) { case LSRUse::Address: { // Check the scaling factor cost with both the min and max offsets. - int ScaleCostMinOffset = - TTI.getScalingFactorCost(LU.AccessTy, F.BaseGV, - F.BaseOffset + LU.MinOffset, - F.HasBaseReg, F.Scale); - int ScaleCostMaxOffset = - TTI.getScalingFactorCost(LU.AccessTy, F.BaseGV, - F.BaseOffset + LU.MaxOffset, - F.HasBaseReg, F.Scale); + int ScaleCostMinOffset = TTI.getScalingFactorCost( + LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MinOffset, F.HasBaseReg, + F.Scale, LU.AccessTy.AddrSpace); + int ScaleCostMaxOffset = TTI.getScalingFactorCost( + LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MaxOffset, F.HasBaseReg, + F.Scale, LU.AccessTy.AddrSpace); assert(ScaleCostMinOffset >= 0 && ScaleCostMaxOffset >= 0 && "Legal addressing mode has an illegal cost!"); @@ -1515,7 +1524,7 @@ static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, } static bool isAlwaysFoldable(const TargetTransformInfo &TTI, - LSRUse::KindType Kind, Type *AccessTy, + LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg) { // Fast-path: zero is always foldable. @@ -1539,7 +1548,8 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI, static bool isAlwaysFoldable(const TargetTransformInfo &TTI, ScalarEvolution &SE, int64_t MinOffset, int64_t MaxOffset, LSRUse::KindType Kind, - Type *AccessTy, const SCEV *S, bool HasBaseReg) { + MemAccessTy AccessTy, const SCEV *S, + bool HasBaseReg) { // Fast-path: zero is always foldable. if (S->isZero()) return true; @@ -1564,9 +1574,9 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI, namespace { -/// IVInc - An individual increment in a Chain of IV increments. -/// Relate an IV user to an expression that computes the IV it uses from the IV -/// used by the previous link in the Chain. +/// An individual increment in a Chain of IV increments. Relate an IV user to +/// an expression that computes the IV it uses from the IV used by the previous +/// link in the Chain. /// /// For the head of a chain, IncExpr holds the absolute SCEV expression for the /// original IVOperand. The head of the chain's IVOperand is only valid during @@ -1582,8 +1592,8 @@ struct IVInc { UserInst(U), IVOperand(O), IncExpr(E) {} }; -// IVChain - The list of IV increments in program order. -// We typically add the head of a chain without finding subsequent links. +// The list of IV increments in program order. We typically add the head of a +// chain without finding subsequent links. struct IVChain { SmallVector<IVInc,1> Incs; const SCEV *ExprBase; @@ -1595,7 +1605,7 @@ struct IVChain { typedef SmallVectorImpl<IVInc>::const_iterator const_iterator; - // begin - return the first increment in the chain. + // Return the first increment in the chain. const_iterator begin() const { assert(!Incs.empty()); return std::next(Incs.begin()); @@ -1604,32 +1614,30 @@ struct IVChain { return Incs.end(); } - // hasIncs - Returns true if this chain contains any increments. + // Returns true if this chain contains any increments. bool hasIncs() const { return Incs.size() >= 2; } - // add - Add an IVInc to the end of this chain. + // Add an IVInc to the end of this chain. void add(const IVInc &X) { Incs.push_back(X); } - // tailUserInst - Returns the last UserInst in the chain. + // Returns the last UserInst in the chain. Instruction *tailUserInst() const { return Incs.back().UserInst; } - // isProfitableIncrement - Returns true if IncExpr can be profitably added to - // this chain. + // Returns true if IncExpr can be profitably added to this chain. bool isProfitableIncrement(const SCEV *OperExpr, const SCEV *IncExpr, ScalarEvolution&); }; -/// ChainUsers - Helper for CollectChains to track multiple IV increment uses. -/// Distinguish between FarUsers that definitely cross IV increments and -/// NearUsers that may be used between IV increments. +/// Helper for CollectChains to track multiple IV increment uses. Distinguish +/// between FarUsers that definitely cross IV increments and NearUsers that may +/// be used between IV increments. struct ChainUsers { SmallPtrSet<Instruction*, 4> FarUsers; SmallPtrSet<Instruction*, 4> NearUsers; }; -/// LSRInstance - This class holds state for the main loop strength reduction -/// logic. +/// This class holds state for the main loop strength reduction logic. class LSRInstance { IVUsers &IU; ScalarEvolution &SE; @@ -1639,25 +1647,25 @@ class LSRInstance { Loop *const L; bool Changed; - /// IVIncInsertPos - This is the insert position that the current loop's - /// induction variable increment should be placed. In simple loops, this is - /// the latch block's terminator. But in more complicated cases, this is a - /// position which will dominate all the in-loop post-increment users. + /// This is the insert position that the current loop's induction variable + /// increment should be placed. In simple loops, this is the latch block's + /// terminator. But in more complicated cases, this is a position which will + /// dominate all the in-loop post-increment users. Instruction *IVIncInsertPos; - /// Factors - Interesting factors between use strides. + /// Interesting factors between use strides. SmallSetVector<int64_t, 8> Factors; - /// Types - Interesting use types, to facilitate truncation reuse. + /// Interesting use types, to facilitate truncation reuse. SmallSetVector<Type *, 4> Types; - /// Fixups - The list of operands which are to be replaced. + /// The list of operands which are to be replaced. SmallVector<LSRFixup, 16> Fixups; - /// Uses - The list of interesting uses. + /// The list of interesting uses. SmallVector<LSRUse, 16> Uses; - /// RegUses - Track which uses use which register candidates. + /// Track which uses use which register candidates. RegUseTracker RegUses; // Limit the number of chains to avoid quadratic behavior. We don't expect to @@ -1665,10 +1673,10 @@ class LSRInstance { // back to normal LSR behavior for those uses. static const unsigned MaxChains = 8; - /// IVChainVec - IV users can form a chain of IV increments. + /// IV users can form a chain of IV increments. SmallVector<IVChain, MaxChains> IVChainVec; - /// IVIncSet - IV users that belong to profitable IVChains. + /// IV users that belong to profitable IVChains. SmallPtrSet<Use*, MaxChains> IVIncSet; void OptimizeShadowIV(); @@ -1696,11 +1704,10 @@ class LSRInstance { UseMapTy UseMap; bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, - LSRUse::KindType Kind, Type *AccessTy); + LSRUse::KindType Kind, MemAccessTy AccessTy); - std::pair<size_t, int64_t> getUse(const SCEV *&Expr, - LSRUse::KindType Kind, - Type *AccessTy); + std::pair<size_t, int64_t> getUse(const SCEV *&Expr, LSRUse::KindType Kind, + MemAccessTy AccessTy); void DeleteUse(LSRUse &LU, size_t LUIdx); @@ -1769,18 +1776,16 @@ class LSRInstance { void RewriteForPHI(PHINode *PN, const LSRFixup &LF, const Formula &F, SCEVExpander &Rewriter, - SmallVectorImpl<WeakVH> &DeadInsts, - Pass *P) const; + SmallVectorImpl<WeakVH> &DeadInsts) const; void Rewrite(const LSRFixup &LF, const Formula &F, SCEVExpander &Rewriter, - SmallVectorImpl<WeakVH> &DeadInsts, - Pass *P) const; - void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, - Pass *P); + SmallVectorImpl<WeakVH> &DeadInsts) const; + void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution); public: - LSRInstance(Loop *L, Pass *P); + LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, + LoopInfo &LI, const TargetTransformInfo &TTI); bool getChanged() const { return Changed; } @@ -1793,8 +1798,8 @@ public: } -/// OptimizeShadowIV - If IV is used in a int-to-float cast -/// inside the loop then try to eliminate the cast operation. +/// If IV is used in a int-to-float cast inside the loop then try to eliminate +/// the cast operation. void LSRInstance::OptimizeShadowIV() { const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) @@ -1902,9 +1907,8 @@ void LSRInstance::OptimizeShadowIV() { } } -/// FindIVUserForCond - If Cond has an operand that is an expression of an IV, -/// set the IV user and stride information and return true, otherwise return -/// false. +/// If Cond has an operand that is an expression of an IV, set the IV user and +/// stride information and return true, otherwise return false. bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) { for (IVStrideUse &U : IU) if (U.getUser() == Cond) { @@ -1917,8 +1921,7 @@ bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) { return false; } -/// OptimizeMax - Rewrite the loop's terminating condition if it uses -/// a max computation. +/// Rewrite the loop's terminating condition if it uses a max computation. /// /// This is a narrow solution to a specific, but acute, problem. For loops /// like this: @@ -2076,8 +2079,7 @@ ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) { return NewCond; } -/// OptimizeLoopTermCond - Change loop terminating condition to use the -/// postinc iv when possible. +/// Change loop terminating condition to use the postinc iv when possible. void LSRInstance::OptimizeLoopTermCond() { SmallPtrSet<Instruction *, 4> PostIncs; @@ -2152,16 +2154,18 @@ LSRInstance::OptimizeLoopTermCond() { C->getValue().isMinSignedValue()) goto decline_post_inc; // Check for possible scaled-address reuse. - Type *AccessTy = getAccessType(UI->getUser()); + MemAccessTy AccessTy = getAccessType(UI->getUser()); int64_t Scale = C->getSExtValue(); - if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ nullptr, - /*BaseOffset=*/ 0, - /*HasBaseReg=*/ false, Scale)) + if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr, + /*BaseOffset=*/0, + /*HasBaseReg=*/false, Scale, + AccessTy.AddrSpace)) goto decline_post_inc; Scale = -Scale; - if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ nullptr, - /*BaseOffset=*/ 0, - /*HasBaseReg=*/ false, Scale)) + if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr, + /*BaseOffset=*/0, + /*HasBaseReg=*/false, Scale, + AccessTy.AddrSpace)) goto decline_post_inc; } } @@ -2180,7 +2184,7 @@ LSRInstance::OptimizeLoopTermCond() { ICmpInst *OldCond = Cond; Cond = cast<ICmpInst>(Cond->clone()); Cond->setName(L->getHeader()->getName() + ".termcond"); - ExitingBlock->getInstList().insert(TermBr, Cond); + ExitingBlock->getInstList().insert(TermBr->getIterator(), Cond); // Clone the IVUse, as the old use still exists! CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace()); @@ -2213,15 +2217,14 @@ LSRInstance::OptimizeLoopTermCond() { } } -/// reconcileNewOffset - Determine if the given use can accommodate a fixup -/// at the given offset and other details. If so, update the use and -/// return true. -bool -LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, - LSRUse::KindType Kind, Type *AccessTy) { +/// Determine if the given use can accommodate a fixup at the given offset and +/// other details. If so, update the use and return true. +bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, + bool HasBaseReg, LSRUse::KindType Kind, + MemAccessTy AccessTy) { int64_t NewMinOffset = LU.MinOffset; int64_t NewMaxOffset = LU.MaxOffset; - Type *NewAccessTy = AccessTy; + MemAccessTy NewAccessTy = AccessTy; // Check for a mismatched kind. It's tempting to collapse mismatched kinds to // something conservative, however this can pessimize in the case that one of @@ -2232,8 +2235,10 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, // Check for a mismatched access type, and fall back conservatively as needed. // TODO: Be less conservative when the type is similar and can use the same // addressing modes. - if (Kind == LSRUse::Address && AccessTy != LU.AccessTy) - NewAccessTy = Type::getVoidTy(AccessTy->getContext()); + if (Kind == LSRUse::Address) { + if (AccessTy != LU.AccessTy) + NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext()); + } // Conservatively assume HasBaseReg is true for now. if (NewOffset < LU.MinOffset) { @@ -2257,12 +2262,12 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, return true; } -/// getUse - Return an LSRUse index and an offset value for a fixup which -/// needs the given expression, with the given kind and optional access type. -/// Either reuse an existing use or create a new one, as needed. -std::pair<size_t, int64_t> -LSRInstance::getUse(const SCEV *&Expr, - LSRUse::KindType Kind, Type *AccessTy) { +/// Return an LSRUse index and an offset value for a fixup which needs the given +/// expression, with the given kind and optional access type. Either reuse an +/// existing use or create a new one, as needed. +std::pair<size_t, int64_t> LSRInstance::getUse(const SCEV *&Expr, + LSRUse::KindType Kind, + MemAccessTy AccessTy) { const SCEV *Copy = Expr; int64_t Offset = ExtractImmediate(Expr, SE); @@ -2300,18 +2305,18 @@ LSRInstance::getUse(const SCEV *&Expr, return std::make_pair(LUIdx, Offset); } -/// DeleteUse - Delete the given use from the Uses list. +/// Delete the given use from the Uses list. void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) { if (&LU != &Uses.back()) std::swap(LU, Uses.back()); Uses.pop_back(); // Update RegUses. - RegUses.SwapAndDropUse(LUIdx, Uses.size()); + RegUses.swapAndDropUse(LUIdx, Uses.size()); } -/// FindUseWithFormula - Look for a use distinct from OrigLU which is has -/// a formula that has the same registers as the given formula. +/// Look for a use distinct from OrigLU which is has a formula that has the same +/// registers as the given formula. LSRUse * LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF, const LSRUse &OrigLU) { @@ -2396,14 +2401,14 @@ void LSRInstance::CollectInterestingTypesAndFactors() { if (const SCEVConstant *Factor = dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride, SE, true))) { - if (Factor->getValue()->getValue().getMinSignedBits() <= 64) - Factors.insert(Factor->getValue()->getValue().getSExtValue()); + if (Factor->getAPInt().getMinSignedBits() <= 64) + Factors.insert(Factor->getAPInt().getSExtValue()); } else if (const SCEVConstant *Factor = dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride, NewStride, SE, true))) { - if (Factor->getValue()->getValue().getMinSignedBits() <= 64) - Factors.insert(Factor->getValue()->getValue().getSExtValue()); + if (Factor->getAPInt().getMinSignedBits() <= 64) + Factors.insert(Factor->getAPInt().getSExtValue()); } } @@ -2415,9 +2420,9 @@ void LSRInstance::CollectInterestingTypesAndFactors() { DEBUG(print_factors_and_types(dbgs())); } -/// findIVOperand - Helper for CollectChains that finds an IV operand (computed -/// by an AddRec in this loop) within [OI,OE) or returns OE. If IVUsers mapped -/// Instructions to IVStrideUses, we could partially skip this. +/// Helper for CollectChains that finds an IV operand (computed by an AddRec in +/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to +/// IVStrideUses, we could partially skip this. static User::op_iterator findIVOperand(User::op_iterator OI, User::op_iterator OE, Loop *L, ScalarEvolution &SE) { @@ -2436,29 +2441,28 @@ findIVOperand(User::op_iterator OI, User::op_iterator OE, return OI; } -/// getWideOperand - IVChain logic must consistenctly peek base TruncInst -/// operands, so wrap it in a convenient helper. +/// IVChain logic must consistenctly peek base TruncInst operands, so wrap it in +/// a convenient helper. static Value *getWideOperand(Value *Oper) { if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper)) return Trunc->getOperand(0); return Oper; } -/// isCompatibleIVType - Return true if we allow an IV chain to include both -/// types. +/// Return true if we allow an IV chain to include both types. static bool isCompatibleIVType(Value *LVal, Value *RVal) { Type *LType = LVal->getType(); Type *RType = RVal->getType(); return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy()); } -/// getExprBase - Return an approximation of this SCEV expression's "base", or -/// NULL for any constant. Returning the expression itself is -/// conservative. Returning a deeper subexpression is more precise and valid as -/// long as it isn't less complex than another subexpression. For expressions -/// involving multiple unscaled values, we need to return the pointer-type -/// SCEVUnknown. This avoids forming chains across objects, such as: -/// PrevOper==a[i], IVOper==b[i], IVInc==b-a. +/// Return an approximation of this SCEV expression's "base", or NULL for any +/// constant. Returning the expression itself is conservative. Returning a +/// deeper subexpression is more precise and valid as long as it isn't less +/// complex than another subexpression. For expressions involving multiple +/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids +/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i], +/// IVInc==b-a. /// /// Since SCEVUnknown is the rightmost type, and pointers are the rightmost /// SCEVUnknown, we simply return the rightmost SCEV operand. @@ -2601,8 +2605,7 @@ isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users, return cost < 0; } -/// ChainInstruction - Add this IV user to an existing chain or make it the head -/// of a new chain. +/// Add this IV user to an existing chain or make it the head of a new chain. void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper, SmallVectorImpl<ChainUsers> &ChainUsersVec) { // When IVs are used as types of varying widths, they are generally converted @@ -2714,7 +2717,7 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper, ChainUsersVec[ChainIdx].FarUsers.erase(UserInst); } -/// CollectChains - Populate the vector of Chains. +/// Populate the vector of Chains. /// /// This decreases ILP at the architecture level. Targets with ample registers, /// multiple memory ports, and no register renaming probably don't want @@ -2755,19 +2758,19 @@ void LSRInstance::CollectChains() { for (BasicBlock::iterator I = (*BBIter)->begin(), E = (*BBIter)->end(); I != E; ++I) { // Skip instructions that weren't seen by IVUsers analysis. - if (isa<PHINode>(I) || !IU.isIVUserOrOperand(I)) + if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&*I)) continue; // Ignore users that are part of a SCEV expression. This way we only // consider leaf IV Users. This effectively rediscovers a portion of // IVUsers analysis but in program order this time. - if (SE.isSCEVable(I->getType()) && !isa<SCEVUnknown>(SE.getSCEV(I))) + if (SE.isSCEVable(I->getType()) && !isa<SCEVUnknown>(SE.getSCEV(&*I))) continue; // Remove this instruction from any NearUsers set it may be in. for (unsigned ChainIdx = 0, NChains = IVChainVec.size(); ChainIdx < NChains; ++ChainIdx) { - ChainUsersVec[ChainIdx].NearUsers.erase(I); + ChainUsersVec[ChainIdx].NearUsers.erase(&*I); } // Search for operands that can be chained. SmallPtrSet<Instruction*, 4> UniqueOperands; @@ -2776,7 +2779,7 @@ void LSRInstance::CollectChains() { while (IVOpIter != IVOpEnd) { Instruction *IVOpInst = cast<Instruction>(*IVOpIter); if (UniqueOperands.insert(IVOpInst).second) - ChainInstruction(I, IVOpInst, ChainUsersVec); + ChainInstruction(&*I, IVOpInst, ChainUsersVec); IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE); } } // Continue walking down the instructions. @@ -2828,20 +2831,20 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, if (!IncConst || !isAddressUse(UserInst, Operand)) return false; - if (IncConst->getValue()->getValue().getMinSignedBits() > 64) + if (IncConst->getAPInt().getMinSignedBits() > 64) return false; + MemAccessTy AccessTy = getAccessType(UserInst); int64_t IncOffset = IncConst->getValue()->getSExtValue(); - if (!isAlwaysFoldable(TTI, LSRUse::Address, - getAccessType(UserInst), /*BaseGV=*/ nullptr, - IncOffset, /*HaseBaseReg=*/ false)) + if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr, + IncOffset, /*HaseBaseReg=*/false)) return false; return true; } -/// GenerateIVChains - Generate an add or subtract for each IVInc in a chain to -/// materialize the IV user's operand from the previous IV user's operand. +/// Generate an add or subtract for each IVInc in a chain to materialize the IV +/// user's operand from the previous IV user's operand. void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, SmallVectorImpl<WeakVH> &DeadInsts) { // Find the new IVOperand for the head of the chain. It may have been replaced @@ -2961,7 +2964,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { LF.PostIncLoops = U.getPostIncLoops(); LSRUse::KindType Kind = LSRUse::Basic; - Type *AccessTy = nullptr; + MemAccessTy AccessTy; if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) { Kind = LSRUse::Address; AccessTy = getAccessType(LF.UserInst); @@ -3027,9 +3030,8 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { DEBUG(print_fixups(dbgs())); } -/// InsertInitialFormula - Insert a formula for the given expression into -/// the given use, separating out loop-variant portions from loop-invariant -/// and loop-computable portions. +/// Insert a formula for the given expression into the given use, separating out +/// loop-variant portions from loop-invariant and loop-computable portions. void LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) { // Mark uses whose expressions cannot be expanded. @@ -3037,13 +3039,13 @@ LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) { LU.RigidFormula = true; Formula F; - F.InitialMatch(S, L, SE); + F.initialMatch(S, L, SE); bool Inserted = InsertFormula(LU, LUIdx, F); assert(Inserted && "Initial formula already exists!"); (void)Inserted; } -/// InsertSupplementalFormula - Insert a simple single-register formula for -/// the given expression into the given use. +/// Insert a simple single-register formula for the given expression into the +/// given use. void LSRInstance::InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) { @@ -3054,17 +3056,16 @@ LSRInstance::InsertSupplementalFormula(const SCEV *S, assert(Inserted && "Supplemental formula already exists!"); (void)Inserted; } -/// CountRegisters - Note which registers are used by the given formula, -/// updating RegUses. +/// Note which registers are used by the given formula, updating RegUses. void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) { if (F.ScaledReg) - RegUses.CountRegister(F.ScaledReg, LUIdx); + RegUses.countRegister(F.ScaledReg, LUIdx); for (const SCEV *BaseReg : F.BaseRegs) - RegUses.CountRegister(BaseReg, LUIdx); + RegUses.countRegister(BaseReg, LUIdx); } -/// InsertFormula - If the given formula has not yet been inserted, add it to -/// the list, and return true. Return false otherwise. +/// If the given formula has not yet been inserted, add it to the list, and +/// return true. Return false otherwise. bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) { // Do not insert formula that we will not be able to expand. assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) && @@ -3076,9 +3077,9 @@ bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) { return true; } -/// CollectLoopInvariantFixupsAndFormulae - Check for other uses of -/// loop-invariant values which we're tracking. These other uses will pin these -/// values in registers, making them less profitable for elimination. +/// Check for other uses of loop-invariant values which we're tracking. These +/// other uses will pin these values in registers, making them less profitable +/// for elimination. /// TODO: This currently misses non-constant addrec step registers. /// TODO: Should this give more weight to users inside the loop? void @@ -3124,6 +3125,9 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { PHINode::getIncomingValueNumForOperand(U.getOperandNo())); if (!DT.dominates(L->getHeader(), UseBB)) continue; + // Don't bother if the instruction is in a BB which ends in an EHPad. + if (UseBB->getTerminator()->isEHPad()) + continue; // Ignore uses which are part of other SCEV expressions, to avoid // analyzing them multiple times. if (SE.isSCEVable(UserInst->getType())) { @@ -3148,7 +3152,8 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { LSRFixup &LF = getNewFixup(); LF.UserInst = const_cast<Instruction *>(UserInst); LF.OperandValToReplace = U; - std::pair<size_t, int64_t> P = getUse(S, LSRUse::Basic, nullptr); + std::pair<size_t, int64_t> P = getUse( + S, LSRUse::Basic, MemAccessTy()); LF.LUIdx = P.first; LF.Offset = P.second; LSRUse &LU = Uses[LF.LUIdx]; @@ -3165,8 +3170,8 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { } } -/// CollectSubexprs - Split S into subexpressions which can be pulled out into -/// separate registers. If C is non-null, multiply each subexpression by C. +/// Split S into subexpressions which can be pulled out into separate +/// registers. If C is non-null, multiply each subexpression by C. /// /// Return remainder expression after factoring the subexpressions captured by /// Ops. If Ops is complete, return NULL. @@ -3300,7 +3305,7 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx, F.BaseRegs.push_back(*J); // We may have changed the number of register in base regs, adjust the // formula accordingly. - F.Canonicalize(); + F.canonicalize(); if (InsertFormula(LU, LUIdx, F)) // If that formula hadn't been seen before, recurse to find more like @@ -3309,8 +3314,7 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx, } } -/// GenerateReassociations - Split out subexpressions from adds and the bases of -/// addrecs. +/// Split out subexpressions from adds and the bases of addrecs. void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base, unsigned Depth) { assert(Base.isCanonical() && "Input must be in the canonical form"); @@ -3326,8 +3330,8 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, /* Idx */ -1, /* IsScaledReg */ true); } -/// GenerateCombinations - Generate a formula consisting of all of the -/// loop-dominating registers added into a single register. +/// Generate a formula consisting of all of the loop-dominating registers added +/// into a single register. void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base) { // This method is only interesting on a plurality of registers. @@ -3336,7 +3340,7 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before // processing the formula. - Base.Unscale(); + Base.unscale(); Formula F = Base; F.BaseRegs.clear(); SmallVector<const SCEV *, 4> Ops; @@ -3354,7 +3358,7 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, // rather than proceed with zero in a register. if (!Sum->isZero()) { F.BaseRegs.push_back(Sum); - F.Canonicalize(); + F.canonicalize(); (void)InsertFormula(LU, LUIdx, F); } } @@ -3379,7 +3383,7 @@ void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx, (void)InsertFormula(LU, LUIdx, F); } -/// GenerateSymbolicOffsets - Generate reuse formulae using symbolic offsets. +/// Generate reuse formulae using symbolic offsets. void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base) { // We can't add a symbolic offset if the address already contains one. @@ -3410,8 +3414,8 @@ void LSRInstance::GenerateConstantOffsetsImpl( F.Scale = 0; F.ScaledReg = nullptr; } else - F.DeleteBaseReg(F.BaseRegs[Idx]); - F.Canonicalize(); + F.deleteBaseReg(F.BaseRegs[Idx]); + F.canonicalize(); } else if (IsScaledReg) F.ScaledReg = NewG; else @@ -3452,8 +3456,8 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, /* IsScaledReg */ true); } -/// GenerateICmpZeroScales - For ICmpZero, check to see if we can scale up -/// the comparison. For example, x == y -> x*c == y*c. +/// For ICmpZero, check to see if we can scale up the comparison. For example, x +/// == y -> x*c == y*c. void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base) { if (LU.Kind != LSRUse::ICmpZero) return; @@ -3538,8 +3542,8 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, } } -/// GenerateScales - Generate stride factor reuse formulae by making use of -/// scaled-offset address modes, for example. +/// Generate stride factor reuse formulae by making use of scaled-offset address +/// modes, for example. void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { // Determine the integer type for the base formula. Type *IntTy = Base.getType(); @@ -3547,10 +3551,10 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { // If this Formula already has a scaled register, we can't add another one. // Try to unscale the formula to generate a better scale. - if (Base.Scale != 0 && !Base.Unscale()) + if (Base.Scale != 0 && !Base.unscale()) return; - assert(Base.Scale == 0 && "Unscale did not did its job!"); + assert(Base.Scale == 0 && "unscale did not did its job!"); // Check each interesting stride. for (int64_t Factor : Factors) { @@ -3587,7 +3591,7 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { // TODO: This could be optimized to avoid all the copying. Formula F = Base; F.ScaledReg = Quotient; - F.DeleteBaseReg(F.BaseRegs[i]); + F.deleteBaseReg(F.BaseRegs[i]); // The canonical representation of 1*reg is reg, which is already in // Base. In that case, do not try to insert the formula, it will be // rejected anyway. @@ -3599,7 +3603,7 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { } } -/// GenerateTruncates - Generate reuse formulae from different IV types. +/// Generate reuse formulae from different IV types. void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) { // Don't bother truncating symbolic values. if (Base.BaseGV) return; @@ -3629,9 +3633,9 @@ void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) { namespace { -/// WorkItem - Helper class for GenerateCrossUseConstantOffsets. It's used to -/// defer modifications so that the search phase doesn't have to worry about -/// the data structures moving underneath it. +/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer +/// modifications so that the search phase doesn't have to worry about the data +/// structures moving underneath it. struct WorkItem { size_t LUIdx; int64_t Imm; @@ -3651,14 +3655,13 @@ void WorkItem::print(raw_ostream &OS) const { << " , add offset " << Imm; } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void WorkItem::dump() const { print(errs()); errs() << '\n'; } -#endif -/// GenerateCrossUseConstantOffsets - Look for registers which are a constant -/// distance apart and try to form reuse opportunities between them. +/// Look for registers which are a constant distance apart and try to form reuse +/// opportunities between them. void LSRInstance::GenerateCrossUseConstantOffsets() { // Group the registers by their value without any added constant offset. typedef std::map<int64_t, const SCEV *> ImmMapTy; @@ -3751,7 +3754,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // very similar but slightly different. Investigate if they // could be merged. That way, we would not have to unscale the // Formula. - F.Unscale(); + F.unscale(); // Use the immediate in the scaled register. if (F.ScaledReg == OrigReg) { int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale; @@ -3770,14 +3773,13 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // value to the immediate would produce a value closer to zero than the // immediate itself, then the formula isn't worthwhile. if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) - if (C->getValue()->isNegative() != - (NewF.BaseOffset < 0) && - (C->getValue()->getValue().abs() * APInt(BitWidth, F.Scale)) - .ule(std::abs(NewF.BaseOffset))) + if (C->getValue()->isNegative() != (NewF.BaseOffset < 0) && + (C->getAPInt().abs() * APInt(BitWidth, F.Scale)) + .ule(std::abs(NewF.BaseOffset))) continue; // OK, looks good. - NewF.Canonicalize(); + NewF.canonicalize(); (void)InsertFormula(LU, LUIdx, NewF); } else { // Use the immediate in a base register. @@ -3801,15 +3803,15 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // zero than the immediate itself, then the formula isn't worthwhile. for (const SCEV *NewReg : NewF.BaseRegs) if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg)) - if ((C->getValue()->getValue() + NewF.BaseOffset).abs().slt( - std::abs(NewF.BaseOffset)) && - (C->getValue()->getValue() + - NewF.BaseOffset).countTrailingZeros() >= - countTrailingZeros<uint64_t>(NewF.BaseOffset)) + if ((C->getAPInt() + NewF.BaseOffset) + .abs() + .slt(std::abs(NewF.BaseOffset)) && + (C->getAPInt() + NewF.BaseOffset).countTrailingZeros() >= + countTrailingZeros<uint64_t>(NewF.BaseOffset)) goto skip_formula; // Ok, looks good. - NewF.Canonicalize(); + NewF.canonicalize(); (void)InsertFormula(LU, LUIdx, NewF); break; skip_formula:; @@ -3819,7 +3821,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { } } -/// GenerateAllReuseFormulae - Generate formulae for each use. +/// Generate formulae for each use. void LSRInstance::GenerateAllReuseFormulae() { // This is split into multiple loops so that hasRegsUsedByUsesOtherThan @@ -3959,10 +3961,9 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() { // This is a rough guess that seems to work fairly well. static const size_t ComplexityLimit = UINT16_MAX; -/// EstimateSearchSpaceComplexity - Estimate the worst-case number of -/// solutions the solver might have to consider. It almost never considers -/// this many solutions because it prune the search space, but the pruning -/// isn't always sufficient. +/// Estimate the worst-case number of solutions the solver might have to +/// consider. It almost never considers this many solutions because it prune the +/// search space, but the pruning isn't always sufficient. size_t LSRInstance::EstimateSearchSpaceComplexity() const { size_t Power = 1; for (const LSRUse &LU : Uses) { @@ -3978,10 +3979,9 @@ size_t LSRInstance::EstimateSearchSpaceComplexity() const { return Power; } -/// NarrowSearchSpaceByDetectingSupersets - When one formula uses a superset -/// of the registers of another formula, it won't help reduce register -/// pressure (though it may not necessarily hurt register pressure); remove -/// it to simplify the system. +/// When one formula uses a superset of the registers of another formula, it +/// won't help reduce register pressure (though it may not necessarily hurt +/// register pressure); remove it to simplify the system. void LSRInstance::NarrowSearchSpaceByDetectingSupersets() { if (EstimateSearchSpaceComplexity() >= ComplexityLimit) { DEBUG(dbgs() << "The search space is too complex.\n"); @@ -4042,9 +4042,8 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() { } } -/// NarrowSearchSpaceByCollapsingUnrolledCode - When there are many registers -/// for expressions like A, A+1, A+2, etc., allocate a single register for -/// them. +/// When there are many registers for expressions like A, A+1, A+2, etc., +/// allocate a single register for them. void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { if (EstimateSearchSpaceComplexity() < ComplexityLimit) return; @@ -4121,8 +4120,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs())); } -/// NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters - Call -/// FilterOutUndesirableDedicatedRegisters again, if necessary, now that +/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that /// we've done more filtering, as it may be able to find more formulae to /// eliminate. void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){ @@ -4139,9 +4137,9 @@ void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){ } } -/// NarrowSearchSpaceByPickingWinnerRegs - Pick a register which seems likely -/// to be profitable, and then in any use which has any reference to that -/// register, delete all formulae which do not reference that register. +/// Pick a register which seems likely to be profitable, and then in any use +/// which has any reference to that register, delete all formulae which do not +/// reference that register. void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() { // With all other options exhausted, loop until the system is simple // enough to handle. @@ -4202,10 +4200,10 @@ void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() { } } -/// NarrowSearchSpaceUsingHeuristics - If there are an extraordinary number of -/// formulae to choose from, use some rough heuristics to prune down the number -/// of formulae. This keeps the main solver from taking an extraordinary amount -/// of time in some worst-case scenarios. +/// If there are an extraordinary number of formulae to choose from, use some +/// rough heuristics to prune down the number of formulae. This keeps the main +/// solver from taking an extraordinary amount of time in some worst-case +/// scenarios. void LSRInstance::NarrowSearchSpaceUsingHeuristics() { NarrowSearchSpaceByDetectingSupersets(); NarrowSearchSpaceByCollapsingUnrolledCode(); @@ -4213,7 +4211,7 @@ void LSRInstance::NarrowSearchSpaceUsingHeuristics() { NarrowSearchSpaceByPickingWinnerRegs(); } -/// SolveRecurse - This is the recursive solver. +/// This is the recursive solver. void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution, Cost &SolutionCost, SmallVectorImpl<const Formula *> &Workspace, @@ -4291,8 +4289,8 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution, } } -/// Solve - Choose one formula from each use. Return the results in the given -/// Solution vector. +/// Choose one formula from each use. Return the results in the given Solution +/// vector. void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const { SmallVector<const Formula *, 8> Workspace; Cost SolutionCost; @@ -4326,10 +4324,9 @@ void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const { assert(Solution.size() == Uses.size() && "Malformed solution!"); } -/// HoistInsertPosition - Helper for AdjustInsertPositionForExpand. Climb up -/// the dominator tree far as we can go while still being dominated by the -/// input positions. This helps canonicalize the insert position, which -/// encourages sharing. +/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as +/// we can go while still being dominated by the input positions. This helps +/// canonicalize the insert position, which encourages sharing. BasicBlock::iterator LSRInstance::HoistInsertPosition(BasicBlock::iterator IP, const SmallVectorImpl<Instruction *> &Inputs) @@ -4365,21 +4362,21 @@ LSRInstance::HoistInsertPosition(BasicBlock::iterator IP, // instead of at the end, so that it can be used for other expansions. if (IDom == Inst->getParent() && (!BetterPos || !DT.dominates(Inst, BetterPos))) - BetterPos = std::next(BasicBlock::iterator(Inst)); + BetterPos = &*std::next(BasicBlock::iterator(Inst)); } if (!AllDominate) break; if (BetterPos) - IP = BetterPos; + IP = BetterPos->getIterator(); else - IP = Tentative; + IP = Tentative->getIterator(); } return IP; } -/// AdjustInsertPositionForExpand - Determine an input position which will be -/// dominated by the operands and which will dominate the result. +/// Determine an input position which will be dominated by the operands and +/// which will dominate the result. BasicBlock::iterator LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP, const LSRFixup &LF, @@ -4417,7 +4414,7 @@ LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP, } } - assert(!isa<PHINode>(LowestIP) && !isa<LandingPadInst>(LowestIP) + assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad() && !isa<DbgInfoIntrinsic>(LowestIP) && "Insertion point must be a normal instruction"); @@ -4429,7 +4426,7 @@ LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP, while (isa<PHINode>(IP)) ++IP; // Ignore landingpad instructions. - while (isa<LandingPadInst>(IP)) ++IP; + while (!isa<TerminatorInst>(IP) && IP->isEHPad()) ++IP; // Ignore debug intrinsics. while (isa<DbgInfoIntrinsic>(IP)) ++IP; @@ -4437,13 +4434,14 @@ LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP, // Set IP below instructions recently inserted by SCEVExpander. This keeps the // IP consistent across expansions and allows the previously inserted // instructions to be reused by subsequent expansion. - while (Rewriter.isInsertedInstruction(IP) && IP != LowestIP) ++IP; + while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP) + ++IP; return IP; } -/// Expand - Emit instructions for the leading candidate expression for this -/// LSRUse (this is called "expanding"). +/// Emit instructions for the leading candidate expression for this LSRUse (this +/// is called "expanding"). Value *LSRInstance::Expand(const LSRFixup &LF, const Formula &F, BasicBlock::iterator IP, @@ -4487,7 +4485,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, LF.UserInst, LF.OperandValToReplace, Loops, SE, DT); - Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr, IP))); + Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr, &*IP))); } // Expand the ScaledReg portion. @@ -4505,14 +4503,14 @@ Value *LSRInstance::Expand(const LSRFixup &LF, // Expand ScaleReg as if it was part of the base regs. if (F.Scale == 1) Ops.push_back( - SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, IP))); + SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, &*IP))); else { // An interesting way of "folding" with an icmp is to use a negated // scale, which we'll implement by inserting it into the other operand // of the icmp. assert(F.Scale == -1 && "The only scale supported by ICmpZero uses is -1!"); - ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr, IP); + ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr, &*IP); } } else { // Otherwise just expand the scaled register and an explicit scale, @@ -4522,11 +4520,11 @@ Value *LSRInstance::Expand(const LSRFixup &LF, // Unless the addressing mode will not be folded. if (!Ops.empty() && LU.Kind == LSRUse::Address && isAMCompletelyFolded(TTI, LU, F)) { - Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); + Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP); Ops.clear(); Ops.push_back(SE.getUnknown(FullV)); } - ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, IP)); + ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, &*IP)); if (F.Scale != 1) ScaledS = SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale)); @@ -4538,7 +4536,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, if (F.BaseGV) { // Flush the operand list to suppress SCEVExpander hoisting. if (!Ops.empty()) { - Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); + Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP); Ops.clear(); Ops.push_back(SE.getUnknown(FullV)); } @@ -4548,7 +4546,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, // Flush the operand list to suppress SCEVExpander hoisting of both folded and // unfolded offsets. LSR assumes they both live next to their uses. if (!Ops.empty()) { - Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); + Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP); Ops.clear(); Ops.push_back(SE.getUnknown(FullV)); } @@ -4584,7 +4582,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF, const SCEV *FullS = Ops.empty() ? SE.getConstant(IntTy, 0) : SE.getAddExpr(Ops); - Value *FullV = Rewriter.expandCodeFor(FullS, Ty, IP); + Value *FullV = Rewriter.expandCodeFor(FullS, Ty, &*IP); // We're done expanding now, so reset the rewriter. Rewriter.clearPostInc(); @@ -4626,15 +4624,14 @@ Value *LSRInstance::Expand(const LSRFixup &LF, return FullV; } -/// RewriteForPHI - Helper for Rewrite. PHI nodes are special because the use -/// of their operands effectively happens in their predecessor blocks, so the -/// expression may need to be expanded in multiple places. +/// Helper for Rewrite. PHI nodes are special because the use of their operands +/// effectively happens in their predecessor blocks, so the expression may need +/// to be expanded in multiple places. void LSRInstance::RewriteForPHI(PHINode *PN, const LSRFixup &LF, const Formula &F, SCEVExpander &Rewriter, - SmallVectorImpl<WeakVH> &DeadInsts, - Pass *P) const { + SmallVectorImpl<WeakVH> &DeadInsts) const { DenseMap<BasicBlock *, Value *> Inserted; for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) if (PN->getIncomingValue(i) == LF.OperandValToReplace) { @@ -4658,8 +4655,7 @@ void LSRInstance::RewriteForPHI(PHINode *PN, .setDontDeleteUselessPHIs()); } else { SmallVector<BasicBlock*, 2> NewBBs; - SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, - /*AliasAnalysis*/ nullptr, &DT, &LI); + SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DT, &LI); NewBB = NewBBs[0]; } // If NewBB==NULL, then SplitCriticalEdge refused to split because all @@ -4685,7 +4681,8 @@ void LSRInstance::RewriteForPHI(PHINode *PN, if (!Pair.second) PN->setIncomingValue(i, Pair.first->second); else { - Value *FullV = Expand(LF, F, BB->getTerminator(), Rewriter, DeadInsts); + Value *FullV = Expand(LF, F, BB->getTerminator()->getIterator(), + Rewriter, DeadInsts); // If this is reuse-by-noop-cast, insert the noop cast. Type *OpTy = LF.OperandValToReplace->getType(); @@ -4702,20 +4699,20 @@ void LSRInstance::RewriteForPHI(PHINode *PN, } } -/// Rewrite - Emit instructions for the leading candidate expression for this -/// LSRUse (this is called "expanding"), and update the UserInst to reference -/// the newly expanded value. +/// Emit instructions for the leading candidate expression for this LSRUse (this +/// is called "expanding"), and update the UserInst to reference the newly +/// expanded value. void LSRInstance::Rewrite(const LSRFixup &LF, const Formula &F, SCEVExpander &Rewriter, - SmallVectorImpl<WeakVH> &DeadInsts, - Pass *P) const { + SmallVectorImpl<WeakVH> &DeadInsts) const { // First, find an insertion point that dominates UserInst. For PHI nodes, // find the nearest block which dominates all the relevant uses. if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) { - RewriteForPHI(PN, LF, F, Rewriter, DeadInsts, P); + RewriteForPHI(PN, LF, F, Rewriter, DeadInsts); } else { - Value *FullV = Expand(LF, F, LF.UserInst, Rewriter, DeadInsts); + Value *FullV = + Expand(LF, F, LF.UserInst->getIterator(), Rewriter, DeadInsts); // If this is reuse-by-noop-cast, insert the noop cast. Type *OpTy = LF.OperandValToReplace->getType(); @@ -4740,11 +4737,10 @@ void LSRInstance::Rewrite(const LSRFixup &LF, DeadInsts.emplace_back(LF.OperandValToReplace); } -/// ImplementSolution - Rewrite all the fixup locations with new values, -/// following the chosen solution. -void -LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, - Pass *P) { +/// Rewrite all the fixup locations with new values, following the chosen +/// solution. +void LSRInstance::ImplementSolution( + const SmallVectorImpl<const Formula *> &Solution) { // Keep track of instructions we may have made dead, so that // we can remove them after we are done working. SmallVector<WeakVH, 16> DeadInsts; @@ -4766,7 +4762,7 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, // Expand the new value definitions and update the users. for (const LSRFixup &Fixup : Fixups) { - Rewrite(Fixup, *Solution[Fixup.LUIdx], Rewriter, DeadInsts, P); + Rewrite(Fixup, *Solution[Fixup.LUIdx], Rewriter, DeadInsts); Changed = true; } @@ -4782,13 +4778,11 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, Changed |= DeleteTriviallyDeadInstructions(DeadInsts); } -LSRInstance::LSRInstance(Loop *L, Pass *P) - : IU(P->getAnalysis<IVUsers>()), SE(P->getAnalysis<ScalarEvolution>()), - DT(P->getAnalysis<DominatorTreeWrapperPass>().getDomTree()), - LI(P->getAnalysis<LoopInfoWrapperPass>().getLoopInfo()), - TTI(P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI( - *L->getHeader()->getParent())), - L(L), Changed(false), IVIncInsertPos(nullptr) { +LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, + DominatorTree &DT, LoopInfo &LI, + const TargetTransformInfo &TTI) + : IU(IU), SE(SE), DT(DT), LI(LI), TTI(TTI), L(L), Changed(false), + IVIncInsertPos(nullptr) { // If LoopSimplify form is not available, stay out of trouble. if (!L->isLoopSimplifyForm()) return; @@ -4879,7 +4873,7 @@ LSRInstance::LSRInstance(Loop *L, Pass *P) #endif // Now that we've decided what we want, make it so. - ImplementSolution(Solution, P); + ImplementSolution(Solution); } void LSRInstance::print_factors_and_types(raw_ostream &OS) const { @@ -4931,11 +4925,10 @@ void LSRInstance::print(raw_ostream &OS) const { print_uses(OS); } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void LSRInstance::dump() const { print(errs()); errs() << '\n'; } -#endif namespace { @@ -4956,7 +4949,7 @@ INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce", "Loop Strength Reduction", false, false) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(IVUsers) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) @@ -4982,8 +4975,8 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequiredID(LoopSimplifyID); AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addRequired<ScalarEvolution>(); - AU.addPreserved<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); // Requiring LoopSimplify a second time here prevents IVUsers from running // twice, since LoopSimplify was invalidated by running ScalarEvolution. AU.addRequiredID(LoopSimplifyID); @@ -4996,17 +4989,24 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { if (skipOptnoneFunction(L)) return false; + auto &IU = getAnalysis<IVUsers>(); + auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI( + *L->getHeader()->getParent()); bool Changed = false; // Run the main LSR transformation. - Changed |= LSRInstance(L, this).getChanged(); + Changed |= LSRInstance(L, IU, SE, DT, LI, TTI).getChanged(); // Remove any extra phis created by processing inner loops. Changed |= DeleteDeadPHIs(L->getHeader()); if (EnablePhiElim && L->isLoopSimplifyForm()) { SmallVector<WeakVH, 16> DeadInsts; const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); - SCEVExpander Rewriter(getAnalysis<ScalarEvolution>(), DL, "lsr"); + SCEVExpander Rewriter(getAnalysis<ScalarEvolutionWrapperPass>().getSE(), DL, + "lsr"); #ifndef NDEBUG Rewriter.setDebugType(DEBUG_TYPE); #endif diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index d78db6c..56ae5c0 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -14,6 +14,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/InstructionSimplify.h" @@ -130,27 +131,29 @@ namespace { bool UserAllowPartial; bool UserRuntime; - bool runOnLoop(Loop *L, LPPassManager &LPM) override; + bool runOnLoop(Loop *L, LPPassManager &) override; /// This transformation requires natural loop information & requires that /// loop preheaders be inserted into the CFG... /// void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequiredID(LoopSimplifyID); AU.addPreservedID(LoopSimplifyID); AU.addRequiredID(LCSSAID); AU.addPreservedID(LCSSAID); - AU.addRequired<ScalarEvolution>(); - AU.addPreserved<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info. // If loop unroll does not preserve dom info then LCSSA pass on next // loop will receive invalid dom info. // For now, recreate dom info, if loop is unrolled. AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } // Fill in the UnrollingPreferences parameter with values from the @@ -186,7 +189,7 @@ namespace { // total unrolled size. Parameters Threshold and PartialThreshold // are set to the maximum unrolled size for fully and partially // unrolled loops respectively. - void selectThresholds(const Loop *L, bool HasPragma, + void selectThresholds(const Loop *L, bool UsePragmaThreshold, const TargetTransformInfo::UnrollingPreferences &UP, unsigned &Threshold, unsigned &PartialThreshold, unsigned &PercentDynamicCostSavedThreshold, @@ -207,12 +210,13 @@ namespace { : UP.DynamicCostSavingsDiscount; if (!UserThreshold && + // FIXME: Use Function::optForSize(). L->getHeader()->getParent()->hasFnAttribute( Attribute::OptimizeForSize)) { Threshold = UP.OptSizeThreshold; PartialThreshold = UP.PartialOptSizeThreshold; } - if (HasPragma) { + if (UsePragmaThreshold) { // If the loop has an unrolling pragma, we want to be more // aggressive with unrolling limits. Set thresholds to at // least the PragmaTheshold value which is larger than the @@ -235,10 +239,11 @@ char LoopUnroll::ID = 0; INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false) Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial, @@ -278,8 +283,8 @@ class UnrolledInstAnalyzer : private InstVisitor<UnrolledInstAnalyzer, bool> { public: UnrolledInstAnalyzer(unsigned Iteration, DenseMap<Value *, Constant *> &SimplifiedValues, - const Loop *L, ScalarEvolution &SE) - : Iteration(Iteration), SimplifiedValues(SimplifiedValues), L(L), SE(SE) { + ScalarEvolution &SE) + : SimplifiedValues(SimplifiedValues), SE(SE) { IterationNumber = SE.getConstant(APInt(64, Iteration)); } @@ -295,13 +300,6 @@ private: /// results saved. DenseMap<Value *, SimplifiedAddress> SimplifiedAddresses; - /// \brief Number of currently simulated iteration. - /// - /// If an expression is ConstAddress+Constant, then the Constant is - /// Start + Iteration*Step, where Start and Step could be obtained from - /// SCEVGEPCache. - unsigned Iteration; - /// \brief SCEV expression corresponding to number of currently simulated /// iteration. const SCEV *IterationNumber; @@ -316,7 +314,6 @@ private: /// post-unrolling. DenseMap<Value *, Constant *> &SimplifiedValues; - const Loop *L; ScalarEvolution &SE; /// \brief Try to simplify instruction \param I using its SCEV expression. @@ -368,11 +365,9 @@ private: return simplifyInstWithSCEV(&I); } - /// TODO: Add visitors for other instruction types, e.g. ZExt, SExt. - /// Try to simplify binary operator I. /// - /// TODO: Probaly it's worth to hoist the code for estimating the + /// TODO: Probably it's worth to hoist the code for estimating the /// simplifications effects to a separate class, since we have a very similar /// code in InlineCost already. bool visitBinaryOperator(BinaryOperator &I) { @@ -412,7 +407,7 @@ private: auto *GV = dyn_cast<GlobalVariable>(AddressIt->second.Base); // We're only interested in loads that can be completely folded to a // constant. - if (!GV || !GV->hasInitializer()) + if (!GV || !GV->hasDefinitiveInitializer() || !GV->isConstant()) return false; ConstantDataSequential *CDS = @@ -420,6 +415,12 @@ private: if (!CDS) return false; + // We might have a vector load from an array. FIXME: for now we just bail + // out in this case, but we should be able to resolve and simplify such + // loads. + if(!CDS->isElementTypeCompatible(I.getType())) + return false; + int ElemSize = CDS->getElementType()->getPrimitiveSizeInBits() / 8U; assert(SimplifiedAddrOp->getValue().getActiveBits() < 64 && "Unexpectedly large index value."); @@ -436,6 +437,59 @@ private: return true; } + + bool visitCastInst(CastInst &I) { + // Propagate constants through casts. + Constant *COp = dyn_cast<Constant>(I.getOperand(0)); + if (!COp) + COp = SimplifiedValues.lookup(I.getOperand(0)); + if (COp) + if (Constant *C = + ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) { + SimplifiedValues[&I] = C; + return true; + } + + return Base::visitCastInst(I); + } + + bool visitCmpInst(CmpInst &I) { + Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); + + // First try to handle simplified comparisons. + if (!isa<Constant>(LHS)) + if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS)) + LHS = SimpleLHS; + if (!isa<Constant>(RHS)) + if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS)) + RHS = SimpleRHS; + + if (!isa<Constant>(LHS) && !isa<Constant>(RHS)) { + auto SimplifiedLHS = SimplifiedAddresses.find(LHS); + if (SimplifiedLHS != SimplifiedAddresses.end()) { + auto SimplifiedRHS = SimplifiedAddresses.find(RHS); + if (SimplifiedRHS != SimplifiedAddresses.end()) { + SimplifiedAddress &LHSAddr = SimplifiedLHS->second; + SimplifiedAddress &RHSAddr = SimplifiedRHS->second; + if (LHSAddr.Base == RHSAddr.Base) { + LHS = LHSAddr.Offset; + RHS = RHSAddr.Offset; + } + } + } + } + + if (Constant *CLHS = dyn_cast<Constant>(LHS)) { + if (Constant *CRHS = dyn_cast<Constant>(RHS)) { + if (Constant *C = ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) { + SimplifiedValues[&I] = C; + return true; + } + } + } + + return Base::visitCmpInst(I); + } }; } // namespace @@ -443,11 +497,11 @@ private: namespace { struct EstimatedUnrollCost { /// \brief The estimated cost after unrolling. - unsigned UnrolledCost; + int UnrolledCost; /// \brief The estimated dynamic cost of executing the instructions in the /// rolled form. - unsigned RolledDynamicCost; + int RolledDynamicCost; }; } @@ -464,10 +518,10 @@ struct EstimatedUnrollCost { /// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If /// the analysis failed (no benefits expected from the unrolling, or the loop is /// too big to analyze), the returned value is None. -Optional<EstimatedUnrollCost> -analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE, - const TargetTransformInfo &TTI, - unsigned MaxUnrolledLoopSize) { +static Optional<EstimatedUnrollCost> +analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, + ScalarEvolution &SE, const TargetTransformInfo &TTI, + int MaxUnrolledLoopSize) { // We want to be able to scale offsets by the trip count and add more offsets // to them without checking for overflows, and we already don't want to // analyze *massive* trip counts, so we force the max to be reasonably small. @@ -481,24 +535,61 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE, SmallSetVector<BasicBlock *, 16> BBWorklist; DenseMap<Value *, Constant *> SimplifiedValues; + SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues; // The estimated cost of the unrolled form of the loop. We try to estimate // this by simplifying as much as we can while computing the estimate. - unsigned UnrolledCost = 0; + int UnrolledCost = 0; // We also track the estimated dynamic (that is, actually executed) cost in // the rolled form. This helps identify cases when the savings from unrolling // aren't just exposing dead control flows, but actual reduced dynamic // instructions due to the simplifications which we expect to occur after // unrolling. - unsigned RolledDynamicCost = 0; + int RolledDynamicCost = 0; + + // Ensure that we don't violate the loop structure invariants relied on by + // this analysis. + assert(L->isLoopSimplifyForm() && "Must put loop into normal form first."); + assert(L->isLCSSAForm(DT) && + "Must have loops in LCSSA form to track live-out values."); + + DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n"); // Simulate execution of each iteration of the loop counting instructions, // which would be simplified. // Since the same load will take different values on different iterations, // we literally have to go through all loop's iterations. for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) { + DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n"); + + // Prepare for the iteration by collecting any simplified entry or backedge + // inputs. + for (Instruction &I : *L->getHeader()) { + auto *PHI = dyn_cast<PHINode>(&I); + if (!PHI) + break; + + // The loop header PHI nodes must have exactly two input: one from the + // loop preheader and one from the loop latch. + assert( + PHI->getNumIncomingValues() == 2 && + "Must have an incoming value only for the preheader and the latch."); + + Value *V = PHI->getIncomingValueForBlock( + Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch()); + Constant *C = dyn_cast<Constant>(V); + if (Iteration != 0 && !C) + C = SimplifiedValues.lookup(V); + if (C) + SimplifiedInputValues.push_back({PHI, C}); + } + + // Now clear and re-populate the map for the next iteration. SimplifiedValues.clear(); - UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, L, SE); + while (!SimplifiedInputValues.empty()) + SimplifiedValues.insert(SimplifiedInputValues.pop_back_val()); + + UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE); BBWorklist.clear(); BBWorklist.insert(L->getHeader()); @@ -510,21 +601,67 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE, // it. We don't change the actual IR, just count optimization // opportunities. for (Instruction &I : *BB) { - unsigned InstCost = TTI.getUserCost(&I); + int InstCost = TTI.getUserCost(&I); // Visit the instruction to analyze its loop cost after unrolling, // and if the visitor returns false, include this instruction in the // unrolled cost. if (!Analyzer.visit(I)) UnrolledCost += InstCost; + else { + DEBUG(dbgs() << " " << I + << " would be simplified if loop is unrolled.\n"); + (void)0; + } // Also track this instructions expected cost when executing the rolled // loop form. RolledDynamicCost += InstCost; // If unrolled body turns out to be too big, bail out. - if (UnrolledCost > MaxUnrolledLoopSize) + if (UnrolledCost > MaxUnrolledLoopSize) { + DEBUG(dbgs() << " Exceeded threshold.. exiting.\n" + << " UnrolledCost: " << UnrolledCost + << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize + << "\n"); return None; + } + } + + TerminatorInst *TI = BB->getTerminator(); + + // Add in the live successors by first checking whether we have terminator + // that may be simplified based on the values simplified by this call. + if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { + if (BI->isConditional()) { + if (Constant *SimpleCond = + SimplifiedValues.lookup(BI->getCondition())) { + BasicBlock *Succ = nullptr; + // Just take the first successor if condition is undef + if (isa<UndefValue>(SimpleCond)) + Succ = BI->getSuccessor(0); + else + Succ = BI->getSuccessor( + cast<ConstantInt>(SimpleCond)->isZero() ? 1 : 0); + if (L->contains(Succ)) + BBWorklist.insert(Succ); + continue; + } + } + } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { + if (Constant *SimpleCond = + SimplifiedValues.lookup(SI->getCondition())) { + BasicBlock *Succ = nullptr; + // Just take the first successor if condition is undef + if (isa<UndefValue>(SimpleCond)) + Succ = SI->getSuccessor(0); + else + Succ = SI->findCaseValue(cast<ConstantInt>(SimpleCond)) + .getCaseSuccessor(); + if (L->contains(Succ)) + BBWorklist.insert(Succ); + continue; + } } // Add BB's successors to the worklist. @@ -535,9 +672,15 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, ScalarEvolution &SE, // If we found no optimization opportunities on the first iteration, we // won't find them on later ones too. - if (UnrolledCost == RolledDynamicCost) + if (UnrolledCost == RolledDynamicCost) { + DEBUG(dbgs() << " No opportunities found.. exiting.\n" + << " UnrolledCost: " << UnrolledCost << "\n"); return None; + } } + DEBUG(dbgs() << "Analysis finished:\n" + << "UnrolledCost: " << UnrolledCost << ", " + << "RolledDynamicCost: " << RolledDynamicCost << "\n"); return {{UnrolledCost, RolledDynamicCost}}; } @@ -583,6 +726,12 @@ static bool HasUnrollFullPragma(const Loop *L) { return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.full"); } +// Returns true if the loop has an unroll(enable) pragma. This metadata is used +// for both "#pragma unroll" and "#pragma clang loop unroll(enable)" directives. +static bool HasUnrollEnablePragma(const Loop *L) { + return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.enable"); +} + // Returns true if the loop has an unroll(disable) pragma. static bool HasUnrollDisablePragma(const Loop *L) { return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.disable"); @@ -708,7 +857,7 @@ unsigned LoopUnroll::selectUnrollCount( unsigned Count = UserCount ? CurrentCount : 0; // If there is no user-specified count, unroll pragmas have the next - // highest precendence. + // highest precedence. if (Count == 0) { if (PragmaCount) { Count = PragmaCount; @@ -737,17 +886,19 @@ unsigned LoopUnroll::selectUnrollCount( return Count; } -bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { +bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &) { if (skipOptnoneFunction(L)) return false; Function &F = *L->getHeader()->getParent(); + auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - ScalarEvolution *SE = &getAnalysis<ScalarEvolution>(); + ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); BasicBlock *Header = L->getHeader(); DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName() @@ -757,8 +908,9 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { return false; } bool PragmaFullUnroll = HasUnrollFullPragma(L); + bool PragmaEnableUnroll = HasUnrollEnablePragma(L); unsigned PragmaCount = UnrollCountPragmaValue(L); - bool HasPragma = PragmaFullUnroll || PragmaCount > 0; + bool HasPragma = PragmaFullUnroll || PragmaEnableUnroll || PragmaCount > 0; TargetTransformInfo::UnrollingPreferences UP; getUnrollingPreferences(L, TTI, UP); @@ -806,7 +958,15 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { unsigned Threshold, PartialThreshold; unsigned PercentDynamicCostSavedThreshold; unsigned DynamicCostSavingsDiscount; - selectThresholds(L, HasPragma, UP, Threshold, PartialThreshold, + // Only use the high pragma threshold when we have a target unroll factor such + // as with "#pragma unroll N" or a pragma indicating full unrolling and the + // trip count is known. Otherwise we rely on the standard threshold to + // heuristically select a reasonable unroll count. + bool UsePragmaThreshold = + PragmaCount > 0 || + ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount != 0); + + selectThresholds(L, UsePragmaThreshold, UP, Threshold, PartialThreshold, PercentDynamicCostSavedThreshold, DynamicCostSavingsDiscount); @@ -824,8 +984,9 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { // The loop isn't that small, but we still can fully unroll it if that // helps to remove a significant number of instructions. // To check that, run additional analysis on the loop. - if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost( - L, TripCount, *SE, TTI, Threshold + DynamicCostSavingsDiscount)) + if (Optional<EstimatedUnrollCost> Cost = + analyzeLoopUnrollCost(L, TripCount, DT, *SE, TTI, + Threshold + DynamicCostSavingsDiscount)) if (canUnrollCompletely(L, Threshold, PercentDynamicCostSavedThreshold, DynamicCostSavingsDiscount, Cost->UnrolledCost, Cost->RolledDynamicCost)) { @@ -840,14 +1001,15 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { // Reduce count based on the type of unrolling and the threshold values. unsigned OriginalCount = Count; - bool AllowRuntime = - (PragmaCount > 0) || (UserRuntime ? CurrentRuntime : UP.Runtime); + bool AllowRuntime = PragmaEnableUnroll || (PragmaCount > 0) || + (UserRuntime ? CurrentRuntime : UP.Runtime); // Don't unroll a runtime trip count loop with unroll full pragma. if (HasRuntimeUnrollDisablePragma(L) || PragmaFullUnroll) { AllowRuntime = false; } if (Unrolling == Partial) { - bool AllowPartial = UserAllowPartial ? CurrentAllowPartial : UP.Partial; + bool AllowPartial = PragmaEnableUnroll || + (UserAllowPartial ? CurrentAllowPartial : UP.Partial); if (!AllowPartial && !CountSetExplicitly) { DEBUG(dbgs() << " will not try to unroll partially because " << "-unroll-allow-partial not given\n"); @@ -887,23 +1049,27 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { DebugLoc LoopLoc = L->getStartLoc(); Function *F = Header->getParent(); LLVMContext &Ctx = F->getContext(); - if (PragmaFullUnroll && PragmaCount == 0) { - if (TripCount && Count != TripCount) { - emitOptimizationRemarkMissed( - Ctx, DEBUG_TYPE, *F, LoopLoc, - "Unable to fully unroll loop as directed by unroll(full) pragma " - "because unrolled size is too large."); - } else if (!TripCount) { - emitOptimizationRemarkMissed( - Ctx, DEBUG_TYPE, *F, LoopLoc, - "Unable to fully unroll loop as directed by unroll(full) pragma " - "because loop has a runtime trip count."); - } - } else if (PragmaCount > 0 && Count != OriginalCount) { + if ((PragmaCount > 0) && Count != OriginalCount) { emitOptimizationRemarkMissed( Ctx, DEBUG_TYPE, *F, LoopLoc, "Unable to unroll loop the number of times directed by " "unroll_count pragma because unrolled size is too large."); + } else if (PragmaFullUnroll && !TripCount) { + emitOptimizationRemarkMissed( + Ctx, DEBUG_TYPE, *F, LoopLoc, + "Unable to fully unroll loop as directed by unroll(full) pragma " + "because loop has a runtime trip count."); + } else if (PragmaEnableUnroll && Count != TripCount && Count < 2) { + emitOptimizationRemarkMissed( + Ctx, DEBUG_TYPE, *F, LoopLoc, + "Unable to unroll loop as directed by unroll(enable) pragma because " + "unrolled size is too large."); + } else if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount && + Count != TripCount) { + emitOptimizationRemarkMissed( + Ctx, DEBUG_TYPE, *F, LoopLoc, + "Unable to fully unroll loop as directed by unroll pragma because " + "unrolled size is too large."); } } @@ -915,7 +1081,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { // Unroll the loop. if (!UnrollLoop(L, Count, TripCount, AllowRuntime, UP.AllowExpensiveTripCount, - TripMultiple, LI, this, &LPM, &AC)) + TripMultiple, LI, SE, &DT, &AC, PreserveLCSSA)) return false; return true; diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp index cbc563b..95d7f8a 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -30,6 +30,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/InstructionSimplify.h" @@ -37,6 +38,10 @@ #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/BlockFrequencyInfoImpl.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Support/BranchProbability.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" @@ -70,6 +75,19 @@ static cl::opt<unsigned> Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"), cl::init(100), cl::Hidden); +static cl::opt<bool> +LoopUnswitchWithBlockFrequency("loop-unswitch-with-block-frequency", + cl::init(false), cl::Hidden, + cl::desc("Enable the use of the block frequency analysis to access PGO " + "heuristics to minimize code growth in cold regions.")); + +static cl::opt<unsigned> +ColdnessThreshold("loop-unswitch-coldness-threshold", cl::init(1), cl::Hidden, + cl::desc("Coldness threshold in percentage. The loop header frequency " + "(relative to the entry frequency) is compared with this " + "threshold to determine if non-trivial unswitching should be " + "enabled.")); + namespace { class LUAnalysisCache { @@ -148,12 +166,19 @@ namespace { LPPassManager *LPM; AssumptionCache *AC; - // LoopProcessWorklist - Used to check if second loop needs processing - // after RewriteLoopBodyWithConditionConstant rewrites first loop. + // Used to check if second loop needs processing after + // RewriteLoopBodyWithConditionConstant rewrites first loop. std::vector<Loop*> LoopProcessWorklist; LUAnalysisCache BranchesInfo; + bool EnabledPGO; + + // BFI and ColdEntryFreq are only used when PGO and + // LoopUnswitchWithBlockFrequency are enabled. + BlockFrequencyInfo BFI; + BlockFrequency ColdEntryFreq; + bool OptimizeForSize; bool redoLoop; @@ -192,9 +217,11 @@ namespace { AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequiredID(LCSSAID); AU.addPreservedID(LCSSAID); + AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<ScalarEvolution>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } private: @@ -210,7 +237,10 @@ namespace { /// Split all of the edges from inside the loop to their exit blocks. /// Update the appropriate Phi nodes as we do so. - void SplitExitEdges(Loop *L, const SmallVectorImpl<BasicBlock *> &ExitBlocks); + void SplitExitEdges(Loop *L, + const SmallVectorImpl<BasicBlock *> &ExitBlocks); + + bool TryTrivialLoopUnswitch(bool &Changed); bool UnswitchIfProfitable(Value *LoopCond, Constant *Val, TerminatorInst *TI = nullptr); @@ -229,9 +259,6 @@ namespace { TerminatorInst *TI); void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L); - bool IsTrivialUnswitchCondition(Value *Cond, Constant **Val = nullptr, - BasicBlock **LoopExit = nullptr); - }; } @@ -367,9 +394,8 @@ Pass *llvm::createLoopUnswitchPass(bool Os) { return new LoopUnswitch(Os); } -/// FindLIVLoopCondition - Cond is a condition that occurs in L. If it is -/// invariant in the loop, or has an invariant piece, return the invariant. -/// Otherwise, return null. +/// Cond is a condition that occurs in L. If it is invariant in the loop, or has +/// an invariant piece, return the invariant. Otherwise, return null. static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) { // We started analyze new instruction, increment scanned instructions counter. @@ -411,11 +437,23 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) { *L->getHeader()->getParent()); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); LPM = &LPM_Ref; - DominatorTreeWrapperPass *DTWP = - getAnalysisIfAvailable<DominatorTreeWrapperPass>(); - DT = DTWP ? &DTWP->getDomTree() : nullptr; + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); currentLoop = L; Function *F = currentLoop->getHeader()->getParent(); + + EnabledPGO = F->getEntryCount().hasValue(); + + if (LoopUnswitchWithBlockFrequency && EnabledPGO) { + BranchProbabilityInfo BPI(*F, *LI); + BFI.calculate(*L->getHeader()->getParent(), BPI, *LI); + + // Use BranchProbability to compute a minimum frequency based on + // function entry baseline frequency. Loops with headers below this + // frequency are considered as cold. + const BranchProbability ColdProb(ColdnessThreshold, 100); + ColdEntryFreq = BlockFrequency(BFI.getEntryFreq()) * ColdProb; + } + bool Changed = false; do { assert(currentLoop->isLCSSAForm(*DT)); @@ -423,16 +461,13 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) { Changed |= processCurrentLoop(); } while(redoLoop); - if (Changed) { - // FIXME: Reconstruct dom info, because it is not preserved properly. - if (DT) - DT->recalculate(*F); - } + // FIXME: Reconstruct dom info, because it is not preserved properly. + if (Changed) + DT->recalculate(*F); return Changed; } -/// processCurrentLoop - Do actual work and unswitch loop if possible -/// and profitable. +/// Do actual work and unswitch loop if possible and profitable. bool LoopUnswitch::processCurrentLoop() { bool Changed = false; @@ -452,14 +487,48 @@ bool LoopUnswitch::processCurrentLoop() { LLVMContext &Context = loopHeader->getContext(); - // Probably we reach the quota of branches for this loop. If so - // stop unswitching. + // Analyze loop cost, and stop unswitching if loop content can not be duplicated. if (!BranchesInfo.countLoop( currentLoop, getAnalysis<TargetTransformInfoWrapperPass>().getTTI( *currentLoop->getHeader()->getParent()), AC)) return false; + // Try trivial unswitch first before loop over other basic blocks in the loop. + if (TryTrivialLoopUnswitch(Changed)) { + return true; + } + + // Do not unswitch loops containing convergent operations, as we might be + // making them control dependent on the unswitch value when they were not + // before. + // FIXME: This could be refined to only bail if the convergent operation is + // not already control-dependent on the unswitch value. + for (const auto BB : currentLoop->blocks()) { + for (auto &I : *BB) { + auto CS = CallSite(&I); + if (!CS) continue; + if (CS.hasFnAttr(Attribute::Convergent)) + return false; + } + } + + // Do not do non-trivial unswitch while optimizing for size. + // FIXME: Use Function::optForSize(). + if (OptimizeForSize || + loopHeader->getParent()->hasFnAttribute(Attribute::OptimizeForSize)) + return false; + + if (LoopUnswitchWithBlockFrequency && EnabledPGO) { + // Compute the weighted frequency of the hottest block in the + // loop (loopHeader in this case since inner loops should be + // processed before outer loop). If it is less than ColdFrequency, + // we should not unswitch. + BlockFrequency LoopEntryFreq = BFI.getBlockFreq(loopHeader); + if (LoopEntryFreq < ColdEntryFreq) + return false; + } + // Loop over all of the basic blocks in the loop. If we find an interior // block that is branching on a loop-invariant condition, we can unswitch this // loop. @@ -528,8 +597,8 @@ bool LoopUnswitch::processCurrentLoop() { return Changed; } -/// isTrivialLoopExitBlock - Check to see if all paths from BB exit the -/// loop with no side effects (including infinite loops). +/// Check to see if all paths from BB exit the loop with no side effects +/// (including infinite loops). /// /// If true, we return true and set ExitBB to the block we /// exit through. @@ -566,9 +635,9 @@ static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB, return true; } -/// isTrivialLoopExitBlock - Return true if the specified block unconditionally -/// leads to an exit from the specified loop, and has no side-effects in the -/// process. If so, return the block that is exited to, otherwise return null. +/// Return true if the specified block unconditionally leads to an exit from +/// the specified loop, and has no side-effects in the process. If so, return +/// the block that is exited to, otherwise return null. static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) { std::set<BasicBlock*> Visited; Visited.insert(L->getHeader()); // Branches to header make infinite loops. @@ -578,105 +647,11 @@ static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) { return nullptr; } -/// IsTrivialUnswitchCondition - Check to see if this unswitch condition is -/// trivial: that is, that the condition controls whether or not the loop does -/// anything at all. If this is a trivial condition, unswitching produces no -/// code duplications (equivalently, it produces a simpler loop and a new empty -/// loop, which gets deleted). -/// -/// If this is a trivial condition, return true, otherwise return false. When -/// returning true, this sets Cond and Val to the condition that controls the -/// trivial condition: when Cond dynamically equals Val, the loop is known to -/// exit. Finally, this sets LoopExit to the BB that the loop exits to when -/// Cond == Val. -/// -bool LoopUnswitch::IsTrivialUnswitchCondition(Value *Cond, Constant **Val, - BasicBlock **LoopExit) { - BasicBlock *Header = currentLoop->getHeader(); - TerminatorInst *HeaderTerm = Header->getTerminator(); - LLVMContext &Context = Header->getContext(); - - BasicBlock *LoopExitBB = nullptr; - if (BranchInst *BI = dyn_cast<BranchInst>(HeaderTerm)) { - // If the header block doesn't end with a conditional branch on Cond, we - // can't handle it. - if (!BI->isConditional() || BI->getCondition() != Cond) - return false; - - // Check to see if a successor of the branch is guaranteed to - // exit through a unique exit block without having any - // side-effects. If so, determine the value of Cond that causes it to do - // this. - if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, - BI->getSuccessor(0)))) { - if (Val) *Val = ConstantInt::getTrue(Context); - } else if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, - BI->getSuccessor(1)))) { - if (Val) *Val = ConstantInt::getFalse(Context); - } - } else if (SwitchInst *SI = dyn_cast<SwitchInst>(HeaderTerm)) { - // If this isn't a switch on Cond, we can't handle it. - if (SI->getCondition() != Cond) return false; - - // Check to see if a successor of the switch is guaranteed to go to the - // latch block or exit through a one exit block without having any - // side-effects. If so, determine the value of Cond that causes it to do - // this. - // Note that we can't trivially unswitch on the default case or - // on already unswitched cases. - for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); - i != e; ++i) { - BasicBlock *LoopExitCandidate; - if ((LoopExitCandidate = isTrivialLoopExitBlock(currentLoop, - i.getCaseSuccessor()))) { - // Okay, we found a trivial case, remember the value that is trivial. - ConstantInt *CaseVal = i.getCaseValue(); - - // Check that it was not unswitched before, since already unswitched - // trivial vals are looks trivial too. - if (BranchesInfo.isUnswitched(SI, CaseVal)) - continue; - LoopExitBB = LoopExitCandidate; - if (Val) *Val = CaseVal; - break; - } - } - } - - // If we didn't find a single unique LoopExit block, or if the loop exit block - // contains phi nodes, this isn't trivial. - if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin())) - return false; // Can't handle this. - - if (LoopExit) *LoopExit = LoopExitBB; - - // We already know that nothing uses any scalar values defined inside of this - // loop. As such, we just have to check to see if this loop will execute any - // side-effecting instructions (e.g. stores, calls, volatile loads) in the - // part of the loop that the code *would* execute. We already checked the - // tail, check the header now. - for (BasicBlock::iterator I = Header->begin(), E = Header->end(); I != E; ++I) - if (I->mayHaveSideEffects()) - return false; - return true; -} - -/// UnswitchIfProfitable - We have found that we can unswitch currentLoop when -/// LoopCond == Val to simplify the loop. If we decide that this is profitable, +/// We have found that we can unswitch currentLoop when LoopCond == Val to +/// simplify the loop. If we decide that this is profitable, /// unswitch the loop, reprocess the pieces, then return true. bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val, TerminatorInst *TI) { - Function *F = loopHeader->getParent(); - Constant *CondVal = nullptr; - BasicBlock *ExitBlock = nullptr; - - if (IsTrivialUnswitchCondition(LoopCond, &CondVal, &ExitBlock)) { - // If the condition is trivial, always unswitch. There is no code growth - // for this case. - UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, ExitBlock, TI); - return true; - } - // Check to see if it would be profitable to unswitch current loop. if (!BranchesInfo.CostAllowsUnswitching()) { DEBUG(dbgs() << "NOT unswitching loop %" @@ -687,32 +662,27 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val, return false; } - // Do not do non-trivial unswitch while optimizing for size. - if (OptimizeForSize || F->hasFnAttribute(Attribute::OptimizeForSize)) - return false; - UnswitchNontrivialCondition(LoopCond, Val, currentLoop, TI); return true; } -/// CloneLoop - Recursively clone the specified loop and all of its children, +/// Recursively clone the specified loop and all of its children, /// mapping the blocks with the specified map. static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM, LoopInfo *LI, LPPassManager *LPM) { - Loop *New = new Loop(); - LPM->insertLoop(New, PL); + Loop &New = LPM->addLoop(PL); // Add all of the blocks in L to the new loop. for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E; ++I) if (LI->getLoopFor(*I) == L) - New->addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), *LI); + New.addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), *LI); // Add all of the subloops to the new loop. for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) - CloneLoop(*I, New, VM, LI, LPM); + CloneLoop(*I, &New, VM, LI, LPM); - return New; + return &New; } static void copyMetadata(Instruction *DstInst, const Instruction *SrcInst, @@ -744,15 +714,15 @@ static void copyMetadata(Instruction *DstInst, const Instruction *SrcInst, } } // fallthrough. + case LLVMContext::MD_make_implicit: case LLVMContext::MD_dbg: DstInst->setMetadata(MD.first, MD.second); } } } -/// EmitPreheaderBranchOnCondition - Emit a conditional branch on two values -/// if LIC == Val, branch to TrueDst, otherwise branch to FalseDest. Insert the -/// code immediately before InsertPt. +/// Emit a conditional branch on two values if LIC == Val, branch to TrueDst, +/// otherwise branch to FalseDest. Insert the code immediately before InsertPt. void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val, BasicBlock *TrueDest, BasicBlock *FalseDest, @@ -782,11 +752,11 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val, SplitCriticalEdge(BI, 1, Options); } -/// UnswitchTrivialCondition - Given a loop that has a trivial unswitchable -/// condition in it (a cond branch from its header block to its latch block, -/// where the path through the loop that doesn't execute its body has no -/// side-effects), unswitch it. This doesn't involve any code duplication, just -/// moving the conditional branch outside of the loop and updating loop info. +/// Given a loop that has a trivial unswitchable condition in it (a cond branch +/// from its header block to its latch block, where the path through the loop +/// that doesn't execute its body has no side-effects), unswitch it. This +/// doesn't involve any code duplication, just moving the conditional branch +/// outside of the loop and updating loop info. void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val, BasicBlock *ExitBlock, TerminatorInst *TI) { @@ -810,7 +780,7 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val, // without actually branching to it (the exit block should be dominated by the // loop header, not the preheader). assert(!L->contains(ExitBlock) && "Exit block is in the loop?"); - BasicBlock *NewExit = SplitBlock(ExitBlock, ExitBlock->begin(), DT, LI); + BasicBlock *NewExit = SplitBlock(ExitBlock, &ExitBlock->front(), DT, LI); // Okay, now we have a position to branch from and a position to branch to, // insert the new conditional branch. @@ -829,8 +799,155 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val, ++NumTrivial; } -/// SplitExitEdges - Split all of the edges from inside the loop to their exit -/// blocks. Update the appropriate Phi nodes as we do so. +/// Check if the first non-constant condition starting from the loop header is +/// a trivial unswitch condition: that is, a condition controls whether or not +/// the loop does anything at all. If it is a trivial condition, unswitching +/// produces no code duplications (equivalently, it produces a simpler loop and +/// a new empty loop, which gets deleted). Therefore always unswitch trivial +/// condition. +bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) { + BasicBlock *CurrentBB = currentLoop->getHeader(); + TerminatorInst *CurrentTerm = CurrentBB->getTerminator(); + LLVMContext &Context = CurrentBB->getContext(); + + // If loop header has only one reachable successor (currently via an + // unconditional branch or constant foldable conditional branch, but + // should also consider adding constant foldable switch instruction in + // future), we should keep looking for trivial condition candidates in + // the successor as well. An alternative is to constant fold conditions + // and merge successors into loop header (then we only need to check header's + // terminator). The reason for not doing this in LoopUnswitch pass is that + // it could potentially break LoopPassManager's invariants. Folding dead + // branches could either eliminate the current loop or make other loops + // unreachable. LCSSA form might also not be preserved after deleting + // branches. The following code keeps traversing loop header's successors + // until it finds the trivial condition candidate (condition that is not a + // constant). Since unswitching generates branches with constant conditions, + // this scenario could be very common in practice. + SmallSet<BasicBlock*, 8> Visited; + + while (true) { + // If we exit loop or reach a previous visited block, then + // we can not reach any trivial condition candidates (unfoldable + // branch instructions or switch instructions) and no unswitch + // can happen. Exit and return false. + if (!currentLoop->contains(CurrentBB) || !Visited.insert(CurrentBB).second) + return false; + + // Check if this loop will execute any side-effecting instructions (e.g. + // stores, calls, volatile loads) in the part of the loop that the code + // *would* execute. Check the header first. + for (Instruction &I : *CurrentBB) + if (I.mayHaveSideEffects()) + return false; + + // FIXME: add check for constant foldable switch instructions. + if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) { + if (BI->isUnconditional()) { + CurrentBB = BI->getSuccessor(0); + } else if (BI->getCondition() == ConstantInt::getTrue(Context)) { + CurrentBB = BI->getSuccessor(0); + } else if (BI->getCondition() == ConstantInt::getFalse(Context)) { + CurrentBB = BI->getSuccessor(1); + } else { + // Found a trivial condition candidate: non-foldable conditional branch. + break; + } + } else { + break; + } + + CurrentTerm = CurrentBB->getTerminator(); + } + + // CondVal is the condition that controls the trivial condition. + // LoopExitBB is the BasicBlock that loop exits when meets trivial condition. + Constant *CondVal = nullptr; + BasicBlock *LoopExitBB = nullptr; + + if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) { + // If this isn't branching on an invariant condition, we can't unswitch it. + if (!BI->isConditional()) + return false; + + Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), + currentLoop, Changed); + + // Unswitch only if the trivial condition itself is an LIV (not + // partial LIV which could occur in and/or) + if (!LoopCond || LoopCond != BI->getCondition()) + return false; + + // Check to see if a successor of the branch is guaranteed to + // exit through a unique exit block without having any + // side-effects. If so, determine the value of Cond that causes + // it to do this. + if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, + BI->getSuccessor(0)))) { + CondVal = ConstantInt::getTrue(Context); + } else if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop, + BI->getSuccessor(1)))) { + CondVal = ConstantInt::getFalse(Context); + } + + // If we didn't find a single unique LoopExit block, or if the loop exit + // block contains phi nodes, this isn't trivial. + if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin())) + return false; // Can't handle this. + + UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, LoopExitBB, + CurrentTerm); + ++NumBranches; + return true; + } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) { + // If this isn't switching on an invariant condition, we can't unswitch it. + Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), + currentLoop, Changed); + + // Unswitch only if the trivial condition itself is an LIV (not + // partial LIV which could occur in and/or) + if (!LoopCond || LoopCond != SI->getCondition()) + return false; + + // Check to see if a successor of the switch is guaranteed to go to the + // latch block or exit through a one exit block without having any + // side-effects. If so, determine the value of Cond that causes it to do + // this. + // Note that we can't trivially unswitch on the default case or + // on already unswitched cases. + for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); + i != e; ++i) { + BasicBlock *LoopExitCandidate; + if ((LoopExitCandidate = isTrivialLoopExitBlock(currentLoop, + i.getCaseSuccessor()))) { + // Okay, we found a trivial case, remember the value that is trivial. + ConstantInt *CaseVal = i.getCaseValue(); + + // Check that it was not unswitched before, since already unswitched + // trivial vals are looks trivial too. + if (BranchesInfo.isUnswitched(SI, CaseVal)) + continue; + LoopExitBB = LoopExitCandidate; + CondVal = CaseVal; + break; + } + } + + // If we didn't find a single unique LoopExit block, or if the loop exit + // block contains phi nodes, this isn't trivial. + if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin())) + return false; // Can't handle this. + + UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, LoopExitBB, + nullptr); + ++NumSwitches; + return true; + } + return false; +} + +/// Split all of the edges from inside the loop to their exit blocks. +/// Update the appropriate Phi nodes as we do so. void LoopUnswitch::SplitExitEdges(Loop *L, const SmallVectorImpl<BasicBlock *> &ExitBlocks){ @@ -841,15 +958,14 @@ void LoopUnswitch::SplitExitEdges(Loop *L, // Although SplitBlockPredecessors doesn't preserve loop-simplify in // general, if we call it on all predecessors of all exits then it does. - SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa", - /*AliasAnalysis*/ nullptr, DT, LI, + SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa", DT, LI, /*PreserveLCSSA*/ true); } } -/// UnswitchNontrivialCondition - We determined that the loop is profitable -/// to unswitch when LIC equal Val. Split it into loop versions and test the -/// condition outside of either loop. Return the loops created as Out1/Out2. +/// We determined that the loop is profitable to unswitch when LIC equal Val. +/// Split it into loop versions and test the condition outside of either loop. +/// Return the loops created as Out1/Out2. void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, Loop *L, TerminatorInst *TI) { Function *F = loopHeader->getParent(); @@ -858,8 +974,8 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, << " blocks] in Function " << F->getName() << " when '" << *Val << "' == " << *LIC << "\n"); - if (ScalarEvolution *SE = getAnalysisIfAvailable<ScalarEvolution>()) - SE->forgetLoop(L); + if (auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>()) + SEWP->getSE().forgetLoop(L); LoopBlocks.clear(); NewBlocks.clear(); @@ -901,8 +1017,9 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, // Splice the newly inserted blocks into the function right before the // original preheader. - F->getBasicBlockList().splice(NewPreheader, F->getBasicBlockList(), - NewBlocks[0], F->end()); + F->getBasicBlockList().splice(NewPreheader->getIterator(), + F->getBasicBlockList(), + NewBlocks[0]->getIterator(), F->end()); // FIXME: We could register any cloned assumptions instead of clearing the // whole function's cache. @@ -944,7 +1061,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, if (LandingPadInst *LPad = NewExit->getLandingPadInst()) { PHINode *PN = PHINode::Create(LPad->getType(), 0, "", - ExitSucc->getFirstInsertionPt()); + &*ExitSucc->getFirstInsertionPt()); for (pred_iterator I = pred_begin(ExitSucc), E = pred_end(ExitSucc); I != E; ++I) { @@ -960,7 +1077,8 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) for (BasicBlock::iterator I = NewBlocks[i]->begin(), E = NewBlocks[i]->end(); I != E; ++I) - RemapInstruction(I, VMap,RF_NoModuleLevelChanges|RF_IgnoreMissingEntries); + RemapInstruction(&*I, VMap, + RF_NoModuleLevelChanges | RF_IgnoreMissingEntries); // Rewrite the original preheader to select between versions of the loop. BranchInst *OldBR = cast<BranchInst>(loopPreheader->getTerminator()); @@ -994,8 +1112,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, RewriteLoopBodyWithConditionConstant(NewLoop, LICHandle, Val, true); } -/// RemoveFromWorklist - Remove all instances of I from the worklist vector -/// specified. +/// Remove all instances of I from the worklist vector specified. static void RemoveFromWorklist(Instruction *I, std::vector<Instruction*> &Worklist) { @@ -1003,7 +1120,7 @@ static void RemoveFromWorklist(Instruction *I, Worklist.end()); } -/// ReplaceUsesOfWith - When we find that I really equals V, remove I from the +/// When we find that I really equals V, remove I from the /// program, replacing all uses with V and update the worklist. static void ReplaceUsesOfWith(Instruction *I, Value *V, std::vector<Instruction*> &Worklist, @@ -1025,9 +1142,9 @@ static void ReplaceUsesOfWith(Instruction *I, Value *V, ++NumSimplify; } -// RewriteLoopBodyWithConditionConstant - We know either that the value LIC has -// the value specified by Val in the specified loop, or we know it does NOT have -// that value. Rewrite any uses of LIC or of properties correlated to it. +/// We know either that the value LIC has the value specified by Val in the +/// specified loop, or we know it does NOT have that value. +/// Rewrite any uses of LIC or of properties correlated to it. void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, Constant *Val, bool IsEqual) { @@ -1138,18 +1255,16 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, // domtree here -- instead we force it to do a full recomputation // after the pass is complete -- but we do need to inform it of // new blocks. - if (DT) - DT->addNewBlock(Abort, NewSISucc); + DT->addNewBlock(Abort, NewSISucc); } SimplifyCode(Worklist, L); } -/// SimplifyCode - Okay, now that we have simplified some instructions in the -/// loop, walk over it and constant prop, dce, and fold control flow where -/// possible. Note that this is effectively a very simple loop-structure-aware -/// optimizer. During processing of this loop, L could very well be deleted, so -/// it must not be used. +/// Now that we have simplified some instructions in the loop, walk over it and +/// constant prop, dce, and fold control flow where possible. Note that this is +/// effectively a very simple loop-structure-aware optimizer. During processing +/// of this loop, L could very well be deleted, so it must not be used. /// /// FIXME: When the loop optimizer is more mature, separate this out to a new /// pass. @@ -1207,8 +1322,8 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) { Succ->replaceAllUsesWith(Pred); // Move all of the successor contents from Succ to Pred. - Pred->getInstList().splice(BI, Succ->getInstList(), Succ->begin(), - Succ->end()); + Pred->getInstList().splice(BI->getIterator(), Succ->getInstList(), + Succ->begin(), Succ->end()); LPM->deleteSimpleAnalysisValue(BI, L); BI->eraseFromParent(); RemoveFromWorklist(BI, Worklist); diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp index 3314e1e..41511bc 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp @@ -22,7 +22,7 @@ using namespace llvm; #define DEBUG_TYPE "loweratomic" static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) { - IRBuilder<> Builder(CXI->getParent(), CXI); + IRBuilder<> Builder(CXI); Value *Ptr = CXI->getPointerOperand(); Value *Cmp = CXI->getCompareOperand(); Value *Val = CXI->getNewValOperand(); @@ -41,7 +41,7 @@ static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) { } static bool LowerAtomicRMWInst(AtomicRMWInst *RMWI) { - IRBuilder<> Builder(RMWI->getParent(), RMWI); + IRBuilder<> Builder(RMWI); Value *Ptr = RMWI->getPointerOperand(); Value *Val = RMWI->getValOperand(); @@ -120,7 +120,7 @@ namespace { return false; bool Changed = false; for (BasicBlock::iterator DI = BB.begin(), DE = BB.end(); DI != DE; ) { - Instruction *Inst = DI++; + Instruction *Inst = &*DI++; if (FenceInst *FI = dyn_cast<FenceInst>(Inst)) Changed |= LowerFenceInst(FI); else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(Inst)) diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp index 0c47cbd..2ace902 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp @@ -139,7 +139,7 @@ static bool lowerExpectIntrinsic(Function &F) { ExpectIntrinsicsHandled++; } - // remove llvm.expect intrinsics. + // Remove llvm.expect intrinsics. for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) { CallInst *CI = dyn_cast<CallInst>(BI++); if (!CI) diff --git a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 85012af..0333bf2 100644 --- a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -30,7 +31,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Local.h" -#include <list> +#include <algorithm> using namespace llvm; #define DEBUG_TYPE "memcpyopt" @@ -71,9 +72,9 @@ static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx, return Offset; } -/// IsPointerOffset - Return true if Ptr1 is provably equal to Ptr2 plus a -/// constant offset, and return that constant offset. For example, Ptr1 might -/// be &A[42], and Ptr2 might be &A[40]. In this case offset would be -8. +/// Return true if Ptr1 is provably equal to Ptr2 plus a constant offset, and +/// return that constant offset. For example, Ptr1 might be &A[42], and Ptr2 +/// might be &A[40]. In this case offset would be -8. static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, const DataLayout &DL) { Ptr1 = Ptr1->stripPointerCasts(); @@ -125,7 +126,7 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, } -/// MemsetRange - Represents a range of memset'd bytes with the ByteVal value. +/// Represents a range of memset'd bytes with the ByteVal value. /// This allows us to analyze stores like: /// store 0 -> P+1 /// store 0 -> P+0 @@ -164,8 +165,8 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const { // If any of the stores are a memset, then it is always good to extend the // memset. - for (unsigned i = 0, e = TheStores.size(); i != e; ++i) - if (!isa<StoreInst>(TheStores[i])) + for (Instruction *SI : TheStores) + if (!isa<StoreInst>(SI)) return true; // Assume that the code generator is capable of merging pairs of stores @@ -189,7 +190,7 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const { unsigned NumPointerStores = Bytes / MaxIntSize; // Assume the remaining bytes if any are done a byte at a time. - unsigned NumByteStores = Bytes - NumPointerStores * MaxIntSize; + unsigned NumByteStores = Bytes % MaxIntSize; // If we will reduce the # stores (according to this heuristic), do the // transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32 @@ -200,15 +201,14 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const { namespace { class MemsetRanges { - /// Ranges - A sorted list of the memset ranges. We use std::list here - /// because each element is relatively large and expensive to copy. - std::list<MemsetRange> Ranges; - typedef std::list<MemsetRange>::iterator range_iterator; + /// A sorted list of the memset ranges. + SmallVector<MemsetRange, 8> Ranges; + typedef SmallVectorImpl<MemsetRange>::iterator range_iterator; const DataLayout &DL; public: MemsetRanges(const DataLayout &DL) : DL(DL) {} - typedef std::list<MemsetRange>::const_iterator const_iterator; + typedef SmallVectorImpl<MemsetRange>::const_iterator const_iterator; const_iterator begin() const { return Ranges.begin(); } const_iterator end() const { return Ranges.end(); } bool empty() const { return Ranges.empty(); } @@ -240,26 +240,20 @@ public: } // end anon namespace -/// addRange - Add a new store to the MemsetRanges data structure. This adds a +/// Add a new store to the MemsetRanges data structure. This adds a /// new range for the specified store at the specified offset, merging into /// existing ranges as appropriate. -/// -/// Do a linear search of the ranges to see if this can be joined and/or to -/// find the insertion point in the list. We keep the ranges sorted for -/// simplicity here. This is a linear search of a linked list, which is ugly, -/// however the number of ranges is limited, so this won't get crazy slow. void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr, unsigned Alignment, Instruction *Inst) { int64_t End = Start+Size; - range_iterator I = Ranges.begin(), E = Ranges.end(); - while (I != E && Start > I->End) - ++I; + range_iterator I = std::lower_bound(Ranges.begin(), Ranges.end(), Start, + [](const MemsetRange &LHS, int64_t RHS) { return LHS.End < RHS; }); // We now know that I == E, in which case we didn't find anything to merge // with, or that Start <= I->End. If End < I->Start or I == E, then we need // to insert a new range. Handle this now. - if (I == E || End < I->Start) { + if (I == Ranges.end() || End < I->Start) { MemsetRange &R = *Ranges.insert(I, MemsetRange()); R.Start = Start; R.End = End; @@ -295,7 +289,7 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr, if (End > I->End) { I->End = End; range_iterator NextI = I; - while (++NextI != E && End >= NextI->Start) { + while (++NextI != Ranges.end() && End >= NextI->Start) { // Merge the range in. I->TheStores.append(NextI->TheStores.begin(), NextI->TheStores.end()); if (NextI->End > I->End) @@ -331,9 +325,9 @@ namespace { AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<MemoryDependenceAnalysis>(); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); - AU.addPreserved<AliasAnalysis>(); + AU.addPreserved<GlobalsAAWrapperPass>(); AU.addPreserved<MemoryDependenceAnalysis>(); } @@ -357,7 +351,7 @@ namespace { char MemCpyOpt::ID = 0; } -// createMemCpyOptPass - The public interface to this file... +/// The public interface to this file... FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOpt(); } INITIALIZE_PASS_BEGIN(MemCpyOpt, "memcpyopt", "MemCpy Optimization", @@ -366,14 +360,15 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization", false, false) -/// tryMergingIntoMemset - When scanning forward over instructions, we look for -/// some other patterns to fold away. In particular, this looks for stores to -/// neighboring locations of memory. If it sees enough consecutive ones, it -/// attempts to merge them together into a memcpy/memset. +/// When scanning forward over instructions, we look for some other patterns to +/// fold away. In particular, this looks for stores to neighboring locations of +/// memory. If it sees enough consecutive ones, it attempts to merge them +/// together into a memcpy/memset. Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, Value *StartPtr, Value *ByteVal) { const DataLayout &DL = StartInst->getModule()->getDataLayout(); @@ -384,7 +379,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, // are stored. MemsetRanges Ranges(DL); - BasicBlock::iterator BI = StartInst; + BasicBlock::iterator BI(StartInst); for (++BI; !isa<TerminatorInst>(BI); ++BI) { if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) { // If the instruction is readnone, ignore it, otherwise bail out. We @@ -439,14 +434,12 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, // If we create any memsets, we put it right before the first instruction that // isn't part of the memset block. This ensure that the memset is dominated // by any addressing instruction needed by the start of the block. - IRBuilder<> Builder(BI); + IRBuilder<> Builder(&*BI); // Now that we have full information about ranges, loop over the ranges and // emit memset's for anything big enough to be worthwhile. Instruction *AMemSet = nullptr; - for (MemsetRanges::const_iterator I = Ranges.begin(), E = Ranges.end(); - I != E; ++I) { - const MemsetRange &Range = *I; + for (const MemsetRange &Range : Ranges) { if (Range.TheStores.size() == 1) continue; @@ -470,19 +463,17 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment); DEBUG(dbgs() << "Replace stores:\n"; - for (unsigned i = 0, e = Range.TheStores.size(); i != e; ++i) - dbgs() << *Range.TheStores[i] << '\n'; + for (Instruction *SI : Range.TheStores) + dbgs() << *SI << '\n'; dbgs() << "With: " << *AMemSet << '\n'); if (!Range.TheStores.empty()) AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc()); // Zap all the stores. - for (SmallVectorImpl<Instruction *>::const_iterator - SI = Range.TheStores.begin(), - SE = Range.TheStores.end(); SI != SE; ++SI) { - MD->removeInstruction(*SI); - (*SI)->eraseFromParent(); + for (Instruction *SI : Range.TheStores) { + MD->removeInstruction(SI); + SI->eraseFromParent(); } ++NumMemSetInfer; } @@ -493,6 +484,16 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (!SI->isSimple()) return false; + + // Avoid merging nontemporal stores since the resulting + // memcpy/memset would not be able to preserve the nontemporal hint. + // In theory we could teach how to propagate the !nontemporal metadata to + // memset calls. However, that change would force the backend to + // conservatively expand !nontemporal memset calls back to sequences of + // store instructions (effectively undoing the merging). + if (SI->getMetadata(LLVMContext::MD_nontemporal)) + return false; + const DataLayout &DL = SI->getModule()->getDataLayout(); // Detect cases where we're performing call slot forwarding, but @@ -509,11 +510,11 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (C) { // Check that nothing touches the dest of the "copy" between // the call and the store. - AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); + AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); MemoryLocation StoreLoc = MemoryLocation::get(SI); - for (BasicBlock::iterator I = --BasicBlock::iterator(SI), - E = C; I != E; --I) { - if (AA.getModRefInfo(&*I, StoreLoc) != AliasAnalysis::NoModRef) { + for (BasicBlock::iterator I = --SI->getIterator(), E = C->getIterator(); + I != E; --I) { + if (AA.getModRefInfo(&*I, StoreLoc) != MRI_NoModRef) { C = nullptr; break; } @@ -554,7 +555,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (Value *ByteVal = isBytewiseValue(SI->getOperand(0))) if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(), ByteVal)) { - BBI = I; // Don't invalidate iterator. + BBI = I->getIterator(); // Don't invalidate iterator. return true; } @@ -567,14 +568,14 @@ bool MemCpyOpt::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) { if (isa<ConstantInt>(MSI->getLength()) && !MSI->isVolatile()) if (Instruction *I = tryMergingIntoMemset(MSI, MSI->getDest(), MSI->getValue())) { - BBI = I; // Don't invalidate iterator. + BBI = I->getIterator(); // Don't invalidate iterator. return true; } return false; } -/// performCallSlotOptzn - takes a memcpy and a call that it depends on, +/// Takes a memcpy and a call that it depends on, /// and checks for the possibility of a call slot optimization by having /// the call write its result directly into the destination of the memcpy. bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, @@ -710,12 +711,12 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, // unexpected manner, for example via a global, which we deduce from // the use analysis, we also need to know that it does not sneakily // access dest. We rely on AA to figure this out for us. - AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); - AliasAnalysis::ModRefResult MR = AA.getModRefInfo(C, cpyDest, srcSize); + AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); + ModRefInfo MR = AA.getModRefInfo(C, cpyDest, srcSize); // If necessary, perform additional analysis. - if (MR != AliasAnalysis::NoModRef) + if (MR != MRI_NoModRef) MR = AA.callCapturesBefore(C, cpyDest, srcSize, &DT); - if (MR != AliasAnalysis::NoModRef) + if (MR != MRI_NoModRef) return false; // All the checks have passed, so do the transformation. @@ -749,11 +750,9 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, // Update AA metadata // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be // handled here, but combineMetadata doesn't support them yet - unsigned KnownIDs[] = { - LLVMContext::MD_tbaa, - LLVMContext::MD_alias_scope, - LLVMContext::MD_noalias, - }; + unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, + LLVMContext::MD_noalias, + LLVMContext::MD_invariant_group}; combineMetadata(C, cpy, KnownIDs); // Remove the memcpy. @@ -763,10 +762,8 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, return true; } -/// processMemCpyMemCpyDependence - We've found that the (upward scanning) -/// memory dependence of memcpy 'M' is the memcpy 'MDep'. Try to simplify M to -/// copy from MDep's input if we can. -/// +/// We've found that the (upward scanning) memory dependence of memcpy 'M' is +/// the memcpy 'MDep'. Try to simplify M to copy from MDep's input if we can. bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep) { // We can only transforms memcpy's where the dest of one is the source of the // other. @@ -788,7 +785,7 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep) { if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue()) return false; - AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); + AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); // Verify that the copied-from memory doesn't change in between the two // transfers. For example, in: @@ -802,8 +799,9 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep) { // // NOTE: This is conservative, it will stop on any read from the source loc, // not just the defining memcpy. - MemDepResult SourceDep = MD->getPointerDependencyFrom( - MemoryLocation::getForSource(MDep), false, M, M->getParent()); + MemDepResult SourceDep = + MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false, + M->getIterator(), M->getParent()); if (!SourceDep.isClobber() || SourceDep.getInst() != MDep) return false; @@ -860,8 +858,9 @@ bool MemCpyOpt::processMemSetMemCpyDependence(MemCpyInst *MemCpy, return false; // Check that there are no other dependencies on the memset destination. - MemDepResult DstDepInfo = MD->getPointerDependencyFrom( - MemoryLocation::getForDest(MemSet), false, MemCpy, MemCpy->getParent()); + MemDepResult DstDepInfo = + MD->getPointerDependencyFrom(MemoryLocation::getForDest(MemSet), false, + MemCpy->getIterator(), MemCpy->getParent()); if (DstDepInfo.getInst() != MemSet) return false; @@ -936,7 +935,7 @@ bool MemCpyOpt::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, return true; } -/// processMemCpy - perform simplification of memcpy's. If we have memcpy A +/// Perform simplification of memcpy's. If we have memcpy A /// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite /// B to be a memcpy from X to Z (or potentially a memmove, depending on /// circumstances). This allows later passes to remove the first memcpy @@ -998,8 +997,8 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) { } MemoryLocation SrcLoc = MemoryLocation::getForSource(M); - MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(SrcLoc, true, - M, M->getParent()); + MemDepResult SrcDepInfo = MD->getPointerDependencyFrom( + SrcLoc, true, M->getIterator(), M->getParent()); if (SrcDepInfo.isClobber()) { if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst())) @@ -1037,10 +1036,10 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) { return false; } -/// processMemMove - Transforms memmove calls to memcpy calls when the src/dst -/// are guaranteed not to alias. +/// Transforms memmove calls to memcpy calls when the src/dst are guaranteed +/// not to alias. bool MemCpyOpt::processMemMove(MemMoveInst *M) { - AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); + AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); if (!TLI->has(LibFunc::memmove)) return false; @@ -1053,12 +1052,11 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) { DEBUG(dbgs() << "MemCpyOpt: Optimizing memmove -> memcpy: " << *M << "\n"); // If not, then we know we can transform this. - Module *Mod = M->getParent()->getParent()->getParent(); Type *ArgTys[3] = { M->getRawDest()->getType(), M->getRawSource()->getType(), M->getLength()->getType() }; - M->setCalledFunction(Intrinsic::getDeclaration(Mod, Intrinsic::memcpy, - ArgTys)); + M->setCalledFunction(Intrinsic::getDeclaration(M->getModule(), + Intrinsic::memcpy, ArgTys)); // MemDep may have over conservative information about this instruction, just // conservatively flush it from the cache. @@ -1068,7 +1066,7 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) { return true; } -/// processByValArgument - This is called on every byval argument in call sites. +/// This is called on every byval argument in call sites. bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { const DataLayout &DL = CS.getCaller()->getParent()->getDataLayout(); // Find out what feeds this byval argument. @@ -1076,8 +1074,8 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType(); uint64_t ByValSize = DL.getTypeAllocSize(ByValTy); MemDepResult DepInfo = MD->getPointerDependencyFrom( - MemoryLocation(ByValArg, ByValSize), true, CS.getInstruction(), - CS.getInstruction()->getParent()); + MemoryLocation(ByValArg, ByValSize), true, + CS.getInstruction()->getIterator(), CS.getInstruction()->getParent()); if (!DepInfo.isClobber()) return false; @@ -1119,9 +1117,9 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { // // NOTE: This is conservative, it will stop on any read from the source loc, // not just the defining memcpy. - MemDepResult SourceDep = - MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false, - CS.getInstruction(), MDep->getParent()); + MemDepResult SourceDep = MD->getPointerDependencyFrom( + MemoryLocation::getForSource(MDep), false, + CS.getInstruction()->getIterator(), MDep->getParent()); if (!SourceDep.isClobber() || SourceDep.getInst() != MDep) return false; @@ -1140,7 +1138,7 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { return true; } -/// iterateOnFunction - Executes one iteration of MemCpyOpt. +/// Executes one iteration of MemCpyOpt. bool MemCpyOpt::iterateOnFunction(Function &F) { bool MadeChange = false; @@ -1148,7 +1146,7 @@ bool MemCpyOpt::iterateOnFunction(Function &F) { for (Function::iterator BB = F.begin(), BBE = F.end(); BB != BBE; ++BB) { for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { // Avoid invalidating the iterator. - Instruction *I = BI++; + Instruction *I = &*BI++; bool RepeatInstruction = false; @@ -1177,9 +1175,7 @@ bool MemCpyOpt::iterateOnFunction(Function &F) { return MadeChange; } -// MemCpyOpt::runOnFunction - This is the main transformation entry point for a -// function. -// +/// This is the main transformation entry point for a function. bool MemCpyOpt::runOnFunction(Function &F) { if (skipOptnoneFunction(F)) return false; diff --git a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp index 643f374..c812d61 100644 --- a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp @@ -78,6 +78,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" @@ -91,6 +92,7 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include <vector> + using namespace llvm; #define DEBUG_TYPE "mldst-motion" @@ -106,7 +108,7 @@ class MergedLoadStoreMotion : public FunctionPass { public: static char ID; // Pass identification, replacement for typeid - explicit MergedLoadStoreMotion(void) + MergedLoadStoreMotion() : FunctionPass(ID), MD(nullptr), MagicCompileTimeControl(250) { initializeMergedLoadStoreMotionPass(*PassRegistry::getPassRegistry()); } @@ -116,10 +118,11 @@ public: private: // This transformation requires dominator postdominator info void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); AU.addRequired<TargetLibraryInfoWrapperPass>(); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); AU.addPreserved<MemoryDependenceAnalysis>(); - AU.addPreserved<AliasAnalysis>(); } // Helper routines @@ -156,7 +159,7 @@ private: }; char MergedLoadStoreMotion::ID = 0; -} +} // anonymous namespace /// /// \brief createMergedLoadStoreMotionPass - The public interface to this file. @@ -169,7 +172,8 @@ INITIALIZE_PASS_BEGIN(MergedLoadStoreMotion, "mldst-motion", "MergedLoadStoreMotion", false, false) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_END(MergedLoadStoreMotion, "mldst-motion", "MergedLoadStoreMotion", false, false) @@ -236,12 +240,11 @@ bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) { /// being loaded or protect against the load from happening /// it is considered a hoist barrier. /// - bool MergedLoadStoreMotion::isLoadHoistBarrierInRange(const Instruction& Start, const Instruction& End, LoadInst* LI) { MemoryLocation Loc = MemoryLocation::get(LI); - return AA->canInstructionRangeModRef(Start, End, Loc, AliasAnalysis::Mod); + return AA->canInstructionRangeModRef(Start, End, Loc, MRI_Mod); } /// @@ -256,7 +259,7 @@ LoadInst *MergedLoadStoreMotion::canHoistFromBlock(BasicBlock *BB1, for (BasicBlock::iterator BBI = BB1->begin(), BBE = BB1->end(); BBI != BBE; ++BBI) { - Instruction *Inst = BBI; + Instruction *Inst = &*BBI; // Only merge and hoist loads when their result in used only in BB if (!isa<LoadInst>(Inst) || Inst->isUsedOutsideOfBlock(BB1)) @@ -293,7 +296,7 @@ void MergedLoadStoreMotion::hoistInstruction(BasicBlock *BB, // Intersect optional metadata. HoistCand->intersectOptionalDataWith(ElseInst); - HoistCand->dropUnknownMetadata(); + HoistCand->dropUnknownNonDebugMetadata(); // Prepend point for instruction insert Instruction *HoistPt = BB->getTerminator(); @@ -363,8 +366,7 @@ bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) { int NLoads = 0; for (BasicBlock::iterator BBI = Succ0->begin(), BBE = Succ0->end(); BBI != BBE;) { - - Instruction *I = BBI; + Instruction *I = &*BBI; ++BBI; // Only move non-simple (atomic, volatile) loads. @@ -394,11 +396,10 @@ bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) { /// value being stored or protect against the store from /// happening it is considered a sink barrier. /// - bool MergedLoadStoreMotion::isStoreSinkBarrierInRange(const Instruction &Start, const Instruction &End, MemoryLocation Loc) { - return AA->canInstructionRangeModRef(Start, End, Loc, AliasAnalysis::ModRef); + return AA->canInstructionRangeModRef(Start, End, Loc, MRI_ModRef); } /// @@ -438,23 +439,16 @@ StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1, PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1) { // Create a phi if the values mismatch. - PHINode *NewPN = 0; + PHINode *NewPN = nullptr; Value *Opd1 = S0->getValueOperand(); Value *Opd2 = S1->getValueOperand(); if (Opd1 != Opd2) { NewPN = PHINode::Create(Opd1->getType(), 2, Opd2->getName() + ".sink", - BB->begin()); + &BB->front()); NewPN->addIncoming(Opd1, S0->getParent()); NewPN->addIncoming(Opd2, S1->getParent()); - if (NewPN->getType()->getScalarType()->isPointerTy()) { - // AA needs to be informed when a PHI-use of the pointer value is added - for (unsigned I = 0, E = NewPN->getNumIncomingValues(); I != E; ++I) { - unsigned J = PHINode::getOperandNumForIncomingValue(I); - AA->addEscapingUse(NewPN->getOperandUse(J)); - } - if (MD) - MD->invalidateCachedPointerInfo(NewPN); - } + if (MD && NewPN->getType()->getScalarType()->isPointerTy()) + MD->invalidateCachedPointerInfo(NewPN); } return NewPN; } @@ -479,12 +473,12 @@ bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0, BasicBlock::iterator InsertPt = BB->getFirstInsertionPt(); // Intersect optional metadata. S0->intersectOptionalDataWith(S1); - S0->dropUnknownMetadata(); + S0->dropUnknownNonDebugMetadata(); // Create the new store to be inserted at the join point. StoreInst *SNew = (StoreInst *)(S0->clone()); Instruction *ANew = A0->clone(); - SNew->insertBefore(InsertPt); + SNew->insertBefore(&*InsertPt); ANew->insertBefore(SNew); assert(S0->getParent() == A0->getParent()); @@ -566,12 +560,13 @@ bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) { } return MergedStores; } + /// /// \brief Run the transformation for each function /// bool MergedLoadStoreMotion::runOnFunction(Function &F) { MD = getAnalysisIfAvailable<MemoryDependenceAnalysis>(); - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); bool Changed = false; DEBUG(dbgs() << "Instruction Merger\n"); @@ -579,7 +574,7 @@ bool MergedLoadStoreMotion::runOnFunction(Function &F) { // Merge unconditional branches, allowing PRE to catch more // optimization opportunities. for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE;) { - BasicBlock *BB = FI++; + BasicBlock *BB = &*FI++; // Hoist equivalent loads and sink stores // outside diamonds when possible diff --git a/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp index f42f830..c8f885e 100644 --- a/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp @@ -71,8 +71,8 @@ // // Limitations and TODO items: // -// 1) We only considers n-ary adds for now. This should be extended and -// generalized. +// 1) We only considers n-ary adds and muls for now. This should be extended +// and generalized. // //===----------------------------------------------------------------------===// @@ -110,11 +110,11 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<ScalarEvolution>(); + AU.addPreserved<ScalarEvolutionWrapperPass>(); AU.addPreserved<TargetLibraryInfoWrapperPass>(); AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); AU.setPreservesCFG(); @@ -145,12 +145,23 @@ private: unsigned I, Value *LHS, Value *RHS, Type *IndexedType); - // Reassociate Add for better CSE. - Instruction *tryReassociateAdd(BinaryOperator *I); - // A helper function for tryReassociateAdd. LHS and RHS are explicitly passed. - Instruction *tryReassociateAdd(Value *LHS, Value *RHS, Instruction *I); - // Rewrites I to LHS + RHS if LHS is computed already. - Instruction *tryReassociatedAdd(const SCEV *LHS, Value *RHS, Instruction *I); + // Reassociate binary operators for better CSE. + Instruction *tryReassociateBinaryOp(BinaryOperator *I); + + // A helper function for tryReassociateBinaryOp. LHS and RHS are explicitly + // passed. + Instruction *tryReassociateBinaryOp(Value *LHS, Value *RHS, + BinaryOperator *I); + // Rewrites I to (LHS op RHS) if LHS is computed already. + Instruction *tryReassociatedBinaryOp(const SCEV *LHS, Value *RHS, + BinaryOperator *I); + + // Tries to match Op1 and Op2 by using V. + bool matchTernaryOp(BinaryOperator *I, Value *V, Value *&Op1, Value *&Op2); + + // Gets SCEV for (LHS op RHS). + const SCEV *getBinarySCEV(BinaryOperator *I, const SCEV *LHS, + const SCEV *RHS); // Returns the closest dominator of \c Dominatee that computes // \c CandidateExpr. Returns null if not found. @@ -161,11 +172,6 @@ private: // GEP's pointer size, i.e., whether Index needs to be sign-extended in order // to be an index of GEP. bool requiresSignExtension(Value *Index, GetElementPtrInst *GEP); - // Returns whether V is known to be non-negative at context \c Ctxt. - bool isKnownNonNegative(Value *V, Instruction *Ctxt); - // Returns whether AO may sign overflow at context \c Ctxt. It computes a - // conservative result -- it answers true when not sure. - bool maySignOverflow(AddOperator *AO, Instruction *Ctxt); AssumptionCache *AC; const DataLayout *DL; @@ -182,7 +188,7 @@ private: // foo(a + b); // if (p2) // bar(a + b); - DenseMap<const SCEV *, SmallVector<Instruction *, 2>> SeenExprs; + DenseMap<const SCEV *, SmallVector<WeakVH, 2>> SeenExprs; }; } // anonymous namespace @@ -191,7 +197,7 @@ INITIALIZE_PASS_BEGIN(NaryReassociate, "nary-reassociate", "Nary reassociation", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(NaryReassociate, "nary-reassociate", "Nary reassociation", @@ -207,7 +213,7 @@ bool NaryReassociate::runOnFunction(Function &F) { AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); @@ -224,6 +230,7 @@ static bool isPotentiallyNaryReassociable(Instruction *I) { switch (I->getOpcode()) { case Instruction::Add: case Instruction::GetElementPtr: + case Instruction::Mul: return true; default: return false; @@ -239,19 +246,21 @@ bool NaryReassociate::doOneIteration(Function &F) { Node != GraphTraits<DominatorTree *>::nodes_end(DT); ++Node) { BasicBlock *BB = Node->getBlock(); for (auto I = BB->begin(); I != BB->end(); ++I) { - if (SE->isSCEVable(I->getType()) && isPotentiallyNaryReassociable(I)) { - const SCEV *OldSCEV = SE->getSCEV(I); - if (Instruction *NewI = tryReassociate(I)) { + if (SE->isSCEVable(I->getType()) && isPotentiallyNaryReassociable(&*I)) { + const SCEV *OldSCEV = SE->getSCEV(&*I); + if (Instruction *NewI = tryReassociate(&*I)) { Changed = true; - SE->forgetValue(I); + SE->forgetValue(&*I); I->replaceAllUsesWith(NewI); - RecursivelyDeleteTriviallyDeadInstructions(I, TLI); - I = NewI; + // If SeenExprs constains I's WeakVH, that entry will be replaced with + // nullptr. + RecursivelyDeleteTriviallyDeadInstructions(&*I, TLI); + I = NewI->getIterator(); } // Add the rewritten instruction to SeenExprs; the original instruction // is deleted. - const SCEV *NewSCEV = SE->getSCEV(I); - SeenExprs[NewSCEV].push_back(I); + const SCEV *NewSCEV = SE->getSCEV(&*I); + SeenExprs[NewSCEV].push_back(WeakVH(&*I)); // Ideally, NewSCEV should equal OldSCEV because tryReassociate(I) // is equivalent to I. However, ScalarEvolution::getSCEV may // weaken nsw causing NewSCEV not to equal OldSCEV. For example, suppose @@ -271,7 +280,7 @@ bool NaryReassociate::doOneIteration(Function &F) { // // This improvement is exercised in @reassociate_gep_nsw in nary-gep.ll. if (NewSCEV != OldSCEV) - SeenExprs[OldSCEV].push_back(I); + SeenExprs[OldSCEV].push_back(WeakVH(&*I)); } } } @@ -281,7 +290,8 @@ bool NaryReassociate::doOneIteration(Function &F) { Instruction *NaryReassociate::tryReassociate(Instruction *I) { switch (I->getOpcode()) { case Instruction::Add: - return tryReassociateAdd(cast<BinaryOperator>(I)); + case Instruction::Mul: + return tryReassociateBinaryOp(cast<BinaryOperator>(I)); case Instruction::GetElementPtr: return tryReassociateGEP(cast<GetElementPtrInst>(I)); default: @@ -352,27 +362,6 @@ bool NaryReassociate::requiresSignExtension(Value *Index, return cast<IntegerType>(Index->getType())->getBitWidth() < PointerSizeInBits; } -bool NaryReassociate::isKnownNonNegative(Value *V, Instruction *Ctxt) { - bool NonNegative, Negative; - // TODO: ComputeSignBits is expensive. Consider caching the results. - ComputeSignBit(V, NonNegative, Negative, *DL, 0, AC, Ctxt, DT); - return NonNegative; -} - -bool NaryReassociate::maySignOverflow(AddOperator *AO, Instruction *Ctxt) { - if (AO->hasNoSignedWrap()) - return false; - - Value *LHS = AO->getOperand(0), *RHS = AO->getOperand(1); - // If LHS or RHS has the same sign as the sum, AO doesn't sign overflow. - // TODO: handle the negative case as well. - if (isKnownNonNegative(AO, Ctxt) && - (isKnownNonNegative(LHS, Ctxt) || isKnownNonNegative(RHS, Ctxt))) - return false; - - return true; -} - GetElementPtrInst * NaryReassociate::tryReassociateGEPAtIndex(GetElementPtrInst *GEP, unsigned I, Type *IndexedType) { @@ -381,7 +370,7 @@ NaryReassociate::tryReassociateGEPAtIndex(GetElementPtrInst *GEP, unsigned I, IndexToSplit = SExt->getOperand(0); } else if (ZExtInst *ZExt = dyn_cast<ZExtInst>(IndexToSplit)) { // zext can be treated as sext if the source is non-negative. - if (isKnownNonNegative(ZExt->getOperand(0), GEP)) + if (isKnownNonNegative(ZExt->getOperand(0), *DL, 0, AC, GEP, DT)) IndexToSplit = ZExt->getOperand(0); } @@ -389,8 +378,11 @@ NaryReassociate::tryReassociateGEPAtIndex(GetElementPtrInst *GEP, unsigned I, // If the I-th index needs sext and the underlying add is not equipped with // nsw, we cannot split the add because // sext(LHS + RHS) != sext(LHS) + sext(RHS). - if (requiresSignExtension(IndexToSplit, GEP) && maySignOverflow(AO, GEP)) + if (requiresSignExtension(IndexToSplit, GEP) && + computeOverflowForSignedAdd(AO, *DL, AC, GEP, DT) != + OverflowResult::NeverOverflows) return nullptr; + Value *LHS = AO->getOperand(0), *RHS = AO->getOperand(1); // IndexToSplit = LHS + RHS. if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType)) @@ -415,7 +407,7 @@ GetElementPtrInst *NaryReassociate::tryReassociateGEPAtIndex( IndexExprs.push_back(SE->getSCEV(*Index)); // Replace the I-th index with LHS. IndexExprs[I] = SE->getSCEV(LHS); - if (isKnownNonNegative(LHS, GEP) && + if (isKnownNonNegative(LHS, *DL, 0, AC, GEP, DT) && DL->getTypeSizeInBits(LHS->getType()) < DL->getTypeSizeInBits(GEP->getOperand(I)->getType())) { // Zero-extend LHS if it is non-negative. InstCombine canonicalizes sext to @@ -429,19 +421,20 @@ GetElementPtrInst *NaryReassociate::tryReassociateGEPAtIndex( GEP->getSourceElementType(), SE->getSCEV(GEP->getPointerOperand()), IndexExprs, GEP->isInBounds()); - auto *Candidate = findClosestMatchingDominator(CandidateExpr, GEP); + Value *Candidate = findClosestMatchingDominator(CandidateExpr, GEP); if (Candidate == nullptr) return nullptr; - PointerType *TypeOfCandidate = dyn_cast<PointerType>(Candidate->getType()); - // Pretty rare but theoretically possible when a numeric value happens to - // share CandidateExpr. - if (TypeOfCandidate == nullptr) - return nullptr; + IRBuilder<> Builder(GEP); + // Candidate does not necessarily have the same pointer type as GEP. Use + // bitcast or pointer cast to make sure they have the same type, so that the + // later RAUW doesn't complain. + Candidate = Builder.CreateBitOrPointerCast(Candidate, GEP->getType()); + assert(Candidate->getType() == GEP->getType()); // NewGEP = (char *)Candidate + RHS * sizeof(IndexedType) uint64_t IndexedSize = DL->getTypeAllocSize(IndexedType); - Type *ElementType = TypeOfCandidate->getElementType(); + Type *ElementType = GEP->getType()->getElementType(); uint64_t ElementSize = DL->getTypeAllocSize(ElementType); // Another less rare case: because I is not necessarily the last index of the // GEP, the size of the type at the I-th index (IndexedSize) is not @@ -461,8 +454,7 @@ GetElementPtrInst *NaryReassociate::tryReassociateGEPAtIndex( return nullptr; // NewGEP = &Candidate[RHS * (sizeof(IndexedType) / sizeof(Candidate[0]))); - IRBuilder<> Builder(GEP); - Type *IntPtrTy = DL->getIntPtrType(TypeOfCandidate); + Type *IntPtrTy = DL->getIntPtrType(GEP->getType()); if (RHS->getType() != IntPtrTy) RHS = Builder.CreateSExtOrTrunc(RHS, IntPtrTy); if (IndexedSize != ElementSize) { @@ -476,54 +468,89 @@ GetElementPtrInst *NaryReassociate::tryReassociateGEPAtIndex( return NewGEP; } -Instruction *NaryReassociate::tryReassociateAdd(BinaryOperator *I) { +Instruction *NaryReassociate::tryReassociateBinaryOp(BinaryOperator *I) { Value *LHS = I->getOperand(0), *RHS = I->getOperand(1); - if (auto *NewI = tryReassociateAdd(LHS, RHS, I)) + if (auto *NewI = tryReassociateBinaryOp(LHS, RHS, I)) return NewI; - if (auto *NewI = tryReassociateAdd(RHS, LHS, I)) + if (auto *NewI = tryReassociateBinaryOp(RHS, LHS, I)) return NewI; return nullptr; } -Instruction *NaryReassociate::tryReassociateAdd(Value *LHS, Value *RHS, - Instruction *I) { +Instruction *NaryReassociate::tryReassociateBinaryOp(Value *LHS, Value *RHS, + BinaryOperator *I) { Value *A = nullptr, *B = nullptr; - // To be conservative, we reassociate I only when it is the only user of A+B. - if (LHS->hasOneUse() && match(LHS, m_Add(m_Value(A), m_Value(B)))) { - // I = (A + B) + RHS - // = (A + RHS) + B or (B + RHS) + A + // To be conservative, we reassociate I only when it is the only user of (A op + // B). + if (LHS->hasOneUse() && matchTernaryOp(I, LHS, A, B)) { + // I = (A op B) op RHS + // = (A op RHS) op B or (B op RHS) op A const SCEV *AExpr = SE->getSCEV(A), *BExpr = SE->getSCEV(B); const SCEV *RHSExpr = SE->getSCEV(RHS); if (BExpr != RHSExpr) { - if (auto *NewI = tryReassociatedAdd(SE->getAddExpr(AExpr, RHSExpr), B, I)) + if (auto *NewI = + tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I)) return NewI; } if (AExpr != RHSExpr) { - if (auto *NewI = tryReassociatedAdd(SE->getAddExpr(BExpr, RHSExpr), A, I)) + if (auto *NewI = + tryReassociatedBinaryOp(getBinarySCEV(I, BExpr, RHSExpr), A, I)) return NewI; } } return nullptr; } -Instruction *NaryReassociate::tryReassociatedAdd(const SCEV *LHSExpr, - Value *RHS, Instruction *I) { - auto Pos = SeenExprs.find(LHSExpr); - // Bail out if LHSExpr is not previously seen. - if (Pos == SeenExprs.end()) - return nullptr; - +Instruction *NaryReassociate::tryReassociatedBinaryOp(const SCEV *LHSExpr, + Value *RHS, + BinaryOperator *I) { // Look for the closest dominator LHS of I that computes LHSExpr, and replace - // I with LHS + RHS. + // I with LHS op RHS. auto *LHS = findClosestMatchingDominator(LHSExpr, I); if (LHS == nullptr) return nullptr; - Instruction *NewI = BinaryOperator::CreateAdd(LHS, RHS, "", I); + Instruction *NewI = nullptr; + switch (I->getOpcode()) { + case Instruction::Add: + NewI = BinaryOperator::CreateAdd(LHS, RHS, "", I); + break; + case Instruction::Mul: + NewI = BinaryOperator::CreateMul(LHS, RHS, "", I); + break; + default: + llvm_unreachable("Unexpected instruction."); + } NewI->takeName(I); return NewI; } +bool NaryReassociate::matchTernaryOp(BinaryOperator *I, Value *V, Value *&Op1, + Value *&Op2) { + switch (I->getOpcode()) { + case Instruction::Add: + return match(V, m_Add(m_Value(Op1), m_Value(Op2))); + case Instruction::Mul: + return match(V, m_Mul(m_Value(Op1), m_Value(Op2))); + default: + llvm_unreachable("Unexpected instruction."); + } + return false; +} + +const SCEV *NaryReassociate::getBinarySCEV(BinaryOperator *I, const SCEV *LHS, + const SCEV *RHS) { + switch (I->getOpcode()) { + case Instruction::Add: + return SE->getAddExpr(LHS, RHS); + case Instruction::Mul: + return SE->getMulExpr(LHS, RHS); + default: + llvm_unreachable("Unexpected instruction."); + } + return nullptr; +} + Instruction * NaryReassociate::findClosestMatchingDominator(const SCEV *CandidateExpr, Instruction *Dominatee) { @@ -537,9 +564,13 @@ NaryReassociate::findClosestMatchingDominator(const SCEV *CandidateExpr, // future instruction either. Therefore, we pop it out of the stack. This // optimization makes the algorithm O(n). while (!Candidates.empty()) { - Instruction *Candidate = Candidates.back(); - if (DT->dominates(Candidate, Dominatee)) - return Candidate; + // Candidates stores WeakVHs, so a candidate can be nullptr if it's removed + // during rewriting. + if (Value *Candidate = Candidates.back()) { + Instruction *CandidateInstruction = cast<Instruction>(Candidate); + if (DT->dominates(CandidateInstruction, Dominatee)) + return CandidateInstruction; + } Candidates.pop_back(); } return nullptr; diff --git a/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp index 31d7df3..9f26f78 100644 --- a/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp @@ -154,7 +154,7 @@ bool PartiallyInlineLibCalls::optimizeSQRT(CallInst *Call, Phi->addIncoming(Call, &CurrBB); Phi->addIncoming(LibCall, LibCallBB); - BB = JoinBB; + BB = JoinBB->getIterator(); return true; } diff --git a/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp b/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp index 366301a..28c610c 100644 --- a/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp @@ -27,7 +27,7 @@ // well defined state for inspection by the collector. In the current // implementation, this is done via the insertion of poll sites at method entry // and the backedge of most loops. We try to avoid inserting more polls than -// are neccessary to ensure a finite period between poll sites. This is not +// are necessary to ensure a finite period between poll sites. This is not // because the poll itself is expensive in the generated code; it's not. Polls // do tend to impact the optimizer itself in negative ways; we'd like to avoid // perturbing the optimization of the method as much as we can. @@ -91,13 +91,15 @@ STATISTIC(FiniteExecution, "Number of loops w/o safepoints finite execution"); using namespace llvm; -// Ignore oppurtunities to avoid placing safepoints on backedges, useful for +// Ignore opportunities to avoid placing safepoints on backedges, useful for // validation static cl::opt<bool> AllBackedges("spp-all-backedges", cl::Hidden, cl::init(false)); -/// If true, do not place backedge safepoints in counted loops. -static cl::opt<bool> SkipCounted("spp-counted", cl::Hidden, cl::init(true)); +/// How narrow does the trip count of a loop have to be to have to be considered +/// "counted"? Counted loops do not get safepoints at backedges. +static cl::opt<int> CountedLoopTripWidth("spp-counted-loop-trip-width", + cl::Hidden, cl::init(32)); // If true, split the backedge of a loop when placing the safepoint, otherwise // split the latch block itself. Both are useful to support for @@ -121,7 +123,7 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass { std::vector<TerminatorInst *> PollLocations; /// True unless we're running spp-no-calls in which case we need to disable - /// the call dependend placement opts. + /// the call-dependent placement opts. bool CallSafepointsEnabled; ScalarEvolution *SE = nullptr; @@ -142,7 +144,7 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass { } bool runOnFunction(Function &F) override { - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); for (auto I = LI->begin(), E = LI->end(); I != E; I++) { @@ -153,7 +155,7 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); // We no longer modify the IR at all in this pass. Thus all // analysis are preserved. @@ -190,10 +192,8 @@ static void InsertSafepointPoll(Instruction *InsertBefore, std::vector<CallSite> &ParsePointsNeeded /*rval*/); -static bool isGCLeafFunction(const CallSite &CS); - static bool needsStatepoint(const CallSite &CS) { - if (isGCLeafFunction(CS)) + if (callsGCLeafFunction(CS)) return false; if (CS.isCall()) { CallInst *call = cast<CallInst>(CS.getInstruction()); @@ -206,7 +206,7 @@ static bool needsStatepoint(const CallSite &CS) { return true; } -static Value *ReplaceWithStatepoint(const CallSite &CS, Pass *P); +static Value *ReplaceWithStatepoint(const CallSite &CS); /// Returns true if this loop is known to contain a call safepoint which /// must unconditionally execute on any iteration of the loop which returns @@ -220,7 +220,7 @@ static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header, // For the moment, we look only for the 'cuts' that consist of a single call // instruction in a block which is dominated by the Header and dominates the // loop latch (Pred) block. Somewhat surprisingly, walking the entire chain - // of such dominating blocks gets substaintially more occurences than just + // of such dominating blocks gets substantially more occurrences than just // checking the Pred and Header blocks themselves. This may be due to the // density of loop exit conditions caused by range and null checks. // TODO: structure this as an analysis pass, cache the result for subloops, @@ -255,18 +255,12 @@ static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header, /// conservatism in the analysis. static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE, BasicBlock *Pred) { - // Only used when SkipCounted is off - const unsigned upperTripBound = 8192; - // A conservative bound on the loop as a whole. const SCEV *MaxTrips = SE->getMaxBackedgeTakenCount(L); - if (MaxTrips != SE->getCouldNotCompute()) { - if (SE->getUnsignedRange(MaxTrips).getUnsignedMax().ult(upperTripBound)) - return true; - if (SkipCounted && - SE->getUnsignedRange(MaxTrips).getUnsignedMax().isIntN(32)) - return true; - } + if (MaxTrips != SE->getCouldNotCompute() && + SE->getUnsignedRange(MaxTrips).getUnsignedMax().isIntN( + CountedLoopTripWidth)) + return true; // If this is a conditional branch to the header with the alternate path // being outside the loop, we can ask questions about the execution frequency @@ -275,13 +269,10 @@ static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE, // This returns an exact expression only. TODO: We really only need an // upper bound here, but SE doesn't expose that. const SCEV *MaxExec = SE->getExitCount(L, Pred); - if (MaxExec != SE->getCouldNotCompute()) { - if (SE->getUnsignedRange(MaxExec).getUnsignedMax().ult(upperTripBound)) - return true; - if (SkipCounted && - SE->getUnsignedRange(MaxExec).getUnsignedMax().isIntN(32)) + if (MaxExec != SE->getCouldNotCompute() && + SE->getUnsignedRange(MaxExec).getUnsignedMax().isIntN( + CountedLoopTripWidth)) return true; - } } return /* not finite */ false; @@ -432,14 +423,14 @@ static Instruction *findLocationForEntrySafepoint(Function &F, assert(hasNextInstruction(I) && "first check if there is a next instruction!"); if (I->isTerminator()) { - return I->getParent()->getUniqueSuccessor()->begin(); + return &I->getParent()->getUniqueSuccessor()->front(); } else { - return std::next(BasicBlock::iterator(I)); + return &*++I->getIterator(); } }; Instruction *cursor = nullptr; - for (cursor = F.getEntryBlock().begin(); hasNextInstruction(cursor); + for (cursor = &F.getEntryBlock().front(); hasNextInstruction(cursor); cursor = nextInstruction(cursor)) { // We need to ensure a safepoint poll occurs before any 'real' call. The @@ -466,7 +457,7 @@ static Instruction *findLocationForEntrySafepoint(Function &F, static void findCallSafepoints(Function &F, std::vector<CallSite> &Found /*rval*/) { assert(Found.empty() && "must be empty!"); - for (Instruction &I : inst_range(F)) { + for (Instruction &I : instructions(F)) { Instruction *inst = &I; if (isa<CallInst>(inst) || isa<InvokeInst>(inst)) { CallSite CS(inst); @@ -713,7 +704,7 @@ bool PlaceSafepoints::runOnFunction(Function &F) { Invoke->getParent()); } - Value *GCResult = ReplaceWithStatepoint(CS, nullptr); + Value *GCResult = ReplaceWithStatepoint(CS); Results.push_back(GCResult); } assert(Results.size() == ParsePointNeeded.size()); @@ -747,7 +738,7 @@ FunctionPass *llvm::createPlaceSafepointsPass() { INITIALIZE_PASS_BEGIN(PlaceBackedgeSafepointsImpl, "place-backedge-safepoints-impl", "Place Backedge Safepoints", false, false) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_END(PlaceBackedgeSafepointsImpl, @@ -759,31 +750,6 @@ INITIALIZE_PASS_BEGIN(PlaceSafepoints, "place-safepoints", "Place Safepoints", INITIALIZE_PASS_END(PlaceSafepoints, "place-safepoints", "Place Safepoints", false, false) -static bool isGCLeafFunction(const CallSite &CS) { - Instruction *inst = CS.getInstruction(); - if (isa<IntrinsicInst>(inst)) { - // Most LLVM intrinsics are things which can never take a safepoint. - // As a result, we don't need to have the stack parsable at the - // callsite. This is a highly useful optimization since intrinsic - // calls are fairly prevelent, particularly in debug builds. - return true; - } - - // If this function is marked explicitly as a leaf call, we don't need to - // place a safepoint of it. In fact, for correctness we *can't* in many - // cases. Note: Indirect calls return Null for the called function, - // these obviously aren't runtime functions with attributes - // TODO: Support attributes on the call site as well. - const Function *F = CS.getCalledFunction(); - bool isLeaf = - F && - F->getFnAttribute("gc-leaf-function").getValueAsString().equals("true"); - if (isLeaf) { - return true; - } - return false; -} - static void InsertSafepointPoll(Instruction *InsertBefore, std::vector<CallSite> &ParsePointsNeeded /*rval*/) { @@ -796,6 +762,7 @@ InsertSafepointPoll(Instruction *InsertBefore, // path call - where we need to insert a safepoint (parsepoint). auto *F = M->getFunction(GCSafepointPollName); + assert(F && "gc.safepoint_poll function is missing"); assert(F->getType()->getElementType() == FunctionType::get(Type::getVoidTy(M->getContext()), false) && "gc.safepoint_poll declared with wrong type"); @@ -864,10 +831,8 @@ InsertSafepointPoll(Instruction *InsertBefore, /// Replaces the given call site (Call or Invoke) with a gc.statepoint /// intrinsic with an empty deoptimization arguments list. This does /// NOT do explicit relocation for GC support. -static Value *ReplaceWithStatepoint(const CallSite &CS, /* to replace */ - Pass *P) { - assert(CS.getInstruction()->getParent()->getParent()->getParent() && - "must be set"); +static Value *ReplaceWithStatepoint(const CallSite &CS /* to replace */) { + assert(CS.getInstruction()->getModule() && "must be set"); // TODO: technically, a pass is not allowed to get functions from within a // function pass since it might trigger a new function addition. Refactor @@ -917,15 +882,10 @@ static Value *ReplaceWithStatepoint(const CallSite &CS, /* to replace */ CS.getInstruction()->getContext(), AttributeSet::FunctionIndex, AttrsToRemove); - Value *StatepointTarget = NumPatchBytes == 0 - ? CS.getCalledValue() - : ConstantPointerNull::get(cast<PointerType>( - CS.getCalledValue()->getType())); - if (CS.isCall()) { CallInst *ToReplace = cast<CallInst>(CS.getInstruction()); CallInst *Call = Builder.CreateGCStatepointCall( - ID, NumPatchBytes, StatepointTarget, + ID, NumPatchBytes, CS.getCalledValue(), makeArrayRef(CS.arg_begin(), CS.arg_end()), None, None, "safepoint_token"); Call->setTailCall(ToReplace->isTailCall()); @@ -938,7 +898,7 @@ static Value *ReplaceWithStatepoint(const CallSite &CS, /* to replace */ Token = Call; - // Put the following gc_result and gc_relocate calls immediately after the + // Put the following gc_result and gc_relocate calls immediately after // the old call (which we're about to delete). assert(ToReplace->getNextNode() && "not a terminator, must have next"); Builder.SetInsertPoint(ToReplace->getNextNode()); @@ -951,7 +911,7 @@ static Value *ReplaceWithStatepoint(const CallSite &CS, /* to replace */ // original block. Builder.SetInsertPoint(ToReplace->getParent()); InvokeInst *Invoke = Builder.CreateGCStatepointInvoke( - ID, NumPatchBytes, StatepointTarget, ToReplace->getNormalDest(), + ID, NumPatchBytes, CS.getCalledValue(), ToReplace->getNormalDest(), ToReplace->getUnwindDest(), makeArrayRef(CS.arg_begin(), CS.arg_end()), None, None, "safepoint_token"); @@ -967,7 +927,7 @@ static Value *ReplaceWithStatepoint(const CallSite &CS, /* to replace */ // We'll insert the gc.result into the normal block BasicBlock *NormalDest = ToReplace->getNormalDest(); // Can not insert gc.result in case of phi nodes preset. - // Should have removed this cases prior to runnning this function + // Should have removed this cases prior to running this function assert(!isa<PHINode>(NormalDest->begin())); Instruction *IP = &*(NormalDest->getFirstInsertionPt()); Builder.SetInsertPoint(IP); diff --git a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp index d1acf78..fb970c7 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -26,6 +26,8 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" @@ -62,7 +64,7 @@ namespace { /// Print out the expression identified in the Ops list. /// static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) { - Module *M = I->getParent()->getParent()->getParent(); + Module *M = I->getModule(); dbgs() << Instruction::getOpcodeName(I->getOpcode()) << " " << *Ops[0].Op->getType() << '\t'; for (unsigned i = 0, e = Ops.size(); i != e; ++i) { @@ -82,20 +84,6 @@ namespace { Factor(Value *Base, unsigned Power) : Base(Base), Power(Power) {} - /// \brief Sort factors by their Base. - struct BaseSorter { - bool operator()(const Factor &LHS, const Factor &RHS) { - return LHS.Base < RHS.Base; - } - }; - - /// \brief Compare factors for equal bases. - struct BaseEqual { - bool operator()(const Factor &LHS, const Factor &RHS) { - return LHS.Base == RHS.Base; - } - }; - /// \brief Sort factors in descending order by their power. struct PowerDescendingSorter { bool operator()(const Factor &LHS, const Factor &RHS) { @@ -172,6 +160,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + AU.addPreserved<GlobalsAAWrapperPass>(); } private: void BuildRankMap(Function &F); @@ -255,27 +244,6 @@ static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode1, return nullptr; } -static bool isUnmovableInstruction(Instruction *I) { - switch (I->getOpcode()) { - case Instruction::PHI: - case Instruction::LandingPad: - case Instruction::Alloca: - case Instruction::Load: - case Instruction::Invoke: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - return true; - case Instruction::Call: - return !isa<DbgInfoIntrinsic>(I); - default: - return false; - } -} - void Reassociate::BuildRankMap(Function &F) { unsigned i = 2; @@ -295,7 +263,7 @@ void Reassociate::BuildRankMap(Function &F) { // we cannot move. This ensures that the ranks for these instructions are // all different in the block. for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) - if (isUnmovableInstruction(I)) + if (mayBeMemoryDependent(*I)) ValueRankMap[&*I] = ++BBRank; } } @@ -913,7 +881,11 @@ void Reassociate::RewriteExprTree(BinaryOperator *I, /// that computes the negative version of the value specified. The negative /// version of the value is returned, and BI is left pointing at the instruction /// that should be processed next by the reassociation pass. -static Value *NegateValue(Value *V, Instruction *BI) { +/// Also add intermediate instructions to the redo list that are modified while +/// pushing the negates through adds. These will be revisited to see if +/// additional opportunities have been exposed. +static Value *NegateValue(Value *V, Instruction *BI, + SetVector<AssertingVH<Instruction>> &ToRedo) { if (Constant *C = dyn_cast<Constant>(V)) { if (C->getType()->isFPOrFPVectorTy()) { return ConstantExpr::getFNeg(C); @@ -934,8 +906,8 @@ static Value *NegateValue(Value *V, Instruction *BI) { if (BinaryOperator *I = isReassociableOp(V, Instruction::Add, Instruction::FAdd)) { // Push the negates through the add. - I->setOperand(0, NegateValue(I->getOperand(0), BI)); - I->setOperand(1, NegateValue(I->getOperand(1), BI)); + I->setOperand(0, NegateValue(I->getOperand(0), BI, ToRedo)); + I->setOperand(1, NegateValue(I->getOperand(1), BI, ToRedo)); if (I->getOpcode() == Instruction::Add) { I->setHasNoUnsignedWrap(false); I->setHasNoSignedWrap(false); @@ -948,6 +920,10 @@ static Value *NegateValue(Value *V, Instruction *BI) { // I->moveBefore(BI); I->setName(I->getName()+".neg"); + + // Add the intermediate negates to the redo list as processing them later + // could expose more reassociating opportunities. + ToRedo.insert(I); return I; } @@ -972,26 +948,28 @@ static Value *NegateValue(Value *V, Instruction *BI) { if (InvokeInst *II = dyn_cast<InvokeInst>(InstInput)) { InsertPt = II->getNormalDest()->begin(); } else { - InsertPt = InstInput; - ++InsertPt; + InsertPt = ++InstInput->getIterator(); } while (isa<PHINode>(InsertPt)) ++InsertPt; } else { InsertPt = TheNeg->getParent()->getParent()->getEntryBlock().begin(); } - TheNeg->moveBefore(InsertPt); + TheNeg->moveBefore(&*InsertPt); if (TheNeg->getOpcode() == Instruction::Sub) { TheNeg->setHasNoUnsignedWrap(false); TheNeg->setHasNoSignedWrap(false); } else { TheNeg->andIRFlags(BI); } + ToRedo.insert(TheNeg); return TheNeg; } // Insert a 'neg' instruction that subtracts the value from zero to get the // negation. - return CreateNeg(V, V->getName() + ".neg", BI, BI); + BinaryOperator *NewNeg = CreateNeg(V, V->getName() + ".neg", BI, BI); + ToRedo.insert(NewNeg); + return NewNeg; } /// Return true if we should break up this subtract of X-Y into (X + -Y). @@ -1025,14 +1003,15 @@ static bool ShouldBreakUpSubtract(Instruction *Sub) { /// If we have (X-Y), and if either X is an add, or if this is only used by an /// add, transform this into (X+(0-Y)) to promote better reassociation. -static BinaryOperator *BreakUpSubtract(Instruction *Sub) { +static BinaryOperator * +BreakUpSubtract(Instruction *Sub, SetVector<AssertingVH<Instruction>> &ToRedo) { // Convert a subtract into an add and a neg instruction. This allows sub // instructions to be commuted with other add instructions. // // Calculate the negative value of Operand 1 of the sub instruction, // and set it as the RHS of the add instruction we just made. // - Value *NegVal = NegateValue(Sub->getOperand(1), Sub); + Value *NegVal = NegateValue(Sub->getOperand(1), Sub, ToRedo); BinaryOperator *New = CreateAdd(Sub->getOperand(0), NegVal, "", Sub, Sub); Sub->setOperand(0, Constant::getNullValue(Sub->getType())); // Drop use of op. Sub->setOperand(1, Constant::getNullValue(Sub->getType())); // Drop use of op. @@ -1166,7 +1145,7 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) { return nullptr; } - BasicBlock::iterator InsertPt = BO; ++InsertPt; + BasicBlock::iterator InsertPt = ++BO->getIterator(); // If this was just a single multiply, remove the multiply and return the only // remaining operand. @@ -1179,7 +1158,7 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) { } if (NeedsNegate) - V = CreateNeg(V, "neg", InsertPt, BO); + V = CreateNeg(V, "neg", &*InsertPt, BO); return V; } @@ -1250,7 +1229,7 @@ static Value *OptimizeAndOrXor(unsigned Opcode, return nullptr; } -/// Helper funciton of CombineXorOpnd(). It creates a bitwise-and +/// Helper function of CombineXorOpnd(). It creates a bitwise-and /// instruction with the given two operands, and return the resulting /// instruction. There are two special cases: 1) if the constant operand is 0, /// it will return NULL. 2) if the constant is ~0, the symbolic operand will @@ -2083,7 +2062,7 @@ void Reassociate::OptimizeInst(Instruction *I) { return; // Don't optimize floating point instructions that don't have unsafe algebra. - if (I->getType()->isFloatingPointTy() && !I->hasUnsafeAlgebra()) + if (I->getType()->isFPOrFPVectorTy() && !I->hasUnsafeAlgebra()) return; // Do not reassociate boolean (i1) expressions. We want to preserve the @@ -2099,7 +2078,7 @@ void Reassociate::OptimizeInst(Instruction *I) { // see if we can convert it to X+-Y. if (I->getOpcode() == Instruction::Sub) { if (ShouldBreakUpSubtract(I)) { - Instruction *NI = BreakUpSubtract(I); + Instruction *NI = BreakUpSubtract(I, RedoInsts); RedoInsts.insert(I); MadeChange = true; I = NI; @@ -2110,6 +2089,12 @@ void Reassociate::OptimizeInst(Instruction *I) { (!I->hasOneUse() || !isReassociableOp(I->user_back(), Instruction::Mul))) { Instruction *NI = LowerNegateToMultiply(I); + // If the negate was simplified, revisit the users to see if we can + // reassociate further. + for (User *U : NI->users()) { + if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U)) + RedoInsts.insert(Tmp); + } RedoInsts.insert(I); MadeChange = true; I = NI; @@ -2117,7 +2102,7 @@ void Reassociate::OptimizeInst(Instruction *I) { } } else if (I->getOpcode() == Instruction::FSub) { if (ShouldBreakUpSubtract(I)) { - Instruction *NI = BreakUpSubtract(I); + Instruction *NI = BreakUpSubtract(I, RedoInsts); RedoInsts.insert(I); MadeChange = true; I = NI; @@ -2127,7 +2112,13 @@ void Reassociate::OptimizeInst(Instruction *I) { if (isReassociableOp(I->getOperand(1), Instruction::FMul) && (!I->hasOneUse() || !isReassociableOp(I->user_back(), Instruction::FMul))) { + // If the negate was simplified, revisit the users to see if we can + // reassociate further. Instruction *NI = LowerNegateToMultiply(I); + for (User *U : NI->users()) { + if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U)) + RedoInsts.insert(Tmp); + } RedoInsts.insert(I); MadeChange = true; I = NI; @@ -2142,8 +2133,14 @@ void Reassociate::OptimizeInst(Instruction *I) { // If this is an interior node of a reassociable tree, ignore it until we // get to the root of the tree, to avoid N^2 analysis. unsigned Opcode = BO->getOpcode(); - if (BO->hasOneUse() && BO->user_back()->getOpcode() == Opcode) + if (BO->hasOneUse() && BO->user_back()->getOpcode() == Opcode) { + // During the initial run we will get to the root of the tree. + // But if we get here while we are redoing instructions, there is no + // guarantee that the root will be visited. So Redo later + if (BO->user_back() != BO) + RedoInsts.insert(BO->user_back()); return; + } // If this is an add tree that is used by a sub instruction, ignore it // until we process the subtract. @@ -2250,10 +2247,10 @@ bool Reassociate::runOnFunction(Function &F) { for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { // Optimize every instruction in the basic block. for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE; ) - if (isInstructionTriviallyDead(II)) { - EraseInst(II++); + if (isInstructionTriviallyDead(&*II)) { + EraseInst(&*II++); } else { - OptimizeInst(II); + OptimizeInst(&*II); assert(II->getParent() == BI && "Moved to a different block!"); ++II; } diff --git a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp index 1b46727..915f897 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp @@ -82,10 +82,9 @@ bool RegToMem::runOnFunction(Function &F) { BasicBlock::iterator I = BBEntry->begin(); while (isa<AllocaInst>(I)) ++I; - CastInst *AllocaInsertionPoint = - new BitCastInst(Constant::getNullValue(Type::getInt32Ty(F.getContext())), - Type::getInt32Ty(F.getContext()), - "reg2mem alloca point", I); + CastInst *AllocaInsertionPoint = new BitCastInst( + Constant::getNullValue(Type::getInt32Ty(F.getContext())), + Type::getInt32Ty(F.getContext()), "reg2mem alloca point", &*I); // Find the escaped instructions. But don't create stack slots for // allocas in entry block. @@ -95,7 +94,7 @@ bool RegToMem::runOnFunction(Function &F) { for (BasicBlock::iterator iib = ibb->begin(), iie = ibb->end(); iib != iie; ++iib) { if (!(isa<AllocaInst>(iib) && iib->getParent() == BBEntry) && - valueEscapes(iib)) { + valueEscapes(&*iib)) { WorkList.push_front(&*iib); } } diff --git a/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index ae2ae3a..db127c3 100644 --- a/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -14,12 +14,14 @@ #include "llvm/Pass.h" #include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/ADT/SetOperations.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/ADT/MapVector.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Dominators.h" @@ -46,10 +48,6 @@ using namespace llvm; -// Print tracing output -static cl::opt<bool> TraceLSP("trace-rewrite-statepoints", cl::Hidden, - cl::init(false)); - // Print the liveset found at the insert location static cl::opt<bool> PrintLiveSet("spp-print-liveset", cl::Hidden, cl::init(false)); @@ -74,6 +72,12 @@ static cl::opt<bool, true> ClobberNonLiveOverride("rs4gc-clobber-non-live", cl::location(ClobberNonLive), cl::Hidden); +static cl::opt<bool> UseDeoptBundles("rs4gc-use-deopt-bundles", cl::Hidden, + cl::init(false)); +static cl::opt<bool> + AllowStatepointWithNoDeoptInfo("rs4gc-allow-statepoint-with-no-deopt-info", + cl::Hidden, cl::init(true)); + namespace { struct RewriteStatepointsForGC : public ModulePass { static char ID; // Pass identification, replacement for typeid @@ -88,10 +92,10 @@ struct RewriteStatepointsForGC : public ModulePass { Changed |= runOnFunction(F); if (Changed) { - // stripDereferenceabilityInfo asserts that shouldRewriteStatepointsIn + // stripNonValidAttributes asserts that shouldRewriteStatepointsIn // returns true for at least one function in the module. Since at least // one function changed, we know that the precondition is satisfied. - stripDereferenceabilityInfo(M); + stripNonValidAttributes(M); } return Changed; @@ -108,15 +112,16 @@ struct RewriteStatepointsForGC : public ModulePass { /// dereferenceability that are no longer valid/correct after /// RewriteStatepointsForGC has run. This is because semantically, after /// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire - /// heap. stripDereferenceabilityInfo (conservatively) restores correctness + /// heap. stripNonValidAttributes (conservatively) restores correctness /// by erasing all attributes in the module that externally imply /// dereferenceability. - /// - void stripDereferenceabilityInfo(Module &M); + /// Similar reasoning also applies to the noalias attributes. gc.statepoint + /// can touch the entire heap including noalias objects. + void stripNonValidAttributes(Module &M); - // Helpers for stripDereferenceabilityInfo - void stripDereferenceabilityInfoFromBody(Function &F); - void stripDereferenceabilityInfoFromPrototype(Function &F); + // Helpers for stripNonValidAttributes + void stripNonValidAttributesFromBody(Function &F); + void stripNonValidAttributesFromPrototype(Function &F); }; } // namespace @@ -160,15 +165,16 @@ struct GCPtrLivenessData { // base relation will remain. Internally, we add a mixture of the two // types, then update all the second type to the first type typedef DenseMap<Value *, Value *> DefiningValueMapTy; -typedef DenseSet<llvm::Value *> StatepointLiveSetTy; -typedef DenseMap<Instruction *, Value *> RematerializedValueMapTy; +typedef DenseSet<Value *> StatepointLiveSetTy; +typedef DenseMap<AssertingVH<Instruction>, AssertingVH<Value>> + RematerializedValueMapTy; struct PartiallyConstructedSafepointRecord { - /// The set of values known to be live accross this safepoint - StatepointLiveSetTy liveset; + /// The set of values known to be live across this safepoint + StatepointLiveSetTy LiveSet; /// Mapping from live pointers to a base-defining-value - DenseMap<llvm::Value *, llvm::Value *> PointerToBase; + DenseMap<Value *, Value *> PointerToBase; /// The *new* gc.statepoint instruction itself. This produces the token /// that normal path gc.relocates and the gc.result are tied to. @@ -179,12 +185,26 @@ struct PartiallyConstructedSafepointRecord { Instruction *UnwindToken; /// Record live values we are rematerialized instead of relocating. - /// They are not included into 'liveset' field. + /// They are not included into 'LiveSet' field. /// Maps rematerialized copy to it's original value. RematerializedValueMapTy RematerializedValues; }; } +static ArrayRef<Use> GetDeoptBundleOperands(ImmutableCallSite CS) { + assert(UseDeoptBundles && "Should not be called otherwise!"); + + Optional<OperandBundleUse> DeoptBundle = CS.getOperandBundle("deopt"); + + if (!DeoptBundle.hasValue()) { + assert(AllowStatepointWithNoDeoptInfo && + "Found non-leaf call without deopt info!"); + return None; + } + + return DeoptBundle.getValue().Inputs; +} + /// Compute the live-in set for every basic block in the function static void computeLiveInValues(DominatorTree &DT, Function &F, GCPtrLivenessData &Data); @@ -195,10 +215,10 @@ static void findLiveSetAtInst(Instruction *inst, GCPtrLivenessData &Data, StatepointLiveSetTy &out); // TODO: Once we can get to the GCStrategy, this becomes -// Optional<bool> isGCManagedPointer(const Value *V) const override { +// Optional<bool> isGCManagedPointer(const Type *Ty) const override { -static bool isGCPointerType(const Type *T) { - if (const PointerType *PT = dyn_cast<PointerType>(T)) +static bool isGCPointerType(Type *T) { + if (auto *PT = dyn_cast<PointerType>(T)) // For the sake of this example GC, we arbitrarily pick addrspace(1) as our // GC managed heap. We know that a pointer into this heap needs to be // updated and that no other pointer does. @@ -233,9 +253,8 @@ static bool containsGCPtrType(Type *Ty) { if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) return containsGCPtrType(AT->getElementType()); if (StructType *ST = dyn_cast<StructType>(Ty)) - return std::any_of( - ST->subtypes().begin(), ST->subtypes().end(), - [](Type *SubType) { return containsGCPtrType(SubType); }); + return std::any_of(ST->subtypes().begin(), ST->subtypes().end(), + containsGCPtrType); return false; } @@ -247,7 +266,7 @@ static bool isUnhandledGCPointerType(Type *Ty) { } #endif -static bool order_by_name(llvm::Value *a, llvm::Value *b) { +static bool order_by_name(Value *a, Value *b) { if (a->hasName() && b->hasName()) { return -1 == a->getName().compare(b->getName()); } else if (a->hasName() && !b->hasName()) { @@ -260,6 +279,13 @@ static bool order_by_name(llvm::Value *a, llvm::Value *b) { } } +// Return the name of the value suffixed with the provided value, or if the +// value didn't have a name, the default value specified. +static std::string suffixed_name_or(Value *V, StringRef Suffix, + StringRef DefaultName) { + return V->hasName() ? (V->getName() + Suffix).str() : DefaultName.str(); +} + // Conservatively identifies any definitions which might be live at the // given instruction. The analysis is performed immediately before the // given instruction. Values defined by that instruction are not considered @@ -269,30 +295,56 @@ static void analyzeParsePointLiveness( const CallSite &CS, PartiallyConstructedSafepointRecord &result) { Instruction *inst = CS.getInstruction(); - StatepointLiveSetTy liveset; - findLiveSetAtInst(inst, OriginalLivenessData, liveset); + StatepointLiveSetTy LiveSet; + findLiveSetAtInst(inst, OriginalLivenessData, LiveSet); if (PrintLiveSet) { // Note: This output is used by several of the test cases - // The order of elemtns in a set is not stable, put them in a vec and sort + // The order of elements in a set is not stable, put them in a vec and sort // by name - SmallVector<Value *, 64> temp; - temp.insert(temp.end(), liveset.begin(), liveset.end()); - std::sort(temp.begin(), temp.end(), order_by_name); + SmallVector<Value *, 64> Temp; + Temp.insert(Temp.end(), LiveSet.begin(), LiveSet.end()); + std::sort(Temp.begin(), Temp.end(), order_by_name); errs() << "Live Variables:\n"; - for (Value *V : temp) { - errs() << " " << V->getName(); // no newline - V->dump(); - } + for (Value *V : Temp) + dbgs() << " " << V->getName() << " " << *V << "\n"; } if (PrintLiveSetSize) { errs() << "Safepoint For: " << CS.getCalledValue()->getName() << "\n"; - errs() << "Number live values: " << liveset.size() << "\n"; + errs() << "Number live values: " << LiveSet.size() << "\n"; + } + result.LiveSet = LiveSet; +} + +static bool isKnownBaseResult(Value *V); +namespace { +/// A single base defining value - An immediate base defining value for an +/// instruction 'Def' is an input to 'Def' whose base is also a base of 'Def'. +/// For instructions which have multiple pointer [vector] inputs or that +/// transition between vector and scalar types, there is no immediate base +/// defining value. The 'base defining value' for 'Def' is the transitive +/// closure of this relation stopping at the first instruction which has no +/// immediate base defining value. The b.d.v. might itself be a base pointer, +/// but it can also be an arbitrary derived pointer. +struct BaseDefiningValueResult { + /// Contains the value which is the base defining value. + Value * const BDV; + /// True if the base defining value is also known to be an actual base + /// pointer. + const bool IsKnownBase; + BaseDefiningValueResult(Value *BDV, bool IsKnownBase) + : BDV(BDV), IsKnownBase(IsKnownBase) { +#ifndef NDEBUG + // Check consistency between new and old means of checking whether a BDV is + // a base. + bool MustBeBase = isKnownBaseResult(BDV); + assert(!MustBeBase || MustBeBase == IsKnownBase); +#endif } - result.liveset = liveset; +}; } -static Value *findBaseDefiningValue(Value *I); +static BaseDefiningValueResult findBaseDefiningValue(Value *I); /// Return a base defining value for the 'Index' element of the given vector /// instruction 'I'. If Index is null, returns a BDV for the entire vector @@ -303,8 +355,8 @@ static Value *findBaseDefiningValue(Value *I); /// vector returned is a BDV (and possibly a base) of the entire vector 'I'. /// If the later, the return pointer is a BDV (or possibly a base) for the /// particular element in 'I'. -static std::pair<Value *, bool> -findBaseDefiningValueOfVector(Value *I, Value *Index = nullptr) { +static BaseDefiningValueResult +findBaseDefiningValueOfVector(Value *I) { assert(I->getType()->isVectorTy() && cast<VectorType>(I->getType())->getElementType()->isPointerTy() && "Illegal to ask for the base pointer of a non-pointer type"); @@ -314,7 +366,7 @@ findBaseDefiningValueOfVector(Value *I, Value *Index = nullptr) { if (isa<Argument>(I)) // An incoming argument to the function is a base pointer - return std::make_pair(I, true); + return BaseDefiningValueResult(I, true); // We shouldn't see the address of a global as a vector value? assert(!isa<GlobalVariable>(I) && @@ -325,7 +377,7 @@ findBaseDefiningValueOfVector(Value *I, Value *Index = nullptr) { if (isa<UndefValue>(I)) // utterly meaningless, but useful for dealing with partially optimized // code. - return std::make_pair(I, true); + return BaseDefiningValueResult(I, true); // Due to inheritance, this must be _after_ the global variable and undef // checks @@ -333,31 +385,17 @@ findBaseDefiningValueOfVector(Value *I, Value *Index = nullptr) { assert(!isa<GlobalVariable>(I) && !isa<UndefValue>(I) && "order of checks wrong!"); assert(Con->isNullValue() && "null is the only case which makes sense"); - return std::make_pair(Con, true); + return BaseDefiningValueResult(Con, true); } if (isa<LoadInst>(I)) - return std::make_pair(I, true); - - // For an insert element, we might be able to look through it if we know - // something about the indexes. - if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(I)) { - if (Index) { - Value *InsertIndex = IEI->getOperand(2); - // This index is inserting the value, look for its BDV - if (InsertIndex == Index) - return std::make_pair(findBaseDefiningValue(IEI->getOperand(1)), false); - // Both constant, and can't be equal per above. This insert is definitely - // not relevant, look back at the rest of the vector and keep trying. - if (isa<ConstantInt>(Index) && isa<ConstantInt>(InsertIndex)) - return findBaseDefiningValueOfVector(IEI->getOperand(0), Index); - } - + return BaseDefiningValueResult(I, true); + + if (isa<InsertElementInst>(I)) // We don't know whether this vector contains entirely base pointers or // not. To be conservatively correct, we treat it as a BDV and will // duplicate code as needed to construct a parallel vector of bases. - return std::make_pair(IEI, false); - } + return BaseDefiningValueResult(I, false); if (isa<ShuffleVectorInst>(I)) // We don't know whether this vector contains entirely base pointers or @@ -365,105 +403,62 @@ findBaseDefiningValueOfVector(Value *I, Value *Index = nullptr) { // duplicate code as needed to construct a parallel vector of bases. // TODO: There a number of local optimizations which could be applied here // for particular sufflevector patterns. - return std::make_pair(I, false); + return BaseDefiningValueResult(I, false); // A PHI or Select is a base defining value. The outer findBasePointer // algorithm is responsible for constructing a base value for this BDV. assert((isa<SelectInst>(I) || isa<PHINode>(I)) && "unknown vector instruction - no base found for vector element"); - return std::make_pair(I, false); + return BaseDefiningValueResult(I, false); } -static bool isKnownBaseResult(Value *V); - /// Helper function for findBasePointer - Will return a value which either a) -/// defines the base pointer for the input or b) blocks the simple search -/// (i.e. a PHI or Select of two derived pointers) -static Value *findBaseDefiningValue(Value *I) { +/// defines the base pointer for the input, b) blocks the simple search +/// (i.e. a PHI or Select of two derived pointers), or c) involves a change +/// from pointer to vector type or back. +static BaseDefiningValueResult findBaseDefiningValue(Value *I) { if (I->getType()->isVectorTy()) - return findBaseDefiningValueOfVector(I).first; + return findBaseDefiningValueOfVector(I); assert(I->getType()->isPointerTy() && "Illegal to ask for the base pointer of a non-pointer type"); - // This case is a bit of a hack - it only handles extracts from vectors which - // trivially contain only base pointers or cases where we can directly match - // the index of the original extract element to an insertion into the vector. - // See note inside the function for how to improve this. - if (auto *EEI = dyn_cast<ExtractElementInst>(I)) { - Value *VectorOperand = EEI->getVectorOperand(); - Value *Index = EEI->getIndexOperand(); - std::pair<Value *, bool> pair = - findBaseDefiningValueOfVector(VectorOperand, Index); - Value *VectorBase = pair.first; - if (VectorBase->getType()->isPointerTy()) - // We found a BDV for this specific element with the vector. This is an - // optimization, but in practice it covers most of the useful cases - // created via scalarization. - return VectorBase; - else { - assert(VectorBase->getType()->isVectorTy()); - if (pair.second) - // If the entire vector returned is known to be entirely base pointers, - // then the extractelement is valid base for this value. - return EEI; - else { - // Otherwise, we have an instruction which potentially produces a - // derived pointer and we need findBasePointers to clone code for us - // such that we can create an instruction which produces the - // accompanying base pointer. - // Note: This code is currently rather incomplete. We don't currently - // support the general form of shufflevector of insertelement. - // Conceptually, these are just 'base defining values' of the same - // variety as phi or select instructions. We need to update the - // findBasePointers algorithm to insert new 'base-only' versions of the - // original instructions. This is relative straight forward to do, but - // the case which would motivate the work hasn't shown up in real - // workloads yet. - assert((isa<PHINode>(VectorBase) || isa<SelectInst>(VectorBase)) && - "need to extend findBasePointers for generic vector" - "instruction cases"); - return VectorBase; - } - } - } - if (isa<Argument>(I)) // An incoming argument to the function is a base pointer // We should have never reached here if this argument isn't an gc value - return I; + return BaseDefiningValueResult(I, true); if (isa<GlobalVariable>(I)) // base case - return I; + return BaseDefiningValueResult(I, true); // inlining could possibly introduce phi node that contains // undef if callee has multiple returns if (isa<UndefValue>(I)) // utterly meaningless, but useful for dealing with // partially optimized code. - return I; + return BaseDefiningValueResult(I, true); // Due to inheritance, this must be _after_ the global variable and undef // checks - if (Constant *Con = dyn_cast<Constant>(I)) { + if (isa<Constant>(I)) { assert(!isa<GlobalVariable>(I) && !isa<UndefValue>(I) && "order of checks wrong!"); - // Note: Finding a constant base for something marked for relocation - // doesn't really make sense. The most likely case is either a) some - // screwed up the address space usage or b) your validating against - // compiled C++ code w/o the proper separation. The only real exception - // is a null pointer. You could have generic code written to index of - // off a potentially null value and have proven it null. We also use - // null pointers in dead paths of relocation phis (which we might later - // want to find a base pointer for). - assert(isa<ConstantPointerNull>(Con) && - "null is the only case which makes sense"); - return Con; + // Note: Even for frontends which don't have constant references, we can + // see constants appearing after optimizations. A simple example is + // specialization of an address computation on null feeding into a merge + // point where the actual use of the now-constant input is protected by + // another null check. (e.g. test4 in constants.ll) + return BaseDefiningValueResult(I, true); } if (CastInst *CI = dyn_cast<CastInst>(I)) { Value *Def = CI->stripPointerCasts(); + // If stripping pointer casts changes the address space there is an + // addrspacecast in between. + assert(cast<PointerType>(Def->getType())->getAddressSpace() == + cast<PointerType>(CI->getType())->getAddressSpace() && + "unsupported addrspacecast"); // If we find a cast instruction here, it means we've found a cast which is // not simply a pointer cast (i.e. an inttoptr). We don't know how to // handle int->ptr conversion. @@ -472,7 +467,9 @@ static Value *findBaseDefiningValue(Value *I) { } if (isa<LoadInst>(I)) - return I; // The value loaded is an gc base itself + // The value loaded is an gc base itself + return BaseDefiningValueResult(I, true); + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) // The base of this GEP is the base @@ -480,14 +477,11 @@ static Value *findBaseDefiningValue(Value *I) { if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { switch (II->getIntrinsicID()) { - case Intrinsic::experimental_gc_result_ptr: default: // fall through to general call handling break; case Intrinsic::experimental_gc_statepoint: - case Intrinsic::experimental_gc_result_float: - case Intrinsic::experimental_gc_result_int: - llvm_unreachable("these don't produce pointers"); + llvm_unreachable("statepoints don't produce pointers"); case Intrinsic::experimental_gc_relocate: { // Rerunning safepoint insertion after safepoints are already // inserted is not supported. It could probably be made to work, @@ -506,17 +500,17 @@ static Value *findBaseDefiningValue(Value *I) { // pointers. This should probably be generalized via attributes to support // both source language and internal functions. if (isa<CallInst>(I) || isa<InvokeInst>(I)) - return I; + return BaseDefiningValueResult(I, true); // I have absolutely no idea how to implement this part yet. It's not - // neccessarily hard, I just haven't really looked at it yet. + // necessarily hard, I just haven't really looked at it yet. assert(!isa<LandingPadInst>(I) && "Landing Pad is unimplemented"); if (isa<AtomicCmpXchgInst>(I)) // A CAS is effectively a atomic store and load combined under a // predicate. From the perspective of base pointers, we just treat it // like a load. - return I; + return BaseDefiningValueResult(I, true); assert(!isa<AtomicRMWInst>(I) && "Xchg handled above, all others are " "binary ops which don't apply to pointers"); @@ -525,34 +519,41 @@ static Value *findBaseDefiningValue(Value *I) { // stack, but in either case, this is simply a field load. As a result, // this is a defining definition of the base just like a load is. if (isa<ExtractValueInst>(I)) - return I; + return BaseDefiningValueResult(I, true); // We should never see an insert vector since that would require we be // tracing back a struct value not a pointer value. assert(!isa<InsertValueInst>(I) && "Base pointer for a struct is meaningless"); + // An extractelement produces a base result exactly when it's input does. + // We may need to insert a parallel instruction to extract the appropriate + // element out of the base vector corresponding to the input. Given this, + // it's analogous to the phi and select case even though it's not a merge. + if (isa<ExtractElementInst>(I)) + // Note: There a lot of obvious peephole cases here. This are deliberately + // handled after the main base pointer inference algorithm to make writing + // test cases to exercise that code easier. + return BaseDefiningValueResult(I, false); + // The last two cases here don't return a base pointer. Instead, they - // return a value which dynamically selects from amoung several base + // return a value which dynamically selects from among several base // derived pointers (each with it's own base potentially). It's the job of // the caller to resolve these. assert((isa<SelectInst>(I) || isa<PHINode>(I)) && "missing instruction case in findBaseDefiningValing"); - return I; + return BaseDefiningValueResult(I, false); } /// Returns the base defining value for this value. static Value *findBaseDefiningValueCached(Value *I, DefiningValueMapTy &Cache) { Value *&Cached = Cache[I]; if (!Cached) { - Cached = findBaseDefiningValue(I); + Cached = findBaseDefiningValue(I).BDV; + DEBUG(dbgs() << "fBDV-cached: " << I->getName() << " -> " + << Cached->getName() << "\n"); } assert(Cache[I] != nullptr); - - if (TraceLSP) { - dbgs() << "fBDV-cached: " << I->getName() << " -> " << Cached->getName() - << "\n"; - } return Cached; } @@ -572,7 +573,9 @@ static Value *findBaseOrBDV(Value *I, DefiningValueMapTy &Cache) { /// Given the result of a call to findBaseDefiningValue, or findBaseOrBDV, /// is it known to be a base pointer? Or do we need to continue searching. static bool isKnownBaseResult(Value *V) { - if (!isa<PHINode>(V) && !isa<SelectInst>(V)) { + if (!isa<PHINode>(V) && !isa<SelectInst>(V) && + !isa<ExtractElementInst>(V) && !isa<InsertElementInst>(V) && + !isa<ShuffleVectorInst>(V)) { // no recursion possible return true; } @@ -587,17 +590,19 @@ static bool isKnownBaseResult(Value *V) { return false; } -// TODO: find a better name for this namespace { -class PhiState { +/// Models the state of a single base defining value in the findBasePointer +/// algorithm for determining where a new instruction is needed to propagate +/// the base of this BDV. +class BDVState { public: enum Status { Unknown, Base, Conflict }; - PhiState(Status s, Value *b = nullptr) : status(s), base(b) { + BDVState(Status s, Value *b = nullptr) : status(s), base(b) { assert(status != Base || b); } - PhiState(Value *b) : status(Base), base(b) {} - PhiState() : status(Unknown), base(nullptr) {} + explicit BDVState(Value *b) : status(Base), base(b) {} + BDVState() : status(Unknown), base(nullptr) {} Status getStatus() const { return status; } Value *getBase() const { return base; } @@ -606,72 +611,80 @@ public: bool isUnknown() const { return getStatus() == Unknown; } bool isConflict() const { return getStatus() == Conflict; } - bool operator==(const PhiState &other) const { + bool operator==(const BDVState &other) const { return base == other.base && status == other.status; } - bool operator!=(const PhiState &other) const { return !(*this == other); } + bool operator!=(const BDVState &other) const { return !(*this == other); } - void dump() { - errs() << status << " (" << base << " - " - << (base ? base->getName() : "nullptr") << "): "; + LLVM_DUMP_METHOD + void dump() const { print(dbgs()); dbgs() << '\n'; } + + void print(raw_ostream &OS) const { + switch (status) { + case Unknown: + OS << "U"; + break; + case Base: + OS << "B"; + break; + case Conflict: + OS << "C"; + break; + }; + OS << " (" << base << " - " + << (base ? base->getName() : "nullptr") << "): "; } private: Status status; - Value *base; // non null only if status == base + AssertingVH<Value> base; // non null only if status == base }; +} -typedef DenseMap<Value *, PhiState> ConflictStateMapTy; -// Values of type PhiState form a lattice, and this is a helper +#ifndef NDEBUG +static raw_ostream &operator<<(raw_ostream &OS, const BDVState &State) { + State.print(OS); + return OS; +} +#endif + +namespace { +// Values of type BDVState form a lattice, and this is a helper // class that implementes the meet operation. The meat of the meet -// operation is implemented in MeetPhiStates::pureMeet -class MeetPhiStates { +// operation is implemented in MeetBDVStates::pureMeet +class MeetBDVStates { public: - // phiStates is a mapping from PHINodes and SelectInst's to PhiStates. - explicit MeetPhiStates(const ConflictStateMapTy &phiStates) - : phiStates(phiStates) {} - - // Destructively meet the current result with the base V. V can - // either be a merge instruction (SelectInst / PHINode), in which - // case its status is looked up in the phiStates map; or a regular - // SSA value, in which case it is assumed to be a base. - void meetWith(Value *V) { - PhiState otherState = getStateForBDV(V); - assert((MeetPhiStates::pureMeet(otherState, currentResult) == - MeetPhiStates::pureMeet(currentResult, otherState)) && - "math is wrong: meet does not commute!"); - currentResult = MeetPhiStates::pureMeet(otherState, currentResult); + /// Initializes the currentResult to the TOP state so that if can be met with + /// any other state to produce that state. + MeetBDVStates() {} + + // Destructively meet the current result with the given BDVState + void meetWith(BDVState otherState) { + currentResult = meet(otherState, currentResult); } - PhiState getResult() const { return currentResult; } + BDVState getResult() const { return currentResult; } private: - const ConflictStateMapTy &phiStates; - PhiState currentResult; - - /// Return a phi state for a base defining value. We'll generate a new - /// base state for known bases and expect to find a cached state otherwise - PhiState getStateForBDV(Value *baseValue) { - if (isKnownBaseResult(baseValue)) { - return PhiState(baseValue); - } else { - return lookupFromMap(baseValue); - } - } + BDVState currentResult; - PhiState lookupFromMap(Value *V) { - auto I = phiStates.find(V); - assert(I != phiStates.end() && "lookup failed!"); - return I->second; + /// Perform a meet operation on two elements of the BDVState lattice. + static BDVState meet(BDVState LHS, BDVState RHS) { + assert((pureMeet(LHS, RHS) == pureMeet(RHS, LHS)) && + "math is wrong: meet does not commute!"); + BDVState Result = pureMeet(LHS, RHS); + DEBUG(dbgs() << "meet of " << LHS << " with " << RHS + << " produced " << Result << "\n"); + return Result; } - static PhiState pureMeet(const PhiState &stateA, const PhiState &stateB) { + static BDVState pureMeet(const BDVState &stateA, const BDVState &stateB) { switch (stateA.getStatus()) { - case PhiState::Unknown: + case BDVState::Unknown: return stateB; - case PhiState::Base: + case BDVState::Base: assert(stateA.getBase() && "can't be null"); if (stateB.isUnknown()) return stateA; @@ -681,18 +694,20 @@ private: assert(stateA == stateB && "equality broken!"); return stateA; } - return PhiState(PhiState::Conflict); + return BDVState(BDVState::Conflict); } assert(stateB.isConflict() && "only three states!"); - return PhiState(PhiState::Conflict); + return BDVState(BDVState::Conflict); - case PhiState::Conflict: + case BDVState::Conflict: return stateA; } llvm_unreachable("only three states!"); } }; } + + /// For a given value or instruction, figure out what base ptr it's derived /// from. For gc objects, this is simply itself. On success, returns a value /// which is the base pointer. (This is reliable and can be used for @@ -723,171 +738,252 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) { // // Note: A simpler form of this would be to add the conflict form of all // PHIs without running the optimistic algorithm. This would be - // analougous to pessimistic data flow and would likely lead to an + // analogous to pessimistic data flow and would likely lead to an // overall worse solution. - ConflictStateMapTy states; - states[def] = PhiState(); - // Recursively fill in all phis & selects reachable from the initial one - // for which we don't already know a definite base value for - // TODO: This should be rewritten with a worklist - bool done = false; - while (!done) { - done = true; - // Since we're adding elements to 'states' as we run, we can't keep - // iterators into the set. - SmallVector<Value *, 16> Keys; - Keys.reserve(states.size()); - for (auto Pair : states) { - Value *V = Pair.first; - Keys.push_back(V); - } - for (Value *v : Keys) { - assert(!isKnownBaseResult(v) && "why did it get added?"); - if (PHINode *phi = dyn_cast<PHINode>(v)) { - assert(phi->getNumIncomingValues() > 0 && - "zero input phis are illegal"); - for (Value *InVal : phi->incoming_values()) { - Value *local = findBaseOrBDV(InVal, cache); - if (!isKnownBaseResult(local) && states.find(local) == states.end()) { - states[local] = PhiState(); - done = false; - } - } - } else if (SelectInst *sel = dyn_cast<SelectInst>(v)) { - Value *local = findBaseOrBDV(sel->getTrueValue(), cache); - if (!isKnownBaseResult(local) && states.find(local) == states.end()) { - states[local] = PhiState(); - done = false; - } - local = findBaseOrBDV(sel->getFalseValue(), cache); - if (!isKnownBaseResult(local) && states.find(local) == states.end()) { - states[local] = PhiState(); - done = false; - } +#ifndef NDEBUG + auto isExpectedBDVType = [](Value *BDV) { + return isa<PHINode>(BDV) || isa<SelectInst>(BDV) || + isa<ExtractElementInst>(BDV) || isa<InsertElementInst>(BDV); + }; +#endif + + // Once populated, will contain a mapping from each potentially non-base BDV + // to a lattice value (described above) which corresponds to that BDV. + // We use the order of insertion (DFS over the def/use graph) to provide a + // stable deterministic ordering for visiting DenseMaps (which are unordered) + // below. This is important for deterministic compilation. + MapVector<Value *, BDVState> States; + + // Recursively fill in all base defining values reachable from the initial + // one for which we don't already know a definite base value for + /* scope */ { + SmallVector<Value*, 16> Worklist; + Worklist.push_back(def); + States.insert(std::make_pair(def, BDVState())); + while (!Worklist.empty()) { + Value *Current = Worklist.pop_back_val(); + assert(!isKnownBaseResult(Current) && "why did it get added?"); + + auto visitIncomingValue = [&](Value *InVal) { + Value *Base = findBaseOrBDV(InVal, cache); + if (isKnownBaseResult(Base)) + // Known bases won't need new instructions introduced and can be + // ignored safely + return; + assert(isExpectedBDVType(Base) && "the only non-base values " + "we see should be base defining values"); + if (States.insert(std::make_pair(Base, BDVState())).second) + Worklist.push_back(Base); + }; + if (PHINode *Phi = dyn_cast<PHINode>(Current)) { + for (Value *InVal : Phi->incoming_values()) + visitIncomingValue(InVal); + } else if (SelectInst *Sel = dyn_cast<SelectInst>(Current)) { + visitIncomingValue(Sel->getTrueValue()); + visitIncomingValue(Sel->getFalseValue()); + } else if (auto *EE = dyn_cast<ExtractElementInst>(Current)) { + visitIncomingValue(EE->getVectorOperand()); + } else if (auto *IE = dyn_cast<InsertElementInst>(Current)) { + visitIncomingValue(IE->getOperand(0)); // vector operand + visitIncomingValue(IE->getOperand(1)); // scalar operand + } else { + // There is one known class of instructions we know we don't handle. + assert(isa<ShuffleVectorInst>(Current)); + llvm_unreachable("unimplemented instruction case"); } } } - if (TraceLSP) { - errs() << "States after initialization:\n"; - for (auto Pair : states) { - Instruction *v = cast<Instruction>(Pair.first); - PhiState state = Pair.second; - state.dump(); - v->dump(); - } +#ifndef NDEBUG + DEBUG(dbgs() << "States after initialization:\n"); + for (auto Pair : States) { + DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n"); } +#endif - // TODO: come back and revisit the state transitions around inputs which - // have reached conflict state. The current version seems too conservative. + // Return a phi state for a base defining value. We'll generate a new + // base state for known bases and expect to find a cached state otherwise. + auto getStateForBDV = [&](Value *baseValue) { + if (isKnownBaseResult(baseValue)) + return BDVState(baseValue); + auto I = States.find(baseValue); + assert(I != States.end() && "lookup failed!"); + return I->second; + }; bool progress = true; while (progress) { #ifndef NDEBUG - size_t oldSize = states.size(); + const size_t oldSize = States.size(); #endif progress = false; - // We're only changing keys in this loop, thus safe to keep iterators - for (auto Pair : states) { - MeetPhiStates calculateMeet(states); - Value *v = Pair.first; - assert(!isKnownBaseResult(v) && "why did it get added?"); - if (SelectInst *select = dyn_cast<SelectInst>(v)) { - calculateMeet.meetWith(findBaseOrBDV(select->getTrueValue(), cache)); - calculateMeet.meetWith(findBaseOrBDV(select->getFalseValue(), cache)); - } else - for (Value *Val : cast<PHINode>(v)->incoming_values()) - calculateMeet.meetWith(findBaseOrBDV(Val, cache)); - - PhiState oldState = states[v]; - PhiState newState = calculateMeet.getResult(); + // We're only changing values in this loop, thus safe to keep iterators. + // Since this is computing a fixed point, the order of visit does not + // effect the result. TODO: We could use a worklist here and make this run + // much faster. + for (auto Pair : States) { + Value *BDV = Pair.first; + assert(!isKnownBaseResult(BDV) && "why did it get added?"); + + // Given an input value for the current instruction, return a BDVState + // instance which represents the BDV of that value. + auto getStateForInput = [&](Value *V) mutable { + Value *BDV = findBaseOrBDV(V, cache); + return getStateForBDV(BDV); + }; + + MeetBDVStates calculateMeet; + if (SelectInst *select = dyn_cast<SelectInst>(BDV)) { + calculateMeet.meetWith(getStateForInput(select->getTrueValue())); + calculateMeet.meetWith(getStateForInput(select->getFalseValue())); + } else if (PHINode *Phi = dyn_cast<PHINode>(BDV)) { + for (Value *Val : Phi->incoming_values()) + calculateMeet.meetWith(getStateForInput(Val)); + } else if (auto *EE = dyn_cast<ExtractElementInst>(BDV)) { + // The 'meet' for an extractelement is slightly trivial, but it's still + // useful in that it drives us to conflict if our input is. + calculateMeet.meetWith(getStateForInput(EE->getVectorOperand())); + } else { + // Given there's a inherent type mismatch between the operands, will + // *always* produce Conflict. + auto *IE = cast<InsertElementInst>(BDV); + calculateMeet.meetWith(getStateForInput(IE->getOperand(0))); + calculateMeet.meetWith(getStateForInput(IE->getOperand(1))); + } + + BDVState oldState = States[BDV]; + BDVState newState = calculateMeet.getResult(); if (oldState != newState) { progress = true; - states[v] = newState; + States[BDV] = newState; } } - assert(oldSize <= states.size()); - assert(oldSize == states.size() || progress); + assert(oldSize == States.size() && + "fixed point shouldn't be adding any new nodes to state"); } - if (TraceLSP) { - errs() << "States after meet iteration:\n"; - for (auto Pair : states) { - Instruction *v = cast<Instruction>(Pair.first); - PhiState state = Pair.second; - state.dump(); - v->dump(); - } +#ifndef NDEBUG + DEBUG(dbgs() << "States after meet iteration:\n"); + for (auto Pair : States) { + DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n"); } - +#endif + // Insert Phis for all conflicts - // We want to keep naming deterministic in the loop that follows, so - // sort the keys before iteration. This is useful in allowing us to - // write stable tests. Note that there is no invalidation issue here. - SmallVector<Value *, 16> Keys; - Keys.reserve(states.size()); - for (auto Pair : states) { - Value *V = Pair.first; - Keys.push_back(V); - } - std::sort(Keys.begin(), Keys.end(), order_by_name); // TODO: adjust naming patterns to avoid this order of iteration dependency - for (Value *V : Keys) { - Instruction *v = cast<Instruction>(V); - PhiState state = states[V]; - assert(!isKnownBaseResult(v) && "why did it get added?"); - assert(!state.isUnknown() && "Optimistic algorithm didn't complete!"); - if (!state.isConflict()) + for (auto Pair : States) { + Instruction *I = cast<Instruction>(Pair.first); + BDVState State = Pair.second; + assert(!isKnownBaseResult(I) && "why did it get added?"); + assert(!State.isUnknown() && "Optimistic algorithm didn't complete!"); + + // extractelement instructions are a bit special in that we may need to + // insert an extract even when we know an exact base for the instruction. + // The problem is that we need to convert from a vector base to a scalar + // base for the particular indice we're interested in. + if (State.isBase() && isa<ExtractElementInst>(I) && + isa<VectorType>(State.getBase()->getType())) { + auto *EE = cast<ExtractElementInst>(I); + // TODO: In many cases, the new instruction is just EE itself. We should + // exploit this, but can't do it here since it would break the invariant + // about the BDV not being known to be a base. + auto *BaseInst = ExtractElementInst::Create(State.getBase(), + EE->getIndexOperand(), + "base_ee", EE); + BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {})); + States[I] = BDVState(BDVState::Base, BaseInst); + } + + // Since we're joining a vector and scalar base, they can never be the + // same. As a result, we should always see insert element having reached + // the conflict state. + if (isa<InsertElementInst>(I)) { + assert(State.isConflict()); + } + + if (!State.isConflict()) continue; - if (isa<PHINode>(v)) { - int num_preds = - std::distance(pred_begin(v->getParent()), pred_end(v->getParent())); - assert(num_preds > 0 && "how did we reach here"); - PHINode *phi = PHINode::Create(v->getType(), num_preds, "base_phi", v); - // Add metadata marking this as a base value - auto *const_1 = ConstantInt::get( - Type::getInt32Ty( - v->getParent()->getParent()->getParent()->getContext()), - 1); - auto MDConst = ConstantAsMetadata::get(const_1); - MDNode *md = MDNode::get( - v->getParent()->getParent()->getParent()->getContext(), MDConst); - phi->setMetadata("is_base_value", md); - states[v] = PhiState(PhiState::Conflict, phi); + /// Create and insert a new instruction which will represent the base of + /// the given instruction 'I'. + auto MakeBaseInstPlaceholder = [](Instruction *I) -> Instruction* { + if (isa<PHINode>(I)) { + BasicBlock *BB = I->getParent(); + int NumPreds = std::distance(pred_begin(BB), pred_end(BB)); + assert(NumPreds > 0 && "how did we reach here"); + std::string Name = suffixed_name_or(I, ".base", "base_phi"); + return PHINode::Create(I->getType(), NumPreds, Name, I); + } else if (SelectInst *Sel = dyn_cast<SelectInst>(I)) { + // The undef will be replaced later + UndefValue *Undef = UndefValue::get(Sel->getType()); + std::string Name = suffixed_name_or(I, ".base", "base_select"); + return SelectInst::Create(Sel->getCondition(), Undef, + Undef, Name, Sel); + } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) { + UndefValue *Undef = UndefValue::get(EE->getVectorOperand()->getType()); + std::string Name = suffixed_name_or(I, ".base", "base_ee"); + return ExtractElementInst::Create(Undef, EE->getIndexOperand(), Name, + EE); + } else { + auto *IE = cast<InsertElementInst>(I); + UndefValue *VecUndef = UndefValue::get(IE->getOperand(0)->getType()); + UndefValue *ScalarUndef = UndefValue::get(IE->getOperand(1)->getType()); + std::string Name = suffixed_name_or(I, ".base", "base_ie"); + return InsertElementInst::Create(VecUndef, ScalarUndef, + IE->getOperand(2), Name, IE); + } + + }; + Instruction *BaseInst = MakeBaseInstPlaceholder(I); + // Add metadata marking this as a base value + BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {})); + States[I] = BDVState(BDVState::Conflict, BaseInst); + } + + // Returns a instruction which produces the base pointer for a given + // instruction. The instruction is assumed to be an input to one of the BDVs + // seen in the inference algorithm above. As such, we must either already + // know it's base defining value is a base, or have inserted a new + // instruction to propagate the base of it's BDV and have entered that newly + // introduced instruction into the state table. In either case, we are + // assured to be able to determine an instruction which produces it's base + // pointer. + auto getBaseForInput = [&](Value *Input, Instruction *InsertPt) { + Value *BDV = findBaseOrBDV(Input, cache); + Value *Base = nullptr; + if (isKnownBaseResult(BDV)) { + Base = BDV; } else { - SelectInst *sel = cast<SelectInst>(v); - // The undef will be replaced later - UndefValue *undef = UndefValue::get(sel->getType()); - SelectInst *basesel = SelectInst::Create(sel->getCondition(), undef, - undef, "base_select", sel); - // Add metadata marking this as a base value - auto *const_1 = ConstantInt::get( - Type::getInt32Ty( - v->getParent()->getParent()->getParent()->getContext()), - 1); - auto MDConst = ConstantAsMetadata::get(const_1); - MDNode *md = MDNode::get( - v->getParent()->getParent()->getParent()->getContext(), MDConst); - basesel->setMetadata("is_base_value", md); - states[v] = PhiState(PhiState::Conflict, basesel); + // Either conflict or base. + assert(States.count(BDV)); + Base = States[BDV].getBase(); } - } + assert(Base && "can't be null"); + // The cast is needed since base traversal may strip away bitcasts + if (Base->getType() != Input->getType() && + InsertPt) { + Base = new BitCastInst(Base, Input->getType(), "cast", + InsertPt); + } + return Base; + }; - // Fixup all the inputs of the new PHIs - for (auto Pair : states) { - Instruction *v = cast<Instruction>(Pair.first); - PhiState state = Pair.second; + // Fixup all the inputs of the new PHIs. Visit order needs to be + // deterministic and predictable because we're naming newly created + // instructions. + for (auto Pair : States) { + Instruction *BDV = cast<Instruction>(Pair.first); + BDVState State = Pair.second; - assert(!isKnownBaseResult(v) && "why did it get added?"); - assert(!state.isUnknown() && "Optimistic algorithm didn't complete!"); - if (!state.isConflict()) + assert(!isKnownBaseResult(BDV) && "why did it get added?"); + assert(!State.isUnknown() && "Optimistic algorithm didn't complete!"); + if (!State.isConflict()) continue; - if (PHINode *basephi = dyn_cast<PHINode>(state.getBase())) { - PHINode *phi = cast<PHINode>(v); + if (PHINode *basephi = dyn_cast<PHINode>(State.getBase())) { + PHINode *phi = cast<PHINode>(BDV); unsigned NumPHIValues = phi->getNumIncomingValues(); for (unsigned i = 0; i < NumPHIValues; i++) { Value *InVal = phi->getIncomingValue(i); @@ -906,104 +1002,145 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) { if (blockIndex != -1) { Value *oldBase = basephi->getIncomingValue(blockIndex); basephi->addIncoming(oldBase, InBB); + #ifndef NDEBUG - Value *base = findBaseOrBDV(InVal, cache); - if (!isKnownBaseResult(base)) { - // Either conflict or base. - assert(states.count(base)); - base = states[base].getBase(); - assert(base != nullptr && "unknown PhiState!"); - } - - // In essense this assert states: the only way two + Value *Base = getBaseForInput(InVal, nullptr); + // In essence this assert states: the only way two // values incoming from the same basic block may be // different is by being different bitcasts of the same // value. A cleanup that remains TODO is changing // findBaseOrBDV to return an llvm::Value of the correct // type (and still remain pure). This will remove the // need to add bitcasts. - assert(base->stripPointerCasts() == oldBase->stripPointerCasts() && + assert(Base->stripPointerCasts() == oldBase->stripPointerCasts() && "sanity -- findBaseOrBDV should be pure!"); #endif continue; } - // Find either the defining value for the PHI or the normal base for - // a non-phi node - Value *base = findBaseOrBDV(InVal, cache); - if (!isKnownBaseResult(base)) { - // Either conflict or base. - assert(states.count(base)); - base = states[base].getBase(); - assert(base != nullptr && "unknown PhiState!"); - } - assert(base && "can't be null"); - // Must use original input BB since base may not be Instruction - // The cast is needed since base traversal may strip away bitcasts - if (base->getType() != basephi->getType()) { - base = new BitCastInst(base, basephi->getType(), "cast", - InBB->getTerminator()); - } - basephi->addIncoming(base, InBB); + // Find the instruction which produces the base for each input. We may + // need to insert a bitcast in the incoming block. + // TODO: Need to split critical edges if insertion is needed + Value *Base = getBaseForInput(InVal, InBB->getTerminator()); + basephi->addIncoming(Base, InBB); } assert(basephi->getNumIncomingValues() == NumPHIValues); - } else { - SelectInst *basesel = cast<SelectInst>(state.getBase()); - SelectInst *sel = cast<SelectInst>(v); + } else if (SelectInst *BaseSel = dyn_cast<SelectInst>(State.getBase())) { + SelectInst *Sel = cast<SelectInst>(BDV); // Operand 1 & 2 are true, false path respectively. TODO: refactor to // something more safe and less hacky. for (int i = 1; i <= 2; i++) { - Value *InVal = sel->getOperand(i); - // Find either the defining value for the PHI or the normal base for - // a non-phi node - Value *base = findBaseOrBDV(InVal, cache); - if (!isKnownBaseResult(base)) { - // Either conflict or base. - assert(states.count(base)); - base = states[base].getBase(); - assert(base != nullptr && "unknown PhiState!"); - } - assert(base && "can't be null"); - // Must use original input BB since base may not be Instruction - // The cast is needed since base traversal may strip away bitcasts - if (base->getType() != basesel->getType()) { - base = new BitCastInst(base, basesel->getType(), "cast", basesel); - } - basesel->setOperand(i, base); + Value *InVal = Sel->getOperand(i); + // Find the instruction which produces the base for each input. We may + // need to insert a bitcast. + Value *Base = getBaseForInput(InVal, BaseSel); + BaseSel->setOperand(i, Base); } + } else if (auto *BaseEE = dyn_cast<ExtractElementInst>(State.getBase())) { + Value *InVal = cast<ExtractElementInst>(BDV)->getVectorOperand(); + // Find the instruction which produces the base for each input. We may + // need to insert a bitcast. + Value *Base = getBaseForInput(InVal, BaseEE); + BaseEE->setOperand(0, Base); + } else { + auto *BaseIE = cast<InsertElementInst>(State.getBase()); + auto *BdvIE = cast<InsertElementInst>(BDV); + auto UpdateOperand = [&](int OperandIdx) { + Value *InVal = BdvIE->getOperand(OperandIdx); + Value *Base = getBaseForInput(InVal, BaseIE); + BaseIE->setOperand(OperandIdx, Base); + }; + UpdateOperand(0); // vector operand + UpdateOperand(1); // scalar operand + } + + } + + // Now that we're done with the algorithm, see if we can optimize the + // results slightly by reducing the number of new instructions needed. + // Arguably, this should be integrated into the algorithm above, but + // doing as a post process step is easier to reason about for the moment. + DenseMap<Value *, Value *> ReverseMap; + SmallPtrSet<Instruction *, 16> NewInsts; + SmallSetVector<AssertingVH<Instruction>, 16> Worklist; + // Note: We need to visit the states in a deterministic order. We uses the + // Keys we sorted above for this purpose. Note that we are papering over a + // bigger problem with the algorithm above - it's visit order is not + // deterministic. A larger change is needed to fix this. + for (auto Pair : States) { + auto *BDV = Pair.first; + auto State = Pair.second; + Value *Base = State.getBase(); + assert(BDV && Base); + assert(!isKnownBaseResult(BDV) && "why did it get added?"); + assert(isKnownBaseResult(Base) && + "must be something we 'know' is a base pointer"); + if (!State.isConflict()) + continue; + + ReverseMap[Base] = BDV; + if (auto *BaseI = dyn_cast<Instruction>(Base)) { + NewInsts.insert(BaseI); + Worklist.insert(BaseI); + } + } + auto ReplaceBaseInstWith = [&](Value *BDV, Instruction *BaseI, + Value *Replacement) { + // Add users which are new instructions (excluding self references) + for (User *U : BaseI->users()) + if (auto *UI = dyn_cast<Instruction>(U)) + if (NewInsts.count(UI) && UI != BaseI) + Worklist.insert(UI); + // Then do the actual replacement + NewInsts.erase(BaseI); + ReverseMap.erase(BaseI); + BaseI->replaceAllUsesWith(Replacement); + assert(States.count(BDV)); + assert(States[BDV].isConflict() && States[BDV].getBase() == BaseI); + States[BDV] = BDVState(BDVState::Conflict, Replacement); + BaseI->eraseFromParent(); + }; + const DataLayout &DL = cast<Instruction>(def)->getModule()->getDataLayout(); + while (!Worklist.empty()) { + Instruction *BaseI = Worklist.pop_back_val(); + assert(NewInsts.count(BaseI)); + Value *Bdv = ReverseMap[BaseI]; + if (auto *BdvI = dyn_cast<Instruction>(Bdv)) + if (BaseI->isIdenticalTo(BdvI)) { + DEBUG(dbgs() << "Identical Base: " << *BaseI << "\n"); + ReplaceBaseInstWith(Bdv, BaseI, Bdv); + continue; + } + if (Value *V = SimplifyInstruction(BaseI, DL)) { + DEBUG(dbgs() << "Base " << *BaseI << " simplified to " << *V << "\n"); + ReplaceBaseInstWith(Bdv, BaseI, V); + continue; } } // Cache all of our results so we can cheaply reuse them // NOTE: This is actually two caches: one of the base defining value // relation and one of the base pointer relation! FIXME - for (auto item : states) { - Value *v = item.first; - Value *base = item.second.getBase(); - assert(v && base); - assert(!isKnownBaseResult(v) && "why did it get added?"); - - if (TraceLSP) { - std::string fromstr = - cache.count(v) ? (cache[v]->hasName() ? cache[v]->getName() : "") - : "none"; - errs() << "Updating base value cache" - << " for: " << (v->hasName() ? v->getName() : "") - << " from: " << fromstr - << " to: " << (base->hasName() ? base->getName() : "") << "\n"; - } - - assert(isKnownBaseResult(base) && - "must be something we 'know' is a base pointer"); - if (cache.count(v)) { + for (auto Pair : States) { + auto *BDV = Pair.first; + Value *base = Pair.second.getBase(); + assert(BDV && base); + + std::string fromstr = cache.count(BDV) ? cache[BDV]->getName() : "none"; + DEBUG(dbgs() << "Updating base value cache" + << " for: " << BDV->getName() + << " from: " << fromstr + << " to: " << base->getName() << "\n"); + + if (cache.count(BDV)) { // Once we transition from the BDV relation being store in the cache to // the base relation being stored, it must be stable - assert((!isKnownBaseResult(cache[v]) || cache[v] == base) && + assert((!isKnownBaseResult(cache[BDV]) || cache[BDV] == base) && "base relation should be stable"); } - cache[v] = base; + cache[BDV] = base; } - assert(cache.find(def) != cache.end()); + assert(cache.count(def)); return cache[def]; } @@ -1024,7 +1161,7 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &cache) { // pointer was a base pointer. static void findBasePointers(const StatepointLiveSetTy &live, - DenseMap<llvm::Value *, llvm::Value *> &PointerToBase, + DenseMap<Value *, Value *> &PointerToBase, DominatorTree *DT, DefiningValueMapTy &DVCache) { // For the naming of values inserted to be deterministic - which makes for // much cleaner and more stable tests - we need to assign an order to the @@ -1043,7 +1180,7 @@ findBasePointers(const StatepointLiveSetTy &live, // If you see this trip and like to live really dangerously, the code should // be correct, just with idioms the verifier can't handle. You can try - // disabling the verifier at your own substaintial risk. + // disabling the verifier at your own substantial risk. assert(!isa<ConstantPointerNull>(base) && "the relocation code needs adjustment to handle the relocation of " "a null pointer constant without causing false positives in the " @@ -1056,8 +1193,8 @@ findBasePointers(const StatepointLiveSetTy &live, static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache, const CallSite &CS, PartiallyConstructedSafepointRecord &result) { - DenseMap<llvm::Value *, llvm::Value *> PointerToBase; - findBasePointers(result.liveset, PointerToBase, &DT, DVCache); + DenseMap<Value *, Value *> PointerToBase; + findBasePointers(result.LiveSet, PointerToBase, &DT, DVCache); if (PrintBasePointers) { // Note: Need to print these in a stable order since this is checked in @@ -1071,8 +1208,11 @@ static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache, std::sort(Temp.begin(), Temp.end(), order_by_name); for (Value *Ptr : Temp) { Value *Base = PointerToBase[Ptr]; - errs() << " derived %" << Ptr->getName() << " base %" << Base->getName() - << "\n"; + errs() << " derived "; + Ptr->printAsOperand(errs(), false); + errs() << " base "; + Base->printAsOperand(errs(), false); + errs() << "\n";; } } @@ -1086,10 +1226,10 @@ static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData, PartiallyConstructedSafepointRecord &result); static void recomputeLiveInValues( - Function &F, DominatorTree &DT, Pass *P, ArrayRef<CallSite> toUpdate, + Function &F, DominatorTree &DT, ArrayRef<CallSite> toUpdate, MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) { // TODO-PERF: reuse the original liveness, then simply run the dataflow - // again. The old values are still live and will help it stablize quickly. + // again. The old values are still live and will help it stabilize quickly. GCPtrLivenessData RevisedLivenessData; computeLiveInValues(DT, F, RevisedLivenessData); for (size_t i = 0; i < records.size(); i++) { @@ -1099,69 +1239,66 @@ static void recomputeLiveInValues( } } -// When inserting gc.relocate calls, we need to ensure there are no uses -// of the original value between the gc.statepoint and the gc.relocate call. -// One case which can arise is a phi node starting one of the successor blocks. -// We also need to be able to insert the gc.relocates only on the path which -// goes through the statepoint. We might need to split an edge to make this -// possible. +// When inserting gc.relocate and gc.result calls, we need to ensure there are +// no uses of the original value / return value between the gc.statepoint and +// the gc.relocate / gc.result call. One case which can arise is a phi node +// starting one of the successor blocks. We also need to be able to insert the +// gc.relocates only on the path which goes through the statepoint. We might +// need to split an edge to make this possible. static BasicBlock * normalizeForInvokeSafepoint(BasicBlock *BB, BasicBlock *InvokeParent, DominatorTree &DT) { BasicBlock *Ret = BB; - if (!BB->getUniquePredecessor()) { - Ret = SplitBlockPredecessors(BB, InvokeParent, "", nullptr, &DT); - } + if (!BB->getUniquePredecessor()) + Ret = SplitBlockPredecessors(BB, InvokeParent, "", &DT); - // Now that 'ret' has unique predecessor we can safely remove all phi nodes + // Now that 'Ret' has unique predecessor we can safely remove all phi nodes // from it FoldSingleEntryPHINodes(Ret); - assert(!isa<PHINode>(Ret->begin())); + assert(!isa<PHINode>(Ret->begin()) && + "All PHI nodes should have been removed!"); - // At this point, we can safely insert a gc.relocate as the first instruction - // in Ret if needed. + // At this point, we can safely insert a gc.relocate or gc.result as the first + // instruction in Ret if needed. return Ret; } -static int find_index(ArrayRef<Value *> livevec, Value *val) { - auto itr = std::find(livevec.begin(), livevec.end(), val); - assert(livevec.end() != itr); - size_t index = std::distance(livevec.begin(), itr); - assert(index < livevec.size()); - return index; -} - -// Create new attribute set containing only attributes which can be transfered +// Create new attribute set containing only attributes which can be transferred // from original call to the safepoint. static AttributeSet legalizeCallAttributes(AttributeSet AS) { - AttributeSet ret; + AttributeSet Ret; for (unsigned Slot = 0; Slot < AS.getNumSlots(); Slot++) { - unsigned index = AS.getSlotIndex(Slot); + unsigned Index = AS.getSlotIndex(Slot); - if (index == AttributeSet::ReturnIndex || - index == AttributeSet::FunctionIndex) { + if (Index == AttributeSet::ReturnIndex || + Index == AttributeSet::FunctionIndex) { - for (auto it = AS.begin(Slot), it_end = AS.end(Slot); it != it_end; - ++it) { - Attribute attr = *it; + for (Attribute Attr : make_range(AS.begin(Slot), AS.end(Slot))) { // Do not allow certain attributes - just skip them // Safepoint can not be read only or read none. - if (attr.hasAttribute(Attribute::ReadNone) || - attr.hasAttribute(Attribute::ReadOnly)) + if (Attr.hasAttribute(Attribute::ReadNone) || + Attr.hasAttribute(Attribute::ReadOnly)) + continue; + + // These attributes control the generation of the gc.statepoint call / + // invoke itself; and once the gc.statepoint is in place, they're of no + // use. + if (Attr.hasAttribute("statepoint-num-patch-bytes") || + Attr.hasAttribute("statepoint-id")) continue; - ret = ret.addAttributes( - AS.getContext(), index, - AttributeSet::get(AS.getContext(), index, AttrBuilder(attr))); + Ret = Ret.addAttributes( + AS.getContext(), Index, + AttributeSet::get(AS.getContext(), Index, AttrBuilder(Attr))); } } // Just skip parameter attributes for now } - return ret; + return Ret; } /// Helper function to place all gc relocates necessary for the given @@ -1173,225 +1310,290 @@ static AttributeSet legalizeCallAttributes(AttributeSet AS) { /// statepointToken - statepoint instruction to which relocates should be /// bound. /// Builder - Llvm IR builder to be used to construct new calls. -static void CreateGCRelocates(ArrayRef<llvm::Value *> LiveVariables, +static void CreateGCRelocates(ArrayRef<Value *> LiveVariables, const int LiveStart, - ArrayRef<llvm::Value *> BasePtrs, + ArrayRef<Value *> BasePtrs, Instruction *StatepointToken, IRBuilder<> Builder) { - SmallVector<Instruction *, 64> NewDefs; - NewDefs.reserve(LiveVariables.size()); + if (LiveVariables.empty()) + return; - Module *M = StatepointToken->getParent()->getParent()->getParent(); + auto FindIndex = [](ArrayRef<Value *> LiveVec, Value *Val) { + auto ValIt = std::find(LiveVec.begin(), LiveVec.end(), Val); + assert(ValIt != LiveVec.end() && "Val not found in LiveVec!"); + size_t Index = std::distance(LiveVec.begin(), ValIt); + assert(Index < LiveVec.size() && "Bug in std::find?"); + return Index; + }; - for (unsigned i = 0; i < LiveVariables.size(); i++) { - // We generate a (potentially) unique declaration for every pointer type - // combination. This results is some blow up the function declarations in - // the IR, but removes the need for argument bitcasts which shrinks the IR - // greatly and makes it much more readable. - SmallVector<Type *, 1> Types; // one per 'any' type - // All gc_relocate are set to i8 addrspace(1)* type. This could help avoid - // cases where the actual value's type mangling is not supported by llvm. A - // bitcast is added later to convert gc_relocate to the actual value's type. - Types.push_back(Type::getInt8PtrTy(M->getContext(), 1)); - Value *GCRelocateDecl = Intrinsic::getDeclaration( - M, Intrinsic::experimental_gc_relocate, Types); + // All gc_relocate are set to i8 addrspace(1)* type. We originally generated + // unique declarations for each pointer type, but this proved problematic + // because the intrinsic mangling code is incomplete and fragile. Since + // we're moving towards a single unified pointer type anyways, we can just + // cast everything to an i8* of the right address space. A bitcast is added + // later to convert gc_relocate to the actual value's type. + Module *M = StatepointToken->getModule(); + auto AS = cast<PointerType>(LiveVariables[0]->getType())->getAddressSpace(); + Type *Types[] = {Type::getInt8PtrTy(M->getContext(), AS)}; + Value *GCRelocateDecl = + Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_relocate, Types); + for (unsigned i = 0; i < LiveVariables.size(); i++) { // Generate the gc.relocate call and save the result Value *BaseIdx = - ConstantInt::get(Type::getInt32Ty(M->getContext()), - LiveStart + find_index(LiveVariables, BasePtrs[i])); - Value *LiveIdx = ConstantInt::get( - Type::getInt32Ty(M->getContext()), - LiveStart + find_index(LiveVariables, LiveVariables[i])); + Builder.getInt32(LiveStart + FindIndex(LiveVariables, BasePtrs[i])); + Value *LiveIdx = Builder.getInt32(LiveStart + i); // only specify a debug name if we can give a useful one - Value *Reloc = Builder.CreateCall( + CallInst *Reloc = Builder.CreateCall( GCRelocateDecl, {StatepointToken, BaseIdx, LiveIdx}, - LiveVariables[i]->hasName() ? LiveVariables[i]->getName() + ".relocated" - : ""); + suffixed_name_or(LiveVariables[i], ".relocated", "")); // Trick CodeGen into thinking there are lots of free registers at this // fake call. - cast<CallInst>(Reloc)->setCallingConv(CallingConv::Cold); + Reloc->setCallingConv(CallingConv::Cold); + } +} - NewDefs.push_back(cast<Instruction>(Reloc)); +namespace { + +/// This struct is used to defer RAUWs and `eraseFromParent` s. Using this +/// avoids having to worry about keeping around dangling pointers to Values. +class DeferredReplacement { + AssertingVH<Instruction> Old; + AssertingVH<Instruction> New; + +public: + explicit DeferredReplacement(Instruction *Old, Instruction *New) : + Old(Old), New(New) { + assert(Old != New && "Not allowed!"); } - assert(NewDefs.size() == LiveVariables.size() && - "missing or extra redefinition at safepoint"); + + /// Does the task represented by this instance. + void doReplacement() { + Instruction *OldI = Old; + Instruction *NewI = New; + + assert(OldI != NewI && "Disallowed at construction?!"); + + Old = nullptr; + New = nullptr; + + if (NewI) + OldI->replaceAllUsesWith(NewI); + OldI->eraseFromParent(); + } +}; } static void -makeStatepointExplicitImpl(const CallSite &CS, /* to replace */ - const SmallVectorImpl<llvm::Value *> &basePtrs, - const SmallVectorImpl<llvm::Value *> &liveVariables, - Pass *P, - PartiallyConstructedSafepointRecord &result) { - assert(basePtrs.size() == liveVariables.size()); - assert(isStatepoint(CS) && +makeStatepointExplicitImpl(const CallSite CS, /* to replace */ + const SmallVectorImpl<Value *> &BasePtrs, + const SmallVectorImpl<Value *> &LiveVariables, + PartiallyConstructedSafepointRecord &Result, + std::vector<DeferredReplacement> &Replacements) { + assert(BasePtrs.size() == LiveVariables.size()); + assert((UseDeoptBundles || isStatepoint(CS)) && "This method expects to be rewriting a statepoint"); - BasicBlock *BB = CS.getInstruction()->getParent(); - assert(BB); - Function *F = BB->getParent(); - assert(F && "must be set"); - Module *M = F->getParent(); - (void)M; - assert(M && "must be set"); - - // We're not changing the function signature of the statepoint since the gc - // arguments go into the var args section. - Function *gc_statepoint_decl = CS.getCalledFunction(); - // Then go ahead and use the builder do actually do the inserts. We insert // immediately before the previous instruction under the assumption that all // arguments will be available here. We can't insert afterwards since we may // be replacing a terminator. - Instruction *insertBefore = CS.getInstruction(); - IRBuilder<> Builder(insertBefore); - // Copy all of the arguments from the original statepoint - this includes the - // target, call args, and deopt args - SmallVector<llvm::Value *, 64> args; - args.insert(args.end(), CS.arg_begin(), CS.arg_end()); - // TODO: Clear the 'needs rewrite' flag - - // add all the pointers to be relocated (gc arguments) - // Capture the start of the live variable list for use in the gc_relocates - const int live_start = args.size(); - args.insert(args.end(), liveVariables.begin(), liveVariables.end()); + Instruction *InsertBefore = CS.getInstruction(); + IRBuilder<> Builder(InsertBefore); + + ArrayRef<Value *> GCArgs(LiveVariables); + uint64_t StatepointID = 0xABCDEF00; + uint32_t NumPatchBytes = 0; + uint32_t Flags = uint32_t(StatepointFlags::None); + + ArrayRef<Use> CallArgs; + ArrayRef<Use> DeoptArgs; + ArrayRef<Use> TransitionArgs; + + Value *CallTarget = nullptr; + + if (UseDeoptBundles) { + CallArgs = {CS.arg_begin(), CS.arg_end()}; + DeoptArgs = GetDeoptBundleOperands(CS); + // TODO: we don't fill in TransitionArgs or Flags in this branch, but we + // could have an operand bundle for that too. + AttributeSet OriginalAttrs = CS.getAttributes(); + + Attribute AttrID = OriginalAttrs.getAttribute(AttributeSet::FunctionIndex, + "statepoint-id"); + if (AttrID.isStringAttribute()) + AttrID.getValueAsString().getAsInteger(10, StatepointID); + + Attribute AttrNumPatchBytes = OriginalAttrs.getAttribute( + AttributeSet::FunctionIndex, "statepoint-num-patch-bytes"); + if (AttrNumPatchBytes.isStringAttribute()) + AttrNumPatchBytes.getValueAsString().getAsInteger(10, NumPatchBytes); + + CallTarget = CS.getCalledValue(); + } else { + // This branch will be gone soon, and we will soon only support the + // UseDeoptBundles == true configuration. + Statepoint OldSP(CS); + StatepointID = OldSP.getID(); + NumPatchBytes = OldSP.getNumPatchBytes(); + Flags = OldSP.getFlags(); + + CallArgs = {OldSP.arg_begin(), OldSP.arg_end()}; + DeoptArgs = {OldSP.vm_state_begin(), OldSP.vm_state_end()}; + TransitionArgs = {OldSP.gc_transition_args_begin(), + OldSP.gc_transition_args_end()}; + CallTarget = OldSP.getCalledValue(); + } // Create the statepoint given all the arguments - Instruction *token = nullptr; - AttributeSet return_attributes; + Instruction *Token = nullptr; + AttributeSet ReturnAttrs; if (CS.isCall()) { - CallInst *toReplace = cast<CallInst>(CS.getInstruction()); - CallInst *call = - Builder.CreateCall(gc_statepoint_decl, args, "safepoint_token"); - call->setTailCall(toReplace->isTailCall()); - call->setCallingConv(toReplace->getCallingConv()); + CallInst *ToReplace = cast<CallInst>(CS.getInstruction()); + CallInst *Call = Builder.CreateGCStatepointCall( + StatepointID, NumPatchBytes, CallTarget, Flags, CallArgs, + TransitionArgs, DeoptArgs, GCArgs, "safepoint_token"); + + Call->setTailCall(ToReplace->isTailCall()); + Call->setCallingConv(ToReplace->getCallingConv()); // Currently we will fail on parameter attributes and on certain // function attributes. - AttributeSet new_attrs = legalizeCallAttributes(toReplace->getAttributes()); - // In case if we can handle this set of sttributes - set up function attrs + AttributeSet NewAttrs = legalizeCallAttributes(ToReplace->getAttributes()); + // In case if we can handle this set of attributes - set up function attrs // directly on statepoint and return attrs later for gc_result intrinsic. - call->setAttributes(new_attrs.getFnAttributes()); - return_attributes = new_attrs.getRetAttributes(); + Call->setAttributes(NewAttrs.getFnAttributes()); + ReturnAttrs = NewAttrs.getRetAttributes(); - token = call; + Token = Call; // Put the following gc_result and gc_relocate calls immediately after the // the old call (which we're about to delete) - BasicBlock::iterator next(toReplace); - assert(BB->end() != next && "not a terminator, must have next"); - next++; - Instruction *IP = &*(next); - Builder.SetInsertPoint(IP); - Builder.SetCurrentDebugLocation(IP->getDebugLoc()); - + assert(ToReplace->getNextNode() && "Not a terminator, must have next!"); + Builder.SetInsertPoint(ToReplace->getNextNode()); + Builder.SetCurrentDebugLocation(ToReplace->getNextNode()->getDebugLoc()); } else { - InvokeInst *toReplace = cast<InvokeInst>(CS.getInstruction()); + InvokeInst *ToReplace = cast<InvokeInst>(CS.getInstruction()); // Insert the new invoke into the old block. We'll remove the old one in a // moment at which point this will become the new terminator for the // original block. - InvokeInst *invoke = InvokeInst::Create( - gc_statepoint_decl, toReplace->getNormalDest(), - toReplace->getUnwindDest(), args, "", toReplace->getParent()); - invoke->setCallingConv(toReplace->getCallingConv()); + InvokeInst *Invoke = Builder.CreateGCStatepointInvoke( + StatepointID, NumPatchBytes, CallTarget, ToReplace->getNormalDest(), + ToReplace->getUnwindDest(), Flags, CallArgs, TransitionArgs, DeoptArgs, + GCArgs, "statepoint_token"); + + Invoke->setCallingConv(ToReplace->getCallingConv()); // Currently we will fail on parameter attributes and on certain // function attributes. - AttributeSet new_attrs = legalizeCallAttributes(toReplace->getAttributes()); - // In case if we can handle this set of sttributes - set up function attrs + AttributeSet NewAttrs = legalizeCallAttributes(ToReplace->getAttributes()); + // In case if we can handle this set of attributes - set up function attrs // directly on statepoint and return attrs later for gc_result intrinsic. - invoke->setAttributes(new_attrs.getFnAttributes()); - return_attributes = new_attrs.getRetAttributes(); + Invoke->setAttributes(NewAttrs.getFnAttributes()); + ReturnAttrs = NewAttrs.getRetAttributes(); - token = invoke; + Token = Invoke; // Generate gc relocates in exceptional path - BasicBlock *unwindBlock = toReplace->getUnwindDest(); - assert(!isa<PHINode>(unwindBlock->begin()) && - unwindBlock->getUniquePredecessor() && + BasicBlock *UnwindBlock = ToReplace->getUnwindDest(); + assert(!isa<PHINode>(UnwindBlock->begin()) && + UnwindBlock->getUniquePredecessor() && "can't safely insert in this block!"); - Instruction *IP = &*(unwindBlock->getFirstInsertionPt()); - Builder.SetInsertPoint(IP); - Builder.SetCurrentDebugLocation(toReplace->getDebugLoc()); + Builder.SetInsertPoint(&*UnwindBlock->getFirstInsertionPt()); + Builder.SetCurrentDebugLocation(ToReplace->getDebugLoc()); - // Extract second element from landingpad return value. We will attach - // exceptional gc relocates to it. - const unsigned idx = 1; - Instruction *exceptional_token = - cast<Instruction>(Builder.CreateExtractValue( - unwindBlock->getLandingPadInst(), idx, "relocate_token")); - result.UnwindToken = exceptional_token; + // Attach exceptional gc relocates to the landingpad. + Instruction *ExceptionalToken = UnwindBlock->getLandingPadInst(); + Result.UnwindToken = ExceptionalToken; - // Just throw away return value. We will use the one we got for normal - // block. - (void)CreateGCRelocates(liveVariables, live_start, basePtrs, - exceptional_token, Builder); + const unsigned LiveStartIdx = Statepoint(Token).gcArgsStartIdx(); + CreateGCRelocates(LiveVariables, LiveStartIdx, BasePtrs, ExceptionalToken, + Builder); // Generate gc relocates and returns for normal block - BasicBlock *normalDest = toReplace->getNormalDest(); - assert(!isa<PHINode>(normalDest->begin()) && - normalDest->getUniquePredecessor() && + BasicBlock *NormalDest = ToReplace->getNormalDest(); + assert(!isa<PHINode>(NormalDest->begin()) && + NormalDest->getUniquePredecessor() && "can't safely insert in this block!"); - IP = &*(normalDest->getFirstInsertionPt()); - Builder.SetInsertPoint(IP); + Builder.SetInsertPoint(&*NormalDest->getFirstInsertionPt()); // gc relocates will be generated later as if it were regular call // statepoint } - assert(token); - - // Take the name of the original value call if it had one. - token->takeName(CS.getInstruction()); + assert(Token && "Should be set in one of the above branches!"); + + if (UseDeoptBundles) { + Token->setName("statepoint_token"); + if (!CS.getType()->isVoidTy() && !CS.getInstruction()->use_empty()) { + StringRef Name = + CS.getInstruction()->hasName() ? CS.getInstruction()->getName() : ""; + CallInst *GCResult = Builder.CreateGCResult(Token, CS.getType(), Name); + GCResult->setAttributes(CS.getAttributes().getRetAttributes()); + + // We cannot RAUW or delete CS.getInstruction() because it could be in the + // live set of some other safepoint, in which case that safepoint's + // PartiallyConstructedSafepointRecord will hold a raw pointer to this + // llvm::Instruction. Instead, we defer the replacement and deletion to + // after the live sets have been made explicit in the IR, and we no longer + // have raw pointers to worry about. + Replacements.emplace_back(CS.getInstruction(), GCResult); + } else { + Replacements.emplace_back(CS.getInstruction(), nullptr); + } + } else { + assert(!CS.getInstruction()->hasNUsesOrMore(2) && + "only valid use before rewrite is gc.result"); + assert(!CS.getInstruction()->hasOneUse() || + isGCResult(cast<Instruction>(*CS.getInstruction()->user_begin()))); -// The GCResult is already inserted, we just need to find it -#ifndef NDEBUG - Instruction *toReplace = CS.getInstruction(); - assert((toReplace->hasNUses(0) || toReplace->hasNUses(1)) && - "only valid use before rewrite is gc.result"); - assert(!toReplace->hasOneUse() || - isGCResult(cast<Instruction>(*toReplace->user_begin()))); -#endif + // Take the name of the original statepoint token if there was one. + Token->takeName(CS.getInstruction()); - // Update the gc.result of the original statepoint (if any) to use the newly - // inserted statepoint. This is safe to do here since the token can't be - // considered a live reference. - CS.getInstruction()->replaceAllUsesWith(token); + // Update the gc.result of the original statepoint (if any) to use the newly + // inserted statepoint. This is safe to do here since the token can't be + // considered a live reference. + CS.getInstruction()->replaceAllUsesWith(Token); + CS.getInstruction()->eraseFromParent(); + } - result.StatepointToken = token; + Result.StatepointToken = Token; // Second, create a gc.relocate for every live variable - CreateGCRelocates(liveVariables, live_start, basePtrs, token, Builder); + const unsigned LiveStartIdx = Statepoint(Token).gcArgsStartIdx(); + CreateGCRelocates(LiveVariables, LiveStartIdx, BasePtrs, Token, Builder); } namespace { -struct name_ordering { - Value *base; - Value *derived; - bool operator()(name_ordering const &a, name_ordering const &b) { - return -1 == a.derived->getName().compare(b.derived->getName()); +struct NameOrdering { + Value *Base; + Value *Derived; + + bool operator()(NameOrdering const &a, NameOrdering const &b) { + return -1 == a.Derived->getName().compare(b.Derived->getName()); } }; } -static void stablize_order(SmallVectorImpl<Value *> &basevec, - SmallVectorImpl<Value *> &livevec) { - assert(basevec.size() == livevec.size()); - - SmallVector<name_ordering, 64> temp; - for (size_t i = 0; i < basevec.size(); i++) { - name_ordering v; - v.base = basevec[i]; - v.derived = livevec[i]; - temp.push_back(v); - } - std::sort(temp.begin(), temp.end(), name_ordering()); - for (size_t i = 0; i < basevec.size(); i++) { - basevec[i] = temp[i].base; - livevec[i] = temp[i].derived; + +static void StabilizeOrder(SmallVectorImpl<Value *> &BaseVec, + SmallVectorImpl<Value *> &LiveVec) { + assert(BaseVec.size() == LiveVec.size()); + + SmallVector<NameOrdering, 64> Temp; + for (size_t i = 0; i < BaseVec.size(); i++) { + NameOrdering v; + v.Base = BaseVec[i]; + v.Derived = LiveVec[i]; + Temp.push_back(v); + } + + std::sort(Temp.begin(), Temp.end(), NameOrdering()); + for (size_t i = 0; i < BaseVec.size(); i++) { + BaseVec[i] = Temp[i].Base; + LiveVec[i] = Temp[i].Derived; } } @@ -1401,40 +1603,39 @@ static void stablize_order(SmallVectorImpl<Value *> &basevec, // WARNING: Does not do any fixup to adjust users of the original live // values. That's the callers responsibility. static void -makeStatepointExplicit(DominatorTree &DT, const CallSite &CS, Pass *P, - PartiallyConstructedSafepointRecord &result) { - auto liveset = result.liveset; - auto PointerToBase = result.PointerToBase; +makeStatepointExplicit(DominatorTree &DT, const CallSite &CS, + PartiallyConstructedSafepointRecord &Result, + std::vector<DeferredReplacement> &Replacements) { + const auto &LiveSet = Result.LiveSet; + const auto &PointerToBase = Result.PointerToBase; // Convert to vector for efficient cross referencing. - SmallVector<Value *, 64> basevec, livevec; - livevec.reserve(liveset.size()); - basevec.reserve(liveset.size()); - for (Value *L : liveset) { - livevec.push_back(L); - - assert(PointerToBase.find(L) != PointerToBase.end()); - Value *base = PointerToBase[L]; - basevec.push_back(base); + SmallVector<Value *, 64> BaseVec, LiveVec; + LiveVec.reserve(LiveSet.size()); + BaseVec.reserve(LiveSet.size()); + for (Value *L : LiveSet) { + LiveVec.push_back(L); + assert(PointerToBase.count(L)); + Value *Base = PointerToBase.find(L)->second; + BaseVec.push_back(Base); } - assert(livevec.size() == basevec.size()); + assert(LiveVec.size() == BaseVec.size()); // To make the output IR slightly more stable (for use in diffs), ensure a // fixed order of the values in the safepoint (by sorting the value name). // The order is otherwise meaningless. - stablize_order(basevec, livevec); + StabilizeOrder(BaseVec, LiveVec); // Do the actual rewriting and delete the old statepoint - makeStatepointExplicitImpl(CS, basevec, livevec, P, result); - CS.getInstruction()->eraseFromParent(); + makeStatepointExplicitImpl(CS, BaseVec, LiveVec, Result, Replacements); } // Helper function for the relocationViaAlloca. -// It receives iterator to the statepoint gc relocates and emits store to the -// assigned -// location (via allocaMap) for the each one of them. -// Add visited values into the visitedLiveValues set we will later use them -// for sanity check. +// +// It receives iterator to the statepoint gc relocates and emits a store to the +// assigned location (via allocaMap) for the each one of them. It adds the +// visited values into the visitedLiveValues set, which we will later use them +// for sanity checking. static void insertRelocationStores(iterator_range<Value::user_iterator> GCRelocs, DenseMap<Value *, Value *> &AllocaMap, @@ -1459,13 +1660,15 @@ insertRelocationStores(iterator_range<Value::user_iterator> GCRelocs, Value *Alloca = AllocaMap[OriginalValue]; // Emit store into the related alloca - // All gc_relocate are i8 addrspace(1)* typed, and it must be bitcasted to + // All gc_relocates are i8 addrspace(1)* typed, and it must be bitcasted to // the correct type according to alloca. - assert(RelocatedValue->getNextNode() && "Should always have one since it's not a terminator"); + assert(RelocatedValue->getNextNode() && + "Should always have one since it's not a terminator"); IRBuilder<> Builder(RelocatedValue->getNextNode()); Value *CastedRelocatedValue = - Builder.CreateBitCast(RelocatedValue, cast<AllocaInst>(Alloca)->getAllocatedType(), - RelocatedValue->hasName() ? RelocatedValue->getName() + ".casted" : ""); + Builder.CreateBitCast(RelocatedValue, + cast<AllocaInst>(Alloca)->getAllocatedType(), + suffixed_name_or(RelocatedValue, ".casted", "")); StoreInst *Store = new StoreInst(CastedRelocatedValue, Alloca); Store->insertAfter(cast<Instruction>(CastedRelocatedValue)); @@ -1501,10 +1704,10 @@ insertRematerializationStores( } } -/// do all the relocation update via allocas and mem2reg +/// Do all the relocation update via allocas and mem2reg static void relocationViaAlloca( Function &F, DominatorTree &DT, ArrayRef<Value *> Live, - ArrayRef<struct PartiallyConstructedSafepointRecord> Records) { + ArrayRef<PartiallyConstructedSafepointRecord> Records) { #ifndef NDEBUG // record initial number of (static) allocas; we'll check we have the same // number when we get done. @@ -1531,15 +1734,12 @@ static void relocationViaAlloca( PromotableAllocas.push_back(Alloca); }; - // emit alloca for each live gc pointer - for (unsigned i = 0; i < Live.size(); i++) { - emitAllocaFor(Live[i]); - } - - // emit allocas for rematerialized values - for (size_t i = 0; i < Records.size(); i++) { - const struct PartiallyConstructedSafepointRecord &Info = Records[i]; + // Emit alloca for each live gc pointer + for (Value *V : Live) + emitAllocaFor(V); + // Emit allocas for rematerialized values + for (const auto &Info : Records) for (auto RematerializedValuePair : Info.RematerializedValues) { Value *OriginalValue = RematerializedValuePair.second; if (AllocaMap.count(OriginalValue) != 0) @@ -1548,20 +1748,17 @@ static void relocationViaAlloca( emitAllocaFor(OriginalValue); ++NumRematerializedValues; } - } // The next two loops are part of the same conceptual operation. We need to // insert a store to the alloca after the original def and at each // redefinition. We need to insert a load before each use. These are split // into distinct loops for performance reasons. - // update gc pointer after each statepoint - // either store a relocated value or null (if no relocated value found for - // this gc pointer and it is not a gc_result) - // this must happen before we update the statepoint with load of alloca - // otherwise we lose the link between statepoint and old def - for (size_t i = 0; i < Records.size(); i++) { - const struct PartiallyConstructedSafepointRecord &Info = Records[i]; + // Update gc pointer after each statepoint: either store a relocated value or + // null (if no relocated value was found for this gc pointer and it is not a + // gc_result). This must happen before we update the statepoint with load of + // alloca otherwise we lose the link between statepoint and old def. + for (const auto &Info : Records) { Value *Statepoint = Info.StatepointToken; // This will be used for consistency check @@ -1582,7 +1779,7 @@ static void relocationViaAlloca( VisitedLiveValues); if (ClobberNonLive) { - // As a debuging aid, pretend that an unrelocated pointer becomes null at + // As a debugging aid, pretend that an unrelocated pointer becomes null at // the gc.statepoint. This will turn some subtle GC problems into // slightly easier to debug SEGVs. Note that on large IR files with // lots of gc.statepoints this is extremely costly both memory and time @@ -1612,23 +1809,22 @@ static void relocationViaAlloca( // Insert the clobbering stores. These may get intermixed with the // gc.results and gc.relocates, but that's fine. if (auto II = dyn_cast<InvokeInst>(Statepoint)) { - InsertClobbersAt(II->getNormalDest()->getFirstInsertionPt()); - InsertClobbersAt(II->getUnwindDest()->getFirstInsertionPt()); + InsertClobbersAt(&*II->getNormalDest()->getFirstInsertionPt()); + InsertClobbersAt(&*II->getUnwindDest()->getFirstInsertionPt()); } else { - BasicBlock::iterator Next(cast<CallInst>(Statepoint)); - Next++; - InsertClobbersAt(Next); + InsertClobbersAt(cast<Instruction>(Statepoint)->getNextNode()); } } } - // update use with load allocas and add store for gc_relocated + + // Update use with load allocas and add store for gc_relocated. for (auto Pair : AllocaMap) { Value *Def = Pair.first; Value *Alloca = Pair.second; - // we pre-record the uses of allocas so that we dont have to worry about - // later update - // that change the user information. + // We pre-record the uses of allocas so that we dont have to worry about + // later update that changes the user information.. + SmallVector<Instruction *, 20> Uses; // PERF: trade a linear scan for repeated reallocation Uses.reserve(std::distance(Def->user_begin(), Def->user_end())); @@ -1663,9 +1859,9 @@ static void relocationViaAlloca( } } - // emit store for the initial gc value - // store must be inserted after load, otherwise store will be in alloca's - // use list and an extra load will be inserted before it + // Emit store for the initial gc value. Store must be inserted after load, + // otherwise store will be in alloca's use list and an extra load will be + // inserted before it. StoreInst *Store = new StoreInst(Def, Alloca); if (Instruction *Inst = dyn_cast<Instruction>(Def)) { if (InvokeInst *Invoke = dyn_cast<InvokeInst>(Inst)) { @@ -1688,14 +1884,13 @@ static void relocationViaAlloca( assert(PromotableAllocas.size() == Live.size() + NumRematerializedValues && "we must have the same allocas with lives"); if (!PromotableAllocas.empty()) { - // apply mem2reg to promote alloca to SSA + // Apply mem2reg to promote alloca to SSA PromoteMemToReg(PromotableAllocas, DT); } #ifndef NDEBUG - for (auto I = F.getEntryBlock().begin(), E = F.getEntryBlock().end(); I != E; - I++) - if (isa<AllocaInst>(*I)) + for (auto &I : F.getEntryBlock()) + if (isa<AllocaInst>(I)) InitialAllocaNum--; assert(InitialAllocaNum == 0 && "We must not introduce any extra allocas"); #endif @@ -1719,28 +1914,27 @@ static void insertUseHolderAfter(CallSite &CS, const ArrayRef<Value *> Values, // No values to hold live, might as well not insert the empty holder return; - Module *M = CS.getInstruction()->getParent()->getParent()->getParent(); + Module *M = CS.getInstruction()->getModule(); // Use a dummy vararg function to actually hold the values live Function *Func = cast<Function>(M->getOrInsertFunction( "__tmp_use", FunctionType::get(Type::getVoidTy(M->getContext()), true))); if (CS.isCall()) { // For call safepoints insert dummy calls right after safepoint - BasicBlock::iterator Next(CS.getInstruction()); - Next++; - Holders.push_back(CallInst::Create(Func, Values, "", Next)); + Holders.push_back(CallInst::Create(Func, Values, "", + &*++CS.getInstruction()->getIterator())); return; } // For invoke safepooints insert dummy calls both in normal and // exceptional destination blocks auto *II = cast<InvokeInst>(CS.getInstruction()); Holders.push_back(CallInst::Create( - Func, Values, "", II->getNormalDest()->getFirstInsertionPt())); + Func, Values, "", &*II->getNormalDest()->getFirstInsertionPt())); Holders.push_back(CallInst::Create( - Func, Values, "", II->getUnwindDest()->getFirstInsertionPt())); + Func, Values, "", &*II->getUnwindDest()->getFirstInsertionPt())); } static void findLiveReferences( - Function &F, DominatorTree &DT, Pass *P, ArrayRef<CallSite> toUpdate, + Function &F, DominatorTree &DT, ArrayRef<CallSite> toUpdate, MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) { GCPtrLivenessData OriginalLivenessData; computeLiveInValues(DT, F, OriginalLivenessData); @@ -1751,12 +1945,12 @@ static void findLiveReferences( } } -/// Remove any vector of pointers from the liveset by scalarizing them over the -/// statepoint instruction. Adds the scalarized pieces to the liveset. It -/// would be preferrable to include the vector in the statepoint itself, but +/// Remove any vector of pointers from the live set by scalarizing them over the +/// statepoint instruction. Adds the scalarized pieces to the live set. It +/// would be preferable to include the vector in the statepoint itself, but /// the lowering code currently does not handle that. Extending it would be /// slightly non-trivial since it requires a format change. Given how rare -/// such cases are (for the moment?) scalarizing is an acceptable comprimise. +/// such cases are (for the moment?) scalarizing is an acceptable compromise. static void splitVectorValues(Instruction *StatepointInst, StatepointLiveSetTy &LiveSet, DenseMap<Value *, Value *>& PointerToBase, @@ -1887,7 +2081,7 @@ static void splitVectorValues(Instruction *StatepointInst, // Helper function for the "rematerializeLiveValues". It walks use chain // starting from the "CurrentValue" until it meets "BaseValue". Only "simple" // values are visited (currently it is GEP's and casts). Returns true if it -// sucessfully reached "BaseValue" and false otherwise. +// successfully reached "BaseValue" and false otherwise. // Fills "ChainToBase" array with all visited values. "BaseValue" is not // recorded. static bool findRematerializableChainToBasePointer( @@ -1907,16 +2101,12 @@ static bool findRematerializableChainToBasePointer( } if (CastInst *CI = dyn_cast<CastInst>(CurrentValue)) { - Value *Def = CI->stripPointerCasts(); - - // This two checks are basically similar. First one is here for the - // consistency with findBasePointers logic. - assert(!isa<CastInst>(Def) && "not a pointer cast found"); if (!CI->isNoopCast(CI->getModule()->getDataLayout())) return false; ChainToBase.push_back(CI); - return findRematerializableChainToBasePointer(ChainToBase, Def, BaseValue); + return findRematerializableChainToBasePointer(ChainToBase, + CI->getOperand(0), BaseValue); } // Not supported instruction in the chain @@ -1957,8 +2147,8 @@ chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain, return Cost; } -// From the statepoint liveset pick values that are cheaper to recompute then to -// relocate. Remove this values from the liveset, rematerialize them after +// From the statepoint live set pick values that are cheaper to recompute then +// to relocate. Remove this values from the live set, rematerialize them after // statepoint and record them in "Info" structure. Note that similar to // relocated values we don't do any user adjustments here. static void rematerializeLiveValues(CallSite CS, @@ -1970,10 +2160,10 @@ static void rematerializeLiveValues(CallSite CS, // We can not di this in following loop due to iterator invalidation. SmallVector<Value *, 32> LiveValuesToBeDeleted; - for (Value *LiveValue: Info.liveset) { + for (Value *LiveValue: Info.LiveSet) { // For each live pointer find it's defining chain SmallVector<Instruction *, 3> ChainToBase; - assert(Info.PointerToBase.find(LiveValue) != Info.PointerToBase.end()); + assert(Info.PointerToBase.count(LiveValue)); bool FoundChain = findRematerializableChainToBasePointer(ChainToBase, LiveValue, @@ -2059,9 +2249,9 @@ static void rematerializeLiveValues(CallSite CS, InvokeInst *Invoke = cast<InvokeInst>(CS.getInstruction()); Instruction *NormalInsertBefore = - Invoke->getNormalDest()->getFirstInsertionPt(); + &*Invoke->getNormalDest()->getFirstInsertionPt(); Instruction *UnwindInsertBefore = - Invoke->getUnwindDest()->getFirstInsertionPt(); + &*Invoke->getUnwindDest()->getFirstInsertionPt(); Instruction *NormalRematerializedValue = rematerializeChain(NormalInsertBefore); @@ -2075,22 +2265,23 @@ static void rematerializeLiveValues(CallSite CS, // Remove rematerializaed values from the live set for (auto LiveValue: LiveValuesToBeDeleted) { - Info.liveset.erase(LiveValue); + Info.LiveSet.erase(LiveValue); } } -static bool insertParsePoints(Function &F, DominatorTree &DT, Pass *P, - SmallVectorImpl<CallSite> &toUpdate) { +static bool insertParsePoints(Function &F, DominatorTree &DT, + TargetTransformInfo &TTI, + SmallVectorImpl<CallSite> &ToUpdate) { #ifndef NDEBUG // sanity check the input - std::set<CallSite> uniqued; - uniqued.insert(toUpdate.begin(), toUpdate.end()); - assert(uniqued.size() == toUpdate.size() && "no duplicates please!"); + std::set<CallSite> Uniqued; + Uniqued.insert(ToUpdate.begin(), ToUpdate.end()); + assert(Uniqued.size() == ToUpdate.size() && "no duplicates please!"); - for (size_t i = 0; i < toUpdate.size(); i++) { - CallSite &CS = toUpdate[i]; + for (CallSite CS : ToUpdate) { assert(CS.getInstruction()->getParent()->getParent() == &F); - assert(isStatepoint(CS) && "expected to already be a deopt statepoint"); + assert((UseDeoptBundles || isStatepoint(CS)) && + "expected to already be a deopt statepoint"); } #endif @@ -2098,50 +2289,45 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, Pass *P, // the top of the successor blocks. See the comment on // normalForInvokeSafepoint on exactly what is needed. Note that this step // may restructure the CFG. - for (CallSite CS : toUpdate) { + for (CallSite CS : ToUpdate) { if (!CS.isInvoke()) continue; - InvokeInst *invoke = cast<InvokeInst>(CS.getInstruction()); - normalizeForInvokeSafepoint(invoke->getNormalDest(), invoke->getParent(), - DT); - normalizeForInvokeSafepoint(invoke->getUnwindDest(), invoke->getParent(), - DT); + auto *II = cast<InvokeInst>(CS.getInstruction()); + normalizeForInvokeSafepoint(II->getNormalDest(), II->getParent(), DT); + normalizeForInvokeSafepoint(II->getUnwindDest(), II->getParent(), DT); } // A list of dummy calls added to the IR to keep various values obviously // live in the IR. We'll remove all of these when done. - SmallVector<CallInst *, 64> holders; + SmallVector<CallInst *, 64> Holders; // Insert a dummy call with all of the arguments to the vm_state we'll need // for the actual safepoint insertion. This ensures reference arguments in // the deopt argument list are considered live through the safepoint (and // thus makes sure they get relocated.) - for (size_t i = 0; i < toUpdate.size(); i++) { - CallSite &CS = toUpdate[i]; - Statepoint StatepointCS(CS); - + for (CallSite CS : ToUpdate) { SmallVector<Value *, 64> DeoptValues; - for (Use &U : StatepointCS.vm_state_args()) { - Value *Arg = cast<Value>(&U); + + iterator_range<const Use *> DeoptStateRange = + UseDeoptBundles + ? iterator_range<const Use *>(GetDeoptBundleOperands(CS)) + : iterator_range<const Use *>(Statepoint(CS).vm_state_args()); + + for (Value *Arg : DeoptStateRange) { assert(!isUnhandledGCPointerType(Arg->getType()) && "support for FCA unimplemented"); if (isHandledGCPointerType(Arg->getType())) DeoptValues.push_back(Arg); } - insertUseHolderAfter(CS, DeoptValues, holders); - } - SmallVector<struct PartiallyConstructedSafepointRecord, 64> records; - records.reserve(toUpdate.size()); - for (size_t i = 0; i < toUpdate.size(); i++) { - struct PartiallyConstructedSafepointRecord info; - records.push_back(info); + insertUseHolderAfter(CS, DeoptValues, Holders); } - assert(records.size() == toUpdate.size()); - // A) Identify all gc pointers which are staticly live at the given call + SmallVector<PartiallyConstructedSafepointRecord, 64> Records(ToUpdate.size()); + + // A) Identify all gc pointers which are statically live at the given call // site. - findLiveReferences(F, DT, P, toUpdate, records); + findLiveReferences(F, DT, ToUpdate, Records); // B) Find the base pointers for each live pointer /* scope for caching */ { @@ -2150,10 +2336,9 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, Pass *P, // large numbers of duplicate base_phis. DefiningValueMapTy DVCache; - for (size_t i = 0; i < records.size(); i++) { - struct PartiallyConstructedSafepointRecord &info = records[i]; - CallSite &CS = toUpdate[i]; - findBasePointers(DT, DVCache, CS, info); + for (size_t i = 0; i < Records.size(); i++) { + PartiallyConstructedSafepointRecord &info = Records[i]; + findBasePointers(DT, DVCache, ToUpdate[i], info); } } // end of cache scope @@ -2170,63 +2355,75 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, Pass *P, // the base pointers which were identified for that safepoint. We'll then // ask liveness for _every_ base inserted to see what is now live. Then we // remove the dummy calls. - holders.reserve(holders.size() + records.size()); - for (size_t i = 0; i < records.size(); i++) { - struct PartiallyConstructedSafepointRecord &info = records[i]; - CallSite &CS = toUpdate[i]; + Holders.reserve(Holders.size() + Records.size()); + for (size_t i = 0; i < Records.size(); i++) { + PartiallyConstructedSafepointRecord &Info = Records[i]; SmallVector<Value *, 128> Bases; - for (auto Pair : info.PointerToBase) { + for (auto Pair : Info.PointerToBase) Bases.push_back(Pair.second); - } - insertUseHolderAfter(CS, Bases, holders); + + insertUseHolderAfter(ToUpdate[i], Bases, Holders); } // By selecting base pointers, we've effectively inserted new uses. Thus, we // need to rerun liveness. We may *also* have inserted new defs, but that's // not the key issue. - recomputeLiveInValues(F, DT, P, toUpdate, records); + recomputeLiveInValues(F, DT, ToUpdate, Records); if (PrintBasePointers) { - for (size_t i = 0; i < records.size(); i++) { - struct PartiallyConstructedSafepointRecord &info = records[i]; + for (auto &Info : Records) { errs() << "Base Pairs: (w/Relocation)\n"; - for (auto Pair : info.PointerToBase) { - errs() << " derived %" << Pair.first->getName() << " base %" - << Pair.second->getName() << "\n"; + for (auto Pair : Info.PointerToBase) { + errs() << " derived "; + Pair.first->printAsOperand(errs(), false); + errs() << " base "; + Pair.second->printAsOperand(errs(), false); + errs() << "\n"; } } } - for (size_t i = 0; i < holders.size(); i++) { - holders[i]->eraseFromParent(); - holders[i] = nullptr; - } - holders.clear(); + + // It is possible that non-constant live variables have a constant base. For + // example, a GEP with a variable offset from a global. In this case we can + // remove it from the liveset. We already don't add constants to the liveset + // because we assume they won't move at runtime and the GC doesn't need to be + // informed about them. The same reasoning applies if the base is constant. + // Note that the relocation placement code relies on this filtering for + // correctness as it expects the base to be in the liveset, which isn't true + // if the base is constant. + for (auto &Info : Records) + for (auto &BasePair : Info.PointerToBase) + if (isa<Constant>(BasePair.second)) + Info.LiveSet.erase(BasePair.first); + + for (CallInst *CI : Holders) + CI->eraseFromParent(); + + Holders.clear(); // Do a limited scalarization of any live at safepoint vector values which // contain pointers. This enables this pass to run after vectorization at // the cost of some possible performance loss. TODO: it would be nice to // natively support vectors all the way through the backend so we don't need // to scalarize here. - for (size_t i = 0; i < records.size(); i++) { - struct PartiallyConstructedSafepointRecord &info = records[i]; - Instruction *statepoint = toUpdate[i].getInstruction(); - splitVectorValues(cast<Instruction>(statepoint), info.liveset, - info.PointerToBase, DT); + for (size_t i = 0; i < Records.size(); i++) { + PartiallyConstructedSafepointRecord &Info = Records[i]; + Instruction *Statepoint = ToUpdate[i].getInstruction(); + splitVectorValues(cast<Instruction>(Statepoint), Info.LiveSet, + Info.PointerToBase, DT); } // In order to reduce live set of statepoint we might choose to rematerialize - // some values instead of relocating them. This is purelly an optimization and + // some values instead of relocating them. This is purely an optimization and // does not influence correctness. - TargetTransformInfo &TTI = - P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + for (size_t i = 0; i < Records.size(); i++) + rematerializeLiveValues(ToUpdate[i], Records[i], TTI); - for (size_t i = 0; i < records.size(); i++) { - struct PartiallyConstructedSafepointRecord &info = records[i]; - CallSite &CS = toUpdate[i]; - - rematerializeLiveValues(CS, info, TTI); - } + // We need this to safely RAUW and delete call or invoke return values that + // may themselves be live over a statepoint. For details, please see usage in + // makeStatepointExplicitImpl. + std::vector<DeferredReplacement> Replacements; // Now run through and replace the existing statepoints with new ones with // the live variables listed. We do not yet update uses of the values being @@ -2234,61 +2431,77 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, Pass *P, // survive to the last iteration of this loop. (By construction, the // previous statepoint can not be a live variable, thus we can and remove // the old statepoint calls as we go.) - for (size_t i = 0; i < records.size(); i++) { - struct PartiallyConstructedSafepointRecord &info = records[i]; - CallSite &CS = toUpdate[i]; - makeStatepointExplicit(DT, CS, P, info); + for (size_t i = 0; i < Records.size(); i++) + makeStatepointExplicit(DT, ToUpdate[i], Records[i], Replacements); + + ToUpdate.clear(); // prevent accident use of invalid CallSites + + for (auto &PR : Replacements) + PR.doReplacement(); + + Replacements.clear(); + + for (auto &Info : Records) { + // These live sets may contain state Value pointers, since we replaced calls + // with operand bundles with calls wrapped in gc.statepoint, and some of + // those calls may have been def'ing live gc pointers. Clear these out to + // avoid accidentally using them. + // + // TODO: We should create a separate data structure that does not contain + // these live sets, and migrate to using that data structure from this point + // onward. + Info.LiveSet.clear(); + Info.PointerToBase.clear(); } - toUpdate.clear(); // prevent accident use of invalid CallSites // Do all the fixups of the original live variables to their relocated selves - SmallVector<Value *, 128> live; - for (size_t i = 0; i < records.size(); i++) { - struct PartiallyConstructedSafepointRecord &info = records[i]; + SmallVector<Value *, 128> Live; + for (size_t i = 0; i < Records.size(); i++) { + PartiallyConstructedSafepointRecord &Info = Records[i]; + // We can't simply save the live set from the original insertion. One of // the live values might be the result of a call which needs a safepoint. // That Value* no longer exists and we need to use the new gc_result. - // Thankfully, the liveset is embedded in the statepoint (and updated), so + // Thankfully, the live set is embedded in the statepoint (and updated), so // we just grab that. - Statepoint statepoint(info.StatepointToken); - live.insert(live.end(), statepoint.gc_args_begin(), - statepoint.gc_args_end()); + Statepoint Statepoint(Info.StatepointToken); + Live.insert(Live.end(), Statepoint.gc_args_begin(), + Statepoint.gc_args_end()); #ifndef NDEBUG // Do some basic sanity checks on our liveness results before performing // relocation. Relocation can and will turn mistakes in liveness results // into non-sensical code which is must harder to debug. // TODO: It would be nice to test consistency as well - assert(DT.isReachableFromEntry(info.StatepointToken->getParent()) && + assert(DT.isReachableFromEntry(Info.StatepointToken->getParent()) && "statepoint must be reachable or liveness is meaningless"); - for (Value *V : statepoint.gc_args()) { + for (Value *V : Statepoint.gc_args()) { if (!isa<Instruction>(V)) // Non-instruction values trivial dominate all possible uses continue; - auto LiveInst = cast<Instruction>(V); + auto *LiveInst = cast<Instruction>(V); assert(DT.isReachableFromEntry(LiveInst->getParent()) && "unreachable values should never be live"); - assert(DT.dominates(LiveInst, info.StatepointToken) && + assert(DT.dominates(LiveInst, Info.StatepointToken) && "basic SSA liveness expectation violated by liveness analysis"); } #endif } - unique_unsorted(live); + unique_unsorted(Live); #ifndef NDEBUG // sanity check - for (auto ptr : live) { - assert(isGCPointerType(ptr->getType()) && "must be a gc pointer type"); - } + for (auto *Ptr : Live) + assert(isGCPointerType(Ptr->getType()) && "must be a gc pointer type"); #endif - relocationViaAlloca(F, DT, live, records); - return !records.empty(); + relocationViaAlloca(F, DT, Live, Records); + return !Records.empty(); } // Handles both return values and arguments for Functions and CallSites. template <typename AttrHolder> -static void RemoveDerefAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH, - unsigned Index) { +static void RemoveNonValidAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH, + unsigned Index) { AttrBuilder R; if (AH.getDereferenceableBytes(Index)) R.addAttribute(Attribute::get(Ctx, Attribute::Dereferenceable, @@ -2296,6 +2509,8 @@ static void RemoveDerefAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH, if (AH.getDereferenceableOrNullBytes(Index)) R.addAttribute(Attribute::get(Ctx, Attribute::DereferenceableOrNull, AH.getDereferenceableOrNullBytes(Index))); + if (AH.doesNotAlias(Index)) + R.addAttribute(Attribute::NoAlias); if (!R.empty()) AH.setAttributes(AH.getAttributes().removeAttributes( @@ -2303,25 +2518,25 @@ static void RemoveDerefAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH, } void -RewriteStatepointsForGC::stripDereferenceabilityInfoFromPrototype(Function &F) { +RewriteStatepointsForGC::stripNonValidAttributesFromPrototype(Function &F) { LLVMContext &Ctx = F.getContext(); for (Argument &A : F.args()) if (isa<PointerType>(A.getType())) - RemoveDerefAttrAtIndex(Ctx, F, A.getArgNo() + 1); + RemoveNonValidAttrAtIndex(Ctx, F, A.getArgNo() + 1); if (isa<PointerType>(F.getReturnType())) - RemoveDerefAttrAtIndex(Ctx, F, AttributeSet::ReturnIndex); + RemoveNonValidAttrAtIndex(Ctx, F, AttributeSet::ReturnIndex); } -void RewriteStatepointsForGC::stripDereferenceabilityInfoFromBody(Function &F) { +void RewriteStatepointsForGC::stripNonValidAttributesFromBody(Function &F) { if (F.empty()) return; LLVMContext &Ctx = F.getContext(); MDBuilder Builder(Ctx); - for (Instruction &I : inst_range(F)) { + for (Instruction &I : instructions(F)) { if (const MDNode *MD = I.getMetadata(LLVMContext::MD_tbaa)) { assert(MD->getNumOperands() < 5 && "unrecognized metadata shape!"); bool IsImmutableTBAA = @@ -2344,9 +2559,9 @@ void RewriteStatepointsForGC::stripDereferenceabilityInfoFromBody(Function &F) { if (CallSite CS = CallSite(&I)) { for (int i = 0, e = CS.arg_size(); i != e; i++) if (isa<PointerType>(CS.getArgument(i)->getType())) - RemoveDerefAttrAtIndex(Ctx, CS, i + 1); + RemoveNonValidAttrAtIndex(Ctx, CS, i + 1); if (isa<PointerType>(CS.getType())) - RemoveDerefAttrAtIndex(Ctx, CS, AttributeSet::ReturnIndex); + RemoveNonValidAttrAtIndex(Ctx, CS, AttributeSet::ReturnIndex); } } } @@ -2365,17 +2580,17 @@ static bool shouldRewriteStatepointsIn(Function &F) { return false; } -void RewriteStatepointsForGC::stripDereferenceabilityInfo(Module &M) { +void RewriteStatepointsForGC::stripNonValidAttributes(Module &M) { #ifndef NDEBUG assert(std::any_of(M.begin(), M.end(), shouldRewriteStatepointsIn) && "precondition!"); #endif for (Function &F : M) - stripDereferenceabilityInfoFromPrototype(F); + stripNonValidAttributesFromPrototype(F); for (Function &F : M) - stripDereferenceabilityInfoFromBody(F); + stripNonValidAttributesFromBody(F); } bool RewriteStatepointsForGC::runOnFunction(Function &F) { @@ -2389,15 +2604,27 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F) { return false; DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree(); + TargetTransformInfo &TTI = + getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + + auto NeedsRewrite = [](Instruction &I) { + if (UseDeoptBundles) { + if (ImmutableCallSite CS = ImmutableCallSite(&I)) + return !callsGCLeafFunction(CS); + return false; + } + + return isStatepoint(I); + }; // Gather all the statepoints which need rewritten. Be careful to only // consider those in reachable code since we need to ask dominance queries // when rewriting. We'll delete the unreachable ones in a moment. SmallVector<CallSite, 64> ParsePointNeeded; bool HasUnreachableStatepoint = false; - for (Instruction &I : inst_range(F)) { + for (Instruction &I : instructions(F)) { // TODO: only the ones with the flag set! - if (isStatepoint(I)) { + if (NeedsRewrite(I)) { if (DT.isReachableFromEntry(I.getParent())) ParsePointNeeded.push_back(CallSite(&I)); else @@ -2428,7 +2655,38 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F) { FoldSingleEntryPHINodes(&BB); } - MadeChange |= insertParsePoints(F, DT, this, ParsePointNeeded); + // Before we start introducing relocations, we want to tweak the IR a bit to + // avoid unfortunate code generation effects. The main example is that we + // want to try to make sure the comparison feeding a branch is after any + // safepoints. Otherwise, we end up with a comparison of pre-relocation + // values feeding a branch after relocation. This is semantically correct, + // but results in extra register pressure since both the pre-relocation and + // post-relocation copies must be available in registers. For code without + // relocations this is handled elsewhere, but teaching the scheduler to + // reverse the transform we're about to do would be slightly complex. + // Note: This may extend the live range of the inputs to the icmp and thus + // increase the liveset of any statepoint we move over. This is profitable + // as long as all statepoints are in rare blocks. If we had in-register + // lowering for live values this would be a much safer transform. + auto getConditionInst = [](TerminatorInst *TI) -> Instruction* { + if (auto *BI = dyn_cast<BranchInst>(TI)) + if (BI->isConditional()) + return dyn_cast<Instruction>(BI->getCondition()); + // TODO: Extend this to handle switches + return nullptr; + }; + for (BasicBlock &BB : F) { + TerminatorInst *TI = BB.getTerminator(); + if (auto *Cond = getConditionInst(TI)) + // TODO: Handle more than just ICmps here. We should be able to move + // most instructions without side effects or memory access. + if (isa<ICmpInst>(Cond) && Cond->hasOneUse()) { + MadeChange = true; + Cond->moveBefore(TI); + } + } + + MadeChange |= insertParsePoints(F, DT, TTI, ParsePointNeeded); return MadeChange; } @@ -2461,7 +2719,7 @@ static void computeLiveInValues(BasicBlock::reverse_iterator rbegin, "support for FCA unimplemented"); if (isHandledGCPointerType(V->getType()) && !isa<Constant>(V)) { // The choice to exclude all things constant here is slightly subtle. - // There are two idependent reasons: + // There are two independent reasons: // - We assume that things which are constant (from LLVM's definition) // do not move at runtime. For example, the address of a global // variable is fixed, even though it's contents may not be. @@ -2599,7 +2857,7 @@ static void computeLiveInValues(DominatorTree &DT, Function &F, } // while( !worklist.empty() ) #ifndef NDEBUG - // Sanity check our ouput against SSA properties. This helps catch any + // Sanity check our output against SSA properties. This helps catch any // missing kills during the above iteration. for (BasicBlock &BB : F) { checkBasicSSA(DT, Data, BB); @@ -2620,7 +2878,7 @@ static void findLiveSetAtInst(Instruction *Inst, GCPtrLivenessData &Data, // call result is not live (normal), nor are it's arguments // (unless they're used again later). This adjustment is // specifically what we need to relocate - BasicBlock::reverse_iterator rend(Inst); + BasicBlock::reverse_iterator rend(Inst->getIterator()); computeLiveInValues(BB->rbegin(), rend, LiveOut); LiveOut.erase(Inst); Out.insert(LiveOut.begin(), LiveOut.end()); @@ -2669,5 +2927,5 @@ static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData, assert(Updated.count(KVPair.first) && "record for non-live value"); #endif - Info.liveset = Updated; + Info.LiveSet = Updated; } diff --git a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp index 4d3a708..2fca803 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -24,6 +24,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/CallSite.h" @@ -479,6 +480,13 @@ private: void visitExtractValueInst(ExtractValueInst &EVI); void visitInsertValueInst(InsertValueInst &IVI); void visitLandingPadInst(LandingPadInst &I) { markAnythingOverdefined(&I); } + void visitFuncletPadInst(FuncletPadInst &FPI) { + markAnythingOverdefined(&FPI); + } + void visitCatchSwitchInst(CatchSwitchInst &CPI) { + markAnythingOverdefined(&CPI); + visitTerminatorInst(CPI); + } // Instructions that cannot be folded away. void visitStoreInst (StoreInst &I); @@ -539,9 +547,9 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI, return; } - if (isa<InvokeInst>(TI)) { - // Invoke instructions successors are always executable. - Succs[0] = Succs[1] = true; + // Unwinding instructions successors are always executable. + if (TI.isExceptional()) { + Succs.assign(TI.getNumSuccessors(), true); return; } @@ -605,8 +613,8 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) { return BI->getSuccessor(CI->isZero()) == To; } - // Invoke instructions successors are always executable. - if (isa<InvokeInst>(TI)) + // Unwinding instructions successors are always executable. + if (TI->isExceptional()) return true; if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { @@ -630,7 +638,7 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) { #ifndef NDEBUG dbgs() << "Unknown terminator instruction: " << *TI << '\n'; #endif - llvm_unreachable(nullptr); + llvm_unreachable("SCCP: Don't know how to handle this terminator!"); } // visit Implementations - Something changed in this instruction, either an @@ -1126,7 +1134,7 @@ CallOverdefined: // entry block executable and merge in the actual arguments to the call into // the formal arguments of the function. if (!TrackingIncomingArguments.empty() && TrackingIncomingArguments.count(F)){ - MarkBlockExecutable(F->begin()); + MarkBlockExecutable(&F->front()); // Propagate information from this call site into the callee. CallSite::arg_iterator CAI = CS.arg_begin(); @@ -1135,17 +1143,17 @@ CallOverdefined: // If this argument is byval, and if the function is not readonly, there // will be an implicit copy formed of the input aggregate. if (AI->hasByValAttr() && !F->onlyReadsMemory()) { - markOverdefined(AI); + markOverdefined(&*AI); continue; } if (StructType *STy = dyn_cast<StructType>(AI->getType())) { for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { LatticeVal CallArg = getStructValueState(*CAI, i); - mergeInValue(getStructValueState(AI, i), AI, CallArg); + mergeInValue(getStructValueState(&*AI, i), &*AI, CallArg); } } else { - mergeInValue(AI, getValueState(*CAI)); + mergeInValue(&*AI, getValueState(*CAI)); } } } @@ -1246,18 +1254,18 @@ void SCCPSolver::Solve() { /// even if X isn't defined. bool SCCPSolver::ResolvedUndefsIn(Function &F) { for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - if (!BBExecutable.count(BB)) + if (!BBExecutable.count(&*BB)) continue; - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + for (Instruction &I : *BB) { // Look for instructions which produce undef values. - if (I->getType()->isVoidTy()) continue; + if (I.getType()->isVoidTy()) continue; - if (StructType *STy = dyn_cast<StructType>(I->getType())) { + if (StructType *STy = dyn_cast<StructType>(I.getType())) { // Only a few things that can be structs matter for undef. // Tracked calls must never be marked overdefined in ResolvedUndefsIn. - if (CallSite CS = CallSite(I)) + if (CallSite CS = CallSite(&I)) if (Function *F = CS.getCalledFunction()) if (MRVFunctionsTracked.count(F)) continue; @@ -1270,14 +1278,14 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // Send the results of everything else to overdefined. We could be // more precise than this but it isn't worth bothering. for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { - LatticeVal &LV = getStructValueState(I, i); + LatticeVal &LV = getStructValueState(&I, i); if (LV.isUndefined()) - markOverdefined(LV, I); + markOverdefined(LV, &I); } continue; } - LatticeVal &LV = getValueState(I); + LatticeVal &LV = getValueState(&I); if (!LV.isUndefined()) continue; // extractvalue is safe; check here because the argument is a struct. @@ -1287,24 +1295,24 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // Compute the operand LatticeVals, for convenience below. // Anything taking a struct is conservatively assumed to require // overdefined markings. - if (I->getOperand(0)->getType()->isStructTy()) { - markOverdefined(I); + if (I.getOperand(0)->getType()->isStructTy()) { + markOverdefined(&I); return true; } - LatticeVal Op0LV = getValueState(I->getOperand(0)); + LatticeVal Op0LV = getValueState(I.getOperand(0)); LatticeVal Op1LV; - if (I->getNumOperands() == 2) { - if (I->getOperand(1)->getType()->isStructTy()) { - markOverdefined(I); + if (I.getNumOperands() == 2) { + if (I.getOperand(1)->getType()->isStructTy()) { + markOverdefined(&I); return true; } - Op1LV = getValueState(I->getOperand(1)); + Op1LV = getValueState(I.getOperand(1)); } // If this is an instructions whose result is defined even if the input is // not fully defined, propagate the information. - Type *ITy = I->getType(); - switch (I->getOpcode()) { + Type *ITy = I.getType(); + switch (I.getOpcode()) { case Instruction::Add: case Instruction::Sub: case Instruction::Trunc: @@ -1318,9 +1326,9 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { case Instruction::FRem: // Floating-point binary operation: be conservative. if (Op0LV.isUndefined() && Op1LV.isUndefined()) - markForcedConstant(I, Constant::getNullValue(ITy)); + markForcedConstant(&I, Constant::getNullValue(ITy)); else - markOverdefined(I); + markOverdefined(&I); return true; case Instruction::ZExt: case Instruction::SExt: @@ -1332,7 +1340,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { case Instruction::SIToFP: case Instruction::UIToFP: // undef -> 0; some outputs are impossible - markForcedConstant(I, Constant::getNullValue(ITy)); + markForcedConstant(&I, Constant::getNullValue(ITy)); return true; case Instruction::Mul: case Instruction::And: @@ -1341,7 +1349,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { break; // undef * X -> 0. X could be zero. // undef & X -> 0. X could be zero. - markForcedConstant(I, Constant::getNullValue(ITy)); + markForcedConstant(&I, Constant::getNullValue(ITy)); return true; case Instruction::Or: @@ -1349,7 +1357,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { if (Op0LV.isUndefined() && Op1LV.isUndefined()) break; // undef | X -> -1. X could be -1. - markForcedConstant(I, Constant::getAllOnesValue(ITy)); + markForcedConstant(&I, Constant::getAllOnesValue(ITy)); return true; case Instruction::Xor: @@ -1357,7 +1365,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // necessary, but we try to be nice to people who expect this // behavior in simple cases if (Op0LV.isUndefined() && Op1LV.isUndefined()) { - markForcedConstant(I, Constant::getNullValue(ITy)); + markForcedConstant(&I, Constant::getNullValue(ITy)); return true; } // undef ^ X -> undef @@ -1373,7 +1381,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // undef / X -> 0. X could be maxint. // undef % X -> 0. X could be 1. - markForcedConstant(I, Constant::getNullValue(ITy)); + markForcedConstant(&I, Constant::getNullValue(ITy)); return true; case Instruction::AShr: @@ -1381,7 +1389,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { if (Op1LV.isUndefined()) break; // undef >>a X -> all ones - markForcedConstant(I, Constant::getAllOnesValue(ITy)); + markForcedConstant(&I, Constant::getAllOnesValue(ITy)); return true; case Instruction::LShr: case Instruction::Shl: @@ -1391,17 +1399,17 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // undef << X -> 0 // undef >> X -> 0 - markForcedConstant(I, Constant::getNullValue(ITy)); + markForcedConstant(&I, Constant::getNullValue(ITy)); return true; case Instruction::Select: - Op1LV = getValueState(I->getOperand(1)); + Op1LV = getValueState(I.getOperand(1)); // undef ? X : Y -> X or Y. There could be commonality between X/Y. if (Op0LV.isUndefined()) { if (!Op1LV.isConstant()) // Pick the constant one if there is any. - Op1LV = getValueState(I->getOperand(2)); + Op1LV = getValueState(I.getOperand(2)); } else if (Op1LV.isUndefined()) { // c ? undef : undef -> undef. No change. - Op1LV = getValueState(I->getOperand(2)); + Op1LV = getValueState(I.getOperand(2)); if (Op1LV.isUndefined()) break; // Otherwise, c ? undef : x -> x. @@ -1410,9 +1418,9 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { } if (Op1LV.isConstant()) - markForcedConstant(I, Op1LV.getConstant()); + markForcedConstant(&I, Op1LV.getConstant()); else - markOverdefined(I); + markOverdefined(&I); return true; case Instruction::Load: // A load here means one of two things: a load of undef from a global, @@ -1421,9 +1429,9 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { break; case Instruction::ICmp: // X == undef -> undef. Other comparisons get more complicated. - if (cast<ICmpInst>(I)->isEquality()) + if (cast<ICmpInst>(&I)->isEquality()) break; - markOverdefined(I); + markOverdefined(&I); return true; case Instruction::Call: case Instruction::Invoke: { @@ -1432,19 +1440,19 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // 2. It could be constant-foldable. // Because of the way we solve return values, tracked calls must // never be marked overdefined in ResolvedUndefsIn. - if (Function *F = CallSite(I).getCalledFunction()) + if (Function *F = CallSite(&I).getCalledFunction()) if (TrackedRetVals.count(F)) break; // If the call is constant-foldable, we mark it overdefined because // we do not know what return values are valid. - markOverdefined(I); + markOverdefined(&I); return true; } default: // If we don't know what should happen here, conservatively mark it // overdefined. - markOverdefined(I); + markOverdefined(&I); return true; } } @@ -1462,7 +1470,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // false. if (isa<UndefValue>(BI->getCondition())) { BI->setCondition(ConstantInt::getFalse(BI->getContext())); - markEdgeExecutable(BB, TI->getSuccessor(1)); + markEdgeExecutable(&*BB, TI->getSuccessor(1)); return true; } @@ -1484,7 +1492,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // the first constant. if (isa<UndefValue>(SI->getCondition())) { SI->setCondition(SI->case_begin().getCaseValue()); - markEdgeExecutable(BB, SI->case_begin().getCaseSuccessor()); + markEdgeExecutable(&*BB, SI->case_begin().getCaseSuccessor()); return true; } @@ -1506,6 +1514,7 @@ namespace { struct SCCP : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } static char ID; // Pass identification, replacement for typeid SCCP() : FunctionPass(ID) { @@ -1541,11 +1550,10 @@ static void DeleteInstructionInBlock(BasicBlock *BB) { Instruction *EndInst = BB->getTerminator(); // Last not to be deleted. while (EndInst != BB->begin()) { // Delete the next to last instruction. - BasicBlock::iterator I = EndInst; - Instruction *Inst = --I; + Instruction *Inst = &*--EndInst->getIterator(); if (!Inst->use_empty()) Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); - if (isa<LandingPadInst>(Inst)) { + if (Inst->isEHPad()) { EndInst = Inst; continue; } @@ -1568,11 +1576,11 @@ bool SCCP::runOnFunction(Function &F) { SCCPSolver Solver(DL, TLI); // Mark the first block of the function as being executable. - Solver.MarkBlockExecutable(F.begin()); + Solver.MarkBlockExecutable(&F.front()); // Mark all arguments to the function as being overdefined. - for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E;++AI) - Solver.markAnythingOverdefined(AI); + for (Argument &AI : F.args()) + Solver.markAnythingOverdefined(&AI); // Solve for constants. bool ResolvedUndefs = true; @@ -1589,8 +1597,8 @@ bool SCCP::runOnFunction(Function &F) { // as we cannot modify the CFG of the function. for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - if (!Solver.isBlockExecutable(BB)) { - DeleteInstructionInBlock(BB); + if (!Solver.isBlockExecutable(&*BB)) { + DeleteInstructionInBlock(&*BB); MadeChanges = true; continue; } @@ -1599,7 +1607,7 @@ bool SCCP::runOnFunction(Function &F) { // constants if we have found them to be of constant values. // for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) { - Instruction *Inst = BI++; + Instruction *Inst = &*BI++; if (Inst->getType()->isVoidTy() || isa<TerminatorInst>(Inst)) continue; @@ -1713,36 +1721,34 @@ bool IPSCCP::runOnModule(Module &M) { // If this is a strong or ODR definition of this function, then we can // propagate information about its result into callsites of it. if (!F->mayBeOverridden()) - Solver.AddTrackedFunction(F); + Solver.AddTrackedFunction(&*F); // If this function only has direct calls that we can see, we can track its // arguments and return value aggressively, and can assume it is not called // unless we see evidence to the contrary. if (F->hasLocalLinkage()) { - if (AddressIsTaken(F)) - AddressTakenFunctions.insert(F); + if (AddressIsTaken(&*F)) + AddressTakenFunctions.insert(&*F); else { - Solver.AddArgumentTrackedFunction(F); + Solver.AddArgumentTrackedFunction(&*F); continue; } } // Assume the function is called. - Solver.MarkBlockExecutable(F->begin()); + Solver.MarkBlockExecutable(&F->front()); // Assume nothing about the incoming arguments. - for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); - AI != E; ++AI) - Solver.markAnythingOverdefined(AI); + for (Argument &AI : F->args()) + Solver.markAnythingOverdefined(&AI); } // Loop over global variables. We inform the solver about any internal global // variables that do not have their 'addresses taken'. If they don't have // their addresses taken, we can propagate constants through them. - for (Module::global_iterator G = M.global_begin(), E = M.global_end(); - G != E; ++G) - if (!G->isConstant() && G->hasLocalLinkage() && !AddressIsTaken(G)) - Solver.TrackValueOfGlobalVariable(G); + for (GlobalVariable &G : M.globals()) + if (!G.isConstant() && G.hasLocalLinkage() && !AddressIsTaken(&G)) + Solver.TrackValueOfGlobalVariable(&G); // Solve for constants. bool ResolvedUndefs = true; @@ -1763,7 +1769,10 @@ bool IPSCCP::runOnModule(Module &M) { SmallVector<BasicBlock*, 512> BlocksToErase; for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) { - if (Solver.isBlockExecutable(F->begin())) { + if (F->isDeclaration()) + continue; + + if (Solver.isBlockExecutable(&F->front())) { for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); AI != E; ++AI) { if (AI->use_empty() || AI->getType()->isStructTy()) continue; @@ -1771,7 +1780,7 @@ bool IPSCCP::runOnModule(Module &M) { // TODO: Could use getStructLatticeValueFor to find out if the entire // result is a constant and replace it entirely if so. - LatticeVal IV = Solver.getLatticeValueFor(AI); + LatticeVal IV = Solver.getLatticeValueFor(&*AI); if (IV.isOverdefined()) continue; Constant *CST = IV.isConstant() ? @@ -1786,28 +1795,27 @@ bool IPSCCP::runOnModule(Module &M) { } for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { - if (!Solver.isBlockExecutable(BB)) { - DeleteInstructionInBlock(BB); + if (!Solver.isBlockExecutable(&*BB)) { + DeleteInstructionInBlock(&*BB); MadeChanges = true; TerminatorInst *TI = BB->getTerminator(); - for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) { - BasicBlock *Succ = TI->getSuccessor(i); + for (BasicBlock *Succ : TI->successors()) { if (!Succ->empty() && isa<PHINode>(Succ->begin())) - TI->getSuccessor(i)->removePredecessor(BB); + Succ->removePredecessor(&*BB); } if (!TI->use_empty()) TI->replaceAllUsesWith(UndefValue::get(TI->getType())); TI->eraseFromParent(); - new UnreachableInst(M.getContext(), BB); + new UnreachableInst(M.getContext(), &*BB); if (&*BB != &F->front()) - BlocksToErase.push_back(BB); + BlocksToErase.push_back(&*BB); continue; } for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) { - Instruction *Inst = BI++; + Instruction *Inst = &*BI++; if (Inst->getType()->isVoidTy() || Inst->getType()->isStructTy()) continue; diff --git a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp index 947513a..a7361b5 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp @@ -23,12 +23,12 @@ /// //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/SROA.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/PtrUseVisitor.h" #include "llvm/Analysis/ValueTracking.h" @@ -37,8 +37,6 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Dominators.h" -#include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/Instructions.h" @@ -53,9 +51,9 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/TimeValue.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" -#include "llvm/Transforms/Utils/SSAUpdater.h" #if __cplusplus >= 201103L && !defined(NDEBUG) // We only use this for a debug check in C++11 @@ -63,6 +61,7 @@ #endif using namespace llvm; +using namespace llvm::sroa; #define DEBUG_TYPE "sroa" @@ -77,11 +76,6 @@ STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion"); STATISTIC(NumDeleted, "Number of instructions deleted"); STATISTIC(NumVectorized, "Number of vectorized aggregates"); -/// Hidden option to force the pass to not use DomTree and mem2reg, instead -/// forming SSA values through the SSAUpdater infrastructure. -static cl::opt<bool> ForceSSAUpdater("force-ssa-updater", cl::init(false), - cl::Hidden); - /// Hidden option to enable randomly shuffling the slices to help uncover /// instability in their order. static cl::opt<bool> SROARandomShuffleSlices("sroa-random-shuffle-slices", @@ -205,7 +199,6 @@ template <typename T> struct isPodLike; template <> struct isPodLike<Slice> { static const bool value = true; }; } -namespace { /// \brief Representation of the alloca slices. /// /// This class represents the slices of an alloca which are formed by its @@ -213,7 +206,7 @@ namespace { /// for the slices used and we reflect that in this structure. The uses are /// stored, sorted by increasing beginning offset and with unsplittable slices /// starting at a particular offset before splittable slices. -class AllocaSlices { +class llvm::sroa::AllocaSlices { public: /// \brief Construct the slices of a particular alloca. AllocaSlices(const DataLayout &DL, AllocaInst &AI); @@ -253,281 +246,10 @@ public: std::inplace_merge(Slices.begin(), SliceI, Slices.end()); } - // Forward declare an iterator to befriend it. + // Forward declare the iterator and range accessor for walking the + // partitions. class partition_iterator; - - /// \brief A partition of the slices. - /// - /// An ephemeral representation for a range of slices which can be viewed as - /// a partition of the alloca. This range represents a span of the alloca's - /// memory which cannot be split, and provides access to all of the slices - /// overlapping some part of the partition. - /// - /// Objects of this type are produced by traversing the alloca's slices, but - /// are only ephemeral and not persistent. - class Partition { - private: - friend class AllocaSlices; - friend class AllocaSlices::partition_iterator; - - /// \brief The begining and ending offsets of the alloca for this partition. - uint64_t BeginOffset, EndOffset; - - /// \brief The start end end iterators of this partition. - iterator SI, SJ; - - /// \brief A collection of split slice tails overlapping the partition. - SmallVector<Slice *, 4> SplitTails; - - /// \brief Raw constructor builds an empty partition starting and ending at - /// the given iterator. - Partition(iterator SI) : SI(SI), SJ(SI) {} - - public: - /// \brief The start offset of this partition. - /// - /// All of the contained slices start at or after this offset. - uint64_t beginOffset() const { return BeginOffset; } - - /// \brief The end offset of this partition. - /// - /// All of the contained slices end at or before this offset. - uint64_t endOffset() const { return EndOffset; } - - /// \brief The size of the partition. - /// - /// Note that this can never be zero. - uint64_t size() const { - assert(BeginOffset < EndOffset && "Partitions must span some bytes!"); - return EndOffset - BeginOffset; - } - - /// \brief Test whether this partition contains no slices, and merely spans - /// a region occupied by split slices. - bool empty() const { return SI == SJ; } - - /// \name Iterate slices that start within the partition. - /// These may be splittable or unsplittable. They have a begin offset >= the - /// partition begin offset. - /// @{ - // FIXME: We should probably define a "concat_iterator" helper and use that - // to stitch together pointee_iterators over the split tails and the - // contiguous iterators of the partition. That would give a much nicer - // interface here. We could then additionally expose filtered iterators for - // split, unsplit, and unsplittable splices based on the usage patterns. - iterator begin() const { return SI; } - iterator end() const { return SJ; } - /// @} - - /// \brief Get the sequence of split slice tails. - /// - /// These tails are of slices which start before this partition but are - /// split and overlap into the partition. We accumulate these while forming - /// partitions. - ArrayRef<Slice *> splitSliceTails() const { return SplitTails; } - }; - - /// \brief An iterator over partitions of the alloca's slices. - /// - /// This iterator implements the core algorithm for partitioning the alloca's - /// slices. It is a forward iterator as we don't support backtracking for - /// efficiency reasons, and re-use a single storage area to maintain the - /// current set of split slices. - /// - /// It is templated on the slice iterator type to use so that it can operate - /// with either const or non-const slice iterators. - class partition_iterator - : public iterator_facade_base<partition_iterator, - std::forward_iterator_tag, Partition> { - friend class AllocaSlices; - - /// \brief Most of the state for walking the partitions is held in a class - /// with a nice interface for examining them. - Partition P; - - /// \brief We need to keep the end of the slices to know when to stop. - AllocaSlices::iterator SE; - - /// \brief We also need to keep track of the maximum split end offset seen. - /// FIXME: Do we really? - uint64_t MaxSplitSliceEndOffset; - - /// \brief Sets the partition to be empty at given iterator, and sets the - /// end iterator. - partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE) - : P(SI), SE(SE), MaxSplitSliceEndOffset(0) { - // If not already at the end, advance our state to form the initial - // partition. - if (SI != SE) - advance(); - } - - /// \brief Advance the iterator to the next partition. - /// - /// Requires that the iterator not be at the end of the slices. - void advance() { - assert((P.SI != SE || !P.SplitTails.empty()) && - "Cannot advance past the end of the slices!"); - - // Clear out any split uses which have ended. - if (!P.SplitTails.empty()) { - if (P.EndOffset >= MaxSplitSliceEndOffset) { - // If we've finished all splits, this is easy. - P.SplitTails.clear(); - MaxSplitSliceEndOffset = 0; - } else { - // Remove the uses which have ended in the prior partition. This - // cannot change the max split slice end because we just checked that - // the prior partition ended prior to that max. - P.SplitTails.erase( - std::remove_if( - P.SplitTails.begin(), P.SplitTails.end(), - [&](Slice *S) { return S->endOffset() <= P.EndOffset; }), - P.SplitTails.end()); - assert(std::any_of(P.SplitTails.begin(), P.SplitTails.end(), - [&](Slice *S) { - return S->endOffset() == MaxSplitSliceEndOffset; - }) && - "Could not find the current max split slice offset!"); - assert(std::all_of(P.SplitTails.begin(), P.SplitTails.end(), - [&](Slice *S) { - return S->endOffset() <= MaxSplitSliceEndOffset; - }) && - "Max split slice end offset is not actually the max!"); - } - } - - // If P.SI is already at the end, then we've cleared the split tail and - // now have an end iterator. - if (P.SI == SE) { - assert(P.SplitTails.empty() && "Failed to clear the split slices!"); - return; - } - - // If we had a non-empty partition previously, set up the state for - // subsequent partitions. - if (P.SI != P.SJ) { - // Accumulate all the splittable slices which started in the old - // partition into the split list. - for (Slice &S : P) - if (S.isSplittable() && S.endOffset() > P.EndOffset) { - P.SplitTails.push_back(&S); - MaxSplitSliceEndOffset = - std::max(S.endOffset(), MaxSplitSliceEndOffset); - } - - // Start from the end of the previous partition. - P.SI = P.SJ; - - // If P.SI is now at the end, we at most have a tail of split slices. - if (P.SI == SE) { - P.BeginOffset = P.EndOffset; - P.EndOffset = MaxSplitSliceEndOffset; - return; - } - - // If the we have split slices and the next slice is after a gap and is - // not splittable immediately form an empty partition for the split - // slices up until the next slice begins. - if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset && - !P.SI->isSplittable()) { - P.BeginOffset = P.EndOffset; - P.EndOffset = P.SI->beginOffset(); - return; - } - } - - // OK, we need to consume new slices. Set the end offset based on the - // current slice, and step SJ past it. The beginning offset of the - // parttion is the beginning offset of the next slice unless we have - // pre-existing split slices that are continuing, in which case we begin - // at the prior end offset. - P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset; - P.EndOffset = P.SI->endOffset(); - ++P.SJ; - - // There are two strategies to form a partition based on whether the - // partition starts with an unsplittable slice or a splittable slice. - if (!P.SI->isSplittable()) { - // When we're forming an unsplittable region, it must always start at - // the first slice and will extend through its end. - assert(P.BeginOffset == P.SI->beginOffset()); - - // Form a partition including all of the overlapping slices with this - // unsplittable slice. - while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) { - if (!P.SJ->isSplittable()) - P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset()); - ++P.SJ; - } - - // We have a partition across a set of overlapping unsplittable - // partitions. - return; - } - - // If we're starting with a splittable slice, then we need to form - // a synthetic partition spanning it and any other overlapping splittable - // splices. - assert(P.SI->isSplittable() && "Forming a splittable partition!"); - - // Collect all of the overlapping splittable slices. - while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset && - P.SJ->isSplittable()) { - P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset()); - ++P.SJ; - } - - // Back upiP.EndOffset if we ended the span early when encountering an - // unsplittable slice. This synthesizes the early end offset of - // a partition spanning only splittable slices. - if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) { - assert(!P.SJ->isSplittable()); - P.EndOffset = P.SJ->beginOffset(); - } - } - - public: - bool operator==(const partition_iterator &RHS) const { - assert(SE == RHS.SE && - "End iterators don't match between compared partition iterators!"); - - // The observed positions of partitions is marked by the P.SI iterator and - // the emptyness of the split slices. The latter is only relevant when - // P.SI == SE, as the end iterator will additionally have an empty split - // slices list, but the prior may have the same P.SI and a tail of split - // slices. - if (P.SI == RHS.P.SI && - P.SplitTails.empty() == RHS.P.SplitTails.empty()) { - assert(P.SJ == RHS.P.SJ && - "Same set of slices formed two different sized partitions!"); - assert(P.SplitTails.size() == RHS.P.SplitTails.size() && - "Same slice position with differently sized non-empty split " - "slice tails!"); - return true; - } - return false; - } - - partition_iterator &operator++() { - advance(); - return *this; - } - - Partition &operator*() { return P; } - }; - - /// \brief A forward range over the partitions of the alloca's slices. - /// - /// This accesses an iterator range over the partitions of the alloca's - /// slices. It computes these partitions on the fly based on the overlapping - /// offsets of the slices and the ability to split them. It will visit "empty" - /// partitions to cover regions of the alloca only accessed via split - /// slices. - iterator_range<partition_iterator> partitions() { - return make_range(partition_iterator(begin(), end()), - partition_iterator(end(), end())); - } + iterator_range<partition_iterator> partitions(); /// \brief Access the dead users for this alloca. ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; } @@ -595,6 +317,280 @@ private: /// the alloca. SmallVector<Use *, 8> DeadOperands; }; + +/// \brief A partition of the slices. +/// +/// An ephemeral representation for a range of slices which can be viewed as +/// a partition of the alloca. This range represents a span of the alloca's +/// memory which cannot be split, and provides access to all of the slices +/// overlapping some part of the partition. +/// +/// Objects of this type are produced by traversing the alloca's slices, but +/// are only ephemeral and not persistent. +class llvm::sroa::Partition { +private: + friend class AllocaSlices; + friend class AllocaSlices::partition_iterator; + + typedef AllocaSlices::iterator iterator; + + /// \brief The beginning and ending offsets of the alloca for this + /// partition. + uint64_t BeginOffset, EndOffset; + + /// \brief The start end end iterators of this partition. + iterator SI, SJ; + + /// \brief A collection of split slice tails overlapping the partition. + SmallVector<Slice *, 4> SplitTails; + + /// \brief Raw constructor builds an empty partition starting and ending at + /// the given iterator. + Partition(iterator SI) : SI(SI), SJ(SI) {} + +public: + /// \brief The start offset of this partition. + /// + /// All of the contained slices start at or after this offset. + uint64_t beginOffset() const { return BeginOffset; } + + /// \brief The end offset of this partition. + /// + /// All of the contained slices end at or before this offset. + uint64_t endOffset() const { return EndOffset; } + + /// \brief The size of the partition. + /// + /// Note that this can never be zero. + uint64_t size() const { + assert(BeginOffset < EndOffset && "Partitions must span some bytes!"); + return EndOffset - BeginOffset; + } + + /// \brief Test whether this partition contains no slices, and merely spans + /// a region occupied by split slices. + bool empty() const { return SI == SJ; } + + /// \name Iterate slices that start within the partition. + /// These may be splittable or unsplittable. They have a begin offset >= the + /// partition begin offset. + /// @{ + // FIXME: We should probably define a "concat_iterator" helper and use that + // to stitch together pointee_iterators over the split tails and the + // contiguous iterators of the partition. That would give a much nicer + // interface here. We could then additionally expose filtered iterators for + // split, unsplit, and unsplittable splices based on the usage patterns. + iterator begin() const { return SI; } + iterator end() const { return SJ; } + /// @} + + /// \brief Get the sequence of split slice tails. + /// + /// These tails are of slices which start before this partition but are + /// split and overlap into the partition. We accumulate these while forming + /// partitions. + ArrayRef<Slice *> splitSliceTails() const { return SplitTails; } +}; + +/// \brief An iterator over partitions of the alloca's slices. +/// +/// This iterator implements the core algorithm for partitioning the alloca's +/// slices. It is a forward iterator as we don't support backtracking for +/// efficiency reasons, and re-use a single storage area to maintain the +/// current set of split slices. +/// +/// It is templated on the slice iterator type to use so that it can operate +/// with either const or non-const slice iterators. +class AllocaSlices::partition_iterator + : public iterator_facade_base<partition_iterator, std::forward_iterator_tag, + Partition> { + friend class AllocaSlices; + + /// \brief Most of the state for walking the partitions is held in a class + /// with a nice interface for examining them. + Partition P; + + /// \brief We need to keep the end of the slices to know when to stop. + AllocaSlices::iterator SE; + + /// \brief We also need to keep track of the maximum split end offset seen. + /// FIXME: Do we really? + uint64_t MaxSplitSliceEndOffset; + + /// \brief Sets the partition to be empty at given iterator, and sets the + /// end iterator. + partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE) + : P(SI), SE(SE), MaxSplitSliceEndOffset(0) { + // If not already at the end, advance our state to form the initial + // partition. + if (SI != SE) + advance(); + } + + /// \brief Advance the iterator to the next partition. + /// + /// Requires that the iterator not be at the end of the slices. + void advance() { + assert((P.SI != SE || !P.SplitTails.empty()) && + "Cannot advance past the end of the slices!"); + + // Clear out any split uses which have ended. + if (!P.SplitTails.empty()) { + if (P.EndOffset >= MaxSplitSliceEndOffset) { + // If we've finished all splits, this is easy. + P.SplitTails.clear(); + MaxSplitSliceEndOffset = 0; + } else { + // Remove the uses which have ended in the prior partition. This + // cannot change the max split slice end because we just checked that + // the prior partition ended prior to that max. + P.SplitTails.erase( + std::remove_if( + P.SplitTails.begin(), P.SplitTails.end(), + [&](Slice *S) { return S->endOffset() <= P.EndOffset; }), + P.SplitTails.end()); + assert(std::any_of(P.SplitTails.begin(), P.SplitTails.end(), + [&](Slice *S) { + return S->endOffset() == MaxSplitSliceEndOffset; + }) && + "Could not find the current max split slice offset!"); + assert(std::all_of(P.SplitTails.begin(), P.SplitTails.end(), + [&](Slice *S) { + return S->endOffset() <= MaxSplitSliceEndOffset; + }) && + "Max split slice end offset is not actually the max!"); + } + } + + // If P.SI is already at the end, then we've cleared the split tail and + // now have an end iterator. + if (P.SI == SE) { + assert(P.SplitTails.empty() && "Failed to clear the split slices!"); + return; + } + + // If we had a non-empty partition previously, set up the state for + // subsequent partitions. + if (P.SI != P.SJ) { + // Accumulate all the splittable slices which started in the old + // partition into the split list. + for (Slice &S : P) + if (S.isSplittable() && S.endOffset() > P.EndOffset) { + P.SplitTails.push_back(&S); + MaxSplitSliceEndOffset = + std::max(S.endOffset(), MaxSplitSliceEndOffset); + } + + // Start from the end of the previous partition. + P.SI = P.SJ; + + // If P.SI is now at the end, we at most have a tail of split slices. + if (P.SI == SE) { + P.BeginOffset = P.EndOffset; + P.EndOffset = MaxSplitSliceEndOffset; + return; + } + + // If the we have split slices and the next slice is after a gap and is + // not splittable immediately form an empty partition for the split + // slices up until the next slice begins. + if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset && + !P.SI->isSplittable()) { + P.BeginOffset = P.EndOffset; + P.EndOffset = P.SI->beginOffset(); + return; + } + } + + // OK, we need to consume new slices. Set the end offset based on the + // current slice, and step SJ past it. The beginning offset of the + // partition is the beginning offset of the next slice unless we have + // pre-existing split slices that are continuing, in which case we begin + // at the prior end offset. + P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset; + P.EndOffset = P.SI->endOffset(); + ++P.SJ; + + // There are two strategies to form a partition based on whether the + // partition starts with an unsplittable slice or a splittable slice. + if (!P.SI->isSplittable()) { + // When we're forming an unsplittable region, it must always start at + // the first slice and will extend through its end. + assert(P.BeginOffset == P.SI->beginOffset()); + + // Form a partition including all of the overlapping slices with this + // unsplittable slice. + while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) { + if (!P.SJ->isSplittable()) + P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset()); + ++P.SJ; + } + + // We have a partition across a set of overlapping unsplittable + // partitions. + return; + } + + // If we're starting with a splittable slice, then we need to form + // a synthetic partition spanning it and any other overlapping splittable + // splices. + assert(P.SI->isSplittable() && "Forming a splittable partition!"); + + // Collect all of the overlapping splittable slices. + while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset && + P.SJ->isSplittable()) { + P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset()); + ++P.SJ; + } + + // Back upiP.EndOffset if we ended the span early when encountering an + // unsplittable slice. This synthesizes the early end offset of + // a partition spanning only splittable slices. + if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) { + assert(!P.SJ->isSplittable()); + P.EndOffset = P.SJ->beginOffset(); + } + } + +public: + bool operator==(const partition_iterator &RHS) const { + assert(SE == RHS.SE && + "End iterators don't match between compared partition iterators!"); + + // The observed positions of partitions is marked by the P.SI iterator and + // the emptiness of the split slices. The latter is only relevant when + // P.SI == SE, as the end iterator will additionally have an empty split + // slices list, but the prior may have the same P.SI and a tail of split + // slices. + if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) { + assert(P.SJ == RHS.P.SJ && + "Same set of slices formed two different sized partitions!"); + assert(P.SplitTails.size() == RHS.P.SplitTails.size() && + "Same slice position with differently sized non-empty split " + "slice tails!"); + return true; + } + return false; + } + + partition_iterator &operator++() { + advance(); + return *this; + } + + Partition &operator*() { return P; } +}; + +/// \brief A forward range over the partitions of the alloca's slices. +/// +/// This accesses an iterator range over the partitions of the alloca's +/// slices. It computes these partitions on the fly based on the overlapping +/// offsets of the slices and the ability to split them. It will visit "empty" +/// partitions to cover regions of the alloca only accessed via split +/// slices. +iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() { + return make_range(partition_iterator(begin(), end()), + partition_iterator(end(), end())); } static Value *foldSelectInst(SelectInst &SI) { @@ -1072,217 +1068,6 @@ LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); } #endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -namespace { -/// \brief Implementation of LoadAndStorePromoter for promoting allocas. -/// -/// This subclass of LoadAndStorePromoter adds overrides to handle promoting -/// the loads and stores of an alloca instruction, as well as updating its -/// debug information. This is used when a domtree is unavailable and thus -/// mem2reg in its full form can't be used to handle promotion of allocas to -/// scalar values. -class AllocaPromoter : public LoadAndStorePromoter { - AllocaInst &AI; - DIBuilder &DIB; - - SmallVector<DbgDeclareInst *, 4> DDIs; - SmallVector<DbgValueInst *, 4> DVIs; - -public: - AllocaPromoter(ArrayRef<const Instruction *> Insts, - SSAUpdater &S, - AllocaInst &AI, DIBuilder &DIB) - : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {} - - void run(const SmallVectorImpl<Instruction *> &Insts) { - // Retain the debug information attached to the alloca for use when - // rewriting loads and stores. - if (auto *L = LocalAsMetadata::getIfExists(&AI)) { - if (auto *DINode = MetadataAsValue::getIfExists(AI.getContext(), L)) { - for (User *U : DINode->users()) - if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U)) - DDIs.push_back(DDI); - else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U)) - DVIs.push_back(DVI); - } - } - - LoadAndStorePromoter::run(Insts); - - // While we have the debug information, clear it off of the alloca. The - // caller takes care of deleting the alloca. - while (!DDIs.empty()) - DDIs.pop_back_val()->eraseFromParent(); - while (!DVIs.empty()) - DVIs.pop_back_val()->eraseFromParent(); - } - - bool - isInstInList(Instruction *I, - const SmallVectorImpl<Instruction *> &Insts) const override { - Value *Ptr; - if (LoadInst *LI = dyn_cast<LoadInst>(I)) - Ptr = LI->getOperand(0); - else - Ptr = cast<StoreInst>(I)->getPointerOperand(); - - // Only used to detect cycles, which will be rare and quickly found as - // we're walking up a chain of defs rather than down through uses. - SmallPtrSet<Value *, 4> Visited; - - do { - if (Ptr == &AI) - return true; - - if (BitCastInst *BCI = dyn_cast<BitCastInst>(Ptr)) - Ptr = BCI->getOperand(0); - else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr)) - Ptr = GEPI->getPointerOperand(); - else - return false; - - } while (Visited.insert(Ptr).second); - - return false; - } - - void updateDebugInfo(Instruction *Inst) const override { - for (DbgDeclareInst *DDI : DDIs) - if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) - ConvertDebugDeclareToDebugValue(DDI, SI, DIB); - else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) - ConvertDebugDeclareToDebugValue(DDI, LI, DIB); - for (DbgValueInst *DVI : DVIs) { - Value *Arg = nullptr; - if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { - // If an argument is zero extended then use argument directly. The ZExt - // may be zapped by an optimization pass in future. - if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0))) - Arg = dyn_cast<Argument>(ZExt->getOperand(0)); - else if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0))) - Arg = dyn_cast<Argument>(SExt->getOperand(0)); - if (!Arg) - Arg = SI->getValueOperand(); - } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { - Arg = LI->getPointerOperand(); - } else { - continue; - } - DIB.insertDbgValueIntrinsic(Arg, 0, DVI->getVariable(), - DVI->getExpression(), DVI->getDebugLoc(), - Inst); - } - } -}; -} // end anon namespace - -namespace { -/// \brief An optimization pass providing Scalar Replacement of Aggregates. -/// -/// This pass takes allocations which can be completely analyzed (that is, they -/// don't escape) and tries to turn them into scalar SSA values. There are -/// a few steps to this process. -/// -/// 1) It takes allocations of aggregates and analyzes the ways in which they -/// are used to try to split them into smaller allocations, ideally of -/// a single scalar data type. It will split up memcpy and memset accesses -/// as necessary and try to isolate individual scalar accesses. -/// 2) It will transform accesses into forms which are suitable for SSA value -/// promotion. This can be replacing a memset with a scalar store of an -/// integer value, or it can involve speculating operations on a PHI or -/// select to be a PHI or select of the results. -/// 3) Finally, this will try to detect a pattern of accesses which map cleanly -/// onto insert and extract operations on a vector value, and convert them to -/// this form. By doing so, it will enable promotion of vector aggregates to -/// SSA vector values. -class SROA : public FunctionPass { - const bool RequiresDomTree; - - LLVMContext *C; - DominatorTree *DT; - AssumptionCache *AC; - - /// \brief Worklist of alloca instructions to simplify. - /// - /// Each alloca in the function is added to this. Each new alloca formed gets - /// added to it as well to recursively simplify unless that alloca can be - /// directly promoted. Finally, each time we rewrite a use of an alloca other - /// the one being actively rewritten, we add it back onto the list if not - /// already present to ensure it is re-visited. - SetVector<AllocaInst *, SmallVector<AllocaInst *, 16>> Worklist; - - /// \brief A collection of instructions to delete. - /// We try to batch deletions to simplify code and make things a bit more - /// efficient. - SetVector<Instruction *, SmallVector<Instruction *, 8>> DeadInsts; - - /// \brief Post-promotion worklist. - /// - /// Sometimes we discover an alloca which has a high probability of becoming - /// viable for SROA after a round of promotion takes place. In those cases, - /// the alloca is enqueued here for re-processing. - /// - /// Note that we have to be very careful to clear allocas out of this list in - /// the event they are deleted. - SetVector<AllocaInst *, SmallVector<AllocaInst *, 16>> PostPromotionWorklist; - - /// \brief A collection of alloca instructions we can directly promote. - std::vector<AllocaInst *> PromotableAllocas; - - /// \brief A worklist of PHIs to speculate prior to promoting allocas. - /// - /// All of these PHIs have been checked for the safety of speculation and by - /// being speculated will allow promoting allocas currently in the promotable - /// queue. - SetVector<PHINode *, SmallVector<PHINode *, 2>> SpeculatablePHIs; - - /// \brief A worklist of select instructions to speculate prior to promoting - /// allocas. - /// - /// All of these select instructions have been checked for the safety of - /// speculation and by being speculated will allow promoting allocas - /// currently in the promotable queue. - SetVector<SelectInst *, SmallVector<SelectInst *, 2>> SpeculatableSelects; - -public: - SROA(bool RequiresDomTree = true) - : FunctionPass(ID), RequiresDomTree(RequiresDomTree), C(nullptr), - DT(nullptr) { - initializeSROAPass(*PassRegistry::getPassRegistry()); - } - bool runOnFunction(Function &F) override; - void getAnalysisUsage(AnalysisUsage &AU) const override; - - const char *getPassName() const override { return "SROA"; } - static char ID; - -private: - friend class PHIOrSelectSpeculator; - friend class AllocaSliceRewriter; - - bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS); - AllocaInst *rewritePartition(AllocaInst &AI, AllocaSlices &AS, - AllocaSlices::Partition &P); - bool splitAlloca(AllocaInst &AI, AllocaSlices &AS); - bool runOnAlloca(AllocaInst &AI); - void clobberUse(Use &U); - void deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas); - bool promoteAllocas(Function &F); -}; -} - -char SROA::ID = 0; - -FunctionPass *llvm::createSROAPass(bool RequiresDomTree) { - return new SROA(RequiresDomTree); -} - -INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates", false, - false) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", false, - false) - /// Walk the range of a partitioning looking for a common type to cover this /// sequence of slices. static Type *findCommonType(AllocaSlices::const_iterator B, @@ -1373,7 +1158,7 @@ static bool isSafePHIToSpeculate(PHINode &PN) { // Ensure that there are no instructions between the PHI and the load that // could store. - for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI) + for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI) if (BBI->mayWriteToMemory()) return false; @@ -1934,10 +1719,10 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, /// \brief Test whether the given slice use can be promoted to a vector. /// -/// This function is called to test each entry in a partioning which is slated +/// This function is called to test each entry in a partition which is slated /// for a single slice. -static bool isVectorPromotionViableForSlice(AllocaSlices::Partition &P, - const Slice &S, VectorType *Ty, +static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, + VectorType *Ty, uint64_t ElementSize, const DataLayout &DL) { // First validate the slice offsets. @@ -2012,8 +1797,7 @@ static bool isVectorPromotionViableForSlice(AllocaSlices::Partition &P, /// SSA value. We only can ensure this for a limited set of operations, and we /// don't want to do the rewrites unless we are confident that the result will /// be promotable, so we have an early test here. -static VectorType *isVectorPromotionViable(AllocaSlices::Partition &P, - const DataLayout &DL) { +static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) { // Collect the candidate types for vector-based promotion. Also track whether // we have different element types. SmallVector<VectorType *, 4> CandidateTys; @@ -2130,7 +1914,7 @@ static bool isIntegerWideningViableForSlice(const Slice &S, uint64_t RelEnd = S.endOffset() - AllocBeginOffset; // We can't reasonably handle cases where the load or store extends past - // the end of the aloca's type and into its padding. + // the end of the alloca's type and into its padding. if (RelEnd > Size) return false; @@ -2199,7 +1983,7 @@ static bool isIntegerWideningViableForSlice(const Slice &S, /// This is a quick test to check whether we can rewrite the integer loads and /// stores to a particular alloca into wider loads and stores and be able to /// promote the resulting alloca. -static bool isIntegerWideningViable(AllocaSlices::Partition &P, Type *AllocaTy, +static bool isIntegerWideningViable(Partition &P, Type *AllocaTy, const DataLayout &DL) { uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy); // Don't create integer types larger than the maximum bitwidth. @@ -2368,14 +2152,14 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V, return V; } -namespace { /// \brief Visitor to rewrite instructions using p particular slice of an alloca /// to use a new alloca. /// /// Also implements the rewriting to vector-based accesses when the partition /// passes the isVectorPromotionViable predicate. Most of the rewriting logic /// lives here. -class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> { +class llvm::sroa::AllocaSliceRewriter + : public InstVisitor<AllocaSliceRewriter, bool> { // Befriend the base class so it can delegate to private visit methods. friend class llvm::InstVisitor<AllocaSliceRewriter, bool>; typedef llvm::InstVisitor<AllocaSliceRewriter, bool> Base; @@ -2583,9 +2367,19 @@ private: V = convertValue(DL, IRB, V, IntTy); assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; - if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) - V = extractInteger(DL, IRB, V, cast<IntegerType>(LI.getType()), Offset, - "extract"); + if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) { + IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8); + V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract"); + } + // It is possible that the extracted type is not the load type. This + // happens if there is a load past the end of the alloca, and as + // a consequence the slice is narrower but still a candidate for integer + // lowering. To handle this case, we just zero extend the extracted + // integer. + assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 && + "Can only handle an extract for an overly wide load"); + if (cast<IntegerType>(LI.getType())->getBitWidth() > SliceSize * 8) + V = IRB.CreateZExt(V, LI.getType()); return V; } @@ -2648,7 +2442,7 @@ private: DL.getTypeStoreSizeInBits(LI.getType()) && "Non-byte-multiple bit width"); // Move the insertion point just past the load so that we can refer to it. - IRB.SetInsertPoint(std::next(BasicBlock::iterator(&LI))); + IRB.SetInsertPoint(&*std::next(BasicBlock::iterator(&LI))); // Create a placeholder value with the same type as LI to use as the // basis for the new value. This allows us to replace the uses of LI with // the computed value, and then replace the placeholder with LI, leaving @@ -3126,7 +2920,7 @@ private: // dominate the PHI. IRBuilderTy PtrBuilder(IRB); if (isa<PHINode>(OldPtr)) - PtrBuilder.SetInsertPoint(OldPtr->getParent()->getFirstInsertionPt()); + PtrBuilder.SetInsertPoint(&*OldPtr->getParent()->getFirstInsertionPt()); else PtrBuilder.SetInsertPoint(OldPtr); PtrBuilder.SetCurrentDebugLocation(OldPtr->getDebugLoc()); @@ -3169,7 +2963,6 @@ private: return true; } }; -} namespace { /// \brief Visitor to rewrite aggregate loads and stores as scalar. @@ -3181,8 +2974,6 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> { // Befriend the base class so it can delegate to private visit methods. friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>; - const DataLayout &DL; - /// Queue of pointer uses to analyze and potentially rewrite. SmallVector<Use *, 8> Queue; @@ -3194,8 +2985,6 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> { Use *U; public: - AggLoadStoreRewriter(const DataLayout &DL) : DL(DL) {} - /// Rewrite loads and stores through a pointer and all pointers derived from /// it. bool rewrite(Instruction &I) { @@ -3711,7 +3500,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { return true; }), Stores.end()); - // Now we have to go *back* through all te stores, because a later store may + // Now we have to go *back* through all the stores, because a later store may // have caused an earlier store's load to become unsplittable and if it is // unsplittable for the later store, then we can't rely on it being split in // the earlier store either. @@ -3773,7 +3562,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { "Cannot represent alloca access size using 64-bit integers!"); Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand()); - IRB.SetInsertPoint(BasicBlock::iterator(LI)); + IRB.SetInsertPoint(LI); DEBUG(dbgs() << " Splitting load: " << *LI << "\n"); @@ -3825,7 +3614,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { } Value *StoreBasePtr = SI->getPointerOperand(); - IRB.SetInsertPoint(BasicBlock::iterator(SI)); + IRB.SetInsertPoint(SI); DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n"); @@ -3914,7 +3703,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { if (SplitLoads) { PLoad = (*SplitLoads)[Idx]; } else { - IRB.SetInsertPoint(BasicBlock::iterator(LI)); + IRB.SetInsertPoint(LI); PLoad = IRB.CreateAlignedLoad( getAdjustedPtr(IRB, DL, LoadBasePtr, APInt(DL.getPointerSizeInBits(), PartOffset), @@ -3924,7 +3713,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { } // And store this partition. - IRB.SetInsertPoint(BasicBlock::iterator(SI)); + IRB.SetInsertPoint(SI); StoreInst *PStore = IRB.CreateAlignedStore( PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr, APInt(DL.getPointerSizeInBits(), PartOffset), @@ -3972,7 +3761,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { // Mark the original store as dead now that we've split it up and kill its // slice. Note that we leave the original load in place unless this store - // was its ownly use. It may in turn be split up if it is an alloca load + // was its only use. It may in turn be split up if it is an alloca load // for some other alloca, but it may be a normal load. This may introduce // redundant loads, but where those can be merged the rest of the optimizer // should handle the merging, and this uncovers SSA splits which is more @@ -4024,7 +3813,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { /// at enabling promotion and if it was successful queues the alloca to be /// promoted. AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, - AllocaSlices::Partition &P) { + Partition &P) { // Try to compute a friendly type for this partition of the alloca. This // won't always succeed, in which case we fall back to a legal integer type // or an i8 array of an appropriate size. @@ -4230,12 +4019,11 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { std::max<unsigned>(NumPartitions, MaxPartitionsPerAlloca); // Migrate debug information from the old alloca to the new alloca(s) - // and the individial partitions. + // and the individual partitions. if (DbgDeclareInst *DbgDecl = FindAllocaDbgDeclare(&AI)) { auto *Var = DbgDecl->getVariable(); auto *Expr = DbgDecl->getExpression(); - DIBuilder DIB(*AI.getParent()->getParent()->getParent(), - /*AllowUnresolved*/ false); + DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false); bool IsSplit = Pieces.size() > 1; for (auto Piece : Pieces) { // Create a piece expression describing the new partition or reuse AI's @@ -4308,7 +4096,7 @@ bool SROA::runOnAlloca(AllocaInst &AI) { // First, split any FCA loads and stores touching this alloca to promote // better splitting and promotion opportunities. - AggLoadStoreRewriter AggRewriter(DL); + AggLoadStoreRewriter AggRewriter; Changed |= AggRewriter.rewrite(AI); // Build the slices using a recursive instruction-visiting builder. @@ -4388,107 +4176,29 @@ void SROA::deleteDeadInstructions( } } -static void enqueueUsersInWorklist(Instruction &I, - SmallVectorImpl<Instruction *> &Worklist, - SmallPtrSetImpl<Instruction *> &Visited) { - for (User *U : I.users()) - if (Visited.insert(cast<Instruction>(U)).second) - Worklist.push_back(cast<Instruction>(U)); -} - /// \brief Promote the allocas, using the best available technique. /// /// This attempts to promote whatever allocas have been identified as viable in /// the PromotableAllocas list. If that list is empty, there is nothing to do. -/// If there is a domtree available, we attempt to promote using the full power -/// of mem2reg. Otherwise, we build and use the AllocaPromoter above which is -/// based on the SSAUpdater utilities. This function returns whether any -/// promotion occurred. +/// This function returns whether any promotion occurred. bool SROA::promoteAllocas(Function &F) { if (PromotableAllocas.empty()) return false; NumPromoted += PromotableAllocas.size(); - if (DT && !ForceSSAUpdater) { - DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); - PromoteMemToReg(PromotableAllocas, *DT, nullptr, AC); - PromotableAllocas.clear(); - return true; - } - - DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n"); - SSAUpdater SSA; - DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false); - SmallVector<Instruction *, 64> Insts; - - // We need a worklist to walk the uses of each alloca. - SmallVector<Instruction *, 8> Worklist; - SmallPtrSet<Instruction *, 8> Visited; - SmallVector<Instruction *, 32> DeadInsts; - - for (unsigned Idx = 0, Size = PromotableAllocas.size(); Idx != Size; ++Idx) { - AllocaInst *AI = PromotableAllocas[Idx]; - Insts.clear(); - Worklist.clear(); - Visited.clear(); - - enqueueUsersInWorklist(*AI, Worklist, Visited); - - while (!Worklist.empty()) { - Instruction *I = Worklist.pop_back_val(); - - // FIXME: Currently the SSAUpdater infrastructure doesn't reason about - // lifetime intrinsics and so we strip them (and the bitcasts+GEPs - // leading to them) here. Eventually it should use them to optimize the - // scalar values produced. - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { - assert(II->getIntrinsicID() == Intrinsic::lifetime_start || - II->getIntrinsicID() == Intrinsic::lifetime_end); - II->eraseFromParent(); - continue; - } - - // Push the loads and stores we find onto the list. SROA will already - // have validated that all loads and stores are viable candidates for - // promotion. - if (LoadInst *LI = dyn_cast<LoadInst>(I)) { - assert(LI->getType() == AI->getAllocatedType()); - Insts.push_back(LI); - continue; - } - if (StoreInst *SI = dyn_cast<StoreInst>(I)) { - assert(SI->getValueOperand()->getType() == AI->getAllocatedType()); - Insts.push_back(SI); - continue; - } - - // For everything else, we know that only no-op bitcasts and GEPs will - // make it this far, just recurse through them and recall them for later - // removal. - DeadInsts.push_back(I); - enqueueUsersInWorklist(*I, Worklist, Visited); - } - AllocaPromoter(Insts, SSA, *AI, DIB).run(Insts); - while (!DeadInsts.empty()) - DeadInsts.pop_back_val()->eraseFromParent(); - AI->eraseFromParent(); - } - + DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); + PromoteMemToReg(PromotableAllocas, *DT, nullptr, AC); PromotableAllocas.clear(); return true; } -bool SROA::runOnFunction(Function &F) { - if (skipOptnoneFunction(F)) - return false; - +PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT, + AssumptionCache &RunAC) { DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); C = &F.getContext(); - DominatorTreeWrapperPass *DTWP = - getAnalysisIfAvailable<DominatorTreeWrapperPass>(); - DT = DTWP ? &DTWP->getDomTree() : nullptr; - AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + DT = &RunDT; + AC = &RunAC; BasicBlock &EntryBB = F.getEntryBlock(); for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end()); @@ -4527,12 +4237,55 @@ bool SROA::runOnFunction(Function &F) { PostPromotionWorklist.clear(); } while (!Worklist.empty()); - return Changed; + // FIXME: Even when promoting allocas we should preserve some abstract set of + // CFG-specific analyses. + return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); } -void SROA::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<AssumptionCacheTracker>(); - if (RequiresDomTree) - AU.addRequired<DominatorTreeWrapperPass>(); - AU.setPreservesCFG(); +PreservedAnalyses SROA::run(Function &F, AnalysisManager<Function> *AM) { + return runImpl(F, AM->getResult<DominatorTreeAnalysis>(F), + AM->getResult<AssumptionAnalysis>(F)); } + +/// A legacy pass for the legacy pass manager that wraps the \c SROA pass. +/// +/// This is in the llvm namespace purely to allow it to be a friend of the \c +/// SROA pass. +class llvm::sroa::SROALegacyPass : public FunctionPass { + /// The SROA implementation. + SROA Impl; + +public: + SROALegacyPass() : FunctionPass(ID) { + initializeSROALegacyPassPass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F) override { + if (skipOptnoneFunction(F)) + return false; + + auto PA = Impl.runImpl( + F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(), + getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F)); + return !PA.areAllPreserved(); + } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + AU.setPreservesCFG(); + } + + const char *getPassName() const override { return "SROA"; } + static char ID; +}; + +char SROALegacyPass::ID = 0; + +FunctionPass *llvm::createSROAPass() { return new SROALegacyPass(); } + +INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa", + "Scalar Replacement Of Aggregates", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates", + false, false) diff --git a/contrib/llvm/lib/Transforms/Scalar/SampleProfile.cpp b/contrib/llvm/lib/Transforms/Scalar/SampleProfile.cpp deleted file mode 100644 index c8dfa54..0000000 --- a/contrib/llvm/lib/Transforms/Scalar/SampleProfile.cpp +++ /dev/null @@ -1,777 +0,0 @@ -//===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements the SampleProfileLoader transformation. This pass -// reads a profile file generated by a sampling profiler (e.g. Linux Perf - -// http://perf.wiki.kernel.org/) and generates IR metadata to reflect the -// profile information in the given profile. -// -// This pass generates branch weight annotations on the IR: -// -// - prof: Represents branch weights. This annotation is added to branches -// to indicate the weights of each edge coming out of the branch. -// The weight of each edge is the weight of the target block for -// that edge. The weight of a block B is computed as the maximum -// number of samples found in B. -// -//===----------------------------------------------------------------------===// - -#include "llvm/Transforms/Scalar.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/PostDominators.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DebugInfo.h" -#include "llvm/IR/DiagnosticInfo.h" -#include "llvm/IR/Dominators.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/InstIterator.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/MDBuilder.h" -#include "llvm/IR/Metadata.h" -#include "llvm/IR/Module.h" -#include "llvm/Pass.h" -#include "llvm/ProfileData/SampleProfReader.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include <cctype> - -using namespace llvm; -using namespace sampleprof; - -#define DEBUG_TYPE "sample-profile" - -// Command line option to specify the file to read samples from. This is -// mainly used for debugging. -static cl::opt<std::string> SampleProfileFile( - "sample-profile-file", cl::init(""), cl::value_desc("filename"), - cl::desc("Profile file loaded by -sample-profile"), cl::Hidden); -static cl::opt<unsigned> SampleProfileMaxPropagateIterations( - "sample-profile-max-propagate-iterations", cl::init(100), - cl::desc("Maximum number of iterations to go through when propagating " - "sample block/edge weights through the CFG.")); - -namespace { -typedef DenseMap<BasicBlock *, unsigned> BlockWeightMap; -typedef DenseMap<BasicBlock *, BasicBlock *> EquivalenceClassMap; -typedef std::pair<BasicBlock *, BasicBlock *> Edge; -typedef DenseMap<Edge, unsigned> EdgeWeightMap; -typedef DenseMap<BasicBlock *, SmallVector<BasicBlock *, 8>> BlockEdgeMap; - -/// \brief Sample profile pass. -/// -/// This pass reads profile data from the file specified by -/// -sample-profile-file and annotates every affected function with the -/// profile information found in that file. -class SampleProfileLoader : public FunctionPass { -public: - // Class identification, replacement for typeinfo - static char ID; - - SampleProfileLoader(StringRef Name = SampleProfileFile) - : FunctionPass(ID), DT(nullptr), PDT(nullptr), LI(nullptr), Ctx(nullptr), - Reader(), Samples(nullptr), Filename(Name), ProfileIsValid(false) { - initializeSampleProfileLoaderPass(*PassRegistry::getPassRegistry()); - } - - bool doInitialization(Module &M) override; - - void dump() { Reader->dump(); } - - const char *getPassName() const override { return "Sample profile pass"; } - - bool runOnFunction(Function &F) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addRequired<LoopInfoWrapperPass>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<PostDominatorTree>(); - } - -protected: - unsigned getFunctionLoc(Function &F); - bool emitAnnotations(Function &F); - unsigned getInstWeight(Instruction &I); - unsigned getBlockWeight(BasicBlock *BB); - void printEdgeWeight(raw_ostream &OS, Edge E); - void printBlockWeight(raw_ostream &OS, BasicBlock *BB); - void printBlockEquivalence(raw_ostream &OS, BasicBlock *BB); - bool computeBlockWeights(Function &F); - void findEquivalenceClasses(Function &F); - void findEquivalencesFor(BasicBlock *BB1, - SmallVector<BasicBlock *, 8> Descendants, - DominatorTreeBase<BasicBlock> *DomTree); - void propagateWeights(Function &F); - unsigned visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge); - void buildEdges(Function &F); - bool propagateThroughEdges(Function &F); - - /// \brief Line number for the function header. Used to compute absolute - /// line numbers from the relative line numbers found in the profile. - unsigned HeaderLineno; - - /// \brief Map basic blocks to their computed weights. - /// - /// The weight of a basic block is defined to be the maximum - /// of all the instruction weights in that block. - BlockWeightMap BlockWeights; - - /// \brief Map edges to their computed weights. - /// - /// Edge weights are computed by propagating basic block weights in - /// SampleProfile::propagateWeights. - EdgeWeightMap EdgeWeights; - - /// \brief Set of visited blocks during propagation. - SmallPtrSet<BasicBlock *, 128> VisitedBlocks; - - /// \brief Set of visited edges during propagation. - SmallSet<Edge, 128> VisitedEdges; - - /// \brief Equivalence classes for block weights. - /// - /// Two blocks BB1 and BB2 are in the same equivalence class if they - /// dominate and post-dominate each other, and they are in the same loop - /// nest. When this happens, the two blocks are guaranteed to execute - /// the same number of times. - EquivalenceClassMap EquivalenceClass; - - /// \brief Dominance, post-dominance and loop information. - DominatorTree *DT; - PostDominatorTree *PDT; - LoopInfo *LI; - - /// \brief Predecessors for each basic block in the CFG. - BlockEdgeMap Predecessors; - - /// \brief Successors for each basic block in the CFG. - BlockEdgeMap Successors; - - /// \brief LLVM context holding the debug data we need. - LLVMContext *Ctx; - - /// \brief Profile reader object. - std::unique_ptr<SampleProfileReader> Reader; - - /// \brief Samples collected for the body of this function. - FunctionSamples *Samples; - - /// \brief Name of the profile file to load. - StringRef Filename; - - /// \brief Flag indicating whether the profile input loaded successfully. - bool ProfileIsValid; -}; -} - -/// \brief Print the weight of edge \p E on stream \p OS. -/// -/// \param OS Stream to emit the output to. -/// \param E Edge to print. -void SampleProfileLoader::printEdgeWeight(raw_ostream &OS, Edge E) { - OS << "weight[" << E.first->getName() << "->" << E.second->getName() - << "]: " << EdgeWeights[E] << "\n"; -} - -/// \brief Print the equivalence class of block \p BB on stream \p OS. -/// -/// \param OS Stream to emit the output to. -/// \param BB Block to print. -void SampleProfileLoader::printBlockEquivalence(raw_ostream &OS, - BasicBlock *BB) { - BasicBlock *Equiv = EquivalenceClass[BB]; - OS << "equivalence[" << BB->getName() - << "]: " << ((Equiv) ? EquivalenceClass[BB]->getName() : "NONE") << "\n"; -} - -/// \brief Print the weight of block \p BB on stream \p OS. -/// -/// \param OS Stream to emit the output to. -/// \param BB Block to print. -void SampleProfileLoader::printBlockWeight(raw_ostream &OS, BasicBlock *BB) { - OS << "weight[" << BB->getName() << "]: " << BlockWeights[BB] << "\n"; -} - -/// \brief Get the weight for an instruction. -/// -/// The "weight" of an instruction \p Inst is the number of samples -/// collected on that instruction at runtime. To retrieve it, we -/// need to compute the line number of \p Inst relative to the start of its -/// function. We use HeaderLineno to compute the offset. We then -/// look up the samples collected for \p Inst using BodySamples. -/// -/// \param Inst Instruction to query. -/// -/// \returns The profiled weight of I. -unsigned SampleProfileLoader::getInstWeight(Instruction &Inst) { - DebugLoc DLoc = Inst.getDebugLoc(); - if (!DLoc) - return 0; - - unsigned Lineno = DLoc.getLine(); - if (Lineno < HeaderLineno) - return 0; - - const DILocation *DIL = DLoc; - int LOffset = Lineno - HeaderLineno; - unsigned Discriminator = DIL->getDiscriminator(); - unsigned Weight = Samples->samplesAt(LOffset, Discriminator); - DEBUG(dbgs() << " " << Lineno << "." << Discriminator << ":" << Inst - << " (line offset: " << LOffset << "." << Discriminator - << " - weight: " << Weight << ")\n"); - return Weight; -} - -/// \brief Compute the weight of a basic block. -/// -/// The weight of basic block \p BB is the maximum weight of all the -/// instructions in BB. The weight of \p BB is computed and cached in -/// the BlockWeights map. -/// -/// \param BB The basic block to query. -/// -/// \returns The computed weight of BB. -unsigned SampleProfileLoader::getBlockWeight(BasicBlock *BB) { - // If we've computed BB's weight before, return it. - std::pair<BlockWeightMap::iterator, bool> Entry = - BlockWeights.insert(std::make_pair(BB, 0)); - if (!Entry.second) - return Entry.first->second; - - // Otherwise, compute and cache BB's weight. - unsigned Weight = 0; - for (auto &I : BB->getInstList()) { - unsigned InstWeight = getInstWeight(I); - if (InstWeight > Weight) - Weight = InstWeight; - } - Entry.first->second = Weight; - return Weight; -} - -/// \brief Compute and store the weights of every basic block. -/// -/// This populates the BlockWeights map by computing -/// the weights of every basic block in the CFG. -/// -/// \param F The function to query. -bool SampleProfileLoader::computeBlockWeights(Function &F) { - bool Changed = false; - DEBUG(dbgs() << "Block weights\n"); - for (auto &BB : F) { - unsigned Weight = getBlockWeight(&BB); - Changed |= (Weight > 0); - DEBUG(printBlockWeight(dbgs(), &BB)); - } - - return Changed; -} - -/// \brief Find equivalence classes for the given block. -/// -/// This finds all the blocks that are guaranteed to execute the same -/// number of times as \p BB1. To do this, it traverses all the -/// descendants of \p BB1 in the dominator or post-dominator tree. -/// -/// A block BB2 will be in the same equivalence class as \p BB1 if -/// the following holds: -/// -/// 1- \p BB1 is a descendant of BB2 in the opposite tree. So, if BB2 -/// is a descendant of \p BB1 in the dominator tree, then BB2 should -/// dominate BB1 in the post-dominator tree. -/// -/// 2- Both BB2 and \p BB1 must be in the same loop. -/// -/// For every block BB2 that meets those two requirements, we set BB2's -/// equivalence class to \p BB1. -/// -/// \param BB1 Block to check. -/// \param Descendants Descendants of \p BB1 in either the dom or pdom tree. -/// \param DomTree Opposite dominator tree. If \p Descendants is filled -/// with blocks from \p BB1's dominator tree, then -/// this is the post-dominator tree, and vice versa. -void SampleProfileLoader::findEquivalencesFor( - BasicBlock *BB1, SmallVector<BasicBlock *, 8> Descendants, - DominatorTreeBase<BasicBlock> *DomTree) { - for (auto *BB2 : Descendants) { - bool IsDomParent = DomTree->dominates(BB2, BB1); - bool IsInSameLoop = LI->getLoopFor(BB1) == LI->getLoopFor(BB2); - if (BB1 != BB2 && VisitedBlocks.insert(BB2).second && IsDomParent && - IsInSameLoop) { - EquivalenceClass[BB2] = BB1; - - // If BB2 is heavier than BB1, make BB2 have the same weight - // as BB1. - // - // Note that we don't worry about the opposite situation here - // (when BB2 is lighter than BB1). We will deal with this - // during the propagation phase. Right now, we just want to - // make sure that BB1 has the largest weight of all the - // members of its equivalence set. - unsigned &BB1Weight = BlockWeights[BB1]; - unsigned &BB2Weight = BlockWeights[BB2]; - BB1Weight = std::max(BB1Weight, BB2Weight); - } - } -} - -/// \brief Find equivalence classes. -/// -/// Since samples may be missing from blocks, we can fill in the gaps by setting -/// the weights of all the blocks in the same equivalence class to the same -/// weight. To compute the concept of equivalence, we use dominance and loop -/// information. Two blocks B1 and B2 are in the same equivalence class if B1 -/// dominates B2, B2 post-dominates B1 and both are in the same loop. -/// -/// \param F The function to query. -void SampleProfileLoader::findEquivalenceClasses(Function &F) { - SmallVector<BasicBlock *, 8> DominatedBBs; - DEBUG(dbgs() << "\nBlock equivalence classes\n"); - // Find equivalence sets based on dominance and post-dominance information. - for (auto &BB : F) { - BasicBlock *BB1 = &BB; - - // Compute BB1's equivalence class once. - if (EquivalenceClass.count(BB1)) { - DEBUG(printBlockEquivalence(dbgs(), BB1)); - continue; - } - - // By default, blocks are in their own equivalence class. - EquivalenceClass[BB1] = BB1; - - // Traverse all the blocks dominated by BB1. We are looking for - // every basic block BB2 such that: - // - // 1- BB1 dominates BB2. - // 2- BB2 post-dominates BB1. - // 3- BB1 and BB2 are in the same loop nest. - // - // If all those conditions hold, it means that BB2 is executed - // as many times as BB1, so they are placed in the same equivalence - // class by making BB2's equivalence class be BB1. - DominatedBBs.clear(); - DT->getDescendants(BB1, DominatedBBs); - findEquivalencesFor(BB1, DominatedBBs, PDT->DT); - - // Repeat the same logic for all the blocks post-dominated by BB1. - // We are looking for every basic block BB2 such that: - // - // 1- BB1 post-dominates BB2. - // 2- BB2 dominates BB1. - // 3- BB1 and BB2 are in the same loop nest. - // - // If all those conditions hold, BB2's equivalence class is BB1. - DominatedBBs.clear(); - PDT->getDescendants(BB1, DominatedBBs); - findEquivalencesFor(BB1, DominatedBBs, DT); - - DEBUG(printBlockEquivalence(dbgs(), BB1)); - } - - // Assign weights to equivalence classes. - // - // All the basic blocks in the same equivalence class will execute - // the same number of times. Since we know that the head block in - // each equivalence class has the largest weight, assign that weight - // to all the blocks in that equivalence class. - DEBUG(dbgs() << "\nAssign the same weight to all blocks in the same class\n"); - for (auto &BI : F) { - BasicBlock *BB = &BI; - BasicBlock *EquivBB = EquivalenceClass[BB]; - if (BB != EquivBB) - BlockWeights[BB] = BlockWeights[EquivBB]; - DEBUG(printBlockWeight(dbgs(), BB)); - } -} - -/// \brief Visit the given edge to decide if it has a valid weight. -/// -/// If \p E has not been visited before, we copy to \p UnknownEdge -/// and increment the count of unknown edges. -/// -/// \param E Edge to visit. -/// \param NumUnknownEdges Current number of unknown edges. -/// \param UnknownEdge Set if E has not been visited before. -/// -/// \returns E's weight, if known. Otherwise, return 0. -unsigned SampleProfileLoader::visitEdge(Edge E, unsigned *NumUnknownEdges, - Edge *UnknownEdge) { - if (!VisitedEdges.count(E)) { - (*NumUnknownEdges)++; - *UnknownEdge = E; - return 0; - } - - return EdgeWeights[E]; -} - -/// \brief Propagate weights through incoming/outgoing edges. -/// -/// If the weight of a basic block is known, and there is only one edge -/// with an unknown weight, we can calculate the weight of that edge. -/// -/// Similarly, if all the edges have a known count, we can calculate the -/// count of the basic block, if needed. -/// -/// \param F Function to process. -/// -/// \returns True if new weights were assigned to edges or blocks. -bool SampleProfileLoader::propagateThroughEdges(Function &F) { - bool Changed = false; - DEBUG(dbgs() << "\nPropagation through edges\n"); - for (auto &BI : F) { - BasicBlock *BB = &BI; - - // Visit all the predecessor and successor edges to determine - // which ones have a weight assigned already. Note that it doesn't - // matter that we only keep track of a single unknown edge. The - // only case we are interested in handling is when only a single - // edge is unknown (see setEdgeOrBlockWeight). - for (unsigned i = 0; i < 2; i++) { - unsigned TotalWeight = 0; - unsigned NumUnknownEdges = 0; - Edge UnknownEdge, SelfReferentialEdge; - - if (i == 0) { - // First, visit all predecessor edges. - for (auto *Pred : Predecessors[BB]) { - Edge E = std::make_pair(Pred, BB); - TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge); - if (E.first == E.second) - SelfReferentialEdge = E; - } - } else { - // On the second round, visit all successor edges. - for (auto *Succ : Successors[BB]) { - Edge E = std::make_pair(BB, Succ); - TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge); - } - } - - // After visiting all the edges, there are three cases that we - // can handle immediately: - // - // - All the edge weights are known (i.e., NumUnknownEdges == 0). - // In this case, we simply check that the sum of all the edges - // is the same as BB's weight. If not, we change BB's weight - // to match. Additionally, if BB had not been visited before, - // we mark it visited. - // - // - Only one edge is unknown and BB has already been visited. - // In this case, we can compute the weight of the edge by - // subtracting the total block weight from all the known - // edge weights. If the edges weight more than BB, then the - // edge of the last remaining edge is set to zero. - // - // - There exists a self-referential edge and the weight of BB is - // known. In this case, this edge can be based on BB's weight. - // We add up all the other known edges and set the weight on - // the self-referential edge as we did in the previous case. - // - // In any other case, we must continue iterating. Eventually, - // all edges will get a weight, or iteration will stop when - // it reaches SampleProfileMaxPropagateIterations. - if (NumUnknownEdges <= 1) { - unsigned &BBWeight = BlockWeights[BB]; - if (NumUnknownEdges == 0) { - // If we already know the weight of all edges, the weight of the - // basic block can be computed. It should be no larger than the sum - // of all edge weights. - if (TotalWeight > BBWeight) { - BBWeight = TotalWeight; - Changed = true; - DEBUG(dbgs() << "All edge weights for " << BB->getName() - << " known. Set weight for block: "; - printBlockWeight(dbgs(), BB);); - } - if (VisitedBlocks.insert(BB).second) - Changed = true; - } else if (NumUnknownEdges == 1 && VisitedBlocks.count(BB)) { - // If there is a single unknown edge and the block has been - // visited, then we can compute E's weight. - if (BBWeight >= TotalWeight) - EdgeWeights[UnknownEdge] = BBWeight - TotalWeight; - else - EdgeWeights[UnknownEdge] = 0; - VisitedEdges.insert(UnknownEdge); - Changed = true; - DEBUG(dbgs() << "Set weight for edge: "; - printEdgeWeight(dbgs(), UnknownEdge)); - } - } else if (SelfReferentialEdge.first && VisitedBlocks.count(BB)) { - unsigned &BBWeight = BlockWeights[BB]; - // We have a self-referential edge and the weight of BB is known. - if (BBWeight >= TotalWeight) - EdgeWeights[SelfReferentialEdge] = BBWeight - TotalWeight; - else - EdgeWeights[SelfReferentialEdge] = 0; - VisitedEdges.insert(SelfReferentialEdge); - Changed = true; - DEBUG(dbgs() << "Set self-referential edge weight to: "; - printEdgeWeight(dbgs(), SelfReferentialEdge)); - } - } - } - - return Changed; -} - -/// \brief Build in/out edge lists for each basic block in the CFG. -/// -/// We are interested in unique edges. If a block B1 has multiple -/// edges to another block B2, we only add a single B1->B2 edge. -void SampleProfileLoader::buildEdges(Function &F) { - for (auto &BI : F) { - BasicBlock *B1 = &BI; - - // Add predecessors for B1. - SmallPtrSet<BasicBlock *, 16> Visited; - if (!Predecessors[B1].empty()) - llvm_unreachable("Found a stale predecessors list in a basic block."); - for (pred_iterator PI = pred_begin(B1), PE = pred_end(B1); PI != PE; ++PI) { - BasicBlock *B2 = *PI; - if (Visited.insert(B2).second) - Predecessors[B1].push_back(B2); - } - - // Add successors for B1. - Visited.clear(); - if (!Successors[B1].empty()) - llvm_unreachable("Found a stale successors list in a basic block."); - for (succ_iterator SI = succ_begin(B1), SE = succ_end(B1); SI != SE; ++SI) { - BasicBlock *B2 = *SI; - if (Visited.insert(B2).second) - Successors[B1].push_back(B2); - } - } -} - -/// \brief Propagate weights into edges -/// -/// The following rules are applied to every block BB in the CFG: -/// -/// - If BB has a single predecessor/successor, then the weight -/// of that edge is the weight of the block. -/// -/// - If all incoming or outgoing edges are known except one, and the -/// weight of the block is already known, the weight of the unknown -/// edge will be the weight of the block minus the sum of all the known -/// edges. If the sum of all the known edges is larger than BB's weight, -/// we set the unknown edge weight to zero. -/// -/// - If there is a self-referential edge, and the weight of the block is -/// known, the weight for that edge is set to the weight of the block -/// minus the weight of the other incoming edges to that block (if -/// known). -void SampleProfileLoader::propagateWeights(Function &F) { - bool Changed = true; - unsigned i = 0; - - // Add an entry count to the function using the samples gathered - // at the function entry. - F.setEntryCount(Samples->getHeadSamples()); - - // Before propagation starts, build, for each block, a list of - // unique predecessors and successors. This is necessary to handle - // identical edges in multiway branches. Since we visit all blocks and all - // edges of the CFG, it is cleaner to build these lists once at the start - // of the pass. - buildEdges(F); - - // Propagate until we converge or we go past the iteration limit. - while (Changed && i++ < SampleProfileMaxPropagateIterations) { - Changed = propagateThroughEdges(F); - } - - // Generate MD_prof metadata for every branch instruction using the - // edge weights computed during propagation. - DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n"); - MDBuilder MDB(F.getContext()); - for (auto &BI : F) { - BasicBlock *BB = &BI; - TerminatorInst *TI = BB->getTerminator(); - if (TI->getNumSuccessors() == 1) - continue; - if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI)) - continue; - - DEBUG(dbgs() << "\nGetting weights for branch at line " - << TI->getDebugLoc().getLine() << ".\n"); - SmallVector<unsigned, 4> Weights; - bool AllWeightsZero = true; - for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) { - BasicBlock *Succ = TI->getSuccessor(I); - Edge E = std::make_pair(BB, Succ); - unsigned Weight = EdgeWeights[E]; - DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E)); - Weights.push_back(Weight); - if (Weight != 0) - AllWeightsZero = false; - } - - // Only set weights if there is at least one non-zero weight. - // In any other case, let the analyzer set weights. - if (!AllWeightsZero) { - DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n"); - TI->setMetadata(llvm::LLVMContext::MD_prof, - MDB.createBranchWeights(Weights)); - } else { - DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n"); - } - } -} - -/// \brief Get the line number for the function header. -/// -/// This looks up function \p F in the current compilation unit and -/// retrieves the line number where the function is defined. This is -/// line 0 for all the samples read from the profile file. Every line -/// number is relative to this line. -/// -/// \param F Function object to query. -/// -/// \returns the line number where \p F is defined. If it returns 0, -/// it means that there is no debug information available for \p F. -unsigned SampleProfileLoader::getFunctionLoc(Function &F) { - if (DISubprogram *S = getDISubprogram(&F)) - return S->getLine(); - - // If could not find the start of \p F, emit a diagnostic to inform the user - // about the missed opportunity. - F.getContext().diagnose(DiagnosticInfoSampleProfile( - "No debug information found in function " + F.getName() + - ": Function profile not used", - DS_Warning)); - return 0; -} - -/// \brief Generate branch weight metadata for all branches in \p F. -/// -/// Branch weights are computed out of instruction samples using a -/// propagation heuristic. Propagation proceeds in 3 phases: -/// -/// 1- Assignment of block weights. All the basic blocks in the function -/// are initial assigned the same weight as their most frequently -/// executed instruction. -/// -/// 2- Creation of equivalence classes. Since samples may be missing from -/// blocks, we can fill in the gaps by setting the weights of all the -/// blocks in the same equivalence class to the same weight. To compute -/// the concept of equivalence, we use dominance and loop information. -/// Two blocks B1 and B2 are in the same equivalence class if B1 -/// dominates B2, B2 post-dominates B1 and both are in the same loop. -/// -/// 3- Propagation of block weights into edges. This uses a simple -/// propagation heuristic. The following rules are applied to every -/// block BB in the CFG: -/// -/// - If BB has a single predecessor/successor, then the weight -/// of that edge is the weight of the block. -/// -/// - If all the edges are known except one, and the weight of the -/// block is already known, the weight of the unknown edge will -/// be the weight of the block minus the sum of all the known -/// edges. If the sum of all the known edges is larger than BB's weight, -/// we set the unknown edge weight to zero. -/// -/// - If there is a self-referential edge, and the weight of the block is -/// known, the weight for that edge is set to the weight of the block -/// minus the weight of the other incoming edges to that block (if -/// known). -/// -/// Since this propagation is not guaranteed to finalize for every CFG, we -/// only allow it to proceed for a limited number of iterations (controlled -/// by -sample-profile-max-propagate-iterations). -/// -/// FIXME: Try to replace this propagation heuristic with a scheme -/// that is guaranteed to finalize. A work-list approach similar to -/// the standard value propagation algorithm used by SSA-CCP might -/// work here. -/// -/// Once all the branch weights are computed, we emit the MD_prof -/// metadata on BB using the computed values for each of its branches. -/// -/// \param F The function to query. -/// -/// \returns true if \p F was modified. Returns false, otherwise. -bool SampleProfileLoader::emitAnnotations(Function &F) { - bool Changed = false; - - // Initialize invariants used during computation and propagation. - HeaderLineno = getFunctionLoc(F); - if (HeaderLineno == 0) - return false; - - DEBUG(dbgs() << "Line number for the first instruction in " << F.getName() - << ": " << HeaderLineno << "\n"); - - // Compute basic block weights. - Changed |= computeBlockWeights(F); - - if (Changed) { - // Find equivalence classes. - findEquivalenceClasses(F); - - // Propagate weights to all edges. - propagateWeights(F); - } - - return Changed; -} - -char SampleProfileLoader::ID = 0; -INITIALIZE_PASS_BEGIN(SampleProfileLoader, "sample-profile", - "Sample Profile loader", false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(PostDominatorTree) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(AddDiscriminators) -INITIALIZE_PASS_END(SampleProfileLoader, "sample-profile", - "Sample Profile loader", false, false) - -bool SampleProfileLoader::doInitialization(Module &M) { - auto ReaderOrErr = SampleProfileReader::create(Filename, M.getContext()); - if (std::error_code EC = ReaderOrErr.getError()) { - std::string Msg = "Could not open profile: " + EC.message(); - M.getContext().diagnose(DiagnosticInfoSampleProfile(Filename.data(), Msg)); - return false; - } - Reader = std::move(ReaderOrErr.get()); - ProfileIsValid = (Reader->read() == sampleprof_error::success); - return true; -} - -FunctionPass *llvm::createSampleProfileLoaderPass() { - return new SampleProfileLoader(SampleProfileFile); -} - -FunctionPass *llvm::createSampleProfileLoaderPass(StringRef Name) { - return new SampleProfileLoader(Name); -} - -bool SampleProfileLoader::runOnFunction(Function &F) { - if (!ProfileIsValid) - return false; - - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - PDT = &getAnalysis<PostDominatorTree>(); - LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - Ctx = &F.getParent()->getContext(); - Samples = Reader->getSamplesFor(F); - if (!Samples->empty()) - return emitAnnotations(F); - return false; -} diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp index d5d3605..52d477c 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -16,7 +16,10 @@ #include "llvm/Transforms/Scalar.h" #include "llvm-c/Initialization.h" #include "llvm-c/Transforms/Scalar.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/Passes.h" +#include "llvm/Analysis/ScopedNoAliasAA.h" +#include "llvm/Analysis/TypeBasedAliasAnalysis.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Verifier.h" #include "llvm/InitializePasses.h" @@ -27,10 +30,9 @@ using namespace llvm; /// initializeScalarOptsPasses - Initialize all passes linked into the /// ScalarOpts library. void llvm::initializeScalarOpts(PassRegistry &Registry) { - initializeADCEPass(Registry); + initializeADCELegacyPassPass(Registry); initializeBDCEPass(Registry); initializeAlignmentFromAssumptionsPass(Registry); - initializeSampleProfileLoaderPass(Registry); initializeConstantHoistingPass(Registry); initializeConstantPropagationPass(Registry); initializeCorrelatedValuePropagationPass(Registry); @@ -66,7 +68,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeRewriteStatepointsForGCPass(Registry); initializeSCCPPass(Registry); initializeIPSCCPPass(Registry); - initializeSROAPass(Registry); + initializeSROALegacyPassPass(Registry); initializeSROA_DTPass(Registry); initializeSROA_SSAUpPass(Registry); initializeCFGSimplifyPassPass(Registry); @@ -81,6 +83,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializePlaceSafepointsPass(Registry); initializeFloat2IntPass(Registry); initializeLoopDistributePass(Registry); + initializeLoopLoadEliminationPass(Registry); } void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) { @@ -225,15 +228,15 @@ void LLVMAddEarlyCSEPass(LLVMPassManagerRef PM) { } void LLVMAddTypeBasedAliasAnalysisPass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createTypeBasedAliasAnalysisPass()); + unwrap(PM)->add(createTypeBasedAAWrapperPass()); } void LLVMAddScopedNoAliasAAPass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createScopedNoAliasAAPass()); + unwrap(PM)->add(createScopedNoAliasAAWrapperPass()); } void LLVMAddBasicAliasAnalysisPass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createBasicAliasAnalysisPass()); + unwrap(PM)->add(createBasicAAWrapperPass()); } void LLVMAddLowerExpectIntrinsicPass(LLVMPassManagerRef PM) { diff --git a/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp index d955da7..114d22d 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -60,6 +60,7 @@ STATISTIC(NumAdjusted, "Number of scalar allocas adjusted to allow promotion"); STATISTIC(NumConverted, "Number of aggregates converted to scalar"); namespace { +#define SROA SROA_ struct SROA : public FunctionPass { SROA(int T, bool hasDT, char &ID, int ST, int AT, int SLT) : FunctionPass(ID), HasDomTree(hasDT) { @@ -382,8 +383,8 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) { // Create and insert the integer alloca. NewTy = IntegerType::get(AI->getContext(), BitWidth); } - AllocaInst *NewAI = new AllocaInst(NewTy, nullptr, "", - AI->getParent()->begin()); + AllocaInst *NewAI = + new AllocaInst(NewTy, nullptr, "", &AI->getParent()->front()); ConvertUsesToScalar(AI, NewAI, 0, nullptr); return NewAI; } @@ -1195,7 +1196,7 @@ static bool isSafePHIToSpeculate(PHINode *PN) { // Ensure that there are no instructions between the PHI and the load that // could store. - for (BasicBlock::iterator BBI = PN; &*BBI != LI; ++BBI) + for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI) if (BBI->mayWriteToMemory()) return false; diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp index 0493003..054bacd 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -253,10 +253,10 @@ bool Scalarizer::doInitialization(Module &M) { } bool Scalarizer::runOnFunction(Function &F) { - for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE; ++BBI) { - BasicBlock *BB = BBI; - for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;) { - Instruction *I = II; + assert(Gathered.empty() && Scattered.empty()); + for (BasicBlock &BB : F) { + for (BasicBlock::iterator II = BB.begin(), IE = BB.end(); II != IE;) { + Instruction *I = &*II; bool Done = visit(I); ++II; if (Done && I->getType()->isVoidTy()) @@ -285,7 +285,7 @@ Scatterer Scalarizer::scatter(Instruction *Point, Value *V) { } // In the fallback case, just put the scattered before Point and // keep the result local to Point. - return Scatterer(Point->getParent(), Point, V); + return Scatterer(Point->getParent(), Point->getIterator(), V); } // Replace Op with the gathered form of the components in CV. Defer the @@ -377,7 +377,7 @@ bool Scalarizer::splitBinary(Instruction &I, const Splitter &Split) { return false; unsigned NumElems = VT->getNumElements(); - IRBuilder<> Builder(I.getParent(), &I); + IRBuilder<> Builder(&I); Scatterer Op0 = scatter(&I, I.getOperand(0)); Scatterer Op1 = scatter(&I, I.getOperand(1)); assert(Op0.size() == NumElems && "Mismatched binary operation"); @@ -397,7 +397,7 @@ bool Scalarizer::visitSelectInst(SelectInst &SI) { return false; unsigned NumElems = VT->getNumElements(); - IRBuilder<> Builder(SI.getParent(), &SI); + IRBuilder<> Builder(&SI); Scatterer Op1 = scatter(&SI, SI.getOperand(1)); Scatterer Op2 = scatter(&SI, SI.getOperand(2)); assert(Op1.size() == NumElems && "Mismatched select"); @@ -438,7 +438,7 @@ bool Scalarizer::visitGetElementPtrInst(GetElementPtrInst &GEPI) { if (!VT) return false; - IRBuilder<> Builder(GEPI.getParent(), &GEPI); + IRBuilder<> Builder(&GEPI); unsigned NumElems = VT->getNumElements(); unsigned NumIndices = GEPI.getNumIndices(); @@ -472,7 +472,7 @@ bool Scalarizer::visitCastInst(CastInst &CI) { return false; unsigned NumElems = VT->getNumElements(); - IRBuilder<> Builder(CI.getParent(), &CI); + IRBuilder<> Builder(&CI); Scatterer Op0 = scatter(&CI, CI.getOperand(0)); assert(Op0.size() == NumElems && "Mismatched cast"); ValueVector Res; @@ -492,7 +492,7 @@ bool Scalarizer::visitBitCastInst(BitCastInst &BCI) { unsigned DstNumElems = DstVT->getNumElements(); unsigned SrcNumElems = SrcVT->getNumElements(); - IRBuilder<> Builder(BCI.getParent(), &BCI); + IRBuilder<> Builder(&BCI); Scatterer Op0 = scatter(&BCI, BCI.getOperand(0)); ValueVector Res; Res.resize(DstNumElems); @@ -569,7 +569,7 @@ bool Scalarizer::visitPHINode(PHINode &PHI) { return false; unsigned NumElems = VT->getNumElements(); - IRBuilder<> Builder(PHI.getParent(), &PHI); + IRBuilder<> Builder(&PHI); ValueVector Res; Res.resize(NumElems); @@ -600,7 +600,7 @@ bool Scalarizer::visitLoadInst(LoadInst &LI) { return false; unsigned NumElems = Layout.VecTy->getNumElements(); - IRBuilder<> Builder(LI.getParent(), &LI); + IRBuilder<> Builder(&LI); Scatterer Ptr = scatter(&LI, LI.getPointerOperand()); ValueVector Res; Res.resize(NumElems); @@ -625,7 +625,7 @@ bool Scalarizer::visitStoreInst(StoreInst &SI) { return false; unsigned NumElems = Layout.VecTy->getNumElements(); - IRBuilder<> Builder(SI.getParent(), &SI); + IRBuilder<> Builder(&SI); Scatterer Ptr = scatter(&SI, SI.getPointerOperand()); Scatterer Val = scatter(&SI, FullValue); @@ -642,7 +642,9 @@ bool Scalarizer::visitStoreInst(StoreInst &SI) { // Delete the instructions that we scalarized. If a full vector result // is still needed, recreate it using InsertElements. bool Scalarizer::finish() { - if (Gathered.empty()) + // The presence of data in Gathered or Scattered indicates changes + // made to the Function. + if (Gathered.empty() && Scattered.empty()) return false; for (GatherList::iterator GMI = Gathered.begin(), GME = Gathered.end(); GMI != GME; ++GMI) { @@ -655,7 +657,7 @@ bool Scalarizer::finish() { Value *Res = UndefValue::get(Ty); BasicBlock *BB = Op->getParent(); unsigned Count = Ty->getVectorNumElements(); - IRBuilder<> Builder(BB, Op); + IRBuilder<> Builder(Op); if (isa<PHINode>(Op)) Builder.SetInsertPoint(BB, BB->getFirstInsertionPt()); for (unsigned I = 0; I < Count; ++I) diff --git a/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 4a87531..86a10d2 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -156,6 +156,10 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Constants.h" @@ -164,6 +168,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Operator.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/raw_ostream.h" @@ -174,6 +179,7 @@ #include "llvm/IR/IRBuilder.h" using namespace llvm; +using namespace llvm::PatternMatch; static cl::opt<bool> DisableSeparateConstOffsetFromGEP( "disable-separate-const-offset-from-gep", cl::init(false), @@ -319,8 +325,11 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); AU.setPreservesCFG(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } bool doInitialization(Module &M) override { @@ -373,15 +382,42 @@ private: /// /// Verified in @i32_add in split-gep.ll bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP); + /// Optimize sext(a)+sext(b) to sext(a+b) when a+b can't sign overflow. + /// SeparateConstOffsetFromGEP distributes a sext to leaves before extracting + /// the constant offset. After extraction, it becomes desirable to reunion the + /// distributed sexts. For example, + /// + /// &a[sext(i +nsw (j +nsw 5)] + /// => distribute &a[sext(i) +nsw (sext(j) +nsw 5)] + /// => constant extraction &a[sext(i) + sext(j)] + 5 + /// => reunion &a[sext(i +nsw j)] + 5 + bool reuniteExts(Function &F); + /// A helper that reunites sexts in an instruction. + bool reuniteExts(Instruction *I); + /// Find the closest dominator of <Dominatee> that is equivalent to <Key>. + Instruction *findClosestMatchingDominator(const SCEV *Key, + Instruction *Dominatee); /// Verify F is free of dead code. void verifyNoDeadCode(Function &F); + bool hasMoreThanOneUseInLoop(Value *v, Loop *L); + // Swap the index operand of two GEP. + void swapGEPOperand(GetElementPtrInst *First, GetElementPtrInst *Second); + // Check if it is safe to swap operand of two GEP. + bool isLegalToSwapOperand(GetElementPtrInst *First, GetElementPtrInst *Second, + Loop *CurLoop); + const DataLayout *DL; - const DominatorTree *DT; + DominatorTree *DT; + ScalarEvolution *SE; const TargetMachine *TM; + + LoopInfo *LI; + TargetLibraryInfo *TLI; /// Whether to lower a GEP with multiple indices into arithmetic operations or /// multiple GEPs with a single index. bool LowerGEP; + DenseMap<const SCEV *, SmallVector<Instruction *, 2>> DominatingExprs; }; } // anonymous namespace @@ -391,7 +427,10 @@ INITIALIZE_PASS_BEGIN( "Split GEPs to a variadic base and a constant offset for better CSE", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END( SeparateConstOffsetFromGEP, "separate-const-offset-from-gep", "Split GEPs to a variadic base and a constant offset for better CSE", false, @@ -734,6 +773,13 @@ void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs( Type *I8PtrTy = Builder.getInt8PtrTy(Variadic->getType()->getPointerAddressSpace()); Value *ResultPtr = Variadic->getOperand(0); + Loop *L = LI->getLoopFor(Variadic->getParent()); + // Check if the base is not loop invariant or used more than once. + bool isSwapCandidate = + L && L->isLoopInvariant(ResultPtr) && + !hasMoreThanOneUseInLoop(ResultPtr, L); + Value *FirstResult = nullptr; + if (ResultPtr->getType() != I8PtrTy) ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy); @@ -762,6 +808,8 @@ void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs( // Create an ugly GEP with a single index for each index. ResultPtr = Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Idx, "uglygep"); + if (FirstResult == nullptr) + FirstResult = ResultPtr; } } @@ -770,7 +818,17 @@ void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs( Value *Offset = ConstantInt::get(IntPtrTy, AccumulativeByteOffset); ResultPtr = Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Offset, "uglygep"); - } + } else + isSwapCandidate = false; + + // If we created a GEP with constant index, and the base is loop invariant, + // then we swap the first one with it, so LICM can move constant GEP out + // later. + GetElementPtrInst *FirstGEP = dyn_cast<GetElementPtrInst>(FirstResult); + GetElementPtrInst *SecondGEP = dyn_cast<GetElementPtrInst>(ResultPtr); + if (isSwapCandidate && isLegalToSwapOperand(FirstGEP, SecondGEP, L)) + swapGEPOperand(FirstGEP, SecondGEP); + if (ResultPtr->getType() != Variadic->getType()) ResultPtr = Builder.CreateBitCast(ResultPtr, Variadic->getType()); @@ -891,13 +949,13 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { // Clear the inbounds attribute because the new index may be off-bound. // e.g., // - // b = add i64 a, 5 - // addr = gep inbounds float* p, i64 b + // b = add i64 a, 5 + // addr = gep inbounds float, float* p, i64 b // // is transformed to: // - // addr2 = gep float* p, i64 a - // addr = gep float* addr2, i64 5 + // addr2 = gep float, float* p, i64 a ; inbounds removed + // addr = gep inbounds float, float* addr2, i64 5 // // If a is -4, although the old index b is in bounds, the new index a is // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the @@ -907,6 +965,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { // // TODO(jingyue): do some range analysis to keep as many inbounds as // possible. GEPs with inbounds are more friendly to alias analysis. + bool GEPWasInBounds = GEP->isInBounds(); GEP->setIsInBounds(false); // Lowers a GEP to either GEPs with a single index or arithmetic operations. @@ -968,6 +1027,8 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { NewGEP = GetElementPtrInst::Create(GEP->getResultElementType(), NewGEP, ConstantInt::get(IntPtrTy, Index, true), GEP->getName(), GEP); + // Inherit the inbounds attribute of the original GEP. + cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds); } else { // Unlikely but possible. For example, // #pragma pack(1) @@ -990,6 +1051,8 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { Type::getInt8Ty(GEP->getContext()), NewGEP, ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), "uglygep", GEP); + // Inherit the inbounds attribute of the original GEP. + cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds); if (GEP->getType() != I8PtrTy) NewGEP = new BitCastInst(NewGEP, GEP->getType(), GEP->getName(), GEP); } @@ -1008,24 +1071,96 @@ bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) { return false; DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); bool Changed = false; for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) { - for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ) { - if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++)) { + for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE;) + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++)) Changed |= splitGEP(GEP); - } - // No need to split GEP ConstantExprs because all its indices are constant - // already. - } + // No need to split GEP ConstantExprs because all its indices are constant + // already. } + Changed |= reuniteExts(F); + if (VerifyNoDeadCode) verifyNoDeadCode(F); return Changed; } +Instruction *SeparateConstOffsetFromGEP::findClosestMatchingDominator( + const SCEV *Key, Instruction *Dominatee) { + auto Pos = DominatingExprs.find(Key); + if (Pos == DominatingExprs.end()) + return nullptr; + + auto &Candidates = Pos->second; + // Because we process the basic blocks in pre-order of the dominator tree, a + // candidate that doesn't dominate the current instruction won't dominate any + // future instruction either. Therefore, we pop it out of the stack. This + // optimization makes the algorithm O(n). + while (!Candidates.empty()) { + Instruction *Candidate = Candidates.back(); + if (DT->dominates(Candidate, Dominatee)) + return Candidate; + Candidates.pop_back(); + } + return nullptr; +} + +bool SeparateConstOffsetFromGEP::reuniteExts(Instruction *I) { + if (!SE->isSCEVable(I->getType())) + return false; + + // Dom: LHS+RHS + // I: sext(LHS)+sext(RHS) + // If Dom can't sign overflow and Dom dominates I, optimize I to sext(Dom). + // TODO: handle zext + Value *LHS = nullptr, *RHS = nullptr; + if (match(I, m_Add(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS)))) || + match(I, m_Sub(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) { + if (LHS->getType() == RHS->getType()) { + const SCEV *Key = + SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS)); + if (auto *Dom = findClosestMatchingDominator(Key, I)) { + Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I); + NewSExt->takeName(I); + I->replaceAllUsesWith(NewSExt); + RecursivelyDeleteTriviallyDeadInstructions(I); + return true; + } + } + } + + // Add I to DominatingExprs if it's an add/sub that can't sign overflow. + if (match(I, m_NSWAdd(m_Value(LHS), m_Value(RHS))) || + match(I, m_NSWSub(m_Value(LHS), m_Value(RHS)))) { + if (isKnownNotFullPoison(I)) { + const SCEV *Key = + SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS)); + DominatingExprs[Key].push_back(I); + } + } + return false; +} + +bool SeparateConstOffsetFromGEP::reuniteExts(Function &F) { + bool Changed = false; + DominatingExprs.clear(); + for (auto Node = GraphTraits<DominatorTree *>::nodes_begin(DT); + Node != GraphTraits<DominatorTree *>::nodes_end(DT); ++Node) { + BasicBlock *BB = Node->getBlock(); + for (auto I = BB->begin(); I != BB->end(); ) { + Instruction *Cur = &*I++; + Changed |= reuniteExts(Cur); + } + } + return Changed; +} + void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) { for (auto &B : F) { for (auto &I : B) { @@ -1038,3 +1173,93 @@ void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) { } } } + +bool SeparateConstOffsetFromGEP::isLegalToSwapOperand( + GetElementPtrInst *FirstGEP, GetElementPtrInst *SecondGEP, Loop *CurLoop) { + if (!FirstGEP || !FirstGEP->hasOneUse()) + return false; + + if (!SecondGEP || FirstGEP->getParent() != SecondGEP->getParent()) + return false; + + if (FirstGEP == SecondGEP) + return false; + + unsigned FirstNum = FirstGEP->getNumOperands(); + unsigned SecondNum = SecondGEP->getNumOperands(); + // Give up if the number of operands are not 2. + if (FirstNum != SecondNum || FirstNum != 2) + return false; + + Value *FirstBase = FirstGEP->getOperand(0); + Value *SecondBase = SecondGEP->getOperand(0); + Value *FirstOffset = FirstGEP->getOperand(1); + // Give up if the index of the first GEP is loop invariant. + if (CurLoop->isLoopInvariant(FirstOffset)) + return false; + + // Give up if base doesn't have same type. + if (FirstBase->getType() != SecondBase->getType()) + return false; + + Instruction *FirstOffsetDef = dyn_cast<Instruction>(FirstOffset); + + // Check if the second operand of first GEP has constant coefficient. + // For an example, for the following code, we won't gain anything by + // hoisting the second GEP out because the second GEP can be folded away. + // %scevgep.sum.ur159 = add i64 %idxprom48.ur, 256 + // %67 = shl i64 %scevgep.sum.ur159, 2 + // %uglygep160 = getelementptr i8* %65, i64 %67 + // %uglygep161 = getelementptr i8* %uglygep160, i64 -1024 + + // Skip constant shift instruction which may be generated by Splitting GEPs. + if (FirstOffsetDef && FirstOffsetDef->isShift() && + isa<ConstantInt>(FirstOffsetDef->getOperand(1))) + FirstOffsetDef = dyn_cast<Instruction>(FirstOffsetDef->getOperand(0)); + + // Give up if FirstOffsetDef is an Add or Sub with constant. + // Because it may not profitable at all due to constant folding. + if (FirstOffsetDef) + if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FirstOffsetDef)) { + unsigned opc = BO->getOpcode(); + if ((opc == Instruction::Add || opc == Instruction::Sub) && + (isa<ConstantInt>(BO->getOperand(0)) || + isa<ConstantInt>(BO->getOperand(1)))) + return false; + } + return true; +} + +bool SeparateConstOffsetFromGEP::hasMoreThanOneUseInLoop(Value *V, Loop *L) { + int UsesInLoop = 0; + for (User *U : V->users()) { + if (Instruction *User = dyn_cast<Instruction>(U)) + if (L->contains(User)) + if (++UsesInLoop > 1) + return true; + } + return false; +} + +void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First, + GetElementPtrInst *Second) { + Value *Offset1 = First->getOperand(1); + Value *Offset2 = Second->getOperand(1); + First->setOperand(1, Offset2); + Second->setOperand(1, Offset1); + + // We changed p+o+c to p+c+o, p+c may not be inbound anymore. + const DataLayout &DAL = First->getModule()->getDataLayout(); + APInt Offset(DAL.getPointerSizeInBits( + cast<PointerType>(First->getType())->getAddressSpace()), + 0); + Value *NewBase = + First->stripAndAccumulateInBoundsConstantOffsets(DAL, Offset); + uint64_t ObjectSize; + if (!getObjectSize(NewBase, ObjectSize, DAL, TLI) || + Offset.ugt(ObjectSize)) { + First->setIsInBounds(false); + Second->setIsInBounds(false); + } else + First->setIsInBounds(true); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp index 231411a..63c8836 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -25,6 +25,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Attributes.h" @@ -67,15 +68,14 @@ static bool mergeEmptyReturnBlocks(Function &F) { // single PHI node that is the operand to the return. if (Ret != &BB.front()) { // Check for something else in the block. - BasicBlock::iterator I = Ret; + BasicBlock::iterator I(Ret); --I; // Skip over debug info. while (isa<DbgInfoIntrinsic>(I) && I != BB.begin()) --I; if (!isa<DbgInfoIntrinsic>(I) && - (!isa<PHINode>(I) || I != BB.begin() || - Ret->getNumOperands() == 0 || - Ret->getOperand(0) != I)) + (!isa<PHINode>(I) || I != BB.begin() || Ret->getNumOperands() == 0 || + Ret->getOperand(0) != &*I)) continue; } @@ -136,7 +136,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, // Loop over all of the basic blocks and remove them if they are unneeded. for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) { - if (SimplifyCFG(BBIt++, TTI, BonusInstThreshold, AC)) { + if (SimplifyCFG(&*BBIt++, TTI, BonusInstThreshold, AC)) { LocalChange = true; ++NumSimpl; } @@ -217,6 +217,7 @@ struct CFGSimplifyPass : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } }; } diff --git a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp index f49f4ea..64109b2 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp @@ -48,7 +48,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); FunctionPass::getAnalysisUsage(AU); - AU.addRequired<AliasAnalysis>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); @@ -66,7 +66,7 @@ char Sinking::ID = 0; INITIALIZE_PASS_BEGIN(Sinking, "sink", "Code sinking", false, false) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(Sinking, "sink", "Code sinking", false, false) FunctionPass *llvm::createSinkingPass() { return new Sinking(); } @@ -99,7 +99,7 @@ bool Sinking::AllUsesDominatedByBlock(Instruction *Inst, bool Sinking::runOnFunction(Function &F) { DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - AA = &getAnalysis<AliasAnalysis>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); bool MadeChange, EverMadeChange = false; @@ -119,7 +119,7 @@ bool Sinking::runOnFunction(Function &F) { bool Sinking::ProcessBlock(BasicBlock &BB) { // Can't sink anything out of a block that has less than two successors. - if (BB.getTerminator()->getNumSuccessors() <= 1 || BB.empty()) return false; + if (BB.getTerminator()->getNumSuccessors() <= 1) return false; // Don't bother sinking code out of unreachable blocks. In addition to being // unprofitable, it can also lead to infinite looping, because in an @@ -134,7 +134,7 @@ bool Sinking::ProcessBlock(BasicBlock &BB) { bool ProcessedBegin = false; SmallPtrSet<Instruction *, 8> Stores; do { - Instruction *Inst = I; // The instruction to sink. + Instruction *Inst = &*I; // The instruction to sink. // Predecrement I (if it's not begin) so that it isn't invalidated by // sinking. @@ -165,14 +165,16 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA, if (LoadInst *L = dyn_cast<LoadInst>(Inst)) { MemoryLocation Loc = MemoryLocation::get(L); for (Instruction *S : Stores) - if (AA->getModRefInfo(S, Loc) & AliasAnalysis::Mod) + if (AA->getModRefInfo(S, Loc) & MRI_Mod) return false; } - if (isa<TerminatorInst>(Inst) || isa<PHINode>(Inst)) + if (isa<TerminatorInst>(Inst) || isa<PHINode>(Inst) || Inst->isEHPad() || + Inst->mayThrow()) return false; - // Convergent operations can only be moved to control equivalent blocks. + // Convergent operations cannot be made control-dependent on additional + // values. if (auto CS = CallSite(Inst)) { if (CS.hasFnAttr(Attribute::Convergent)) return false; @@ -193,6 +195,11 @@ bool Sinking::IsAcceptableTarget(Instruction *Inst, if (Inst->getParent() == SuccToSinkTo) return false; + // It's never legal to sink an instruction into a block which terminates in an + // EH-pad. + if (SuccToSinkTo->getTerminator()->isExceptional()) + return false; + // If the block has multiple predecessors, this would introduce computation // on different code paths. We could split the critical edge, but for now we // just punt. @@ -278,6 +285,6 @@ bool Sinking::SinkInstruction(Instruction *Inst, dbgs() << ")\n"); // Move the instruction. - Inst->moveBefore(SuccToSinkTo->getFirstInsertionPt()); + Inst->moveBefore(&*SuccToSinkTo->getFirstInsertionPt()); return true; } diff --git a/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp index ff3f00a..147d615 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp @@ -227,7 +227,7 @@ bool SpeculativeExecution::considerHoistingFromTo(BasicBlock &FromBlock, // changes the list that I is iterating through. auto Current = I; ++I; - if (!NotHoisted.count(Current)) { + if (!NotHoisted.count(&*Current)) { Current->moveBefore(ToBlock.getTerminator()); } } diff --git a/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp index 6d9d417..1faa65e 100644 --- a/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp @@ -131,7 +131,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); // We do not modify the shape of the CFG. AU.setPreservesCFG(); @@ -212,7 +212,7 @@ char StraightLineStrengthReduce::ID = 0; INITIALIZE_PASS_BEGIN(StraightLineStrengthReduce, "slsr", "Straight line strength reduction", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(StraightLineStrengthReduce, "slsr", "Straight line strength reduction", false, false) @@ -234,6 +234,7 @@ bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis, Basis.CandidateKind == C.CandidateKind); } +// TODO: use TTI->getGEPCost. static bool isGEPFoldable(GetElementPtrInst *GEP, const TargetTransformInfo *TTI, const DataLayout *DL) { @@ -523,7 +524,7 @@ void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP( continue; const SCEV *OrigIndexExpr = IndexExprs[I - 1]; - IndexExprs[I - 1] = SE->getConstant(OrigIndexExpr->getType(), 0); + IndexExprs[I - 1] = SE->getZero(OrigIndexExpr->getType()); // The base of this candidate is GEP's base plus the offsets of all // indices except this current one. @@ -689,7 +690,7 @@ bool StraightLineStrengthReduce::runOnFunction(Function &F) { TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); // Traverse the dominator tree in the depth-first order. This order makes sure // all bases of a candidate are in Candidates when we process it. for (auto node = GraphTraits<DominatorTree *>::nodes_begin(DT); diff --git a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index 4f23e20..662513c 100644 --- a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -358,13 +358,9 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) { BasicBlock *BB = N->getNodeAs<BasicBlock>(); BranchInst *Term = cast<BranchInst>(BB->getTerminator()); - for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) { - BasicBlock *Succ = Term->getSuccessor(i); - - if (Visited.count(Succ)) { + for (BasicBlock *Succ : Term->successors()) + if (Visited.count(Succ)) Loops[Succ] = BB; - } - } } } @@ -903,14 +899,14 @@ void StructurizeCFG::rebuildSSA() { continue; } - if (DT->dominates(II, User)) + if (DT->dominates(&*II, User)) continue; if (!Initialized) { Value *Undef = UndefValue::get(II->getType()); Updater.Initialize(II->getType(), ""); Updater.AddAvailableValue(&Func->getEntryBlock(), Undef); - Updater.AddAvailableValue(BB, II); + Updater.AddAvailableValue(BB, &*II); Initialized = true; } Updater.RewriteUseAfterInsertions(U); diff --git a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp index c7de2e2..0e0b00d 100644 --- a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -54,6 +54,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/InlineCost.h" @@ -136,6 +137,7 @@ FunctionPass *llvm::createTailCallEliminationPass() { void TailCallElim::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } /// \brief Scan the specified function for alloca instructions. @@ -195,8 +197,8 @@ struct AllocaDerivedValueTracker { case Instruction::Call: case Instruction::Invoke: { CallSite CS(I); - bool IsNocapture = !CS.isCallee(U) && - CS.doesNotCapture(CS.getArgumentNo(U)); + bool IsNocapture = + CS.isDataOperand(U) && CS.doesNotCapture(CS.getDataOperandNo(U)); callUsesLocalStack(CS, IsNocapture); if (IsNocapture) { // If the alloca-derived argument is passed in as nocapture, then it @@ -302,7 +304,9 @@ bool TailCallElim::markTails(Function &F, bool &AllCallsAreTailCalls) { if (!CI || CI->isTailCall()) continue; - if (CI->doesNotAccessMemory()) { + bool IsNoTail = CI->isNoTailCall(); + + if (!IsNoTail && CI->doesNotAccessMemory()) { // A call to a readnone function whose arguments are all things computed // outside this function can be marked tail. Even if you stored the // alloca address into a global, a readnone function can't load the @@ -330,7 +334,7 @@ bool TailCallElim::markTails(Function &F, bool &AllCallsAreTailCalls) { } } - if (Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI)) { + if (!IsNoTail && Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI)) { DeferredTails.push_back(CI); } else { AllCallsAreTailCalls = false; @@ -404,7 +408,7 @@ bool TailCallElim::runTRE(Function &F) { // Until this is resolved, disable this transformation if that would ever // happen. This bug is PR962. for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; /*in loop*/) { - BasicBlock *BB = BBI++; // FoldReturnAndProcessPred may delete BB. + BasicBlock *BB = &*BBI++; // FoldReturnAndProcessPred may delete BB. if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) { bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail, ArgumentPHIs, !CanTRETailMarkedCall); @@ -574,7 +578,7 @@ TailCallElim::FindTRECandidate(Instruction *TI, // Scan backwards from the return, checking to see if there is a tail call in // this block. If so, set CI to it. CallInst *CI = nullptr; - BasicBlock::iterator BBI = TI; + BasicBlock::iterator BBI(TI); while (true) { CI = dyn_cast<CallInst>(BBI); if (CI && CI->getCalledFunction() == F) @@ -595,9 +599,8 @@ TailCallElim::FindTRECandidate(Instruction *TI, // and disable this xform in this case, because the code generator will // lower the call to fabs into inline code. if (BB == &F->getEntryBlock() && - FirstNonDbg(BB->front()) == CI && - FirstNonDbg(std::next(BB->begin())) == TI && - CI->getCalledFunction() && + FirstNonDbg(BB->front().getIterator()) == CI && + FirstNonDbg(std::next(BB->begin())) == TI && CI->getCalledFunction() && !TTI->isLoweredToCall(CI->getCalledFunction())) { // A single-block function with just a call and a return. Check that // the arguments match. @@ -636,19 +639,19 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, // tail call if all of the instructions between the call and the return are // movable to above the call itself, leaving the call next to the return. // Check that this is the case now. - BasicBlock::iterator BBI = CI; + BasicBlock::iterator BBI(CI); for (++BBI; &*BBI != Ret; ++BBI) { - if (CanMoveAboveCall(BBI, CI)) continue; + if (CanMoveAboveCall(&*BBI, CI)) continue; // If we can't move the instruction above the call, it might be because it // is an associative and commutative operation that could be transformed // using accumulator recursion elimination. Check to see if this is the // case, and if so, remember the initial accumulator value for later. if ((AccumulatorRecursionEliminationInitVal = - CanTransformAccumulatorRecursion(BBI, CI))) { + CanTransformAccumulatorRecursion(&*BBI, CI))) { // Yes, this is accumulator recursion. Remember which instruction // accumulates. - AccumulatorRecursionInstr = BBI; + AccumulatorRecursionInstr = &*BBI; } else { return false; // Otherwise, we cannot eliminate the tail recursion! } @@ -698,19 +701,19 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, NEBI = NewEntry->begin(); OEBI != E; ) if (AllocaInst *AI = dyn_cast<AllocaInst>(OEBI++)) if (isa<ConstantInt>(AI->getArraySize())) - AI->moveBefore(NEBI); + AI->moveBefore(&*NEBI); // Now that we have created a new block, which jumps to the entry // block, insert a PHI node for each argument of the function. // For now, we initialize each PHI to only have the real arguments // which are passed in. - Instruction *InsertPos = OldEntry->begin(); + Instruction *InsertPos = &OldEntry->front(); for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I) { PHINode *PN = PHINode::Create(I->getType(), 2, I->getName() + ".tr", InsertPos); I->replaceAllUsesWith(PN); // Everyone use the PHI node now! - PN->addIncoming(I, NewEntry); + PN->addIncoming(&*I, NewEntry); ArgumentPHIs.push_back(PN); } } @@ -739,10 +742,9 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, Instruction *AccRecInstr = AccumulatorRecursionInstr; // Start by inserting a new PHI node for the accumulator. pred_iterator PB = pred_begin(OldEntry), PE = pred_end(OldEntry); - PHINode *AccPN = - PHINode::Create(AccumulatorRecursionEliminationInitVal->getType(), - std::distance(PB, PE) + 1, - "accumulator.tr", OldEntry->begin()); + PHINode *AccPN = PHINode::Create( + AccumulatorRecursionEliminationInitVal->getType(), + std::distance(PB, PE) + 1, "accumulator.tr", &OldEntry->front()); // Loop over all of the predecessors of the tail recursion block. For the // real entry into the function we seed the PHI with the initial value, |