diff options
Diffstat (limited to 'contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp')
-rw-r--r-- | contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp | 439 |
1 files changed, 259 insertions, 180 deletions
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 91af4a1..c7f9122 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -12,6 +12,7 @@ // counts of loops easily. //===----------------------------------------------------------------------===// +#include "llvm/Transforms/Scalar/LoopUnrollPass.h" #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" @@ -19,11 +20,10 @@ #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/LoopUnrollAnalyzer.h" +#include "llvm/Analysis/OptimizationDiagnosticInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" -#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/DataLayout.h" -#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/IntrinsicInst.h" @@ -32,6 +32,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/UnrollLoop.h" #include <climits> @@ -45,16 +46,14 @@ static cl::opt<unsigned> UnrollThreshold("unroll-threshold", cl::Hidden, cl::desc("The baseline cost threshold for loop unrolling")); -static cl::opt<unsigned> UnrollPercentDynamicCostSavedThreshold( - "unroll-percent-dynamic-cost-saved-threshold", cl::init(50), cl::Hidden, - cl::desc("The percentage of estimated dynamic cost which must be saved by " - "unrolling to allow unrolling up to the max threshold.")); - -static cl::opt<unsigned> UnrollDynamicCostSavingsDiscount( - "unroll-dynamic-cost-savings-discount", cl::init(100), cl::Hidden, - cl::desc("This is the amount discounted from the total unroll cost when " - "the unrolled form has a high dynamic cost savings (triggered by " - "the '-unroll-perecent-dynamic-cost-saved-threshold' flag).")); +static cl::opt<unsigned> UnrollMaxPercentThresholdBoost( + "unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden, + cl::desc("The maximum 'boost' (represented as a percentage >= 100) applied " + "to the threshold when aggressively unrolling a loop due to the " + "dynamic cost savings. If completely unrolling a loop will reduce " + "the total runtime from X to Y, we boost the loop unroll " + "threshold to DefaultThreshold*std::min(MaxPercentThresholdBoost, " + "X/Y). This limit avoids excessive code bloat.")); static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze( "unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden, @@ -90,43 +89,59 @@ static cl::opt<bool> UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden, cl::desc("Unroll loops with run-time trip counts")); +static cl::opt<unsigned> UnrollMaxUpperBound( + "unroll-max-upperbound", cl::init(8), cl::Hidden, + cl::desc( + "The max of trip count upper bound that is considered in unrolling")); + static cl::opt<unsigned> PragmaUnrollThreshold( "pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden, cl::desc("Unrolled size limit for loops with an unroll(full) or " "unroll_count pragma.")); +static cl::opt<unsigned> FlatLoopTripCountThreshold( + "flat-loop-tripcount-threshold", cl::init(5), cl::Hidden, + cl::desc("If the runtime tripcount for the loop is lower than the " + "threshold, the loop is considered as flat and will be less " + "aggressively unrolled.")); + +static cl::opt<bool> + UnrollAllowPeeling("unroll-allow-peeling", cl::Hidden, + cl::desc("Allows loops to be peeled when the dynamic " + "trip count is known to be low.")); + /// A magic value for use with the Threshold parameter to indicate /// that the loop unroll should be performed regardless of how much /// code expansion would result. static const unsigned NoThreshold = UINT_MAX; -/// Default unroll count for loops with run-time trip count if -/// -unroll-count is not set -static const unsigned DefaultUnrollRuntimeCount = 8; - /// Gather the various unrolling parameters based on the defaults, compiler /// flags, TTI overrides and user specified parameters. static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences( Loop *L, const TargetTransformInfo &TTI, Optional<unsigned> UserThreshold, Optional<unsigned> UserCount, Optional<bool> UserAllowPartial, - Optional<bool> UserRuntime) { + Optional<bool> UserRuntime, Optional<bool> UserUpperBound) { TargetTransformInfo::UnrollingPreferences UP; // Set up the defaults UP.Threshold = 150; - UP.PercentDynamicCostSavedThreshold = 50; - UP.DynamicCostSavingsDiscount = 100; + UP.MaxPercentThresholdBoost = 400; UP.OptSizeThreshold = 0; UP.PartialThreshold = UP.Threshold; UP.PartialOptSizeThreshold = 0; UP.Count = 0; + UP.PeelCount = 0; + UP.DefaultUnrollRuntimeCount = 8; UP.MaxCount = UINT_MAX; UP.FullUnrollMaxCount = UINT_MAX; + UP.BEInsns = 2; UP.Partial = false; UP.Runtime = false; UP.AllowRemainder = true; UP.AllowExpensiveTripCount = false; UP.Force = false; + UP.UpperBound = false; + UP.AllowPeeling = false; // Override with any target specific settings TTI.getUnrollingPreferences(L, UP); @@ -142,11 +157,8 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences( UP.Threshold = UnrollThreshold; UP.PartialThreshold = UnrollThreshold; } - if (UnrollPercentDynamicCostSavedThreshold.getNumOccurrences() > 0) - UP.PercentDynamicCostSavedThreshold = - UnrollPercentDynamicCostSavedThreshold; - if (UnrollDynamicCostSavingsDiscount.getNumOccurrences() > 0) - UP.DynamicCostSavingsDiscount = UnrollDynamicCostSavingsDiscount; + if (UnrollMaxPercentThresholdBoost.getNumOccurrences() > 0) + UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost; if (UnrollMaxCount.getNumOccurrences() > 0) UP.MaxCount = UnrollMaxCount; if (UnrollFullMaxCount.getNumOccurrences() > 0) @@ -157,6 +169,10 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences( UP.AllowRemainder = UnrollAllowRemainder; if (UnrollRuntime.getNumOccurrences() > 0) UP.Runtime = UnrollRuntime; + if (UnrollMaxUpperBound == 0) + UP.UpperBound = false; + if (UnrollAllowPeeling.getNumOccurrences() > 0) + UP.AllowPeeling = UnrollAllowPeeling; // Apply user values provided by argument if (UserThreshold.hasValue()) { @@ -169,6 +185,8 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences( UP.Partial = *UserAllowPartial; if (UserRuntime.hasValue()) UP.Runtime = *UserRuntime; + if (UserUpperBound.hasValue()) + UP.UpperBound = *UserUpperBound; return UP; } @@ -210,11 +228,11 @@ struct UnrolledInstStateKeyInfo { namespace { struct EstimatedUnrollCost { /// \brief The estimated cost after unrolling. - int UnrolledCost; + unsigned UnrolledCost; /// \brief The estimated dynamic cost of executing the instructions in the /// rolled form. - int RolledDynamicCost; + unsigned RolledDynamicCost; }; } @@ -234,7 +252,7 @@ struct EstimatedUnrollCost { static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE, const TargetTransformInfo &TTI, - int MaxUnrolledLoopSize) { + unsigned MaxUnrolledLoopSize) { // We want to be able to scale offsets by the trip count and add more offsets // to them without checking for overflows, and we already don't want to // analyze *massive* trip counts, so we force the max to be reasonably small. @@ -258,14 +276,14 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, // The estimated cost of the unrolled form of the loop. We try to estimate // this by simplifying as much as we can while computing the estimate. - int UnrolledCost = 0; + unsigned UnrolledCost = 0; // We also track the estimated dynamic (that is, actually executed) cost in // the rolled form. This helps identify cases when the savings from unrolling // aren't just exposing dead control flows, but actual reduced dynamic // instructions due to the simplifications which we expect to occur after // unrolling. - int RolledDynamicCost = 0; + unsigned RolledDynamicCost = 0; // We track the simplification of each instruction in each iteration. We use // this to recursively merge costs into the unrolled cost on-demand so that @@ -412,6 +430,9 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, // it. We don't change the actual IR, just count optimization // opportunities. for (Instruction &I : *BB) { + if (isa<DbgInfoIntrinsic>(I)) + continue; + // Track this instruction's expected baseline cost when executing the // rolled loop form. RolledDynamicCost += TTI.getUserCost(&I); @@ -429,16 +450,16 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, if (IsFree) continue; - // If the instruction might have a side-effect recursively account for - // the cost of it and all the instructions leading up to it. - if (I.mayHaveSideEffects()) - AddCostRecursively(I, Iteration); - // Can't properly model a cost of a call. // FIXME: With a proper cost model we should be able to do it. if(isa<CallInst>(&I)) return None; + // If the instruction might have a side-effect recursively account for + // the cost of it and all the instructions leading up to it. + if (I.mayHaveSideEffects()) + AddCostRecursively(I, Iteration); + // If unrolled body turns out to be too big, bail out. if (UnrolledCost > MaxUnrolledLoopSize) { DEBUG(dbgs() << " Exceeded threshold.. exiting.\n" @@ -529,7 +550,7 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, bool &Convergent, const TargetTransformInfo &TTI, - AssumptionCache *AC) { + AssumptionCache *AC, unsigned BEInsns) { SmallPtrSet<const Value *, 32> EphValues; CodeMetrics::collectEphemeralValues(L, AC, EphValues); @@ -548,7 +569,7 @@ static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, // that each loop has at least three instructions (likely a conditional // branch, a comparison feeding that branch, and some kind of loop increment // feeding that comparison instruction). - LoopSize = std::max(LoopSize, 3u); + LoopSize = std::max(LoopSize, BEInsns + 1); return LoopSize; } @@ -635,70 +656,38 @@ static void SetLoopAlreadyUnrolled(Loop *L) { L->setLoopID(NewLoopID); } -static bool canUnrollCompletely(Loop *L, unsigned Threshold, - unsigned PercentDynamicCostSavedThreshold, - unsigned DynamicCostSavingsDiscount, - uint64_t UnrolledCost, - uint64_t RolledDynamicCost) { - if (Threshold == NoThreshold) { - DEBUG(dbgs() << " Can fully unroll, because no threshold is set.\n"); - return true; - } - - if (UnrolledCost <= Threshold) { - DEBUG(dbgs() << " Can fully unroll, because unrolled cost: " - << UnrolledCost << "<" << Threshold << "\n"); - return true; - } - - assert(UnrolledCost && "UnrolledCost can't be 0 at this point."); - assert(RolledDynamicCost >= UnrolledCost && - "Cannot have a higher unrolled cost than a rolled cost!"); - - // Compute the percentage of the dynamic cost in the rolled form that is - // saved when unrolled. If unrolling dramatically reduces the estimated - // dynamic cost of the loop, we use a higher threshold to allow more - // unrolling. - unsigned PercentDynamicCostSaved = - (uint64_t)(RolledDynamicCost - UnrolledCost) * 100ull / RolledDynamicCost; - - if (PercentDynamicCostSaved >= PercentDynamicCostSavedThreshold && - (int64_t)UnrolledCost - (int64_t)DynamicCostSavingsDiscount <= - (int64_t)Threshold) { - DEBUG(dbgs() << " Can fully unroll, because unrolling will reduce the " - "expected dynamic cost by " - << PercentDynamicCostSaved << "% (threshold: " - << PercentDynamicCostSavedThreshold << "%)\n" - << " and the unrolled cost (" << UnrolledCost - << ") is less than the max threshold (" - << DynamicCostSavingsDiscount << ").\n"); - return true; - } +// Computes the boosting factor for complete unrolling. +// If fully unrolling the loop would save a lot of RolledDynamicCost, it would +// be beneficial to fully unroll the loop even if unrolledcost is large. We +// use (RolledDynamicCost / UnrolledCost) to model the unroll benefits to adjust +// the unroll threshold. +static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost, + unsigned MaxPercentThresholdBoost) { + if (Cost.RolledDynamicCost >= UINT_MAX / 100) + return 100; + else if (Cost.UnrolledCost != 0) + // The boosting factor is RolledDynamicCost / UnrolledCost + return std::min(100 * Cost.RolledDynamicCost / Cost.UnrolledCost, + MaxPercentThresholdBoost); + else + return MaxPercentThresholdBoost; +} - DEBUG(dbgs() << " Too large to fully unroll:\n"); - DEBUG(dbgs() << " Threshold: " << Threshold << "\n"); - DEBUG(dbgs() << " Max threshold: " << DynamicCostSavingsDiscount << "\n"); - DEBUG(dbgs() << " Percent cost saved threshold: " - << PercentDynamicCostSavedThreshold << "%\n"); - DEBUG(dbgs() << " Unrolled cost: " << UnrolledCost << "\n"); - DEBUG(dbgs() << " Rolled dynamic cost: " << RolledDynamicCost << "\n"); - DEBUG(dbgs() << " Percent cost saved: " << PercentDynamicCostSaved - << "\n"); - return false; +// Returns loop size estimation for unrolled loop. +static uint64_t getUnrolledLoopSize( + unsigned LoopSize, + TargetTransformInfo::UnrollingPreferences &UP) { + assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!"); + return (uint64_t)(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns; } // Returns true if unroll count was set explicitly. // Calculates unroll count and writes it to UP.Count. -static bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, - DominatorTree &DT, LoopInfo *LI, - ScalarEvolution *SE, unsigned TripCount, - unsigned TripMultiple, unsigned LoopSize, - TargetTransformInfo::UnrollingPreferences &UP) { - // BEInsns represents number of instructions optimized when "back edge" - // becomes "fall through" in unrolled loop. - // For now we count a conditional branch on a backedge and a comparison - // feeding it. - unsigned BEInsns = 2; +static bool computeUnrollCount( + Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, + ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, unsigned &TripCount, + unsigned MaxTripCount, unsigned &TripMultiple, unsigned LoopSize, + TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) { // Check for explicit Count. // 1st priority is unroll count set by "unroll-count" option. bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0; @@ -706,8 +695,7 @@ static bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, UP.Count = UnrollCount; UP.AllowExpensiveTripCount = true; UP.Force = true; - if (UP.AllowRemainder && - (LoopSize - BEInsns) * UP.Count + BEInsns < UP.Threshold) + if (UP.AllowRemainder && getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) return true; } @@ -719,13 +707,13 @@ static bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, UP.AllowExpensiveTripCount = true; UP.Force = true; if (UP.AllowRemainder && - (LoopSize - BEInsns) * UP.Count + BEInsns < PragmaUnrollThreshold) + getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold) return true; } bool PragmaFullUnroll = HasUnrollFullPragma(L); if (PragmaFullUnroll && TripCount != 0) { UP.Count = TripCount; - if ((LoopSize - BEInsns) * UP.Count + BEInsns < PragmaUnrollThreshold) + if (getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold) return false; } @@ -733,11 +721,6 @@ static bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll || PragmaEnableUnroll || UserUnrollCount; - uint64_t UnrolledSize; - DebugLoc LoopLoc = L->getStartLoc(); - Function *F = L->getHeader()->getParent(); - LLVMContext &Ctx = F->getContext(); - if (ExplicitUnroll && TripCount != 0) { // If the loop has an unrolling pragma, we want to be more aggressive with // unrolling limits. Set thresholds to at least the PragmaThreshold value @@ -748,38 +731,48 @@ static bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, } // 3rd priority is full unroll count. - // Full unroll make sense only when TripCount could be staticaly calculated. + // Full unroll makes sense only when TripCount or its upper bound could be + // statically calculated. // Also we need to check if we exceed FullUnrollMaxCount. - if (TripCount && TripCount <= UP.FullUnrollMaxCount) { + // If using the upper bound to unroll, TripMultiple should be set to 1 because + // we do not know when loop may exit. + // MaxTripCount and ExactTripCount cannot both be non zero since we only + // compute the former when the latter is zero. + unsigned ExactTripCount = TripCount; + assert((ExactTripCount == 0 || MaxTripCount == 0) && + "ExtractTripCound and MaxTripCount cannot both be non zero."); + unsigned FullUnrollTripCount = ExactTripCount ? ExactTripCount : MaxTripCount; + UP.Count = FullUnrollTripCount; + if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) { // When computing the unrolled size, note that BEInsns are not replicated // like the rest of the loop body. - UnrolledSize = (uint64_t)(LoopSize - BEInsns) * TripCount + BEInsns; - if (canUnrollCompletely(L, UP.Threshold, 100, UP.DynamicCostSavingsDiscount, - UnrolledSize, UnrolledSize)) { - UP.Count = TripCount; + if (getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) { + UseUpperBound = (MaxTripCount == FullUnrollTripCount); + TripCount = FullUnrollTripCount; + TripMultiple = UP.UpperBound ? 1 : TripMultiple; return ExplicitUnroll; } else { // The loop isn't that small, but we still can fully unroll it if that // helps to remove a significant number of instructions. // To check that, run additional analysis on the loop. if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost( - L, TripCount, DT, *SE, TTI, - UP.Threshold + UP.DynamicCostSavingsDiscount)) - if (canUnrollCompletely(L, UP.Threshold, - UP.PercentDynamicCostSavedThreshold, - UP.DynamicCostSavingsDiscount, - Cost->UnrolledCost, Cost->RolledDynamicCost)) { - UP.Count = TripCount; + L, FullUnrollTripCount, DT, *SE, TTI, + UP.Threshold * UP.MaxPercentThresholdBoost / 100)) { + unsigned Boost = + getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost); + if (Cost->UnrolledCost < UP.Threshold * Boost / 100) { + UseUpperBound = (MaxTripCount == FullUnrollTripCount); + TripCount = FullUnrollTripCount; + TripMultiple = UP.UpperBound ? 1 : TripMultiple; return ExplicitUnroll; } + } } } // 4rd priority is partial unrolling. // Try partial unroll only when TripCount could be staticaly calculated. if (TripCount) { - if (UP.Count == 0) - UP.Count = TripCount; UP.Partial |= ExplicitUnroll; if (!UP.Partial) { DEBUG(dbgs() << " will not try to unroll partially because " @@ -787,12 +780,14 @@ static bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, UP.Count = 0; return false; } + if (UP.Count == 0) + UP.Count = TripCount; if (UP.PartialThreshold != NoThreshold) { // Reduce unroll count to be modulo of TripCount for partial unrolling. - UnrolledSize = (uint64_t)(LoopSize - BEInsns) * UP.Count + BEInsns; - if (UnrolledSize > UP.PartialThreshold) - UP.Count = (std::max(UP.PartialThreshold, 3u) - BEInsns) / - (LoopSize - BEInsns); + if (getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold) + UP.Count = + (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) / + (LoopSize - UP.BEInsns); if (UP.Count > UP.MaxCount) UP.Count = UP.MaxCount; while (UP.Count != 0 && TripCount % UP.Count != 0) @@ -802,19 +797,18 @@ static bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, // largest power-of-two factor that satisfies the threshold limit. // As we'll create fixup loop, do the type of unrolling only if // remainder loop is allowed. - UP.Count = DefaultUnrollRuntimeCount; - UnrolledSize = (LoopSize - BEInsns) * UP.Count + BEInsns; - while (UP.Count != 0 && UnrolledSize > UP.PartialThreshold) { + UP.Count = UP.DefaultUnrollRuntimeCount; + while (UP.Count != 0 && + getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold) UP.Count >>= 1; - UnrolledSize = (LoopSize - BEInsns) * UP.Count + BEInsns; - } } if (UP.Count < 2) { if (PragmaEnableUnroll) - emitOptimizationRemarkMissed( - Ctx, DEBUG_TYPE, *F, LoopLoc, - "Unable to unroll loop as directed by unroll(enable) pragma " - "because unrolled size is too large."); + ORE->emit( + OptimizationRemarkMissed(DEBUG_TYPE, "UnrollAsDirectedTooLarge", + L->getStartLoc(), L->getHeader()) + << "Unable to unroll loop as directed by unroll(enable) pragma " + "because unrolled size is too large."); UP.Count = 0; } } else { @@ -822,26 +816,48 @@ static bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, } if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount && UP.Count != TripCount) - emitOptimizationRemarkMissed( - Ctx, DEBUG_TYPE, *F, LoopLoc, - "Unable to fully unroll loop as directed by unroll pragma because " - "unrolled size is too large."); + ORE->emit( + OptimizationRemarkMissed(DEBUG_TYPE, "FullUnrollAsDirectedTooLarge", + L->getStartLoc(), L->getHeader()) + << "Unable to fully unroll loop as directed by unroll pragma because " + "unrolled size is too large."); return ExplicitUnroll; } assert(TripCount == 0 && "All cases when TripCount is constant should be covered here."); if (PragmaFullUnroll) - emitOptimizationRemarkMissed( - Ctx, DEBUG_TYPE, *F, LoopLoc, - "Unable to fully unroll loop as directed by unroll(full) pragma " - "because loop has a runtime trip count."); + ORE->emit( + OptimizationRemarkMissed(DEBUG_TYPE, + "CantFullUnrollAsDirectedRuntimeTripCount", + L->getStartLoc(), L->getHeader()) + << "Unable to fully unroll loop as directed by unroll(full) pragma " + "because loop has a runtime trip count."); + + // 5th priority is loop peeling + computePeelCount(L, LoopSize, UP); + if (UP.PeelCount) { + UP.Runtime = false; + UP.Count = 1; + return ExplicitUnroll; + } - // 5th priority is runtime unrolling. + // 6th priority is runtime unrolling. // Don't unroll a runtime trip count loop when it is disabled. if (HasRuntimeUnrollDisablePragma(L)) { UP.Count = 0; return false; } + + // Check if the runtime trip count is too small when profile is available. + if (L->getHeader()->getParent()->getEntryCount()) { + if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) { + if (*ProfileTripCount < FlatLoopTripCountThreshold) + return false; + else + UP.AllowExpensiveTripCount = true; + } + } + // Reduce count based on the type of unrolling and the threshold values. UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount; if (!UP.Runtime) { @@ -851,15 +867,13 @@ static bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, return false; } if (UP.Count == 0) - UP.Count = DefaultUnrollRuntimeCount; - UnrolledSize = (LoopSize - BEInsns) * UP.Count + BEInsns; + UP.Count = UP.DefaultUnrollRuntimeCount; // Reduce unroll count to be the largest power-of-two factor of // the original count which satisfies the threshold limit. - while (UP.Count != 0 && UnrolledSize > UP.PartialThreshold) { + while (UP.Count != 0 && + getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold) UP.Count >>= 1; - UnrolledSize = (LoopSize - BEInsns) * UP.Count + BEInsns; - } #ifndef NDEBUG unsigned OrigCount = UP.Count; @@ -874,16 +888,19 @@ static bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, "multiple, " << TripMultiple << ". Reducing unroll count from " << OrigCount << " to " << UP.Count << ".\n"); + using namespace ore; if (PragmaCount > 0 && !UP.AllowRemainder) - emitOptimizationRemarkMissed( - Ctx, DEBUG_TYPE, *F, LoopLoc, - Twine("Unable to unroll loop the number of times directed by " - "unroll_count pragma because remainder loop is restricted " - "(that could architecture specific or because the loop " - "contains a convergent instruction) and so must have an unroll " - "count that divides the loop trip multiple of ") + - Twine(TripMultiple) + ". Unrolling instead " + Twine(UP.Count) + - " time(s)."); + ORE->emit( + OptimizationRemarkMissed(DEBUG_TYPE, + "DifferentUnrollCountFromDirected", + L->getStartLoc(), L->getHeader()) + << "Unable to unroll loop the number of times directed by " + "unroll_count pragma because remainder loop is restricted " + "(that could architecture specific or because the loop " + "contains a convergent instruction) and so must have an unroll " + "count that divides the loop trip multiple of " + << NV("TripMultiple", TripMultiple) << ". Unrolling instead " + << NV("UnrollCount", UP.Count) << " time(s)."); } if (UP.Count > UP.MaxCount) @@ -896,22 +913,34 @@ static bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution *SE, const TargetTransformInfo &TTI, - AssumptionCache &AC, bool PreserveLCSSA, + AssumptionCache &AC, OptimizationRemarkEmitter &ORE, + bool PreserveLCSSA, Optional<unsigned> ProvidedCount, Optional<unsigned> ProvidedThreshold, Optional<bool> ProvidedAllowPartial, - Optional<bool> ProvidedRuntime) { + Optional<bool> ProvidedRuntime, + Optional<bool> ProvidedUpperBound) { DEBUG(dbgs() << "Loop Unroll: F[" << L->getHeader()->getParent()->getName() << "] Loop %" << L->getHeader()->getName() << "\n"); - if (HasUnrollDisablePragma(L)) { + if (HasUnrollDisablePragma(L)) + return false; + if (!L->isLoopSimplifyForm()) { + DEBUG( + dbgs() << " Not unrolling loop which is not in loop-simplify form.\n"); return false; } unsigned NumInlineCandidates; bool NotDuplicatable; bool Convergent; + TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences( + L, TTI, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial, + ProvidedRuntime, ProvidedUpperBound); + // Exit early if unrolling is disabled. + if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0)) + return false; unsigned LoopSize = ApproximateLoopSize( - L, NumInlineCandidates, NotDuplicatable, Convergent, TTI, &AC); + L, NumInlineCandidates, NotDuplicatable, Convergent, TTI, &AC, UP.BEInsns); DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n"); if (NotDuplicatable) { DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable" @@ -922,14 +951,10 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n"); return false; } - if (!L->isLoopSimplifyForm()) { - DEBUG( - dbgs() << " Not unrolling loop which is not in loop-simplify form.\n"); - return false; - } // Find trip count and trip multiple if count is not available unsigned TripCount = 0; + unsigned MaxTripCount = 0; unsigned TripMultiple = 1; // If there are multiple exiting blocks but one of them is the latch, use the // latch for the trip count estimation. Otherwise insist on a single exiting @@ -942,10 +967,6 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock); } - TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences( - L, TTI, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial, - ProvidedRuntime); - // If the loop contains a convergent operation, the prelude we'd add // to do the first few instructions before we hit the unrolled loop // is unsafe -- it adds a control-flow dependency to the convergent @@ -961,8 +982,31 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, if (Convergent) UP.AllowRemainder = false; - bool IsCountSetExplicitly = computeUnrollCount(L, TTI, DT, LI, SE, TripCount, - TripMultiple, LoopSize, UP); + // Try to find the trip count upper bound if we cannot find the exact trip + // count. + bool MaxOrZero = false; + if (!TripCount) { + MaxTripCount = SE->getSmallConstantMaxTripCount(L); + MaxOrZero = SE->isBackedgeTakenCountMaxOrZero(L); + // We can unroll by the upper bound amount if it's generally allowed or if + // we know that the loop is executed either the upper bound or zero times. + // (MaxOrZero unrolling keeps only the first loop test, so the number of + // loop tests remains the same compared to the non-unrolled version, whereas + // the generic upper bound unrolling keeps all but the last loop test so the + // number of loop tests goes up which may end up being worse on targets with + // constriained branch predictor resources so is controlled by an option.) + // In addition we only unroll small upper bounds. + if (!(UP.UpperBound || MaxOrZero) || MaxTripCount > UnrollMaxUpperBound) { + MaxTripCount = 0; + } + } + + // computeUnrollCount() decides whether it is beneficial to use upper bound to + // fully unroll the loop. + bool UseUpperBound = false; + bool IsCountSetExplicitly = + computeUnrollCount(L, TTI, DT, LI, SE, &ORE, TripCount, MaxTripCount, + TripMultiple, LoopSize, UP, UseUpperBound); if (!UP.Count) return false; // Unroll factor (Count) must be less or equal to TripCount. @@ -971,14 +1015,18 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, // Unroll the loop. if (!UnrollLoop(L, UP.Count, TripCount, UP.Force, UP.Runtime, - UP.AllowExpensiveTripCount, TripMultiple, LI, SE, &DT, &AC, + UP.AllowExpensiveTripCount, UseUpperBound, MaxOrZero, + TripMultiple, UP.PeelCount, LI, SE, &DT, &AC, &ORE, PreserveLCSSA)) return false; // If loop has an unroll count pragma or unrolled by explicitly set count // mark loop as unrolled to prevent unrolling beyond that requested. - if (IsCountSetExplicitly) + // If the loop was peeled, we already "used up" the profile information + // we had, so we don't want to unroll or peel again. + if (IsCountSetExplicitly || UP.PeelCount) SetLoopAlreadyUnrolled(L); + return true; } @@ -988,10 +1036,11 @@ public: static char ID; // Pass ID, replacement for typeid LoopUnroll(Optional<unsigned> Threshold = None, Optional<unsigned> Count = None, - Optional<bool> AllowPartial = None, Optional<bool> Runtime = None) + Optional<bool> AllowPartial = None, Optional<bool> Runtime = None, + Optional<bool> UpperBound = None) : LoopPass(ID), ProvidedCount(std::move(Count)), ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial), - ProvidedRuntime(Runtime) { + ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound) { initializeLoopUnrollPass(*PassRegistry::getPassRegistry()); } @@ -999,6 +1048,7 @@ public: Optional<unsigned> ProvidedThreshold; Optional<bool> ProvidedAllowPartial; Optional<bool> ProvidedRuntime; + Optional<bool> ProvidedUpperBound; bool runOnLoop(Loop *L, LPPassManager &) override { if (skipLoop(L)) @@ -1012,11 +1062,16 @@ public: const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + // For the old PM, we can't use OptimizationRemarkEmitter as an analysis + // pass. Function analyses need to be preserved across loop transformations + // but ORE cannot be preserved (see comment before the pass definition). + OptimizationRemarkEmitter ORE(&F); bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); - return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, PreserveLCSSA, ProvidedCount, - ProvidedThreshold, ProvidedAllowPartial, - ProvidedRuntime); + return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA, + ProvidedCount, ProvidedThreshold, + ProvidedAllowPartial, ProvidedRuntime, + ProvidedUpperBound); } /// This transformation requires natural loop information & requires that @@ -1040,7 +1095,7 @@ INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false) Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial, - int Runtime) { + int Runtime, int UpperBound) { // TODO: It would make more sense for this function to take the optionals // directly, but that's dangerous since it would silently break out of tree // callers. @@ -1048,9 +1103,33 @@ Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial, Count == -1 ? None : Optional<unsigned>(Count), AllowPartial == -1 ? None : Optional<bool>(AllowPartial), - Runtime == -1 ? None : Optional<bool>(Runtime)); + Runtime == -1 ? None : Optional<bool>(Runtime), + UpperBound == -1 ? None : Optional<bool>(UpperBound)); } Pass *llvm::createSimpleLoopUnrollPass() { - return llvm::createLoopUnrollPass(-1, -1, 0, 0); + return llvm::createLoopUnrollPass(-1, -1, 0, 0, 0); +} + +PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, + LPMUpdater &) { + const auto &FAM = + AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager(); + Function *F = L.getHeader()->getParent(); + + auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F); + // FIXME: This should probably be optional rather than required. + if (!ORE) + report_fatal_error("LoopUnrollPass: OptimizationRemarkEmitterAnalysis not " + "cached at a higher level"); + + bool Changed = tryToUnrollLoop(&L, AR.DT, &AR.LI, &AR.SE, AR.TTI, AR.AC, *ORE, + /*PreserveLCSSA*/ true, ProvidedCount, + ProvidedThreshold, ProvidedAllowPartial, + ProvidedRuntime, ProvidedUpperBound); + + if (!Changed) + return PreservedAnalyses::all(); + return getLoopPassPreservedAnalyses(); } |