diff options
Diffstat (limited to 'contrib/llvm/lib/CodeGen/MachineScheduler.cpp')
-rw-r--r-- | contrib/llvm/lib/CodeGen/MachineScheduler.cpp | 736 |
1 files changed, 438 insertions, 298 deletions
diff --git a/contrib/llvm/lib/CodeGen/MachineScheduler.cpp b/contrib/llvm/lib/CodeGen/MachineScheduler.cpp index bcee15c..d921e29 100644 --- a/contrib/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/contrib/llvm/lib/CodeGen/MachineScheduler.cpp @@ -23,13 +23,13 @@ #include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/CodeGen/ScheduleDFS.h" #include "llvm/CodeGen/ScheduleHazardRecognizer.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/GraphWriter.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" -#include <queue> using namespace llvm; @@ -65,14 +65,20 @@ static cl::opt<unsigned> SchedOnlyBlock("misched-only-block", cl::Hidden, static bool ViewMISchedDAGs = false; #endif // NDEBUG +/// Avoid quadratic complexity in unusually large basic blocks by limiting the +/// size of the ready lists. +static cl::opt<unsigned> ReadyListLimit("misched-limit", cl::Hidden, + cl::desc("Limit ready list to N instructions"), cl::init(256)); + static cl::opt<bool> EnableRegPressure("misched-regpressure", cl::Hidden, cl::desc("Enable register pressure scheduling."), cl::init(true)); static cl::opt<bool> EnableCyclicPath("misched-cyclicpath", cl::Hidden, cl::desc("Enable cyclic critical path analysis."), cl::init(true)); -static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden, - cl::desc("Enable load clustering."), cl::init(true)); +static cl::opt<bool> EnableMemOpCluster("misched-cluster", cl::Hidden, + cl::desc("Enable memop clustering."), + cl::init(true)); // Experimental heuristics static cl::opt<bool> EnableMacroFusion("misched-fusion", cl::Hidden, @@ -219,6 +225,11 @@ static cl::opt<bool> EnableMachineSched( cl::desc("Enable the machine instruction scheduling pass."), cl::init(true), cl::Hidden); +static cl::opt<bool> EnablePostRAMachineSched( + "enable-post-misched", + cl::desc("Enable the post-ra machine instruction scheduling pass."), + cl::init(true), cl::Hidden); + /// Forward declare the standard machine scheduler. This will be used as the /// default scheduler if the target does not set a default. static ScheduleDAGInstrs *createGenericSchedLive(MachineSchedContext *C); @@ -314,6 +325,9 @@ ScheduleDAGInstrs *PostMachineScheduler::createPostMachineScheduler() { /// design would be to split blocks at scheduling boundaries, but LLVM has a /// general bias against block splitting purely for implementation simplicity. bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) { + if (skipFunction(*mf.getFunction())) + return false; + if (EnableMachineSched.getNumOccurrences()) { if (!EnableMachineSched) return false; @@ -349,10 +363,13 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) { } bool PostMachineScheduler::runOnMachineFunction(MachineFunction &mf) { - if (skipOptnoneFunction(*mf.getFunction())) + if (skipFunction(*mf.getFunction())) return false; - if (!mf.getSubtarget().enablePostRAScheduler()) { + if (EnablePostRAMachineSched.getNumOccurrences()) { + if (!EnablePostRAMachineSched) + return false; + } else if (!mf.getSubtarget().enablePostRAScheduler()) { DEBUG(dbgs() << "Subtarget disables post-MI-sched.\n"); return false; } @@ -389,7 +406,7 @@ static bool isSchedBoundary(MachineBasicBlock::iterator MI, MachineBasicBlock *MBB, MachineFunction *MF, const TargetInstrInfo *TII) { - return MI->isCall() || TII->isSchedulingBoundary(MI, MBB, *MF); + return MI->isCall() || TII->isSchedulingBoundary(*MI, MBB, *MF); } /// Main driver for both MachineScheduler and PostMachineScheduler. @@ -427,7 +444,6 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler, // // MBB::size() uses instr_iterator to count. Here we need a bundle to count // as a single instruction. - unsigned RemainingInstrs = std::distance(MBB->begin(), MBB->end()); for(MachineBasicBlock::iterator RegionEnd = MBB->end(); RegionEnd != MBB->begin(); RegionEnd = Scheduler.begin()) { @@ -435,15 +451,13 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler, if (RegionEnd != MBB->end() || isSchedBoundary(&*std::prev(RegionEnd), &*MBB, MF, TII)) { --RegionEnd; - // Count the boundary instruction. - --RemainingInstrs; } // The next region starts above the previous region. Look backward in the // instruction stream until we find the nearest boundary. unsigned NumRegionInstrs = 0; MachineBasicBlock::iterator I = RegionEnd; - for(;I != MBB->begin(); --I, --RemainingInstrs) { + for (;I != MBB->begin(); --I) { if (isSchedBoundary(&*std::prev(I), &*MBB, MF, TII)) break; if (!I->isDebugValue()) @@ -466,8 +480,7 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler, << "\n From: " << *I << " To: "; if (RegionEnd != MBB->end()) dbgs() << *RegionEnd; else dbgs() << "End"; - dbgs() << " RegionInstrs: " << NumRegionInstrs - << " Remaining: " << RemainingInstrs << "\n"); + dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n'); if (DumpCriticalPathLength) { errs() << MF->getName(); errs() << ":BB# " << MBB->getNumber(); @@ -485,7 +498,6 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler, // scheduler for the top of it's scheduled region. RegionEnd = Scheduler.begin(); } - assert(RemainingInstrs == 0 && "Instruction count mismatch!"); Scheduler.finishBlock(); // FIXME: Ideally, no further passes should rely on kill flags. However, // thumb2 size reduction is currently an exception, so the PostMIScheduler @@ -640,7 +652,7 @@ void ScheduleDAGMI::moveInstruction( // Update LiveIntervals if (LIS) - LIS->handleMove(MI, /*UpdateFlags=*/true); + LIS->handleMove(*MI, /*UpdateFlags=*/true); // Recede RegionBegin if an instruction moves above the first. if (RegionBegin == InsertPos) @@ -704,8 +716,7 @@ void ScheduleDAGMI::schedule() { CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom); else moveInstruction(MI, CurrentTop); - } - else { + } else { assert(SU->isBottomReady() && "node still has unscheduled dependencies"); MachineBasicBlock::iterator priorII = priorNonDebug(CurrentBottom, CurrentTop); @@ -869,13 +880,19 @@ void ScheduleDAGMILive::enterRegion(MachineBasicBlock *bb, SUPressureDiffs.clear(); ShouldTrackPressure = SchedImpl->shouldTrackPressure(); + ShouldTrackLaneMasks = SchedImpl->shouldTrackLaneMasks(); + + assert((!ShouldTrackLaneMasks || ShouldTrackPressure) && + "ShouldTrackLaneMasks requires ShouldTrackPressure"); } // Setup the register pressure trackers for the top scheduled top and bottom // scheduled regions. void ScheduleDAGMILive::initRegPressure() { - TopRPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin); - BotRPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd); + TopRPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin, + ShouldTrackLaneMasks, false); + BotRPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd, + ShouldTrackLaneMasks, false); // Close the RPTracker to finalize live ins. RPTracker.closeRegion(); @@ -905,7 +922,7 @@ void ScheduleDAGMILive::initRegPressure() { // Account for liveness generated by the region boundary. if (LiveRegionEnd != RegionEnd) { - SmallVector<unsigned, 8> LiveUses; + SmallVector<RegisterMaskPair, 8> LiveUses; BotRPTracker.recede(&LiveUses); updatePressureDiffs(LiveUses); } @@ -969,47 +986,74 @@ updateScheduledPressure(const SUnit *SU, /// Update the PressureDiff array for liveness after scheduling this /// instruction. -void ScheduleDAGMILive::updatePressureDiffs(ArrayRef<unsigned> LiveUses) { - for (unsigned LUIdx = 0, LUEnd = LiveUses.size(); LUIdx != LUEnd; ++LUIdx) { +void ScheduleDAGMILive::updatePressureDiffs( + ArrayRef<RegisterMaskPair> LiveUses) { + for (const RegisterMaskPair &P : LiveUses) { + unsigned Reg = P.RegUnit; /// FIXME: Currently assuming single-use physregs. - unsigned Reg = LiveUses[LUIdx]; - DEBUG(dbgs() << " LiveReg: " << PrintVRegOrUnit(Reg, TRI) << "\n"); if (!TRI->isVirtualRegister(Reg)) continue; - // This may be called before CurrentBottom has been initialized. However, - // BotRPTracker must have a valid position. We want the value live into the - // instruction or live out of the block, so ask for the previous - // instruction's live-out. - const LiveInterval &LI = LIS->getInterval(Reg); - VNInfo *VNI; - MachineBasicBlock::const_iterator I = - nextIfDebug(BotRPTracker.getPos(), BB->end()); - if (I == BB->end()) - VNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB)); - else { - LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(I)); - VNI = LRQ.valueIn(); - } - // RegisterPressureTracker guarantees that readsReg is true for LiveUses. - assert(VNI && "No live value at use."); - for (const VReg2SUnit &V2SU - : make_range(VRegUses.find(Reg), VRegUses.end())) { - SUnit *SU = V2SU.SU; - // If this use comes before the reaching def, it cannot be a last use, so - // descrease its pressure change. - if (!SU->isScheduled && SU != &ExitSU) { - LiveQueryResult LRQ - = LI.Query(LIS->getInstructionIndex(SU->getInstr())); - if (LRQ.valueIn() == VNI) { - PressureDiff &PDiff = getPressureDiff(SU); - PDiff.addPressureChange(Reg, true, &MRI); - DEBUG( - dbgs() << " UpdateRegP: SU(" << SU->NodeNum << ") " - << *SU->getInstr(); - dbgs() << " to "; - PDiff.dump(*TRI); - ); + if (ShouldTrackLaneMasks) { + // If the register has just become live then other uses won't change + // this fact anymore => decrement pressure. + // If the register has just become dead then other uses make it come + // back to life => increment pressure. + bool Decrement = P.LaneMask != 0; + + for (const VReg2SUnit &V2SU + : make_range(VRegUses.find(Reg), VRegUses.end())) { + SUnit &SU = *V2SU.SU; + if (SU.isScheduled || &SU == &ExitSU) + continue; + + PressureDiff &PDiff = getPressureDiff(&SU); + PDiff.addPressureChange(Reg, Decrement, &MRI); + DEBUG( + dbgs() << " UpdateRegP: SU(" << SU.NodeNum << ") " + << PrintReg(Reg, TRI) << ':' << PrintLaneMask(P.LaneMask) + << ' ' << *SU.getInstr(); + dbgs() << " to "; + PDiff.dump(*TRI); + ); + } + } else { + assert(P.LaneMask != 0); + DEBUG(dbgs() << " LiveReg: " << PrintVRegOrUnit(Reg, TRI) << "\n"); + // This may be called before CurrentBottom has been initialized. However, + // BotRPTracker must have a valid position. We want the value live into the + // instruction or live out of the block, so ask for the previous + // instruction's live-out. + const LiveInterval &LI = LIS->getInterval(Reg); + VNInfo *VNI; + MachineBasicBlock::const_iterator I = + nextIfDebug(BotRPTracker.getPos(), BB->end()); + if (I == BB->end()) + VNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB)); + else { + LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(*I)); + VNI = LRQ.valueIn(); + } + // RegisterPressureTracker guarantees that readsReg is true for LiveUses. + assert(VNI && "No live value at use."); + for (const VReg2SUnit &V2SU + : make_range(VRegUses.find(Reg), VRegUses.end())) { + SUnit *SU = V2SU.SU; + // If this use comes before the reaching def, it cannot be a last use, + // so decrease its pressure change. + if (!SU->isScheduled && SU != &ExitSU) { + LiveQueryResult LRQ = + LI.Query(LIS->getInstructionIndex(*SU->getInstr())); + if (LRQ.valueIn() == VNI) { + PressureDiff &PDiff = getPressureDiff(SU); + PDiff.addPressureChange(Reg, true, &MRI); + DEBUG( + dbgs() << " UpdateRegP: SU(" << SU->NodeNum << ") " + << *SU->getInstr(); + dbgs() << " to "; + PDiff.dump(*TRI); + ); + } } } } @@ -1057,11 +1101,6 @@ void ScheduleDAGMILive::schedule() { // Initialize ready queues now that the DAG and priority data are finalized. initQueues(TopRoots, BotRoots); - if (ShouldTrackPressure) { - assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker"); - TopRPTracker.setPos(CurrentTop); - } - bool IsTopNode = false; while (true) { DEBUG(dbgs() << "** ScheduleDAGMILive::schedule picking next node\n"); @@ -1111,14 +1150,14 @@ void ScheduleDAGMILive::buildDAGWithRegPressure() { // Initialize the register pressure tracker used by buildSchedGraph. RPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd, - /*TrackUntiedDefs=*/true); + ShouldTrackLaneMasks, /*TrackUntiedDefs=*/true); // Account for liveness generate by the region boundary. if (LiveRegionEnd != RegionEnd) RPTracker.recede(); // Build the DAG, and compute current register pressure. - buildSchedGraph(AA, &RPTracker, &SUPressureDiffs); + buildSchedGraph(AA, &RPTracker, &SUPressureDiffs, LIS, ShouldTrackLaneMasks); // Initialize top/bottom trackers after computing region pressure. initRegPressure(); @@ -1167,10 +1206,8 @@ unsigned ScheduleDAGMILive::computeCyclicCriticalPath() { unsigned MaxCyclicLatency = 0; // Visit each live out vreg def to find def/use pairs that cross iterations. - ArrayRef<unsigned> LiveOuts = RPTracker.getPressure().LiveOutRegs; - for (ArrayRef<unsigned>::iterator RI = LiveOuts.begin(), RE = LiveOuts.end(); - RI != RE; ++RI) { - unsigned Reg = *RI; + for (const RegisterMaskPair &P : RPTracker.getPressure().LiveOutRegs) { + unsigned Reg = P.RegUnit; if (!TRI->isVirtualRegister(Reg)) continue; const LiveInterval &LI = LIS->getInterval(Reg); @@ -1193,8 +1230,7 @@ unsigned ScheduleDAGMILive::computeCyclicCriticalPath() { continue; // Only consider uses of the phi. - LiveQueryResult LRQ = - LI.Query(LIS->getInstructionIndex(SU->getInstr())); + LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(*SU->getInstr())); if (!LRQ.valueIn()->isPHIDef()) continue; @@ -1209,8 +1245,7 @@ unsigned ScheduleDAGMILive::computeCyclicCriticalPath() { if (LiveInHeight > LiveOutHeight) { if (LiveInHeight - LiveOutHeight < CyclicLatency) CyclicLatency = LiveInHeight - LiveOutHeight; - } - else + } else CyclicLatency = 0; DEBUG(dbgs() << "Cyclic Path: SU(" << DefSU->NodeNum << ") -> SU(" @@ -1223,6 +1258,17 @@ unsigned ScheduleDAGMILive::computeCyclicCriticalPath() { return MaxCyclicLatency; } +/// Release ExitSU predecessors and setup scheduler queues. Re-position +/// the Top RP tracker in case the region beginning has changed. +void ScheduleDAGMILive::initQueues(ArrayRef<SUnit*> TopRoots, + ArrayRef<SUnit*> BotRoots) { + ScheduleDAGMI::initQueues(TopRoots, BotRoots); + if (ShouldTrackPressure) { + assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker"); + TopRPTracker.setPos(CurrentTop); + } +} + /// Move an instruction and update register pressure. void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) { // Move the instruction to its new location in the instruction stream. @@ -1239,7 +1285,18 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) { if (ShouldTrackPressure) { // Update top scheduled pressure. - TopRPTracker.advance(); + RegisterOperands RegOpers; + RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false); + if (ShouldTrackLaneMasks) { + // Adjust liveness and add missing dead+read-undef flags. + SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot(); + RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI); + } else { + // Adjust for missing dead-def flags. + RegOpers.detectDeadDefs(*MI, *LIS); + } + + TopRPTracker.advance(RegOpers); assert(TopRPTracker.getPos() == CurrentTop && "out of sync"); DEBUG( dbgs() << "Top Pressure:\n"; @@ -1248,8 +1305,7 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) { updateScheduledPressure(SU, TopRPTracker.getPressure().MaxSetPressure); } - } - else { + } else { assert(SU->isBottomReady() && "node still has unscheduled dependencies"); MachineBasicBlock::iterator priorII = priorNonDebug(CurrentBottom, CurrentTop); @@ -1264,9 +1320,20 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) { CurrentBottom = MI; } if (ShouldTrackPressure) { - // Update bottom scheduled pressure. - SmallVector<unsigned, 8> LiveUses; - BotRPTracker.recede(&LiveUses); + RegisterOperands RegOpers; + RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false); + if (ShouldTrackLaneMasks) { + // Adjust liveness and add missing dead+read-undef flags. + SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot(); + RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI); + } else { + // Adjust for missing dead-def flags. + RegOpers.detectDeadDefs(*MI, *LIS); + } + + BotRPTracker.recedeSkipDebugValues(); + SmallVector<RegisterMaskPair, 8> LiveUses; + BotRPTracker.recede(RegOpers, &LiveUses); assert(BotRPTracker.getPos() == CurrentBottom && "out of sync"); DEBUG( dbgs() << "Bottom Pressure:\n"; @@ -1280,64 +1347,81 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) { } //===----------------------------------------------------------------------===// -// LoadClusterMutation - DAG post-processing to cluster loads. +// BaseMemOpClusterMutation - DAG post-processing to cluster loads or stores. //===----------------------------------------------------------------------===// namespace { /// \brief Post-process the DAG to create cluster edges between neighboring -/// loads. -class LoadClusterMutation : public ScheduleDAGMutation { - struct LoadInfo { +/// loads or between neighboring stores. +class BaseMemOpClusterMutation : public ScheduleDAGMutation { + struct MemOpInfo { SUnit *SU; unsigned BaseReg; - unsigned Offset; - LoadInfo(SUnit *su, unsigned reg, unsigned ofs) - : SU(su), BaseReg(reg), Offset(ofs) {} + int64_t Offset; + MemOpInfo(SUnit *su, unsigned reg, int64_t ofs) + : SU(su), BaseReg(reg), Offset(ofs) {} - bool operator<(const LoadInfo &RHS) const { + bool operator<(const MemOpInfo&RHS) const { return std::tie(BaseReg, Offset) < std::tie(RHS.BaseReg, RHS.Offset); } }; const TargetInstrInfo *TII; const TargetRegisterInfo *TRI; + bool IsLoad; + public: - LoadClusterMutation(const TargetInstrInfo *tii, - const TargetRegisterInfo *tri) - : TII(tii), TRI(tri) {} + BaseMemOpClusterMutation(const TargetInstrInfo *tii, + const TargetRegisterInfo *tri, bool IsLoad) + : TII(tii), TRI(tri), IsLoad(IsLoad) {} + + void apply(ScheduleDAGInstrs *DAGInstrs) override; - void apply(ScheduleDAGMI *DAG) override; protected: - void clusterNeighboringLoads(ArrayRef<SUnit*> Loads, ScheduleDAGMI *DAG); + void clusterNeighboringMemOps(ArrayRef<SUnit *> MemOps, ScheduleDAGMI *DAG); +}; + +class StoreClusterMutation : public BaseMemOpClusterMutation { +public: + StoreClusterMutation(const TargetInstrInfo *tii, + const TargetRegisterInfo *tri) + : BaseMemOpClusterMutation(tii, tri, false) {} +}; + +class LoadClusterMutation : public BaseMemOpClusterMutation { +public: + LoadClusterMutation(const TargetInstrInfo *tii, const TargetRegisterInfo *tri) + : BaseMemOpClusterMutation(tii, tri, true) {} }; } // anonymous -void LoadClusterMutation::clusterNeighboringLoads(ArrayRef<SUnit*> Loads, - ScheduleDAGMI *DAG) { - SmallVector<LoadClusterMutation::LoadInfo,32> LoadRecords; - for (unsigned Idx = 0, End = Loads.size(); Idx != End; ++Idx) { - SUnit *SU = Loads[Idx]; +void BaseMemOpClusterMutation::clusterNeighboringMemOps( + ArrayRef<SUnit *> MemOps, ScheduleDAGMI *DAG) { + SmallVector<MemOpInfo, 32> MemOpRecords; + for (unsigned Idx = 0, End = MemOps.size(); Idx != End; ++Idx) { + SUnit *SU = MemOps[Idx]; unsigned BaseReg; - unsigned Offset; - if (TII->getMemOpBaseRegImmOfs(SU->getInstr(), BaseReg, Offset, TRI)) - LoadRecords.push_back(LoadInfo(SU, BaseReg, Offset)); + int64_t Offset; + if (TII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseReg, Offset, TRI)) + MemOpRecords.push_back(MemOpInfo(SU, BaseReg, Offset)); } - if (LoadRecords.size() < 2) + if (MemOpRecords.size() < 2) return; - std::sort(LoadRecords.begin(), LoadRecords.end()); + + std::sort(MemOpRecords.begin(), MemOpRecords.end()); unsigned ClusterLength = 1; - for (unsigned Idx = 0, End = LoadRecords.size(); Idx < (End - 1); ++Idx) { - if (LoadRecords[Idx].BaseReg != LoadRecords[Idx+1].BaseReg) { + for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) { + if (MemOpRecords[Idx].BaseReg != MemOpRecords[Idx+1].BaseReg) { ClusterLength = 1; continue; } - SUnit *SUa = LoadRecords[Idx].SU; - SUnit *SUb = LoadRecords[Idx+1].SU; - if (TII->shouldClusterLoads(SUa->getInstr(), SUb->getInstr(), ClusterLength) - && DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) { - - DEBUG(dbgs() << "Cluster loads SU(" << SUa->NodeNum << ") - SU(" + SUnit *SUa = MemOpRecords[Idx].SU; + SUnit *SUb = MemOpRecords[Idx+1].SU; + if (TII->shouldClusterMemOps(*SUa->getInstr(), *SUb->getInstr(), + ClusterLength) && + DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) { + DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU(" << SUb->NodeNum << ")\n"); // Copy successor edges from SUa to SUb. Interleaving computation // dependent on SUa can prevent load combining due to register reuse. @@ -1351,22 +1435,26 @@ void LoadClusterMutation::clusterNeighboringLoads(ArrayRef<SUnit*> Loads, DAG->addEdge(SI->getSUnit(), SDep(SUb, SDep::Artificial)); } ++ClusterLength; - } - else + } else ClusterLength = 1; } } /// \brief Callback from DAG postProcessing to create cluster edges for loads. -void LoadClusterMutation::apply(ScheduleDAGMI *DAG) { +void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAGInstrs) { + + ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); + // Map DAG NodeNum to store chain ID. DenseMap<unsigned, unsigned> StoreChainIDs; - // Map each store chain to a set of dependent loads. + // Map each store chain to a set of dependent MemOps. SmallVector<SmallVector<SUnit*,4>, 32> StoreChainDependents; for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) { SUnit *SU = &DAG->SUnits[Idx]; - if (!SU->getInstr()->mayLoad()) + if ((IsLoad && !SU->getInstr()->mayLoad()) || + (!IsLoad && !SU->getInstr()->mayStore())) continue; + unsigned ChainPredID = DAG->SUnits.size(); for (SUnit::const_pred_iterator PI = SU->Preds.begin(), PE = SU->Preds.end(); PI != PE; ++PI) { @@ -1376,7 +1464,7 @@ void LoadClusterMutation::apply(ScheduleDAGMI *DAG) { } } // Check if this chain-like pred has been seen - // before. ChainPredID==MaxNodeID for loads at the top of the schedule. + // before. ChainPredID==MaxNodeID at the top of the schedule. unsigned NumChains = StoreChainDependents.size(); std::pair<DenseMap<unsigned, unsigned>::iterator, bool> Result = StoreChainIDs.insert(std::make_pair(ChainPredID, NumChains)); @@ -1384,9 +1472,10 @@ void LoadClusterMutation::apply(ScheduleDAGMI *DAG) { StoreChainDependents.resize(NumChains + 1); StoreChainDependents[Result.first->second].push_back(SU); } + // Iterate over the store chains. for (unsigned Idx = 0, End = StoreChainDependents.size(); Idx != End; ++Idx) - clusterNeighboringLoads(StoreChainDependents[Idx], DAG); + clusterNeighboringMemOps(StoreChainDependents[Idx], DAG); } //===----------------------------------------------------------------------===// @@ -1403,7 +1492,7 @@ public: MacroFusion(const TargetInstrInfo &TII, const TargetRegisterInfo &TRI) : TII(TII), TRI(TRI) {} - void apply(ScheduleDAGMI *DAG) override; + void apply(ScheduleDAGInstrs *DAGInstrs) override; }; } // anonymous @@ -1423,7 +1512,9 @@ static bool HasDataDep(const TargetRegisterInfo &TRI, const MachineInstr &MI, /// \brief Callback from DAG postProcessing to create cluster edges to encourage /// fused operations. -void MacroFusion::apply(ScheduleDAGMI *DAG) { +void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) { + ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); + // For now, assume targets can only fuse with the branch. SUnit &ExitSU = DAG->ExitSU; MachineInstr *Branch = ExitSU.getInstr(); @@ -1439,7 +1530,7 @@ void MacroFusion::apply(ScheduleDAGMI *DAG) { if (!HasDataDep(TRI, *Branch, *Pred)) continue; - if (!TII.shouldScheduleAdjacent(Pred, Branch)) + if (!TII.shouldScheduleAdjacent(*Pred, *Branch)) continue; // Create a single weak edge from SU to ExitSU. The only effect is to cause @@ -1474,7 +1565,7 @@ class CopyConstrain : public ScheduleDAGMutation { public: CopyConstrain(const TargetInstrInfo *, const TargetRegisterInfo *) {} - void apply(ScheduleDAGMI *DAG) override; + void apply(ScheduleDAGInstrs *DAGInstrs) override; protected: void constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG); @@ -1505,12 +1596,14 @@ void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG) { MachineInstr *Copy = CopySU->getInstr(); // Check for pure vreg copies. - unsigned SrcReg = Copy->getOperand(1).getReg(); - if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) + const MachineOperand &SrcOp = Copy->getOperand(1); + unsigned SrcReg = SrcOp.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || !SrcOp.readsReg()) return; - unsigned DstReg = Copy->getOperand(0).getReg(); - if (!TargetRegisterInfo::isVirtualRegister(DstReg)) + const MachineOperand &DstOp = Copy->getOperand(0); + unsigned DstReg = DstOp.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(DstReg) || DstOp.isDead()) return; // Check if either the dest or source is local. If it's live across a back @@ -1627,15 +1720,16 @@ void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG) { /// \brief Callback from DAG postProcessing to create weak edges to encourage /// copy elimination. -void CopyConstrain::apply(ScheduleDAGMI *DAG) { +void CopyConstrain::apply(ScheduleDAGInstrs *DAGInstrs) { + ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); assert(DAG->hasVRegLiveness() && "Expect VRegs with LiveIntervals"); MachineBasicBlock::iterator FirstPos = nextIfDebug(DAG->begin(), DAG->end()); if (FirstPos == DAG->end()) return; - RegionBeginIdx = DAG->getLIS()->getInstructionIndex(&*FirstPos); + RegionBeginIdx = DAG->getLIS()->getInstructionIndex(*FirstPos); RegionEndIdx = DAG->getLIS()->getInstructionIndex( - &*priorNonDebug(DAG->end(), DAG->begin())); + *priorNonDebug(DAG->end(), DAG->begin())); for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) { SUnit *SU = &DAG->SUnits[Idx]; @@ -1862,7 +1956,8 @@ void SchedBoundary::releaseNode(SUnit *SU, unsigned ReadyCycle) { // Check for interlocks first. For the purpose of other heuristics, an // instruction that cannot issue appears as if it's not in the ReadyQueue. bool IsBuffered = SchedModel->getMicroOpBufferSize() != 0; - if ((!IsBuffered && ReadyCycle > CurrCycle) || checkHazard(SU)) + if ((!IsBuffered && ReadyCycle > CurrCycle) || checkHazard(SU) || + Available.size() >= ReadyListLimit) Pending.push(SU); else Available.push(SU); @@ -1905,8 +2000,7 @@ void SchedBoundary::bumpCycle(unsigned NextCycle) { if (!HazardRec->isEnabled()) { // Bypass HazardRec virtual calls. CurrCycle = NextCycle; - } - else { + } else { // Bypass getHazardType calls in case of long latency. for (; CurrCycle != NextCycle; ++CurrCycle) { if (isTop()) @@ -2074,8 +2168,7 @@ void SchedBoundary::bumpNode(SUnit *SU) { // If we stall for any reason, bump the cycle. if (NextCycle > CurrCycle) { bumpCycle(NextCycle); - } - else { + } else { // After updating ZoneCritResIdx and ExpectedLatency, check if we're // resource limited. If a stall occurred, bumpCycle does this. unsigned LFactor = SchedModel->getLatencyFactor(); @@ -2119,11 +2212,13 @@ void SchedBoundary::releasePending() { if (checkHazard(SU)) continue; + if (Available.size() >= ReadyListLimit) + break; + Available.push(SU); Pending.remove(Pending.begin()+i); --i; --e; } - DEBUG(if (!Pending.empty()) Pending.dump()); CheckPending = false; } @@ -2163,6 +2258,10 @@ SUnit *SchedBoundary::pickOnlyChoice() { bumpCycle(CurrCycle + 1); releasePending(); } + + DEBUG(Pending.dump()); + DEBUG(Available.dump()); + if (Available.size() == 1) return *Available.begin(); return nullptr; @@ -2177,8 +2276,7 @@ void SchedBoundary::dumpScheduledState() { if (ZoneCritResIdx) { ResFactor = SchedModel->getResourceFactor(ZoneCritResIdx); ResCount = getResourceCount(ZoneCritResIdx); - } - else { + } else { ResFactor = SchedModel->getMicroOpFactor(); ResCount = RetiredMOps * SchedModel->getMicroOpFactor(); } @@ -2218,8 +2316,7 @@ initResourceDelta(const ScheduleDAGMI *DAG, /// Set the CandPolicy given a scheduling zone given the current resources and /// latencies inside and outside the zone. -void GenericSchedulerBase::setPolicy(CandPolicy &Policy, - bool IsPostRA, +void GenericSchedulerBase::setPolicy(CandPolicy &Policy, bool IsPostRA, SchedBoundary &CurrZone, SchedBoundary *OtherZone) { // Apply preemptive heuristics based on the total latency and resources @@ -2295,7 +2392,8 @@ const char *GenericSchedulerBase::getReasonStr( GenericSchedulerBase::CandReason Reason) { switch (Reason) { case NoCand: return "NOCAND "; - case PhysRegCopy: return "PREG-COPY"; + case Only1: return "ONLY1 "; + case PhysRegCopy: return "PREG-COPY "; case RegExcess: return "REG-EXCESS"; case RegCritical: return "REG-CRIT "; case Stall: return "STALL "; @@ -2381,7 +2479,6 @@ static bool tryLess(int TryVal, int CandVal, Cand.Reason = Reason; return true; } - Cand.setRepeat(Reason); return false; } @@ -2398,7 +2495,6 @@ static bool tryGreater(int TryVal, int CandVal, Cand.Reason = Reason; return true; } - Cand.setRepeat(Reason); return false; } @@ -2414,8 +2510,7 @@ static bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand, if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(), TryCand, Cand, GenericSchedulerBase::TopPathReduce)) return true; - } - else { + } else { if (Cand.SU->getHeight() > Zone.getScheduledLatency()) { if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(), TryCand, Cand, GenericSchedulerBase::BotHeightReduce)) @@ -2428,10 +2523,13 @@ static bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand, return false; } -static void tracePick(const GenericSchedulerBase::SchedCandidate &Cand, - bool IsTop) { +static void tracePick(GenericSchedulerBase::CandReason Reason, bool IsTop) { DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot ") - << GenericSchedulerBase::getReasonStr(Cand.Reason) << '\n'); + << GenericSchedulerBase::getReasonStr(Reason) << '\n'); +} + +static void tracePick(const GenericSchedulerBase::SchedCandidate &Cand) { + tracePick(Cand.Reason, Cand.AtTop); } void GenericScheduler::initialize(ScheduleDAGMI *dag) { @@ -2460,6 +2558,8 @@ void GenericScheduler::initialize(ScheduleDAGMI *dag) { DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer( Itin, DAG); } + TopCand.SU = nullptr; + BotCand.SU = nullptr; } /// Initialize the per-region scheduling policy. @@ -2487,8 +2587,7 @@ void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin, RegionPolicy.OnlyBottomUp = true; // Allow the subtarget to override default policy. - MF.getSubtarget().overrideSchedPolicy(RegionPolicy, Begin, End, - NumRegionInstrs); + MF.getSubtarget().overrideSchedPolicy(RegionPolicy, NumRegionInstrs); // After subtarget overrides, apply command line options. if (!EnableRegPressure) @@ -2582,19 +2681,25 @@ static bool tryPressure(const PressureChange &TryP, GenericSchedulerBase::CandReason Reason, const TargetRegisterInfo *TRI, const MachineFunction &MF) { - unsigned TryPSet = TryP.getPSetOrMax(); - unsigned CandPSet = CandP.getPSetOrMax(); - // If both candidates affect the same set, go with the smallest increase. - if (TryPSet == CandPSet) { - return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand, - Reason); - } // If one candidate decreases and the other increases, go with it. // Invalid candidates have UnitInc==0. if (tryGreater(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand, Reason)) { return true; } + // Do not compare the magnitude of pressure changes between top and bottom + // boundary. + if (Cand.AtTop != TryCand.AtTop) + return false; + + // If both candidates affect the same set in the same boundary, go with the + // smallest increase. + unsigned TryPSet = TryP.getPSetOrMax(); + unsigned CandPSet = CandP.getPSetOrMax(); + if (TryPSet == CandPSet) { + return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand, + Reason); + } int TryRank = TryP.isValid() ? TRI->getRegPressureSetScore(MF, TryPSet) : std::numeric_limits<int>::max(); @@ -2640,64 +2745,64 @@ static int biasPhysRegCopy(const SUnit *SU, bool isTop) { return 0; } -/// Apply a set of heursitics to a new candidate. Heuristics are currently -/// hierarchical. This may be more efficient than a graduated cost model because -/// we don't need to evaluate all aspects of the model for each node in the -/// queue. But it's really done to make the heuristics easier to debug and -/// statistically analyze. -/// -/// \param Cand provides the policy and current best candidate. -/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized. -/// \param Zone describes the scheduled zone that we are extending. -/// \param RPTracker describes reg pressure within the scheduled zone. -/// \param TempTracker is a scratch pressure tracker to reuse in queries. -void GenericScheduler::tryCandidate(SchedCandidate &Cand, - SchedCandidate &TryCand, - SchedBoundary &Zone, - const RegPressureTracker &RPTracker, - RegPressureTracker &TempTracker) { - +void GenericScheduler::initCandidate(SchedCandidate &Cand, SUnit *SU, + bool AtTop, + const RegPressureTracker &RPTracker, + RegPressureTracker &TempTracker) { + Cand.SU = SU; + Cand.AtTop = AtTop; if (DAG->isTrackingPressure()) { - // Always initialize TryCand's RPDelta. - if (Zone.isTop()) { + if (AtTop) { TempTracker.getMaxDownwardPressureDelta( - TryCand.SU->getInstr(), - TryCand.RPDelta, + Cand.SU->getInstr(), + Cand.RPDelta, DAG->getRegionCriticalPSets(), DAG->getRegPressure().MaxSetPressure); - } - else { + } else { if (VerifyScheduling) { TempTracker.getMaxUpwardPressureDelta( - TryCand.SU->getInstr(), - &DAG->getPressureDiff(TryCand.SU), - TryCand.RPDelta, + Cand.SU->getInstr(), + &DAG->getPressureDiff(Cand.SU), + Cand.RPDelta, DAG->getRegionCriticalPSets(), DAG->getRegPressure().MaxSetPressure); - } - else { + } else { RPTracker.getUpwardPressureDelta( - TryCand.SU->getInstr(), - DAG->getPressureDiff(TryCand.SU), - TryCand.RPDelta, + Cand.SU->getInstr(), + DAG->getPressureDiff(Cand.SU), + Cand.RPDelta, DAG->getRegionCriticalPSets(), DAG->getRegPressure().MaxSetPressure); } } } - DEBUG(if (TryCand.RPDelta.Excess.isValid()) - dbgs() << " Try SU(" << TryCand.SU->NodeNum << ") " - << TRI->getRegPressureSetName(TryCand.RPDelta.Excess.getPSet()) - << ":" << TryCand.RPDelta.Excess.getUnitInc() << "\n"); + DEBUG(if (Cand.RPDelta.Excess.isValid()) + dbgs() << " Try SU(" << Cand.SU->NodeNum << ") " + << TRI->getRegPressureSetName(Cand.RPDelta.Excess.getPSet()) + << ":" << Cand.RPDelta.Excess.getUnitInc() << "\n"); +} +/// Apply a set of heursitics to a new candidate. Heuristics are currently +/// hierarchical. This may be more efficient than a graduated cost model because +/// we don't need to evaluate all aspects of the model for each node in the +/// queue. But it's really done to make the heuristics easier to debug and +/// statistically analyze. +/// +/// \param Cand provides the policy and current best candidate. +/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized. +/// \param Zone describes the scheduled zone that we are extending, or nullptr +// if Cand is from a different zone than TryCand. +void GenericScheduler::tryCandidate(SchedCandidate &Cand, + SchedCandidate &TryCand, + SchedBoundary *Zone) { // Initialize the candidate if needed. if (!Cand.isValid()) { TryCand.Reason = NodeOrder; return; } - if (tryGreater(biasPhysRegCopy(TryCand.SU, Zone.isTop()), - biasPhysRegCopy(Cand.SU, Zone.isTop()), + if (tryGreater(biasPhysRegCopy(TryCand.SU, TryCand.AtTop), + biasPhysRegCopy(Cand.SU, Cand.AtTop), TryCand, Cand, PhysRegCopy)) return; @@ -2715,17 +2820,26 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand, DAG->MF)) return; - // For loops that are acyclic path limited, aggressively schedule for latency. - // This can result in very long dependence chains scheduled in sequence, so - // once every cycle (when CurrMOps == 0), switch to normal heuristics. - if (Rem.IsAcyclicLatencyLimited && !Zone.getCurrMOps() - && tryLatency(TryCand, Cand, Zone)) - return; + // We only compare a subset of features when comparing nodes between + // Top and Bottom boundary. Some properties are simply incomparable, in many + // other instances we should only override the other boundary if something + // is a clear good pick on one boundary. Skip heuristics that are more + // "tie-breaking" in nature. + bool SameBoundary = Zone != nullptr; + if (SameBoundary) { + // For loops that are acyclic path limited, aggressively schedule for + // latency. This can result in very long dependence chains scheduled in + // sequence, so once every cycle (when CurrMOps == 0), switch to normal + // heuristics. + if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() && + tryLatency(TryCand, Cand, *Zone)) + return; - // Prioritize instructions that read unbuffered resources by stall cycles. - if (tryLess(Zone.getLatencyStallCycles(TryCand.SU), - Zone.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) - return; + // Prioritize instructions that read unbuffered resources by stall cycles. + if (tryLess(Zone->getLatencyStallCycles(TryCand.SU), + Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) + return; + } // Keep clustered nodes together to encourage downstream peephole // optimizations which may reduce resource requirements. @@ -2733,18 +2847,23 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand, // This is a best effort to set things up for a post-RA pass. Optimizations // like generating loads of multiple registers should ideally be done within // the scheduler pass by combining the loads during DAG postprocessing. - const SUnit *NextClusterSU = - Zone.isTop() ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); - if (tryGreater(TryCand.SU == NextClusterSU, Cand.SU == NextClusterSU, + const SUnit *CandNextClusterSU = + Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); + const SUnit *TryCandNextClusterSU = + TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred(); + if (tryGreater(TryCand.SU == TryCandNextClusterSU, + Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster)) return; - // Weak edges are for clustering and other constraints. - if (tryLess(getWeakLeft(TryCand.SU, Zone.isTop()), - getWeakLeft(Cand.SU, Zone.isTop()), - TryCand, Cand, Weak)) { - return; + if (SameBoundary) { + // Weak edges are for clustering and other constraints. + if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop), + getWeakLeft(Cand.SU, Cand.AtTop), + TryCand, Cand, Weak)) + return; } + // Avoid increasing the max pressure of the entire region. if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, @@ -2752,34 +2871,35 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand, DAG->MF)) return; - // Avoid critical resource consumption and balance the schedule. - TryCand.initResourceDelta(DAG, SchedModel); - if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, - TryCand, Cand, ResourceReduce)) - return; - if (tryGreater(TryCand.ResDelta.DemandedResources, - Cand.ResDelta.DemandedResources, - TryCand, Cand, ResourceDemand)) - return; + if (SameBoundary) { + // Avoid critical resource consumption and balance the schedule. + TryCand.initResourceDelta(DAG, SchedModel); + if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, + TryCand, Cand, ResourceReduce)) + return; + if (tryGreater(TryCand.ResDelta.DemandedResources, + Cand.ResDelta.DemandedResources, + TryCand, Cand, ResourceDemand)) + return; - // Avoid serializing long latency dependence chains. - // For acyclic path limited loops, latency was already checked above. - if (!RegionPolicy.DisableLatencyHeuristic && Cand.Policy.ReduceLatency && - !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, Zone)) { - return; - } + // Avoid serializing long latency dependence chains. + // For acyclic path limited loops, latency was already checked above. + if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency && + !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone)) + return; - // Prefer immediate defs/users of the last scheduled instruction. This is a - // local pressure avoidance strategy that also makes the machine code - // readable. - if (tryGreater(Zone.isNextSU(TryCand.SU), Zone.isNextSU(Cand.SU), - TryCand, Cand, NextDefUse)) - return; + // Prefer immediate defs/users of the last scheduled instruction. This is a + // local pressure avoidance strategy that also makes the machine code + // readable. + if (tryGreater(Zone->isNextSU(TryCand.SU), Zone->isNextSU(Cand.SU), + TryCand, Cand, NextDefUse)) + return; - // Fall through to original instruction order. - if ((Zone.isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) - || (!Zone.isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { - TryCand.Reason = NodeOrder; + // Fall through to original instruction order. + if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) + || (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { + TryCand.Reason = NodeOrder; + } } } @@ -2789,20 +2909,20 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand, /// DAG building. To adjust for the current scheduling location we need to /// maintain the number of vreg uses remaining to be top-scheduled. void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone, + const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, SchedCandidate &Cand) { - ReadyQueue &Q = Zone.Available; - - DEBUG(Q.dump()); - // getMaxPressureDelta temporarily modifies the tracker. RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker); + ReadyQueue &Q = Zone.Available; for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) { - SchedCandidate TryCand(Cand.Policy); - TryCand.SU = *I; - tryCandidate(Cand, TryCand, Zone, RPTracker, TempTracker); + SchedCandidate TryCand(ZonePolicy); + initCandidate(TryCand, *I, Zone.isTop(), RPTracker, TempTracker); + // Pass SchedBoundary only when comparing nodes from the same boundary. + SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; + tryCandidate(Cand, TryCand, ZoneArg); if (TryCand.Reason != NoCand) { // Initialize resource delta if needed in case future heuristics query it. if (TryCand.ResDelta == SchedResourceDelta()) @@ -2819,57 +2939,77 @@ SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) { // efficient, but also provides the best heuristics for CriticalPSets. if (SUnit *SU = Bot.pickOnlyChoice()) { IsTopNode = false; - DEBUG(dbgs() << "Pick Bot ONLY1\n"); + tracePick(Only1, false); return SU; } if (SUnit *SU = Top.pickOnlyChoice()) { IsTopNode = true; - DEBUG(dbgs() << "Pick Top ONLY1\n"); + tracePick(Only1, true); return SU; } - CandPolicy NoPolicy; - SchedCandidate BotCand(NoPolicy); - SchedCandidate TopCand(NoPolicy); // Set the bottom-up policy based on the state of the current bottom zone and // the instructions outside the zone, including the top zone. - setPolicy(BotCand.Policy, /*IsPostRA=*/false, Bot, &Top); + CandPolicy BotPolicy; + setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top); // Set the top-down policy based on the state of the current top zone and // the instructions outside the zone, including the bottom zone. - setPolicy(TopCand.Policy, /*IsPostRA=*/false, Top, &Bot); - - // Prefer bottom scheduling when heuristics are silent. - pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand); - assert(BotCand.Reason != NoCand && "failed to find the first candidate"); - - // If either Q has a single candidate that provides the least increase in - // Excess pressure, we can immediately schedule from that Q. - // - // RegionCriticalPSets summarizes the pressure within the scheduled region and - // affects picking from either Q. If scheduling in one direction must - // increase pressure for one of the excess PSets, then schedule in that - // direction first to provide more freedom in the other direction. - if ((BotCand.Reason == RegExcess && !BotCand.isRepeat(RegExcess)) - || (BotCand.Reason == RegCritical - && !BotCand.isRepeat(RegCritical))) - { - IsTopNode = false; - tracePick(BotCand, IsTopNode); - return BotCand.SU; + CandPolicy TopPolicy; + setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot); + + // See if BotCand is still valid (because we previously scheduled from Top). + DEBUG(dbgs() << "Picking from Bot:\n"); + if (!BotCand.isValid() || BotCand.SU->isScheduled || + BotCand.Policy != BotPolicy) { + BotCand.reset(CandPolicy()); + pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand); + assert(BotCand.Reason != NoCand && "failed to find the first candidate"); + } else { + DEBUG(traceCandidate(BotCand)); +#ifndef NDEBUG + if (VerifyScheduling) { + SchedCandidate TCand; + TCand.reset(CandPolicy()); + pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand); + assert(TCand.SU == BotCand.SU && + "Last pick result should correspond to re-picking right now"); + } +#endif } + // Check if the top Q has a better candidate. - pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand); - assert(TopCand.Reason != NoCand && "failed to find the first candidate"); + DEBUG(dbgs() << "Picking from Top:\n"); + if (!TopCand.isValid() || TopCand.SU->isScheduled || + TopCand.Policy != TopPolicy) { + TopCand.reset(CandPolicy()); + pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand); + assert(TopCand.Reason != NoCand && "failed to find the first candidate"); + } else { + DEBUG(traceCandidate(TopCand)); +#ifndef NDEBUG + if (VerifyScheduling) { + SchedCandidate TCand; + TCand.reset(CandPolicy()); + pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand); + assert(TCand.SU == TopCand.SU && + "Last pick result should correspond to re-picking right now"); + } +#endif + } - // Choose the queue with the most important (lowest enum) reason. - if (TopCand.Reason < BotCand.Reason) { - IsTopNode = true; - tracePick(TopCand, IsTopNode); - return TopCand.SU; + // Pick best from BotCand and TopCand. + assert(BotCand.isValid()); + assert(TopCand.isValid()); + SchedCandidate Cand = BotCand; + TopCand.Reason = NoCand; + tryCandidate(Cand, TopCand, nullptr); + if (TopCand.Reason != NoCand) { + Cand.setBest(TopCand); + DEBUG(traceCandidate(Cand)); } - // Otherwise prefer the bottom candidate, in node order if all else failed. - IsTopNode = false; - tracePick(BotCand, IsTopNode); - return BotCand.SU; + + IsTopNode = Cand.AtTop; + tracePick(Cand); + return Cand.SU; } /// Pick the best node to balance the schedule. Implements MachineSchedStrategy. @@ -2885,27 +3025,25 @@ SUnit *GenericScheduler::pickNode(bool &IsTopNode) { SU = Top.pickOnlyChoice(); if (!SU) { CandPolicy NoPolicy; - SchedCandidate TopCand(NoPolicy); - pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand); + TopCand.reset(NoPolicy); + pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand); assert(TopCand.Reason != NoCand && "failed to find a candidate"); - tracePick(TopCand, true); + tracePick(TopCand); SU = TopCand.SU; } IsTopNode = true; - } - else if (RegionPolicy.OnlyBottomUp) { + } else if (RegionPolicy.OnlyBottomUp) { SU = Bot.pickOnlyChoice(); if (!SU) { CandPolicy NoPolicy; - SchedCandidate BotCand(NoPolicy); - pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand); + BotCand.reset(NoPolicy); + pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand); assert(BotCand.Reason != NoCand && "failed to find a candidate"); - tracePick(BotCand, false); + tracePick(BotCand); SU = BotCand.SU; } IsTopNode = false; - } - else { + } else { SU = pickNodeBidirectional(IsTopNode); } } while (SU->isScheduled); @@ -2957,8 +3095,7 @@ void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) { Top.bumpNode(SU); if (SU->hasPhysRegUses) reschedulePhysRegCopies(SU, true); - } - else { + } else { SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.getCurrCycle()); Bot.bumpNode(SU); if (SU->hasPhysRegDefs) @@ -2976,8 +3113,12 @@ static ScheduleDAGInstrs *createGenericSchedLive(MachineSchedContext *C) { // data and pass it to later mutations. Have a single mutation that gathers // the interesting nodes in one pass. DAG->addMutation(make_unique<CopyConstrain>(DAG->TII, DAG->TRI)); - if (EnableLoadCluster && DAG->TII->enableClusterLoads()) - DAG->addMutation(make_unique<LoadClusterMutation>(DAG->TII, DAG->TRI)); + if (EnableMemOpCluster) { + if (DAG->TII->enableClusterLoads()) + DAG->addMutation(make_unique<LoadClusterMutation>(DAG->TII, DAG->TRI)); + if (DAG->TII->enableClusterStores()) + DAG->addMutation(make_unique<StoreClusterMutation>(DAG->TII, DAG->TRI)); + } if (EnableMacroFusion) DAG->addMutation(make_unique<MacroFusion>(*DAG->TII, *DAG->TRI)); return DAG; @@ -3065,12 +3206,10 @@ void PostGenericScheduler::tryCandidate(SchedCandidate &Cand, void PostGenericScheduler::pickNodeFromQueue(SchedCandidate &Cand) { ReadyQueue &Q = Top.Available; - - DEBUG(Q.dump()); - for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) { SchedCandidate TryCand(Cand.Policy); TryCand.SU = *I; + TryCand.AtTop = true; TryCand.initResourceDelta(DAG, SchedModel); tryCandidate(Cand, TryCand); if (TryCand.Reason != NoCand) { @@ -3089,7 +3228,9 @@ SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) { SUnit *SU; do { SU = Top.pickOnlyChoice(); - if (!SU) { + if (SU) { + tracePick(Only1, true); + } else { CandPolicy NoPolicy; SchedCandidate TopCand(NoPolicy); // Set the top-down policy based on the state of the current top zone and @@ -3097,7 +3238,7 @@ SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) { setPolicy(TopCand.Policy, /*IsPostRA=*/true, Top, nullptr); pickNodeFromQueue(TopCand); assert(TopCand.Reason != NoCand && "failed to find a candidate"); - tracePick(TopCand, true); + tracePick(TopCand); SU = TopCand.SU; } } while (SU->isScheduled); @@ -3285,8 +3426,7 @@ public: TopQ.pop(); } while (SU->isScheduled); IsTopNode = true; - } - else { + } else { do { if (BottomQ.empty()) return nullptr; SU = BottomQ.top(); |