diff options
Diffstat (limited to 'contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp')
-rw-r--r-- | contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp | 255 |
1 files changed, 179 insertions, 76 deletions
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp index da86bbf..34886c4 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -12,9 +12,9 @@ // //===----------------------------------------------------------------------===// +#include "SIMachineScheduler.h" #include "AMDGPU.h" #include "SIInstrInfo.h" -#include "SIMachineScheduler.h" #include "SIRegisterInfo.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" @@ -38,7 +38,7 @@ using namespace llvm; -#define DEBUG_TYPE "misched" +#define DEBUG_TYPE "machine-scheduler" // This scheduler implements a different scheduling algorithm than // GenericScheduler. @@ -539,21 +539,30 @@ void SIScheduleBlock::addPred(SIScheduleBlock *Pred) { Preds.push_back(Pred); assert(none_of(Succs, - [=](SIScheduleBlock *S) { return PredID == S->getID(); }) && + [=](std::pair<SIScheduleBlock*, + SIScheduleBlockLinkKind> S) { + return PredID == S.first->getID(); + }) && "Loop in the Block Graph!"); } -void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) { +void SIScheduleBlock::addSucc(SIScheduleBlock *Succ, + SIScheduleBlockLinkKind Kind) { unsigned SuccID = Succ->getID(); // Check if not already predecessor. - for (SIScheduleBlock* S : Succs) { - if (SuccID == S->getID()) + for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) { + if (SuccID == S.first->getID()) { + if (S.second == SIScheduleBlockLinkKind::NoData && + Kind == SIScheduleBlockLinkKind::Data) + S.second = Kind; return; + } } if (Succ->isHighLatencyBlock()) ++NumHighLatencySuccessors; - Succs.push_back(Succ); + Succs.push_back(std::make_pair(Succ, Kind)); + assert(none_of(Preds, [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) && "Loop in the Block Graph!"); @@ -573,8 +582,10 @@ void SIScheduleBlock::printDebug(bool full) { } dbgs() << "\nSuccessors:\n"; - for (SIScheduleBlock* S : Succs) { - S->printDebug(false); + for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) { + if (S.second == SIScheduleBlockLinkKind::Data) + dbgs() << "(Data Dep) "; + S.first->printDebug(false); } if (Scheduled) { @@ -651,11 +662,21 @@ void SIScheduleBlockCreator::colorHighLatenciesAlone() { } } +static bool +hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) { + for (const auto &PredDep : SU.Preds) { + if (PredDep.getSUnit() == &FromSU && + PredDep.getKind() == llvm::SDep::Data) + return true; + } + return false; +} + void SIScheduleBlockCreator::colorHighLatenciesGroups() { unsigned DAGSize = DAG->SUnits.size(); unsigned NumHighLatencies = 0; unsigned GroupSize; - unsigned Color = NextReservedID; + int Color = NextReservedID; unsigned Count = 0; std::set<unsigned> FormingGroup; @@ -675,35 +696,102 @@ void SIScheduleBlockCreator::colorHighLatenciesGroups() { else GroupSize = 4; - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &DAG->SUnits[i]; - if (DAG->IsHighLatencySU[SU->NodeNum]) { + for (unsigned SUNum : DAG->TopDownIndex2SU) { + const SUnit &SU = DAG->SUnits[SUNum]; + if (DAG->IsHighLatencySU[SU.NodeNum]) { unsigned CompatibleGroup = true; - unsigned ProposedColor = Color; + int ProposedColor = Color; + std::vector<int> AdditionalElements; + + // We don't want to put in the same block + // two high latency instructions that depend + // on each other. + // One way would be to check canAddEdge + // in both directions, but that currently is not + // enough because there the high latency order is + // enforced (via links). + // Instead, look at the dependencies between the + // high latency instructions and deduce if it is + // a data dependency or not. for (unsigned j : FormingGroup) { - // TODO: Currently CompatibleGroup will always be false, - // because the graph enforces the load order. This - // can be fixed, but as keeping the load order is often - // good for performance that causes a performance hit (both - // the default scheduler and this scheduler). - // When this scheduler determines a good load order, - // this can be fixed. - if (!DAG->canAddEdge(SU, &DAG->SUnits[j]) || - !DAG->canAddEdge(&DAG->SUnits[j], SU)) + bool HasSubGraph; + std::vector<int> SubGraph; + // By construction (topological order), if SU and + // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary + // in the parent graph of SU. +#ifndef NDEBUG + SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j], + HasSubGraph); + assert(!HasSubGraph); +#endif + SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU, + HasSubGraph); + if (!HasSubGraph) + continue; // No dependencies between each other + else if (SubGraph.size() > 5) { + // Too many elements would be required to be added to the block. CompatibleGroup = false; + break; + } + else { + // Check the type of dependency + for (unsigned k : SubGraph) { + // If in the path to join the two instructions, + // there is another high latency instruction, + // or instructions colored for another block + // abort the merge. + if (DAG->IsHighLatencySU[k] || + (CurrentColoring[k] != ProposedColor && + CurrentColoring[k] != 0)) { + CompatibleGroup = false; + break; + } + // If one of the SU in the subgraph depends on the result of SU j, + // there'll be a data dependency. + if (hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])) { + CompatibleGroup = false; + break; + } + } + if (!CompatibleGroup) + break; + // Same check for the SU + if (hasDataDependencyPred(SU, DAG->SUnits[j])) { + CompatibleGroup = false; + break; + } + // Add all the required instructions to the block + // These cannot live in another block (because they + // depend (order dependency) on one of the + // instruction in the block, and are required for the + // high latency instruction we add. + AdditionalElements.insert(AdditionalElements.end(), + SubGraph.begin(), SubGraph.end()); + } + } + if (CompatibleGroup) { + FormingGroup.insert(SU.NodeNum); + for (unsigned j : AdditionalElements) + CurrentColoring[j] = ProposedColor; + CurrentColoring[SU.NodeNum] = ProposedColor; + ++Count; } - if (!CompatibleGroup || ++Count == GroupSize) { + // Found one incompatible instruction, + // or has filled a big enough group. + // -> start a new one. + if (!CompatibleGroup) { FormingGroup.clear(); Color = ++NextReservedID; - if (!CompatibleGroup) { - ProposedColor = Color; - FormingGroup.insert(SU->NodeNum); - } + ProposedColor = Color; + FormingGroup.insert(SU.NodeNum); + CurrentColoring[SU.NodeNum] = ProposedColor; + Count = 0; + } else if (Count == GroupSize) { + FormingGroup.clear(); + Color = ++NextReservedID; + ProposedColor = Color; Count = 0; - } else { - FormingGroup.insert(SU->NodeNum); } - CurrentColoring[SU->NodeNum] = ProposedColor; } } } @@ -835,6 +923,17 @@ void SIScheduleBlockCreator::colorEndsAccordingToDependencies() { unsigned DAGSize = DAG->SUnits.size(); std::vector<int> PendingColoring = CurrentColoring; + assert(DAGSize >= 1 && + CurrentBottomUpReservedDependencyColoring.size() == DAGSize && + CurrentTopDownReservedDependencyColoring.size() == DAGSize); + // If there is no reserved block at all, do nothing. We don't want + // everything in one block. + if (*std::max_element(CurrentBottomUpReservedDependencyColoring.begin(), + CurrentBottomUpReservedDependencyColoring.end()) == 0 && + *std::max_element(CurrentTopDownReservedDependencyColoring.begin(), + CurrentTopDownReservedDependencyColoring.end()) == 0) + return; + for (unsigned SUNum : DAG->BottomUpIndex2SU) { SUnit *SU = &DAG->SUnits[SUNum]; std::set<unsigned> SUColors; @@ -856,6 +955,9 @@ void SIScheduleBlockCreator::colorEndsAccordingToDependencies() { SUColors.insert(CurrentColoring[Succ->NodeNum]); SUColorsPending.insert(PendingColoring[Succ->NodeNum]); } + // If there is only one child/parent block, and that block + // is not among the ones we are removing in this path, then + // merge the instruction to that block if (SUColors.size() == 1 && SUColorsPending.size() == 1) PendingColoring[SU->NodeNum] = *SUColors.begin(); else // TODO: Attribute new colors depending on color @@ -974,12 +1076,7 @@ void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() { for (unsigned SUNum : DAG->BottomUpIndex2SU) { SUnit *SU = &DAG->SUnits[SUNum]; unsigned color = CurrentColoring[SU->NodeNum]; - std::map<unsigned, unsigned>::iterator Pos = ColorCount.find(color); - if (Pos != ColorCount.end()) { - ++ColorCount[color]; - } else { - ColorCount[color] = 1; - } + ++ColorCount[color]; } for (unsigned SUNum : DAG->BottomUpIndex2SU) { @@ -1087,7 +1184,8 @@ void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVaria if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) continue; if (Node2CurrentBlock[Succ->NodeNum] != SUID) - CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]]); + CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]], + SuccDep.isCtrl() ? NoData : Data); } for (SDep& PredDep : SU->Preds) { SUnit *Pred = PredDep.getSUnit(); @@ -1281,10 +1379,8 @@ void SIScheduleBlockCreator::fillStats() { Block->Height = 0; else { unsigned Height = 0; - for (SIScheduleBlock *Succ : Block->getSuccs()) { - if (Height < Succ->Height + 1) - Height = Succ->Height + 1; - } + for (const auto &Succ : Block->getSuccs()) + Height = std::min(Height, Succ.first->Height + 1); Block->Height = Height; } } @@ -1331,13 +1427,7 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, continue; int PredID = BlocksStruct.TopDownIndex2Block[topoInd]; - std::map<unsigned, unsigned>::iterator RegPos = - LiveOutRegsNumUsages[PredID].find(Reg); - if (RegPos != LiveOutRegsNumUsages[PredID].end()) { - ++LiveOutRegsNumUsages[PredID][Reg]; - } else { - LiveOutRegsNumUsages[PredID][Reg] = 1; - } + ++LiveOutRegsNumUsages[PredID][Reg]; } } @@ -1361,6 +1451,24 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, std::set<unsigned> InRegs = DAG->getInRegs(); addLiveRegs(InRegs); + // Increase LiveOutRegsNumUsages for blocks + // producing registers consumed in another + // scheduling region. + for (unsigned Reg : DAG->getOutRegs()) { + for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { + // Do reverse traversal + int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i]; + SIScheduleBlock *Block = Blocks[ID]; + const std::set<unsigned> &OutRegs = Block->getOutRegs(); + + if (OutRegs.find(Reg) == OutRegs.end()) + continue; + + ++LiveOutRegsNumUsages[ID][Reg]; + break; + } + } + // Fill LiveRegsConsumers for regs that were already // defined before scheduling. for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { @@ -1377,12 +1485,8 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, } } - if (!Found) { - if (LiveRegsConsumers.find(Reg) == LiveRegsConsumers.end()) - LiveRegsConsumers[Reg] = 1; - else - ++LiveRegsConsumers[Reg]; - } + if (!Found) + ++LiveRegsConsumers[Reg]; } } @@ -1403,6 +1507,7 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, for (SIScheduleBlock* Block : BlocksScheduled) { dbgs() << ' ' << Block->getID(); } + dbgs() << '\n'; ); } @@ -1464,8 +1569,8 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() { VregCurrentUsage, SregCurrentUsage); if (VregCurrentUsage > maxVregUsage) maxVregUsage = VregCurrentUsage; - if (VregCurrentUsage > maxSregUsage) - maxSregUsage = VregCurrentUsage; + if (SregCurrentUsage > maxSregUsage) + maxSregUsage = SregCurrentUsage; DEBUG( dbgs() << "Picking New Blocks\n"; dbgs() << "Available: "; @@ -1556,17 +1661,13 @@ void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block, } void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) { - for (SIScheduleBlock* Block : Parent->getSuccs()) { - --BlockNumPredsLeft[Block->getID()]; - if (BlockNumPredsLeft[Block->getID()] == 0) { - ReadyBlocks.push_back(Block); - } - // TODO: Improve check. When the dependency between the high latency - // instructions and the instructions of the other blocks are WAR or WAW - // there will be no wait triggered. We would like these cases to not - // update LastPosHighLatencyParentScheduled. - if (Parent->isHighLatencyBlock()) - LastPosHighLatencyParentScheduled[Block->getID()] = NumBlockScheduled; + for (const auto &Block : Parent->getSuccs()) { + if (--BlockNumPredsLeft[Block.first->getID()] == 0) + ReadyBlocks.push_back(Block.first); + + if (Parent->isHighLatencyBlock() && + Block.second == SIScheduleBlockLinkKind::Data) + LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled; } } @@ -1578,12 +1679,10 @@ void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) { LiveOutRegsNumUsages[Block->getID()].begin(), E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) { std::pair<unsigned, unsigned> RegP = *RegI; - if (LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end()) - LiveRegsConsumers[RegP.first] = RegP.second; - else { - assert(LiveRegsConsumers[RegP.first] == 0); - LiveRegsConsumers[RegP.first] += RegP.second; - } + // We produce this register, thus it must not be previously alive. + assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() || + LiveRegsConsumers[RegP.first] == 0); + LiveRegsConsumers[RegP.first] += RegP.second; } if (LastPosHighLatencyParentScheduled[Block->getID()] > (unsigned)LastPosWaitedHighLatency) @@ -1825,7 +1924,9 @@ void SIScheduleDAGMI::schedule() // if VGPR usage is extremely high, try other good performing variants // which could lead to lower VGPR usage if (Best.MaxVGPRUsage > 180) { - std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = { + static const std::pair<SISchedulerBlockCreatorVariant, + SISchedulerBlockSchedulerVariant> + Variants[] = { { LatenciesAlone, BlockRegUsageLatency }, // { LatenciesAlone, BlockRegUsage }, { LatenciesGrouped, BlockLatencyRegUsage }, @@ -1844,7 +1945,9 @@ void SIScheduleDAGMI::schedule() // if VGPR usage is still extremely high, we may spill. Try other variants // which are less performing, but that could lead to lower VGPR usage. if (Best.MaxVGPRUsage > 200) { - std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = { + static const std::pair<SISchedulerBlockCreatorVariant, + SISchedulerBlockSchedulerVariant> + Variants[] = { // { LatenciesAlone, BlockRegUsageLatency }, { LatenciesAlone, BlockRegUsage }, // { LatenciesGrouped, BlockLatencyRegUsage }, |