1 files changed, 179 insertions, 76 deletions
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
index da86bbf..34886c4 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -12,9 +12,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "SIMachineScheduler.h"
 #include "AMDGPU.h"
 #include "SIInstrInfo.h"
-#include "SIMachineScheduler.h"
 #include "SIRegisterInfo.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
@@ -38,7 +38,7 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "misched"
+#define DEBUG_TYPE "machine-scheduler"
 
 // This scheduler implements a different scheduling algorithm than
 // GenericScheduler.
@@ -539,21 +539,30 @@ void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
   Preds.push_back(Pred);
 
   assert(none_of(Succs,
-                 [=](SIScheduleBlock *S) { return PredID == S->getID(); }) &&
+                 [=](std::pair<SIScheduleBlock*,
+                     SIScheduleBlockLinkKind> S) {
+                   return PredID == S.first->getID();
+                    }) &&
          "Loop in the Block Graph!");
 }
 
-void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) {
+void SIScheduleBlock::addSucc(SIScheduleBlock *Succ,
+                              SIScheduleBlockLinkKind Kind) {
   unsigned SuccID = Succ->getID();
 
   // Check if not already predecessor.
-  for (SIScheduleBlock* S : Succs) {
-    if (SuccID == S->getID())
+  for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) {
+    if (SuccID == S.first->getID()) {
+      if (S.second == SIScheduleBlockLinkKind::NoData &&
+          Kind == SIScheduleBlockLinkKind::Data)
+        S.second = Kind;
       return;
+    }
   }
   if (Succ->isHighLatencyBlock())
     ++NumHighLatencySuccessors;
-  Succs.push_back(Succ);
+  Succs.push_back(std::make_pair(Succ, Kind));
+
   assert(none_of(Preds,
                  [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) &&
          "Loop in the Block Graph!");
@@ -573,8 +582,10 @@ void SIScheduleBlock::printDebug(bool full) {
   }
 
   dbgs() << "\nSuccessors:\n";
-  for (SIScheduleBlock* S : Succs) {
-    S->printDebug(false);
+  for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) {
+    if (S.second == SIScheduleBlockLinkKind::Data)
+      dbgs() << "(Data Dep) ";
+    S.first->printDebug(false);
   }
 
   if (Scheduled) {
@@ -651,11 +662,21 @@ void SIScheduleBlockCreator::colorHighLatenciesAlone() {
   }
 }
 
+static bool
+hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) {
+  for (const auto &PredDep : SU.Preds) {
+    if (PredDep.getSUnit() == &FromSU &&
+        PredDep.getKind() == llvm::SDep::Data)
+      return true;
+  }
+  return false;
+}
+
 void SIScheduleBlockCreator::colorHighLatenciesGroups() {
   unsigned DAGSize = DAG->SUnits.size();
   unsigned NumHighLatencies = 0;
   unsigned GroupSize;
-  unsigned Color = NextReservedID;
+  int Color = NextReservedID;
   unsigned Count = 0;
   std::set<unsigned> FormingGroup;
 
@@ -675,35 +696,102 @@ void SIScheduleBlockCreator::colorHighLatenciesGroups() {
   else
     GroupSize = 4;
 
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &DAG->SUnits[i];
-    if (DAG->IsHighLatencySU[SU->NodeNum]) {
+  for (unsigned SUNum : DAG->TopDownIndex2SU) {
+    const SUnit &SU = DAG->SUnits[SUNum];
+    if (DAG->IsHighLatencySU[SU.NodeNum]) {
       unsigned CompatibleGroup = true;
-      unsigned ProposedColor = Color;
+      int ProposedColor = Color;
+      std::vector<int> AdditionalElements;
+
+      // We don't want to put in the same block
+      // two high latency instructions that depend
+      // on each other.
+      // One way would be to check canAddEdge
+      // in both directions, but that currently is not
+      // enough because there the high latency order is
+      // enforced (via links).
+      // Instead, look at the dependencies between the
+      // high latency instructions and deduce if it is
+      // a data dependency or not.
       for (unsigned j : FormingGroup) {
-        // TODO: Currently CompatibleGroup will always be false,
-        // because the graph enforces the load order. This
-        // can be fixed, but as keeping the load order is often
-        // good for performance that causes a performance hit (both
-        // the default scheduler and this scheduler).
-        // When this scheduler determines a good load order,
-        // this can be fixed.
-        if (!DAG->canAddEdge(SU, &DAG->SUnits[j]) ||
-            !DAG->canAddEdge(&DAG->SUnits[j], SU))
+        bool HasSubGraph;
+        std::vector<int> SubGraph;
+        // By construction (topological order), if SU and
+        // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
+        // in the parent graph of SU.
+#ifndef NDEBUG
+        SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
+                                               HasSubGraph);
+        assert(!HasSubGraph);
+#endif
+        SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
+                                               HasSubGraph);
+        if (!HasSubGraph)
+          continue; // No dependencies between each other
+        else if (SubGraph.size() > 5) {
+          // Too many elements would be required to be added to the block.
           CompatibleGroup = false;
+          break;
+        }
+        else {
+          // Check the type of dependency
+          for (unsigned k : SubGraph) {
+            // If in the path to join the two instructions,
+            // there is another high latency instruction,
+            // or instructions colored for another block
+            // abort the merge.
+            if (DAG->IsHighLatencySU[k] ||
+                (CurrentColoring[k] != ProposedColor &&
+                 CurrentColoring[k] != 0)) {
+              CompatibleGroup = false;
+              break;
+            }
+            // If one of the SU in the subgraph depends on the result of SU j,
+            // there'll be a data dependency.
+            if (hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])) {
+              CompatibleGroup = false;
+              break;
+            }
+          }
+          if (!CompatibleGroup)
+            break;
+          // Same check for the SU
+          if (hasDataDependencyPred(SU, DAG->SUnits[j])) {
+            CompatibleGroup = false;
+            break;
+          }
+          // Add all the required instructions to the block
+          // These cannot live in another block (because they
+          // depend (order dependency) on one of the
+          // instruction in the block, and are required for the
+          // high latency instruction we add.
+          AdditionalElements.insert(AdditionalElements.end(),
+                                    SubGraph.begin(), SubGraph.end());
+        }
+      }
+      if (CompatibleGroup) {
+        FormingGroup.insert(SU.NodeNum);
+        for (unsigned j : AdditionalElements)
+          CurrentColoring[j] = ProposedColor;
+        CurrentColoring[SU.NodeNum] = ProposedColor;
+        ++Count;
       }
-      if (!CompatibleGroup || ++Count == GroupSize) {
+      // Found one incompatible instruction,
+      // or has filled a big enough group.
+      // -> start a new one.
+      if (!CompatibleGroup) {
         FormingGroup.clear();
         Color = ++NextReservedID;
-        if (!CompatibleGroup) {
-          ProposedColor = Color;
-          FormingGroup.insert(SU->NodeNum);
-        }
+        ProposedColor = Color;
+        FormingGroup.insert(SU.NodeNum);
+        CurrentColoring[SU.NodeNum] = ProposedColor;
+        Count = 0;
+      } else if (Count == GroupSize) {
+        FormingGroup.clear();
+        Color = ++NextReservedID;
+        ProposedColor = Color;
         Count = 0;
-      } else {
-        FormingGroup.insert(SU->NodeNum);
       }
-      CurrentColoring[SU->NodeNum] = ProposedColor;
     }
   }
 }
@@ -835,6 +923,17 @@ void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
   unsigned DAGSize = DAG->SUnits.size();
   std::vector<int> PendingColoring = CurrentColoring;
 
+  assert(DAGSize >= 1 &&
+         CurrentBottomUpReservedDependencyColoring.size() == DAGSize &&
+         CurrentTopDownReservedDependencyColoring.size() == DAGSize);
+  // If there is no reserved block at all, do nothing. We don't want
+  // everything in one block.
+  if (*std::max_element(CurrentBottomUpReservedDependencyColoring.begin(),
+                        CurrentBottomUpReservedDependencyColoring.end()) == 0 &&
+      *std::max_element(CurrentTopDownReservedDependencyColoring.begin(),
+                        CurrentTopDownReservedDependencyColoring.end()) == 0)
+    return;
+
   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
     SUnit *SU = &DAG->SUnits[SUNum];
     std::set<unsigned> SUColors;
@@ -856,6 +955,9 @@ void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
         SUColors.insert(CurrentColoring[Succ->NodeNum]);
       SUColorsPending.insert(PendingColoring[Succ->NodeNum]);
     }
+    // If there is only one child/parent block, and that block
+    // is not among the ones we are removing in this path, then
+    // merge the instruction to that block
     if (SUColors.size() == 1 && SUColorsPending.size() == 1)
       PendingColoring[SU->NodeNum] = *SUColors.begin();
     else // TODO: Attribute new colors depending on color
@@ -974,12 +1076,7 @@ void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
     SUnit *SU = &DAG->SUnits[SUNum];
     unsigned color = CurrentColoring[SU->NodeNum];
-    std::map<unsigned, unsigned>::iterator Pos = ColorCount.find(color);
-      if (Pos != ColorCount.end()) {
-        ++ColorCount[color];
-      } else {
-        ColorCount[color] = 1;
-      }
+     ++ColorCount[color];
   }
 
   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
@@ -1087,7 +1184,8 @@ void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVaria
       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
         continue;
       if (Node2CurrentBlock[Succ->NodeNum] != SUID)
-        CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]]);
+        CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]],
+                                     SuccDep.isCtrl() ? NoData : Data);
     }
     for (SDep& PredDep : SU->Preds) {
       SUnit *Pred = PredDep.getSUnit();
@@ -1281,10 +1379,8 @@ void SIScheduleBlockCreator::fillStats() {
       Block->Height = 0;
     else {
       unsigned Height = 0;
-      for (SIScheduleBlock *Succ : Block->getSuccs()) {
-        if (Height < Succ->Height + 1)
-          Height = Succ->Height + 1;
-      }
+      for (const auto &Succ : Block->getSuccs())
+        Height = std::min(Height, Succ.first->Height + 1);
       Block->Height = Height;
     }
   }
@@ -1331,13 +1427,7 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
         continue;
 
       int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
-      std::map<unsigned, unsigned>::iterator RegPos =
-        LiveOutRegsNumUsages[PredID].find(Reg);
-      if (RegPos != LiveOutRegsNumUsages[PredID].end()) {
-        ++LiveOutRegsNumUsages[PredID][Reg];
-      } else {
-        LiveOutRegsNumUsages[PredID][Reg] = 1;
-      }
+      ++LiveOutRegsNumUsages[PredID][Reg];
     }
   }
 
@@ -1361,6 +1451,24 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
   std::set<unsigned> InRegs = DAG->getInRegs();
   addLiveRegs(InRegs);
 
+  // Increase LiveOutRegsNumUsages for blocks
+  // producing registers consumed in another
+  // scheduling region.
+  for (unsigned Reg : DAG->getOutRegs()) {
+    for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+      // Do reverse traversal
+      int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i];
+      SIScheduleBlock *Block = Blocks[ID];
+      const std::set<unsigned> &OutRegs = Block->getOutRegs();
+
+      if (OutRegs.find(Reg) == OutRegs.end())
+        continue;
+
+      ++LiveOutRegsNumUsages[ID][Reg];
+      break;
+    }
+  }
+
   // Fill LiveRegsConsumers for regs that were already
   // defined before scheduling.
   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
@@ -1377,12 +1485,8 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
         }
       }
 
-      if (!Found) {
-        if (LiveRegsConsumers.find(Reg) == LiveRegsConsumers.end())
-          LiveRegsConsumers[Reg] = 1;
-        else
-          ++LiveRegsConsumers[Reg];
-      }
+      if (!Found)
+        ++LiveRegsConsumers[Reg];
     }
   }
 
@@ -1403,6 +1507,7 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
     for (SIScheduleBlock* Block : BlocksScheduled) {
       dbgs() << ' ' << Block->getID();
     }
+    dbgs() << '\n';
   );
 }
 
@@ -1464,8 +1569,8 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
                         VregCurrentUsage, SregCurrentUsage);
   if (VregCurrentUsage > maxVregUsage)
     maxVregUsage = VregCurrentUsage;
-  if (VregCurrentUsage > maxSregUsage)
-    maxSregUsage = VregCurrentUsage;
+  if (SregCurrentUsage > maxSregUsage)
+    maxSregUsage = SregCurrentUsage;
   DEBUG(
     dbgs() << "Picking New Blocks\n";
     dbgs() << "Available: ";
@@ -1556,17 +1661,13 @@ void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
 }
 
 void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
-  for (SIScheduleBlock* Block : Parent->getSuccs()) {
-    --BlockNumPredsLeft[Block->getID()];
-    if (BlockNumPredsLeft[Block->getID()] == 0) {
-      ReadyBlocks.push_back(Block);
-    }
-    // TODO: Improve check. When the dependency between the high latency
-    // instructions and the instructions of the other blocks are WAR or WAW
-    // there will be no wait triggered. We would like these cases to not
-    // update LastPosHighLatencyParentScheduled.
-    if (Parent->isHighLatencyBlock())
-      LastPosHighLatencyParentScheduled[Block->getID()] = NumBlockScheduled;
+  for (const auto &Block : Parent->getSuccs()) {
+    if (--BlockNumPredsLeft[Block.first->getID()] == 0)
+      ReadyBlocks.push_back(Block.first);
+
+    if (Parent->isHighLatencyBlock() &&
+        Block.second == SIScheduleBlockLinkKind::Data)
+      LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled;
   }
 }
 
@@ -1578,12 +1679,10 @@ void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
        LiveOutRegsNumUsages[Block->getID()].begin(),
        E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) {
     std::pair<unsigned, unsigned> RegP = *RegI;
-    if (LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end())
-      LiveRegsConsumers[RegP.first] = RegP.second;
-    else {
-      assert(LiveRegsConsumers[RegP.first] == 0);
-      LiveRegsConsumers[RegP.first] += RegP.second;
-    }
+    // We produce this register, thus it must not be previously alive.
+    assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() ||
+           LiveRegsConsumers[RegP.first] == 0);
+    LiveRegsConsumers[RegP.first] += RegP.second;
   }
   if (LastPosHighLatencyParentScheduled[Block->getID()] >
         (unsigned)LastPosWaitedHighLatency)
@@ -1825,7 +1924,9 @@ void SIScheduleDAGMI::schedule()
   // if VGPR usage is extremely high, try other good performing variants
   // which could lead to lower VGPR usage
   if (Best.MaxVGPRUsage > 180) {
-    std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
+    static const std::pair<SISchedulerBlockCreatorVariant,
+                           SISchedulerBlockSchedulerVariant>
+        Variants[] = {
       { LatenciesAlone, BlockRegUsageLatency },
 //      { LatenciesAlone, BlockRegUsage },
       { LatenciesGrouped, BlockLatencyRegUsage },
@@ -1844,7 +1945,9 @@ void SIScheduleDAGMI::schedule()
   // if VGPR usage is still extremely high, we may spill. Try other variants
   // which are less performing, but that could lead to lower VGPR usage.
   if (Best.MaxVGPRUsage > 200) {
-    std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
+    static const std::pair<SISchedulerBlockCreatorVariant,
+                           SISchedulerBlockSchedulerVariant>
+        Variants[] = {
 //      { LatenciesAlone, BlockRegUsageLatency },
       { LatenciesAlone, BlockRegUsage },
 //      { LatenciesGrouped, BlockLatencyRegUsage },