1 files changed, 129 insertions, 76 deletions
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index ecd46b9..a7c8166 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -20,17 +20,13 @@
 
 using namespace llvm;
 
-static cl::opt<bool> EnableSpillSGPRToVGPR(
-  "amdgpu-spill-sgpr-to-vgpr",
-  cl::desc("Enable spilling VGPRs to SGPRs"),
-  cl::ReallyHidden,
-  cl::init(true));
-
 SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   : AMDGPUMachineFunction(MF),
     TIDReg(AMDGPU::NoRegister),
-    ScratchRSrcReg(AMDGPU::NoRegister),
-    ScratchWaveOffsetReg(AMDGPU::NoRegister),
+    ScratchRSrcReg(AMDGPU::PRIVATE_RSRC_REG),
+    ScratchWaveOffsetReg(AMDGPU::SCRATCH_WAVE_OFFSET_REG),
+    FrameOffsetReg(AMDGPU::FP_REG),
+    StackPtrOffsetReg(AMDGPU::SP_REG),
     PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister),
     DispatchPtrUserSGPR(AMDGPU::NoRegister),
     QueuePtrUserSGPR(AMDGPU::NoRegister),
@@ -46,14 +42,17 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     WorkGroupIDZSystemSGPR(AMDGPU::NoRegister),
     WorkGroupInfoSystemSGPR(AMDGPU::NoRegister),
     PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
+    WorkItemIDXVGPR(AMDGPU::NoRegister),
+    WorkItemIDYVGPR(AMDGPU::NoRegister),
+    WorkItemIDZVGPR(AMDGPU::NoRegister),
     PSInputAddr(0),
+    PSInputEnable(0),
     ReturnsVoid(true),
     FlatWorkGroupSizes(0, 0),
     WavesPerEU(0, 0),
     DebuggerWorkGroupIDStackObjectIndices({{0, 0, 0}}),
     DebuggerWorkItemIDStackObjectIndices({{0, 0, 0}}),
     LDSWaveSpillSize(0),
-    PSInputEna(0),
     NumUserSGPRs(0),
     NumSystemSGPRs(0),
     HasSpilledSGPRs(false),
@@ -78,44 +77,83 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     WorkItemIDX(false),
     WorkItemIDY(false),
     WorkItemIDZ(false),
-    PrivateMemoryInputPtr(false) {
+    ImplicitBufferPtr(false) {
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
   const Function *F = MF.getFunction();
+  FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F);
+  WavesPerEU = ST.getWavesPerEU(*F);
 
-  PSInputAddr = AMDGPU::getInitialPSInputAddr(*F);
+  if (!isEntryFunction()) {
+    // Non-entry functions have no special inputs for now, other registers
+    // required for scratch access.
+    ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
+    ScratchWaveOffsetReg = AMDGPU::SGPR4;
+    FrameOffsetReg = AMDGPU::SGPR5;
+    StackPtrOffsetReg = AMDGPU::SGPR32;
 
-  const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+    // FIXME: Not really a system SGPR.
+    PrivateSegmentWaveByteOffsetSystemSGPR = ScratchWaveOffsetReg;
+  }
 
-  if (!AMDGPU::isShader(F->getCallingConv())) {
-    KernargSegmentPtr = true;
+  CallingConv::ID CC = F->getCallingConv();
+  if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
+    KernargSegmentPtr = !F->arg_empty();
     WorkGroupIDX = true;
     WorkItemIDX = true;
+  } else if (CC == CallingConv::AMDGPU_PS) {
+    PSInputAddr = AMDGPU::getInitialPSInputAddr(*F);
   }
 
-  if (F->hasFnAttribute("amdgpu-work-group-id-y") || ST.debuggerEmitPrologue())
+  if (ST.debuggerEmitPrologue()) {
+    // Enable everything.
+    WorkGroupIDX = true;
     WorkGroupIDY = true;
-
-  if (F->hasFnAttribute("amdgpu-work-group-id-z") || ST.debuggerEmitPrologue())
     WorkGroupIDZ = true;
-
-  if (F->hasFnAttribute("amdgpu-work-item-id-y") || ST.debuggerEmitPrologue())
+    WorkItemIDX = true;
     WorkItemIDY = true;
-
-  if (F->hasFnAttribute("amdgpu-work-item-id-z") || ST.debuggerEmitPrologue())
     WorkItemIDZ = true;
+  } else {
+    if (F->hasFnAttribute("amdgpu-work-group-id-x"))
+      WorkGroupIDX = true;
 
-  // X, XY, and XYZ are the only supported combinations, so make sure Y is
-  // enabled if Z is.
-  if (WorkItemIDZ)
-    WorkItemIDY = true;
+    if (F->hasFnAttribute("amdgpu-work-group-id-y"))
+      WorkGroupIDY = true;
+
+    if (F->hasFnAttribute("amdgpu-work-group-id-z"))
+      WorkGroupIDZ = true;
+
+    if (F->hasFnAttribute("amdgpu-work-item-id-x"))
+      WorkItemIDX = true;
+
+    if (F->hasFnAttribute("amdgpu-work-item-id-y"))
+      WorkItemIDY = true;
 
+    if (F->hasFnAttribute("amdgpu-work-item-id-z"))
+      WorkItemIDZ = true;
+  }
+
+  const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
   bool MaySpill = ST.isVGPRSpillingEnabled(*F);
   bool HasStackObjects = FrameInfo.hasStackObjects();
 
-  if (HasStackObjects || MaySpill)
-    PrivateSegmentWaveByteOffset = true;
+  if (isEntryFunction()) {
+    // X, XY, and XYZ are the only supported combinations, so make sure Y is
+    // enabled if Z is.
+    if (WorkItemIDZ)
+      WorkItemIDY = true;
+
+    if (HasStackObjects || MaySpill) {
+      PrivateSegmentWaveByteOffset = true;
 
-  if (ST.isAmdCodeObjectV2(MF)) {
+      // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
+      if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
+          (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
+        PrivateSegmentWaveByteOffsetSystemSGPR = AMDGPU::SGPR5;
+    }
+  }
+
+  bool IsCOV2 = ST.isAmdCodeObjectV2(MF);
+  if (IsCOV2) {
     if (HasStackObjects || MaySpill)
       PrivateSegmentBuffer = true;
 
@@ -129,18 +167,18 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
       DispatchID = true;
   } else if (ST.isMesaGfxShader(MF)) {
     if (HasStackObjects || MaySpill)
-      PrivateMemoryInputPtr = true;
+      ImplicitBufferPtr = true;
   }
 
-  // We don't need to worry about accessing spills with flat instructions.
-  // TODO: On VI where we must use flat for global, we should be able to omit
-  // this if it is never used for generic access.
-  if (HasStackObjects && ST.getGeneration() >= SISubtarget::SEA_ISLANDS &&
-      ST.isAmdHsaOS())
-    FlatScratchInit = true;
+  if (F->hasFnAttribute("amdgpu-kernarg-segment-ptr"))
+    KernargSegmentPtr = true;
 
-  FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F);
-  WavesPerEU = ST.getWavesPerEU(*F);
+  if (ST.hasFlatAddressSpace() && isEntryFunction() && IsCOV2) {
+    // TODO: This could be refined a lot. The attribute is a poor way of
+    // detecting calls that may require it before argument lowering.
+    if (HasStackObjects || F->hasFnAttribute("amdgpu-flat-scratch"))
+      FlatScratchInit = true;
+  }
 }
 
 unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
@@ -186,52 +224,67 @@ unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
   return FlatScratchInitUserSGPR;
 }
 
-unsigned SIMachineFunctionInfo::addPrivateMemoryPtr(const SIRegisterInfo &TRI) {
-  PrivateMemoryPtrUserSGPR = TRI.getMatchingSuperReg(
+unsigned SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) {
+  ImplicitBufferPtrUserSGPR = TRI.getMatchingSuperReg(
     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
   NumUserSGPRs += 2;
-  return PrivateMemoryPtrUserSGPR;
+  return ImplicitBufferPtrUserSGPR;
 }
 
-SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg (
-                                                       MachineFunction *MF,
-                                                       unsigned FrameIndex,
-                                                       unsigned SubIdx) {
-  if (!EnableSpillSGPRToVGPR)
-    return SpilledReg();
-
-  const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
-  const SIRegisterInfo *TRI = ST.getRegisterInfo();
-
-  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
-  int64_t Offset = FrameInfo.getObjectOffset(FrameIndex);
-  Offset += SubIdx * 4;
+/// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI.
+bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
+                                                    int FI) {
+  std::vector<SpilledReg> &SpillLanes = SGPRToVGPRSpills[FI];
 
-  unsigned LaneVGPRIdx = Offset / (64 * 4);
-  unsigned Lane = (Offset / 4) % 64;
+  // This has already been allocated.
+  if (!SpillLanes.empty())
+    return true;
 
-  struct SpilledReg Spill;
-  Spill.Lane = Lane;
-
-  if (!LaneVGPRs.count(LaneVGPRIdx)) {
-    unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass,
-                                                *MF);
-
-    if (LaneVGPR == AMDGPU::NoRegister)
-      // We have no VGPRs left for spilling SGPRs.
-      return Spill;
-
-    LaneVGPRs[LaneVGPRIdx] = LaneVGPR;
-
-    // Add this register as live-in to all blocks to avoid machine verifer
-    // complaining about use of an undefined physical register.
-    for (MachineFunction::iterator BI = MF->begin(), BE = MF->end();
-         BI != BE; ++BI) {
-      BI->addLiveIn(LaneVGPR);
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  unsigned WaveSize = ST.getWavefrontSize();
+
+  unsigned Size = FrameInfo.getObjectSize(FI);
+  assert(Size >= 4 && Size <= 64 && "invalid sgpr spill size");
+  assert(TRI->spillSGPRToVGPR() && "not spilling SGPRs to VGPRs");
+
+  int NumLanes = Size / 4;
+
+  // Make sure to handle the case where a wide SGPR spill may span between two
+  // VGPRs.
+  for (int I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) {
+    unsigned LaneVGPR;
+    unsigned VGPRIndex = (NumVGPRSpillLanes % WaveSize);
+
+    if (VGPRIndex == 0) {
+      LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
+      if (LaneVGPR == AMDGPU::NoRegister) {
+        // We have no VGPRs left for spilling SGPRs. Reset because we won't
+        // partially spill the SGPR to VGPRs.
+        SGPRToVGPRSpills.erase(FI);
+        NumVGPRSpillLanes -= I;
+        return false;
+      }
+
+      SpillVGPRs.push_back(LaneVGPR);
+
+      // Add this register as live-in to all blocks to avoid machine verifer
+      // complaining about use of an undefined physical register.
+      for (MachineBasicBlock &BB : MF)
+        BB.addLiveIn(LaneVGPR);
+    } else {
+      LaneVGPR = SpillVGPRs.back();
     }
+
+    SpillLanes.push_back(SpilledReg(LaneVGPR, VGPRIndex));
   }
 
-  Spill.VGPR = LaneVGPRs[LaneVGPRIdx];
-  return Spill;
+  return true;
+}
+
+void SIMachineFunctionInfo::removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI) {
+  for (auto &R : SGPRToVGPRSpills)
+    MFI.RemoveStackObject(R.first);
 }