1 files changed, 437 insertions, 208 deletions
diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 126f624..ee1d5da 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -52,6 +52,7 @@
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -61,24 +62,24 @@
 
 using namespace llvm;
 
-namespace {
+#define DEBUG_TYPE "si-lower-control-flow"
 
-class SILowerControlFlowPass : public MachineFunctionPass {
+namespace {
 
+class SILowerControlFlow : public MachineFunctionPass {
 private:
   static const unsigned SkipThreshold = 12;
 
-  static char ID;
   const SIRegisterInfo *TRI;
   const SIInstrInfo *TII;
 
   bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
 
   void Skip(MachineInstr &From, MachineOperand &To);
-  void SkipIfDead(MachineInstr &MI);
+  bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
 
   void If(MachineInstr &MI);
-  void Else(MachineInstr &MI);
+  void Else(MachineInstr &MI, bool ExecModified);
   void Break(MachineInstr &MI);
   void IfBreak(MachineInstr &MI);
   void ElseBreak(MachineInstr &MI);
@@ -88,56 +89,118 @@ private:
   void Kill(MachineInstr &MI);
   void Branch(MachineInstr &MI);
 
-  void LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0);
-  void computeIndirectRegAndOffset(unsigned VecReg, unsigned &Reg, int &Offset);
-  void IndirectSrc(MachineInstr &MI);
-  void IndirectDst(MachineInstr &MI);
+  MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  std::pair<MachineBasicBlock *, MachineBasicBlock *>
+  splitBlock(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
+
+  void splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs,
+                               const MachineRegisterInfo &MRI,
+                               const MachineInstr &MI,
+                               MachineBasicBlock &LoopBB,
+                               MachineBasicBlock &RemainderBB,
+                               unsigned SaveReg,
+                               const MachineOperand &IdxReg);
+
+  void emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL,
+                              MachineInstr *MovRel,
+                              const MachineOperand &IdxReg,
+                              int Offset);
+
+  bool loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0);
+  std::pair<unsigned, int> computeIndirectRegAndOffset(unsigned VecReg,
+                                                       int Offset) const;
+  bool indirectSrc(MachineInstr &MI);
+  bool indirectDst(MachineInstr &MI);
 
 public:
-  SILowerControlFlowPass(TargetMachine &tm) :
+  static char ID;
+
+  SILowerControlFlow() :
     MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   const char *getPassName() const override {
-    return "SI Lower control flow instructions";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    MachineFunctionPass::getAnalysisUsage(AU);
+    return "SI Lower control flow pseudo instructions";
   }
 };
 
 } // End anonymous namespace
 
-char SILowerControlFlowPass::ID = 0;
+char SILowerControlFlow::ID = 0;
+
+INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
+                "SI lower control flow", false, false)
 
-FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) {
-  return new SILowerControlFlowPass(tm);
+char &llvm::SILowerControlFlowPassID = SILowerControlFlow::ID;
+
+
+FunctionPass *llvm::createSILowerControlFlowPass() {
+  return new SILowerControlFlow();
 }
 
-bool SILowerControlFlowPass::shouldSkip(MachineBasicBlock *From,
-                                        MachineBasicBlock *To) {
+static bool opcodeEmitsNoInsts(unsigned Opc) {
+  switch (Opc) {
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+  case TargetOpcode::BUNDLE:
+  case TargetOpcode::CFI_INSTRUCTION:
+  case TargetOpcode::EH_LABEL:
+  case TargetOpcode::GC_LABEL:
+  case TargetOpcode::DBG_VALUE:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From,
+                                    MachineBasicBlock *To) {
+  if (From->succ_empty())
+    return false;
 
   unsigned NumInstr = 0;
+  MachineFunction *MF = From->getParent();
 
-  for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty();
-       MBB = *MBB->succ_begin()) {
+  for (MachineFunction::iterator MBBI(From), ToI(To), End = MF->end();
+       MBBI != End && MBBI != ToI; ++MBBI) {
+    MachineBasicBlock &MBB = *MBBI;
 
-    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
          NumInstr < SkipThreshold && I != E; ++I) {
+      if (opcodeEmitsNoInsts(I->getOpcode()))
+        continue;
+
+      // When a uniform loop is inside non-uniform control flow, the branch
+      // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
+      // when EXEC = 0. We should skip the loop lest it becomes infinite.
+      if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
+          I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
+        return true;
+
+      if (I->isInlineAsm()) {
+        const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
+        const char *AsmStr = I->getOperand(0).getSymbolName();
+
+        // inlineasm length estimate is number of bytes assuming the longest
+        // instruction.
+        uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
+        NumInstr += MaxAsmSize / MAI->getMaxInstLength();
+      } else {
+        ++NumInstr;
+      }
 
-      if (I->isBundle() || !I->isBundled())
-        if (++NumInstr >= SkipThreshold)
-          return true;
+      if (NumInstr >= SkipThreshold)
+        return true;
     }
   }
 
   return false;
 }
 
-void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) {
+void SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) {
 
   if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
     return;
@@ -147,40 +210,44 @@ void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) {
     .addOperand(To);
 }
 
-void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) {
-
+bool SILowerControlFlow::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
   MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
+  MachineFunction *MF = MBB.getParent();
 
-  if (MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getShaderType() !=
-      ShaderType::PIXEL ||
+  if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS ||
       !shouldSkip(&MBB, &MBB.getParent()->back()))
-    return;
+    return false;
+
+  MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
+  MBB.addSuccessor(SkipBB);
 
-  MachineBasicBlock::iterator Insert = &MI;
-  ++Insert;
+  const DebugLoc &DL = MI.getDebugLoc();
 
   // If the exec mask is non-zero, skip the next two instructions
-  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
-    .addImm(3);
+  BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+    .addMBB(&NextBB);
+
+  MachineBasicBlock::iterator Insert = SkipBB->begin();
 
   // Exec mask is zero: Export to NULL target...
-  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP))
-          .addImm(0)
-          .addImm(0x09) // V_008DFC_SQ_EXP_NULL
-          .addImm(0)
-          .addImm(1)
-          .addImm(1)
-          .addReg(AMDGPU::VGPR0)
-          .addReg(AMDGPU::VGPR0)
-          .addReg(AMDGPU::VGPR0)
-          .addReg(AMDGPU::VGPR0);
-
-  // ... and terminate wavefront
-  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
+  BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP))
+    .addImm(0)
+    .addImm(0x09) // V_008DFC_SQ_EXP_NULL
+    .addImm(0)
+    .addImm(1)
+    .addImm(1)
+    .addReg(AMDGPU::VGPR0, RegState::Undef)
+    .addReg(AMDGPU::VGPR0, RegState::Undef)
+    .addReg(AMDGPU::VGPR0, RegState::Undef)
+    .addReg(AMDGPU::VGPR0, RegState::Undef);
+
+  // ... and terminate wavefront.
+  BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
+
+  return true;
 }
 
-void SILowerControlFlowPass::If(MachineInstr &MI) {
+void SILowerControlFlow::If(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
   unsigned Reg = MI.getOperand(0).getReg();
@@ -195,10 +262,15 @@ void SILowerControlFlowPass::If(MachineInstr &MI) {
 
   Skip(MI, MI.getOperand(2));
 
+  // Insert a pseudo terminator to help keep the verifier happy.
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
+    .addOperand(MI.getOperand(2))
+    .addReg(Reg);
+
   MI.eraseFromParent();
 }
 
-void SILowerControlFlowPass::Else(MachineInstr &MI) {
+void SILowerControlFlow::Else(MachineInstr &MI, bool ExecModified) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
   unsigned Dst = MI.getOperand(0).getReg();
@@ -208,22 +280,36 @@ void SILowerControlFlowPass::Else(MachineInstr &MI) {
           TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
           .addReg(Src); // Saved EXEC
 
+  if (ExecModified) {
+    // Adjust the saved exec to account for the modifications during the flow
+    // block that contains the ELSE. This can happen when WQM mode is switched
+    // off.
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
+            .addReg(AMDGPU::EXEC)
+            .addReg(Dst);
+  }
+
   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
           .addReg(AMDGPU::EXEC)
           .addReg(Dst);
 
   Skip(MI, MI.getOperand(2));
 
+  // Insert a pseudo terminator to help keep the verifier happy.
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
+    .addOperand(MI.getOperand(2))
+    .addReg(Dst);
+
   MI.eraseFromParent();
 }
 
-void SILowerControlFlowPass::Break(MachineInstr &MI) {
+void SILowerControlFlow::Break(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
 
   unsigned Dst = MI.getOperand(0).getReg();
   unsigned Src = MI.getOperand(1).getReg();
- 
+
   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
           .addReg(AMDGPU::EXEC)
           .addReg(Src);
@@ -231,14 +317,14 @@ void SILowerControlFlowPass::Break(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
-void SILowerControlFlowPass::IfBreak(MachineInstr &MI) {
+void SILowerControlFlow::IfBreak(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
 
   unsigned Dst = MI.getOperand(0).getReg();
   unsigned Vcc = MI.getOperand(1).getReg();
   unsigned Src = MI.getOperand(2).getReg();
- 
+
   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
           .addReg(Vcc)
           .addReg(Src);
@@ -246,14 +332,14 @@ void SILowerControlFlowPass::IfBreak(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
-void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) {
+void SILowerControlFlow::ElseBreak(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
 
   unsigned Dst = MI.getOperand(0).getReg();
   unsigned Saved = MI.getOperand(1).getReg();
   unsigned Src = MI.getOperand(2).getReg();
- 
+
   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
           .addReg(Saved)
           .addReg(Src);
@@ -261,7 +347,7 @@ void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
-void SILowerControlFlowPass::Loop(MachineInstr &MI) {
+void SILowerControlFlow::Loop(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
   unsigned Src = MI.getOperand(0).getReg();
@@ -276,7 +362,7 @@ void SILowerControlFlowPass::Loop(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
-void SILowerControlFlowPass::EndCf(MachineInstr &MI) {
+void SILowerControlFlow::EndCf(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
   unsigned Reg = MI.getOperand(0).getReg();
@@ -289,24 +375,24 @@ void SILowerControlFlowPass::EndCf(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
-void SILowerControlFlowPass::Branch(MachineInstr &MI) {
-  if (MI.getOperand(0).getMBB() == MI.getParent()->getNextNode())
+void SILowerControlFlow::Branch(MachineInstr &MI) {
+  MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
+  if (MBB == MI.getParent()->getNextNode())
     MI.eraseFromParent();
 
   // If these aren't equal, this is probably an infinite loop.
 }
 
-void SILowerControlFlowPass::Kill(MachineInstr &MI) {
+void SILowerControlFlow::Kill(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
   const MachineOperand &Op = MI.getOperand(0);
 
 #ifndef NDEBUG
-  const SIMachineFunctionInfo *MFI
-    = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
+  CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv();
   // Kill is only allowed in pixel / geometry shaders.
-  assert(MFI->getShaderType() == ShaderType::PIXEL ||
-         MFI->getShaderType() == ShaderType::GEOMETRY);
+  assert(CallConv == CallingConv::AMDGPU_PS ||
+         CallConv == CallingConv::AMDGPU_GS);
 #endif
 
   // Clear this thread from the exec mask if the operand is negative
@@ -325,94 +411,209 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
-void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) {
+// All currently live registers must remain so in the remainder block.
+void SILowerControlFlow::splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs,
+                                                 const MachineRegisterInfo &MRI,
+                                                 const MachineInstr &MI,
+                                                 MachineBasicBlock &LoopBB,
+                                                 MachineBasicBlock &RemainderBB,
+                                                 unsigned SaveReg,
+                                                 const MachineOperand &IdxReg) {
+  // Add reg defined in loop body.
+  RemainderLiveRegs.addReg(SaveReg);
+
+  if (const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val)) {
+    if (!Val->isUndef()) {
+      RemainderLiveRegs.addReg(Val->getReg());
+      LoopBB.addLiveIn(Val->getReg());
+    }
+  }
+
+  for (unsigned Reg : RemainderLiveRegs) {
+    if (MRI.isAllocatable(Reg))
+      RemainderBB.addLiveIn(Reg);
+  }
+
+  const MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src);
+  if (!Src->isUndef())
+    LoopBB.addLiveIn(Src->getReg());
+
+  if (!IdxReg.isUndef())
+    LoopBB.addLiveIn(IdxReg.getReg());
+  LoopBB.sortUniqueLiveIns();
+}
+
+void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB,
+                                                DebugLoc DL,
+                                                MachineInstr *MovRel,
+                                                const MachineOperand &IdxReg,
+                                                int Offset) {
+  MachineBasicBlock::iterator I = LoopBB.begin();
+
+  // Read the next variant into VCC (lower 32 bits) <- also loop target
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), AMDGPU::VCC_LO)
+    .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
+
+  // Move index from VCC into M0
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+    .addReg(AMDGPU::VCC_LO);
+
+  // Compare the just read M0 value to all possible Idx values
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32))
+    .addReg(AMDGPU::M0)
+    .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
+
+  // Update EXEC, save the original EXEC value to VCC
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
+    .addReg(AMDGPU::VCC);
+
+  if (Offset != 0) {
+    BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
+      .addReg(AMDGPU::M0)
+      .addImm(Offset);
+  }
+
+  // Do the actual move
+  LoopBB.insert(I, MovRel);
+
+  // Update EXEC, switch all done bits to 0 and all todo bits to 1
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
+    .addReg(AMDGPU::EXEC)
+    .addReg(AMDGPU::VCC);
+
+  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
+  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+    .addMBB(&LoopBB);
+}
+
+MachineBasicBlock *SILowerControlFlow::insertSkipBlock(
+  MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
+  MachineFunction *MF = MBB.getParent();
+
+  MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
+  MachineFunction::iterator MBBI(MBB);
+  ++MBBI;
+
+  MF->insert(MBBI, SkipBB);
+
+  return SkipBB;
+}
+
+std::pair<MachineBasicBlock *, MachineBasicBlock *>
+SILowerControlFlow::splitBlock(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I) {
+  MachineFunction *MF = MBB.getParent();
 
+  // To insert the loop we need to split the block. Move everything after this
+  // point to a new block, and insert a new empty block between the two.
+  MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
+  MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
+  MachineFunction::iterator MBBI(MBB);
+  ++MBBI;
+
+  MF->insert(MBBI, LoopBB);
+  MF->insert(MBBI, RemainderBB);
+
+  // Move the rest of the block into a new block.
+  RemainderBB->transferSuccessors(&MBB);
+  RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
+
+  MBB.addSuccessor(LoopBB);
+
+  return std::make_pair(LoopBB, RemainderBB);
+}
+
+// Returns true if a new block was inserted.
+bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
-  MachineBasicBlock::iterator I = MI;
+  MachineBasicBlock::iterator I(&MI);
 
-  unsigned Save = MI.getOperand(1).getReg();
-  unsigned Idx = MI.getOperand(3).getReg();
+  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
 
-  if (AMDGPU::SReg_32RegClass.contains(Idx)) {
-    if (Offset) {
-      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
-              .addReg(Idx)
-              .addImm(Offset);
+  if (AMDGPU::SReg_32RegClass.contains(Idx->getReg())) {
+    if (Offset != 0) {
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
+        .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef()))
+        .addImm(Offset);
     } else {
-      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
-              .addReg(Idx);
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+        .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef()));
     }
+
     MBB.insert(I, MovRel);
-  } else {
+    MI.eraseFromParent();
+    return false;
+  }
 
-    assert(AMDGPU::SReg_64RegClass.contains(Save));
-    assert(AMDGPU::VGPR_32RegClass.contains(Idx));
+  MachineOperand *SaveOp = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
+  SaveOp->setIsDead(false);
+  unsigned Save = SaveOp->getReg();
 
-    // Save the EXEC mask
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
-            .addReg(AMDGPU::EXEC);
+  // Reading from a VGPR requires looping over all workitems in the wavefront.
+  assert(AMDGPU::SReg_64RegClass.contains(Save) &&
+         AMDGPU::VGPR_32RegClass.contains(Idx->getReg()));
 
-    // Read the next variant into VCC (lower 32 bits) <- also loop target
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
-            AMDGPU::VCC_LO)
-            .addReg(Idx);
+  // Save the EXEC mask
+  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), Save)
+    .addReg(AMDGPU::EXEC);
 
-    // Move index from VCC into M0
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
-            .addReg(AMDGPU::VCC_LO);
+  LivePhysRegs RemainderLiveRegs(TRI);
 
-    // Compare the just read M0 value to all possible Idx values
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32))
-      .addReg(AMDGPU::M0)
-      .addReg(Idx);
+  RemainderLiveRegs.addLiveOuts(MBB);
 
-    // Update EXEC, save the original EXEC value to VCC
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
-            .addReg(AMDGPU::VCC);
+  MachineBasicBlock *LoopBB;
+  MachineBasicBlock *RemainderBB;
 
-    if (Offset) {
-      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
-              .addReg(AMDGPU::M0)
-              .addImm(Offset);
-    }
-    // Do the actual move
-    MBB.insert(I, MovRel);
+  std::tie(LoopBB, RemainderBB) = splitBlock(MBB, I);
 
-    // Update EXEC, switch all done bits to 0 and all todo bits to 1
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
-            .addReg(AMDGPU::EXEC)
-            .addReg(AMDGPU::VCC);
+  for (const MachineInstr &Inst : reverse(*RemainderBB))
+    RemainderLiveRegs.stepBackward(Inst);
 
-    // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
-      .addImm(-7);
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  LoopBB->addSuccessor(RemainderBB);
+  LoopBB->addSuccessor(LoopBB);
 
-    // Restore EXEC
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
-            .addReg(Save);
+  splitLoadM0BlockLiveIns(RemainderLiveRegs, MRI, MI, *LoopBB,
+                          *RemainderBB, Save, *Idx);
+
+  emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, *Idx, Offset);
+
+  MachineBasicBlock::iterator First = RemainderBB->begin();
+  BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+    .addReg(Save);
 
-  }
   MI.eraseFromParent();
+  return true;
 }
 
-/// \param @VecReg The register which holds element zero of the vector
-///                 being addressed into.
-/// \param[out] @Reg The base register to use in the indirect addressing instruction.
-/// \param[in,out] @Offset As an input, this is the constant offset part of the
-//                         indirect Index. e.g. v0 = v[VecReg + Offset]
-//                         As an output, this is a constant value that needs
-//                         to be added to the value stored in M0.
-void SILowerControlFlowPass::computeIndirectRegAndOffset(unsigned VecReg,
-                                                         unsigned &Reg,
-                                                         int &Offset) {
+/// \param @VecReg The register which holds element zero of the vector being
+///                 addressed into.
+//
+/// \param[in] @Idx The index operand from the movrel instruction. This must be
+// a register, but may be NoRegister.
+///
+/// \param[in] @Offset As an input, this is the constant offset part of the
+// indirect Index. e.g. v0 = v[VecReg + Offset] As an output, this is a constant
+// value that needs to be added to the value stored in M0.
+std::pair<unsigned, int>
+SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg, int Offset) const {
   unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0);
   if (!SubReg)
     SubReg = VecReg;
 
+  const TargetRegisterClass *SuperRC = TRI->getPhysRegClass(VecReg);
   const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg);
-  int RegIdx = TRI->getHWRegIndex(SubReg) + Offset;
+  int NumElts = SuperRC->getSize() / RC->getSize();
+
+  int BaseRegIdx = TRI->getHWRegIndex(SubReg);
+
+  // Skip out of bounds offsets, or else we would end up using an undefined
+  // register.
+  if (Offset >= NumElts)
+    return std::make_pair(RC->getRegister(BaseRegIdx), Offset);
 
+  int RegIdx = BaseRegIdx + Offset;
   if (RegIdx < 0) {
     Offset = RegIdx;
     RegIdx = 0;
@@ -420,77 +621,102 @@ void SILowerControlFlowPass::computeIndirectRegAndOffset(unsigned VecReg,
     Offset = 0;
   }
 
-  Reg = RC->getRegister(RegIdx);
+  unsigned Reg = RC->getRegister(RegIdx);
+  return std::make_pair(Reg, Offset);
 }
 
-void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) {
-
+// Return true if a new block was inserted.
+bool SILowerControlFlow::indirectSrc(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
+  const DebugLoc &DL = MI.getDebugLoc();
 
   unsigned Dst = MI.getOperand(0).getReg();
-  unsigned Vec = MI.getOperand(2).getReg();
-  int Off = MI.getOperand(4).getImm();
+  const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
+  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
   unsigned Reg;
 
-  computeIndirectRegAndOffset(Vec, Reg, Off);
+  std::tie(Reg, Offset) = computeIndirectRegAndOffset(SrcVec->getReg(), Offset);
+
+  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
+  if (Idx->getReg() == AMDGPU::NoRegister) {
+    // Only had a constant offset, copy the register directly.
+    BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
+      .addReg(Reg, getUndefRegState(SrcVec->isUndef()));
+    MI.eraseFromParent();
+    return false;
+  }
 
   MachineInstr *MovRel =
     BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
-            .addReg(Reg)
-            .addReg(Vec, RegState::Implicit);
+    .addReg(Reg, getUndefRegState(SrcVec->isUndef()))
+    .addReg(SrcVec->getReg(), RegState::Implicit);
 
-  LoadM0(MI, MovRel, Off);
+  return loadM0(MI, MovRel, Offset);
 }
 
-void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
-
+// Return true if a new block was inserted.
+bool SILowerControlFlow::indirectDst(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
+  const DebugLoc &DL = MI.getDebugLoc();
 
   unsigned Dst = MI.getOperand(0).getReg();
-  int Off = MI.getOperand(4).getImm();
-  unsigned Val = MI.getOperand(5).getReg();
+  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
   unsigned Reg;
 
-  computeIndirectRegAndOffset(Dst, Reg, Off);
+  const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
+  std::tie(Reg, Offset) = computeIndirectRegAndOffset(Dst, Offset);
 
-  MachineInstr *MovRel = 
-    BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
-            .addReg(Reg, RegState::Define)
-            .addReg(Val)
-            .addReg(Dst, RegState::Implicit);
+  MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
+  if (Idx->getReg() == AMDGPU::NoRegister) {
+    // Only had a constant offset, copy the register directly.
+    BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Reg)
+      .addOperand(*Val);
+    MI.eraseFromParent();
+    return false;
+  }
+
+  MachineInstr *MovRel =
+    BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32), Reg)
+    .addReg(Val->getReg(), getUndefRegState(Val->isUndef()))
+    .addReg(Dst, RegState::Implicit);
 
-  LoadM0(MI, MovRel, Off);
+  return loadM0(MI, MovRel, Offset);
 }
 
-bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
-  TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  TRI =
-      static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
+bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  TII = ST.getInstrInfo();
+  TRI = &TII->getRegisterInfo();
+
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   bool HaveKill = false;
-  bool NeedWQM = false;
   bool NeedFlat = false;
   unsigned Depth = 0;
 
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-       BI != BE; ++BI) {
+  MachineFunction::iterator NextBB;
 
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+       BI != BE; BI = NextBB) {
+    NextBB = std::next(BI);
     MachineBasicBlock &MBB = *BI;
+
+    MachineBasicBlock *EmptyMBBAtEnd = nullptr;
     MachineBasicBlock::iterator I, Next;
+    bool ExecModified = false;
+
     for (I = MBB.begin(); I != MBB.end(); I = Next) {
       Next = std::next(I);
 
       MachineInstr &MI = *I;
-      if (TII->isWQM(MI) || TII->isDS(MI))
-        NeedWQM = true;
 
       // Flat uses m0 in case it needs to access LDS.
       if (TII->isFLAT(MI))
         NeedFlat = true;
 
+      if (I->modifiesRegister(AMDGPU::EXEC, TRI))
+        ExecModified = true;
+
       switch (MI.getOpcode()) {
         default: break;
         case AMDGPU::SI_IF:
@@ -499,7 +725,7 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
           break;
 
         case AMDGPU::SI_ELSE:
-          Else(MI);
+          Else(MI, ExecModified);
           break;
 
         case AMDGPU::SI_BREAK:
@@ -521,16 +747,20 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
 
         case AMDGPU::SI_END_CF:
           if (--Depth == 0 && HaveKill) {
-            SkipIfDead(MI);
             HaveKill = false;
+            // TODO: Insert skip if exec is 0?
           }
+
           EndCf(MI);
           break;
 
-        case AMDGPU::SI_KILL:
-          if (Depth == 0)
-            SkipIfDead(MI);
-          else
+        case AMDGPU::SI_KILL_TERMINATOR:
+          if (Depth == 0) {
+            if (skipIfDead(MI, *NextBB)) {
+              NextBB = std::next(BI);
+              BE = MF.end();
+            }
+          } else
             HaveKill = true;
           Kill(MI);
           break;
@@ -544,7 +774,15 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
         case AMDGPU::SI_INDIRECT_SRC_V4:
         case AMDGPU::SI_INDIRECT_SRC_V8:
         case AMDGPU::SI_INDIRECT_SRC_V16:
-          IndirectSrc(MI);
+          if (indirectSrc(MI)) {
+            // The block was split at this point. We can safely skip the middle
+            // inserted block to the following which contains the rest of this
+            // block's instructions.
+            NextBB = std::next(BI);
+            BE = MF.end();
+            Next = MBB.end();
+          }
+
           break;
 
         case AMDGPU::SI_INDIRECT_DST_V1:
@@ -552,55 +790,46 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
         case AMDGPU::SI_INDIRECT_DST_V4:
         case AMDGPU::SI_INDIRECT_DST_V8:
         case AMDGPU::SI_INDIRECT_DST_V16:
-          IndirectDst(MI);
+          if (indirectDst(MI)) {
+            // The block was split at this point. We can safely skip the middle
+            // inserted block to the following which contains the rest of this
+            // block's instructions.
+            NextBB = std::next(BI);
+            BE = MF.end();
+            Next = MBB.end();
+          }
+
           break;
+
+        case AMDGPU::SI_RETURN: {
+          assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
+
+          // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
+          // because external bytecode will be appended at the end.
+          if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
+            // SI_RETURN is not the last instruction. Add an empty block at
+            // the end and jump there.
+            if (!EmptyMBBAtEnd) {
+              EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
+              MF.insert(MF.end(), EmptyMBBAtEnd);
+            }
+
+            MBB.addSuccessor(EmptyMBBAtEnd);
+            BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
+                    .addMBB(EmptyMBBAtEnd);
+            I->eraseFromParent();
+          }
+          break;
+        }
       }
     }
   }
 
-  if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) {
-    MachineBasicBlock &MBB = MF.front();
-    BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
-            AMDGPU::EXEC).addReg(AMDGPU::EXEC);
-  }
-
-  // FIXME: This seems inappropriate to do here.
   if (NeedFlat && MFI->IsKernel) {
-    // Insert the prologue initializing the SGPRs pointing to the scratch space
-    // for flat accesses.
-    const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
-
     // TODO: What to use with function calls?
-
-    // FIXME: This is reporting stack size that is used in a scratch buffer
-    // rather than registers as well.
-    uint64_t StackSizeBytes = FrameInfo->getStackSize();
-
-    int IndirectBegin
-      = static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF);
-    // Convert register index to 256-byte unit.
-    uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256);
-
-    assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff &&
-           "Stack limits should be smaller than 16-bits");
-
-    // Initialize the flat scratch register pair.
-    // TODO: Can we use one s_mov_b64 here?
-
-    // Offset is in units of 256-bytes.
-    MachineBasicBlock &MBB = MF.front();
-    DebugLoc NoDL;
-    MachineBasicBlock::iterator Start = MBB.getFirstNonPHI();
-    const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32);
-
-    assert(isInt<16>(StackOffset) && isInt<16>(StackSizeBytes));
-
-    BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO)
-      .addImm(StackOffset);
-
-    // Documentation says size is "per-thread scratch size in bytes"
-    BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI)
-      .addImm(StackSizeBytes);
+    // We will need to Initialize the flat scratch register pair.
+    if (NeedFlat)
+      MFI->setHasFlatInstructions(true);
   }
 
   return true;