summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp')
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp645
1 files changed, 437 insertions, 208 deletions
diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 126f624..ee1d5da 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -52,6 +52,7 @@
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -61,24 +62,24 @@
using namespace llvm;
-namespace {
+#define DEBUG_TYPE "si-lower-control-flow"
-class SILowerControlFlowPass : public MachineFunctionPass {
+namespace {
+class SILowerControlFlow : public MachineFunctionPass {
private:
static const unsigned SkipThreshold = 12;
- static char ID;
const SIRegisterInfo *TRI;
const SIInstrInfo *TII;
bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
void Skip(MachineInstr &From, MachineOperand &To);
- void SkipIfDead(MachineInstr &MI);
+ bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
void If(MachineInstr &MI);
- void Else(MachineInstr &MI);
+ void Else(MachineInstr &MI, bool ExecModified);
void Break(MachineInstr &MI);
void IfBreak(MachineInstr &MI);
void ElseBreak(MachineInstr &MI);
@@ -88,56 +89,118 @@ private:
void Kill(MachineInstr &MI);
void Branch(MachineInstr &MI);
- void LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0);
- void computeIndirectRegAndOffset(unsigned VecReg, unsigned &Reg, int &Offset);
- void IndirectSrc(MachineInstr &MI);
- void IndirectDst(MachineInstr &MI);
+ MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const;
+
+ std::pair<MachineBasicBlock *, MachineBasicBlock *>
+ splitBlock(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
+
+ void splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs,
+ const MachineRegisterInfo &MRI,
+ const MachineInstr &MI,
+ MachineBasicBlock &LoopBB,
+ MachineBasicBlock &RemainderBB,
+ unsigned SaveReg,
+ const MachineOperand &IdxReg);
+
+ void emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL,
+ MachineInstr *MovRel,
+ const MachineOperand &IdxReg,
+ int Offset);
+
+ bool loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0);
+ std::pair<unsigned, int> computeIndirectRegAndOffset(unsigned VecReg,
+ int Offset) const;
+ bool indirectSrc(MachineInstr &MI);
+ bool indirectDst(MachineInstr &MI);
public:
- SILowerControlFlowPass(TargetMachine &tm) :
+ static char ID;
+
+ SILowerControlFlow() :
MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { }
bool runOnMachineFunction(MachineFunction &MF) override;
const char *getPassName() const override {
- return "SI Lower control flow instructions";
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- MachineFunctionPass::getAnalysisUsage(AU);
+ return "SI Lower control flow pseudo instructions";
}
};
} // End anonymous namespace
-char SILowerControlFlowPass::ID = 0;
+char SILowerControlFlow::ID = 0;
+
+INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
+ "SI lower control flow", false, false)
-FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) {
- return new SILowerControlFlowPass(tm);
+char &llvm::SILowerControlFlowPassID = SILowerControlFlow::ID;
+
+
+FunctionPass *llvm::createSILowerControlFlowPass() {
+ return new SILowerControlFlow();
}
-bool SILowerControlFlowPass::shouldSkip(MachineBasicBlock *From,
- MachineBasicBlock *To) {
+static bool opcodeEmitsNoInsts(unsigned Opc) {
+ switch (Opc) {
+ case TargetOpcode::IMPLICIT_DEF:
+ case TargetOpcode::KILL:
+ case TargetOpcode::BUNDLE:
+ case TargetOpcode::CFI_INSTRUCTION:
+ case TargetOpcode::EH_LABEL:
+ case TargetOpcode::GC_LABEL:
+ case TargetOpcode::DBG_VALUE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From,
+ MachineBasicBlock *To) {
+ if (From->succ_empty())
+ return false;
unsigned NumInstr = 0;
+ MachineFunction *MF = From->getParent();
- for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty();
- MBB = *MBB->succ_begin()) {
+ for (MachineFunction::iterator MBBI(From), ToI(To), End = MF->end();
+ MBBI != End && MBBI != ToI; ++MBBI) {
+ MachineBasicBlock &MBB = *MBBI;
- for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+ for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
NumInstr < SkipThreshold && I != E; ++I) {
+ if (opcodeEmitsNoInsts(I->getOpcode()))
+ continue;
+
+ // When a uniform loop is inside non-uniform control flow, the branch
+ // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
+ // when EXEC = 0. We should skip the loop lest it becomes infinite.
+ if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
+ I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
+ return true;
+
+ if (I->isInlineAsm()) {
+ const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
+ const char *AsmStr = I->getOperand(0).getSymbolName();
+
+ // inlineasm length estimate is number of bytes assuming the longest
+ // instruction.
+ uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
+ NumInstr += MaxAsmSize / MAI->getMaxInstLength();
+ } else {
+ ++NumInstr;
+ }
- if (I->isBundle() || !I->isBundled())
- if (++NumInstr >= SkipThreshold)
- return true;
+ if (NumInstr >= SkipThreshold)
+ return true;
}
}
return false;
}
-void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) {
+void SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) {
if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
return;
@@ -147,40 +210,44 @@ void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) {
.addOperand(To);
}
-void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) {
-
+bool SILowerControlFlow::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
+ MachineFunction *MF = MBB.getParent();
- if (MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getShaderType() !=
- ShaderType::PIXEL ||
+ if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS ||
!shouldSkip(&MBB, &MBB.getParent()->back()))
- return;
+ return false;
+
+ MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
+ MBB.addSuccessor(SkipBB);
- MachineBasicBlock::iterator Insert = &MI;
- ++Insert;
+ const DebugLoc &DL = MI.getDebugLoc();
// If the exec mask is non-zero, skip the next two instructions
- BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
- .addImm(3);
+ BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+ .addMBB(&NextBB);
+
+ MachineBasicBlock::iterator Insert = SkipBB->begin();
// Exec mask is zero: Export to NULL target...
- BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP))
- .addImm(0)
- .addImm(0x09) // V_008DFC_SQ_EXP_NULL
- .addImm(0)
- .addImm(1)
- .addImm(1)
- .addReg(AMDGPU::VGPR0)
- .addReg(AMDGPU::VGPR0)
- .addReg(AMDGPU::VGPR0)
- .addReg(AMDGPU::VGPR0);
-
- // ... and terminate wavefront
- BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
+ BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP))
+ .addImm(0)
+ .addImm(0x09) // V_008DFC_SQ_EXP_NULL
+ .addImm(0)
+ .addImm(1)
+ .addImm(1)
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addReg(AMDGPU::VGPR0, RegState::Undef);
+
+ // ... and terminate wavefront.
+ BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
+
+ return true;
}
-void SILowerControlFlowPass::If(MachineInstr &MI) {
+void SILowerControlFlow::If(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Reg = MI.getOperand(0).getReg();
@@ -195,10 +262,15 @@ void SILowerControlFlowPass::If(MachineInstr &MI) {
Skip(MI, MI.getOperand(2));
+ // Insert a pseudo terminator to help keep the verifier happy.
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
+ .addOperand(MI.getOperand(2))
+ .addReg(Reg);
+
MI.eraseFromParent();
}
-void SILowerControlFlowPass::Else(MachineInstr &MI) {
+void SILowerControlFlow::Else(MachineInstr &MI, bool ExecModified) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Dst = MI.getOperand(0).getReg();
@@ -208,22 +280,36 @@ void SILowerControlFlowPass::Else(MachineInstr &MI) {
TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
.addReg(Src); // Saved EXEC
+ if (ExecModified) {
+ // Adjust the saved exec to account for the modifications during the flow
+ // block that contains the ELSE. This can happen when WQM mode is switched
+ // off.
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
+ .addReg(AMDGPU::EXEC)
+ .addReg(Dst);
+ }
+
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
.addReg(AMDGPU::EXEC)
.addReg(Dst);
Skip(MI, MI.getOperand(2));
+ // Insert a pseudo terminator to help keep the verifier happy.
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
+ .addOperand(MI.getOperand(2))
+ .addReg(Dst);
+
MI.eraseFromParent();
}
-void SILowerControlFlowPass::Break(MachineInstr &MI) {
+void SILowerControlFlow::Break(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Dst = MI.getOperand(0).getReg();
unsigned Src = MI.getOperand(1).getReg();
-
+
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
.addReg(AMDGPU::EXEC)
.addReg(Src);
@@ -231,14 +317,14 @@ void SILowerControlFlowPass::Break(MachineInstr &MI) {
MI.eraseFromParent();
}
-void SILowerControlFlowPass::IfBreak(MachineInstr &MI) {
+void SILowerControlFlow::IfBreak(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Dst = MI.getOperand(0).getReg();
unsigned Vcc = MI.getOperand(1).getReg();
unsigned Src = MI.getOperand(2).getReg();
-
+
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
.addReg(Vcc)
.addReg(Src);
@@ -246,14 +332,14 @@ void SILowerControlFlowPass::IfBreak(MachineInstr &MI) {
MI.eraseFromParent();
}
-void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) {
+void SILowerControlFlow::ElseBreak(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Dst = MI.getOperand(0).getReg();
unsigned Saved = MI.getOperand(1).getReg();
unsigned Src = MI.getOperand(2).getReg();
-
+
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
.addReg(Saved)
.addReg(Src);
@@ -261,7 +347,7 @@ void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) {
MI.eraseFromParent();
}
-void SILowerControlFlowPass::Loop(MachineInstr &MI) {
+void SILowerControlFlow::Loop(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Src = MI.getOperand(0).getReg();
@@ -276,7 +362,7 @@ void SILowerControlFlowPass::Loop(MachineInstr &MI) {
MI.eraseFromParent();
}
-void SILowerControlFlowPass::EndCf(MachineInstr &MI) {
+void SILowerControlFlow::EndCf(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Reg = MI.getOperand(0).getReg();
@@ -289,24 +375,24 @@ void SILowerControlFlowPass::EndCf(MachineInstr &MI) {
MI.eraseFromParent();
}
-void SILowerControlFlowPass::Branch(MachineInstr &MI) {
- if (MI.getOperand(0).getMBB() == MI.getParent()->getNextNode())
+void SILowerControlFlow::Branch(MachineInstr &MI) {
+ MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
+ if (MBB == MI.getParent()->getNextNode())
MI.eraseFromParent();
// If these aren't equal, this is probably an infinite loop.
}
-void SILowerControlFlowPass::Kill(MachineInstr &MI) {
+void SILowerControlFlow::Kill(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
const MachineOperand &Op = MI.getOperand(0);
#ifndef NDEBUG
- const SIMachineFunctionInfo *MFI
- = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
+ CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv();
// Kill is only allowed in pixel / geometry shaders.
- assert(MFI->getShaderType() == ShaderType::PIXEL ||
- MFI->getShaderType() == ShaderType::GEOMETRY);
+ assert(CallConv == CallingConv::AMDGPU_PS ||
+ CallConv == CallingConv::AMDGPU_GS);
#endif
// Clear this thread from the exec mask if the operand is negative
@@ -325,94 +411,209 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) {
MI.eraseFromParent();
}
-void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) {
+// All currently live registers must remain so in the remainder block.
+void SILowerControlFlow::splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs,
+ const MachineRegisterInfo &MRI,
+ const MachineInstr &MI,
+ MachineBasicBlock &LoopBB,
+ MachineBasicBlock &RemainderBB,
+ unsigned SaveReg,
+ const MachineOperand &IdxReg) {
+ // Add reg defined in loop body.
+ RemainderLiveRegs.addReg(SaveReg);
+
+ if (const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val)) {
+ if (!Val->isUndef()) {
+ RemainderLiveRegs.addReg(Val->getReg());
+ LoopBB.addLiveIn(Val->getReg());
+ }
+ }
+
+ for (unsigned Reg : RemainderLiveRegs) {
+ if (MRI.isAllocatable(Reg))
+ RemainderBB.addLiveIn(Reg);
+ }
+
+ const MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src);
+ if (!Src->isUndef())
+ LoopBB.addLiveIn(Src->getReg());
+
+ if (!IdxReg.isUndef())
+ LoopBB.addLiveIn(IdxReg.getReg());
+ LoopBB.sortUniqueLiveIns();
+}
+
+void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB,
+ DebugLoc DL,
+ MachineInstr *MovRel,
+ const MachineOperand &IdxReg,
+ int Offset) {
+ MachineBasicBlock::iterator I = LoopBB.begin();
+
+ // Read the next variant into VCC (lower 32 bits) <- also loop target
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), AMDGPU::VCC_LO)
+ .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
+
+ // Move index from VCC into M0
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+ .addReg(AMDGPU::VCC_LO);
+
+ // Compare the just read M0 value to all possible Idx values
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32))
+ .addReg(AMDGPU::M0)
+ .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
+
+ // Update EXEC, save the original EXEC value to VCC
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
+ .addReg(AMDGPU::VCC);
+
+ if (Offset != 0) {
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
+ .addReg(AMDGPU::M0)
+ .addImm(Offset);
+ }
+
+ // Do the actual move
+ LoopBB.insert(I, MovRel);
+
+ // Update EXEC, switch all done bits to 0 and all todo bits to 1
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC)
+ .addReg(AMDGPU::VCC);
+
+ // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+ .addMBB(&LoopBB);
+}
+
+MachineBasicBlock *SILowerControlFlow::insertSkipBlock(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
+ MachineFunction *MF = MBB.getParent();
+
+ MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
+ MachineFunction::iterator MBBI(MBB);
+ ++MBBI;
+
+ MF->insert(MBBI, SkipBB);
+
+ return SkipBB;
+}
+
+std::pair<MachineBasicBlock *, MachineBasicBlock *>
+SILowerControlFlow::splitBlock(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) {
+ MachineFunction *MF = MBB.getParent();
+ // To insert the loop we need to split the block. Move everything after this
+ // point to a new block, and insert a new empty block between the two.
+ MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
+ MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
+ MachineFunction::iterator MBBI(MBB);
+ ++MBBI;
+
+ MF->insert(MBBI, LoopBB);
+ MF->insert(MBBI, RemainderBB);
+
+ // Move the rest of the block into a new block.
+ RemainderBB->transferSuccessors(&MBB);
+ RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
+
+ MBB.addSuccessor(LoopBB);
+
+ return std::make_pair(LoopBB, RemainderBB);
+}
+
+// Returns true if a new block was inserted.
+bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
- MachineBasicBlock::iterator I = MI;
+ MachineBasicBlock::iterator I(&MI);
- unsigned Save = MI.getOperand(1).getReg();
- unsigned Idx = MI.getOperand(3).getReg();
+ const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
- if (AMDGPU::SReg_32RegClass.contains(Idx)) {
- if (Offset) {
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
- .addReg(Idx)
- .addImm(Offset);
+ if (AMDGPU::SReg_32RegClass.contains(Idx->getReg())) {
+ if (Offset != 0) {
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
+ .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef()))
+ .addImm(Offset);
} else {
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
- .addReg(Idx);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+ .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef()));
}
+
MBB.insert(I, MovRel);
- } else {
+ MI.eraseFromParent();
+ return false;
+ }
- assert(AMDGPU::SReg_64RegClass.contains(Save));
- assert(AMDGPU::VGPR_32RegClass.contains(Idx));
+ MachineOperand *SaveOp = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
+ SaveOp->setIsDead(false);
+ unsigned Save = SaveOp->getReg();
- // Save the EXEC mask
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
- .addReg(AMDGPU::EXEC);
+ // Reading from a VGPR requires looping over all workitems in the wavefront.
+ assert(AMDGPU::SReg_64RegClass.contains(Save) &&
+ AMDGPU::VGPR_32RegClass.contains(Idx->getReg()));
- // Read the next variant into VCC (lower 32 bits) <- also loop target
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
- AMDGPU::VCC_LO)
- .addReg(Idx);
+ // Save the EXEC mask
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), Save)
+ .addReg(AMDGPU::EXEC);
- // Move index from VCC into M0
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
- .addReg(AMDGPU::VCC_LO);
+ LivePhysRegs RemainderLiveRegs(TRI);
- // Compare the just read M0 value to all possible Idx values
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32))
- .addReg(AMDGPU::M0)
- .addReg(Idx);
+ RemainderLiveRegs.addLiveOuts(MBB);
- // Update EXEC, save the original EXEC value to VCC
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
- .addReg(AMDGPU::VCC);
+ MachineBasicBlock *LoopBB;
+ MachineBasicBlock *RemainderBB;
- if (Offset) {
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
- .addReg(AMDGPU::M0)
- .addImm(Offset);
- }
- // Do the actual move
- MBB.insert(I, MovRel);
+ std::tie(LoopBB, RemainderBB) = splitBlock(MBB, I);
- // Update EXEC, switch all done bits to 0 and all todo bits to 1
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC)
- .addReg(AMDGPU::VCC);
+ for (const MachineInstr &Inst : reverse(*RemainderBB))
+ RemainderLiveRegs.stepBackward(Inst);
- // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
- .addImm(-7);
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ LoopBB->addSuccessor(RemainderBB);
+ LoopBB->addSuccessor(LoopBB);
- // Restore EXEC
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
- .addReg(Save);
+ splitLoadM0BlockLiveIns(RemainderLiveRegs, MRI, MI, *LoopBB,
+ *RemainderBB, Save, *Idx);
+
+ emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, *Idx, Offset);
+
+ MachineBasicBlock::iterator First = RemainderBB->begin();
+ BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+ .addReg(Save);
- }
MI.eraseFromParent();
+ return true;
}
-/// \param @VecReg The register which holds element zero of the vector
-/// being addressed into.
-/// \param[out] @Reg The base register to use in the indirect addressing instruction.
-/// \param[in,out] @Offset As an input, this is the constant offset part of the
-// indirect Index. e.g. v0 = v[VecReg + Offset]
-// As an output, this is a constant value that needs
-// to be added to the value stored in M0.
-void SILowerControlFlowPass::computeIndirectRegAndOffset(unsigned VecReg,
- unsigned &Reg,
- int &Offset) {
+/// \param @VecReg The register which holds element zero of the vector being
+/// addressed into.
+//
+/// \param[in] @Idx The index operand from the movrel instruction. This must be
+// a register, but may be NoRegister.
+///
+/// \param[in] @Offset As an input, this is the constant offset part of the
+// indirect Index. e.g. v0 = v[VecReg + Offset] As an output, this is a constant
+// value that needs to be added to the value stored in M0.
+std::pair<unsigned, int>
+SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg, int Offset) const {
unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0);
if (!SubReg)
SubReg = VecReg;
+ const TargetRegisterClass *SuperRC = TRI->getPhysRegClass(VecReg);
const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg);
- int RegIdx = TRI->getHWRegIndex(SubReg) + Offset;
+ int NumElts = SuperRC->getSize() / RC->getSize();
+
+ int BaseRegIdx = TRI->getHWRegIndex(SubReg);
+
+ // Skip out of bounds offsets, or else we would end up using an undefined
+ // register.
+ if (Offset >= NumElts)
+ return std::make_pair(RC->getRegister(BaseRegIdx), Offset);
+ int RegIdx = BaseRegIdx + Offset;
if (RegIdx < 0) {
Offset = RegIdx;
RegIdx = 0;
@@ -420,77 +621,102 @@ void SILowerControlFlowPass::computeIndirectRegAndOffset(unsigned VecReg,
Offset = 0;
}
- Reg = RC->getRegister(RegIdx);
+ unsigned Reg = RC->getRegister(RegIdx);
+ return std::make_pair(Reg, Offset);
}
-void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) {
-
+// Return true if a new block was inserted.
+bool SILowerControlFlow::indirectSrc(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
unsigned Dst = MI.getOperand(0).getReg();
- unsigned Vec = MI.getOperand(2).getReg();
- int Off = MI.getOperand(4).getImm();
+ const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
+ int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
unsigned Reg;
- computeIndirectRegAndOffset(Vec, Reg, Off);
+ std::tie(Reg, Offset) = computeIndirectRegAndOffset(SrcVec->getReg(), Offset);
+
+ const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
+ if (Idx->getReg() == AMDGPU::NoRegister) {
+ // Only had a constant offset, copy the register directly.
+ BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
+ .addReg(Reg, getUndefRegState(SrcVec->isUndef()));
+ MI.eraseFromParent();
+ return false;
+ }
MachineInstr *MovRel =
BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
- .addReg(Reg)
- .addReg(Vec, RegState::Implicit);
+ .addReg(Reg, getUndefRegState(SrcVec->isUndef()))
+ .addReg(SrcVec->getReg(), RegState::Implicit);
- LoadM0(MI, MovRel, Off);
+ return loadM0(MI, MovRel, Offset);
}
-void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
-
+// Return true if a new block was inserted.
+bool SILowerControlFlow::indirectDst(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
+ const DebugLoc &DL = MI.getDebugLoc();
unsigned Dst = MI.getOperand(0).getReg();
- int Off = MI.getOperand(4).getImm();
- unsigned Val = MI.getOperand(5).getReg();
+ int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
unsigned Reg;
- computeIndirectRegAndOffset(Dst, Reg, Off);
+ const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
+ std::tie(Reg, Offset) = computeIndirectRegAndOffset(Dst, Offset);
- MachineInstr *MovRel =
- BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
- .addReg(Reg, RegState::Define)
- .addReg(Val)
- .addReg(Dst, RegState::Implicit);
+ MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
+ if (Idx->getReg() == AMDGPU::NoRegister) {
+ // Only had a constant offset, copy the register directly.
+ BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Reg)
+ .addOperand(*Val);
+ MI.eraseFromParent();
+ return false;
+ }
+
+ MachineInstr *MovRel =
+ BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32), Reg)
+ .addReg(Val->getReg(), getUndefRegState(Val->isUndef()))
+ .addReg(Dst, RegState::Implicit);
- LoadM0(MI, MovRel, Off);
+ return loadM0(MI, MovRel, Offset);
}
-bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
- TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
- TRI =
- static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
+bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ TII = ST.getInstrInfo();
+ TRI = &TII->getRegisterInfo();
+
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
bool HaveKill = false;
- bool NeedWQM = false;
bool NeedFlat = false;
unsigned Depth = 0;
- for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
- BI != BE; ++BI) {
+ MachineFunction::iterator NextBB;
+ for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+ BI != BE; BI = NextBB) {
+ NextBB = std::next(BI);
MachineBasicBlock &MBB = *BI;
+
+ MachineBasicBlock *EmptyMBBAtEnd = nullptr;
MachineBasicBlock::iterator I, Next;
+ bool ExecModified = false;
+
for (I = MBB.begin(); I != MBB.end(); I = Next) {
Next = std::next(I);
MachineInstr &MI = *I;
- if (TII->isWQM(MI) || TII->isDS(MI))
- NeedWQM = true;
// Flat uses m0 in case it needs to access LDS.
if (TII->isFLAT(MI))
NeedFlat = true;
+ if (I->modifiesRegister(AMDGPU::EXEC, TRI))
+ ExecModified = true;
+
switch (MI.getOpcode()) {
default: break;
case AMDGPU::SI_IF:
@@ -499,7 +725,7 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
break;
case AMDGPU::SI_ELSE:
- Else(MI);
+ Else(MI, ExecModified);
break;
case AMDGPU::SI_BREAK:
@@ -521,16 +747,20 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
case AMDGPU::SI_END_CF:
if (--Depth == 0 && HaveKill) {
- SkipIfDead(MI);
HaveKill = false;
+ // TODO: Insert skip if exec is 0?
}
+
EndCf(MI);
break;
- case AMDGPU::SI_KILL:
- if (Depth == 0)
- SkipIfDead(MI);
- else
+ case AMDGPU::SI_KILL_TERMINATOR:
+ if (Depth == 0) {
+ if (skipIfDead(MI, *NextBB)) {
+ NextBB = std::next(BI);
+ BE = MF.end();
+ }
+ } else
HaveKill = true;
Kill(MI);
break;
@@ -544,7 +774,15 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
case AMDGPU::SI_INDIRECT_SRC_V4:
case AMDGPU::SI_INDIRECT_SRC_V8:
case AMDGPU::SI_INDIRECT_SRC_V16:
- IndirectSrc(MI);
+ if (indirectSrc(MI)) {
+ // The block was split at this point. We can safely skip the middle
+ // inserted block to the following which contains the rest of this
+ // block's instructions.
+ NextBB = std::next(BI);
+ BE = MF.end();
+ Next = MBB.end();
+ }
+
break;
case AMDGPU::SI_INDIRECT_DST_V1:
@@ -552,55 +790,46 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
case AMDGPU::SI_INDIRECT_DST_V4:
case AMDGPU::SI_INDIRECT_DST_V8:
case AMDGPU::SI_INDIRECT_DST_V16:
- IndirectDst(MI);
+ if (indirectDst(MI)) {
+ // The block was split at this point. We can safely skip the middle
+ // inserted block to the following which contains the rest of this
+ // block's instructions.
+ NextBB = std::next(BI);
+ BE = MF.end();
+ Next = MBB.end();
+ }
+
break;
+
+ case AMDGPU::SI_RETURN: {
+ assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
+
+ // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
+ // because external bytecode will be appended at the end.
+ if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
+ // SI_RETURN is not the last instruction. Add an empty block at
+ // the end and jump there.
+ if (!EmptyMBBAtEnd) {
+ EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
+ MF.insert(MF.end(), EmptyMBBAtEnd);
+ }
+
+ MBB.addSuccessor(EmptyMBBAtEnd);
+ BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
+ .addMBB(EmptyMBBAtEnd);
+ I->eraseFromParent();
+ }
+ break;
+ }
}
}
}
- if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) {
- MachineBasicBlock &MBB = MF.front();
- BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
- AMDGPU::EXEC).addReg(AMDGPU::EXEC);
- }
-
- // FIXME: This seems inappropriate to do here.
if (NeedFlat && MFI->IsKernel) {
- // Insert the prologue initializing the SGPRs pointing to the scratch space
- // for flat accesses.
- const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
-
// TODO: What to use with function calls?
-
- // FIXME: This is reporting stack size that is used in a scratch buffer
- // rather than registers as well.
- uint64_t StackSizeBytes = FrameInfo->getStackSize();
-
- int IndirectBegin
- = static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF);
- // Convert register index to 256-byte unit.
- uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256);
-
- assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff &&
- "Stack limits should be smaller than 16-bits");
-
- // Initialize the flat scratch register pair.
- // TODO: Can we use one s_mov_b64 here?
-
- // Offset is in units of 256-bytes.
- MachineBasicBlock &MBB = MF.front();
- DebugLoc NoDL;
- MachineBasicBlock::iterator Start = MBB.getFirstNonPHI();
- const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32);
-
- assert(isInt<16>(StackOffset) && isInt<16>(StackSizeBytes));
-
- BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO)
- .addImm(StackOffset);
-
- // Documentation says size is "per-thread scratch size in bytes"
- BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI)
- .addImm(StackSizeBytes);
+ // We will need to Initialize the flat scratch register pair.
+ if (NeedFlat)
+ MFI->setHasFlatInstructions(true);
}
return true;
OpenPOWER on IntegriCloud