summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp')
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp930
1 files changed, 281 insertions, 649 deletions
diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index ee1d5da..7ed18f2 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -58,7 +58,6 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Constants.h"
using namespace llvm;
@@ -68,63 +67,50 @@ namespace {
class SILowerControlFlow : public MachineFunctionPass {
private:
- static const unsigned SkipThreshold = 12;
-
const SIRegisterInfo *TRI;
const SIInstrInfo *TII;
+ LiveIntervals *LIS;
+ MachineRegisterInfo *MRI;
- bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
-
- void Skip(MachineInstr &From, MachineOperand &To);
- bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
-
- void If(MachineInstr &MI);
- void Else(MachineInstr &MI, bool ExecModified);
- void Break(MachineInstr &MI);
- void IfBreak(MachineInstr &MI);
- void ElseBreak(MachineInstr &MI);
- void Loop(MachineInstr &MI);
- void EndCf(MachineInstr &MI);
-
- void Kill(MachineInstr &MI);
- void Branch(MachineInstr &MI);
-
- MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I) const;
-
- std::pair<MachineBasicBlock *, MachineBasicBlock *>
- splitBlock(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
+ void emitIf(MachineInstr &MI);
+ void emitElse(MachineInstr &MI);
+ void emitBreak(MachineInstr &MI);
+ void emitIfBreak(MachineInstr &MI);
+ void emitElseBreak(MachineInstr &MI);
+ void emitLoop(MachineInstr &MI);
+ void emitEndCf(MachineInstr &MI);
- void splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs,
- const MachineRegisterInfo &MRI,
- const MachineInstr &MI,
- MachineBasicBlock &LoopBB,
- MachineBasicBlock &RemainderBB,
- unsigned SaveReg,
- const MachineOperand &IdxReg);
+ void findMaskOperands(MachineInstr &MI, unsigned OpNo,
+ SmallVectorImpl<MachineOperand> &Src) const;
- void emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL,
- MachineInstr *MovRel,
- const MachineOperand &IdxReg,
- int Offset);
-
- bool loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0);
- std::pair<unsigned, int> computeIndirectRegAndOffset(unsigned VecReg,
- int Offset) const;
- bool indirectSrc(MachineInstr &MI);
- bool indirectDst(MachineInstr &MI);
+ void combineMasks(MachineInstr &MI);
public:
static char ID;
SILowerControlFlow() :
- MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { }
+ MachineFunctionPass(ID),
+ TRI(nullptr),
+ TII(nullptr),
+ LIS(nullptr),
+ MRI(nullptr) {}
bool runOnMachineFunction(MachineFunction &MF) override;
- const char *getPassName() const override {
+ StringRef getPassName() const override {
return "SI Lower control flow pseudo instructions";
}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ // Should preserve the same set that TwoAddressInstructions does.
+ AU.addPreserved<SlotIndexes>();
+ AU.addPreserved<LiveIntervals>();
+ AU.addPreservedID(LiveVariablesID);
+ AU.addPreservedID(MachineLoopInfoID);
+ AU.addPreservedID(MachineDominatorsID);
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
};
} // End anonymous namespace
@@ -132,555 +118,283 @@ public:
char SILowerControlFlow::ID = 0;
INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
- "SI lower control flow", false, false)
+ "SI lower control flow", false, false)
-char &llvm::SILowerControlFlowPassID = SILowerControlFlow::ID;
+static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) {
+ MachineOperand &ImpDefSCC = MI.getOperand(3);
+ assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
-
-FunctionPass *llvm::createSILowerControlFlowPass() {
- return new SILowerControlFlow();
+ ImpDefSCC.setIsDead(IsDead);
}
-static bool opcodeEmitsNoInsts(unsigned Opc) {
- switch (Opc) {
- case TargetOpcode::IMPLICIT_DEF:
- case TargetOpcode::KILL:
- case TargetOpcode::BUNDLE:
- case TargetOpcode::CFI_INSTRUCTION:
- case TargetOpcode::EH_LABEL:
- case TargetOpcode::GC_LABEL:
- case TargetOpcode::DBG_VALUE:
- return true;
- default:
- return false;
- }
-}
-
-bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From,
- MachineBasicBlock *To) {
- if (From->succ_empty())
- return false;
-
- unsigned NumInstr = 0;
- MachineFunction *MF = From->getParent();
-
- for (MachineFunction::iterator MBBI(From), ToI(To), End = MF->end();
- MBBI != End && MBBI != ToI; ++MBBI) {
- MachineBasicBlock &MBB = *MBBI;
-
- for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
- NumInstr < SkipThreshold && I != E; ++I) {
- if (opcodeEmitsNoInsts(I->getOpcode()))
- continue;
-
- // When a uniform loop is inside non-uniform control flow, the branch
- // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
- // when EXEC = 0. We should skip the loop lest it becomes infinite.
- if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
- I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
- return true;
-
- if (I->isInlineAsm()) {
- const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
- const char *AsmStr = I->getOperand(0).getSymbolName();
-
- // inlineasm length estimate is number of bytes assuming the longest
- // instruction.
- uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
- NumInstr += MaxAsmSize / MAI->getMaxInstLength();
- } else {
- ++NumInstr;
- }
+char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
- if (NumInstr >= SkipThreshold)
- return true;
- }
- }
-
- return false;
-}
-
-void SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) {
-
- if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
- return;
-
- DebugLoc DL = From.getDebugLoc();
- BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
- .addOperand(To);
-}
-
-bool SILowerControlFlow::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
+void SILowerControlFlow::emitIf(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
- MachineFunction *MF = MBB.getParent();
-
- if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS ||
- !shouldSkip(&MBB, &MBB.getParent()->back()))
- return false;
-
- MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
- MBB.addSuccessor(SkipBB);
-
const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock::iterator I(&MI);
- // If the exec mask is non-zero, skip the next two instructions
- BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
- .addMBB(&NextBB);
-
- MachineBasicBlock::iterator Insert = SkipBB->begin();
-
- // Exec mask is zero: Export to NULL target...
- BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP))
- .addImm(0)
- .addImm(0x09) // V_008DFC_SQ_EXP_NULL
- .addImm(0)
- .addImm(1)
- .addImm(1)
- .addReg(AMDGPU::VGPR0, RegState::Undef)
- .addReg(AMDGPU::VGPR0, RegState::Undef)
- .addReg(AMDGPU::VGPR0, RegState::Undef)
- .addReg(AMDGPU::VGPR0, RegState::Undef);
-
- // ... and terminate wavefront.
- BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
-
- return true;
-}
-
-void SILowerControlFlow::If(MachineInstr &MI) {
- MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
- unsigned Reg = MI.getOperand(0).getReg();
- unsigned Vcc = MI.getOperand(1).getReg();
-
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg)
- .addReg(Vcc);
-
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg)
- .addReg(AMDGPU::EXEC)
- .addReg(Reg);
-
- Skip(MI, MI.getOperand(2));
+ MachineOperand &SaveExec = MI.getOperand(0);
+ MachineOperand &Cond = MI.getOperand(1);
+ assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister &&
+ Cond.getSubReg() == AMDGPU::NoSubRegister);
- // Insert a pseudo terminator to help keep the verifier happy.
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
- .addOperand(MI.getOperand(2))
- .addReg(Reg);
+ unsigned SaveExecReg = SaveExec.getReg();
- MI.eraseFromParent();
-}
+ MachineOperand &ImpDefSCC = MI.getOperand(4);
+ assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
-void SILowerControlFlow::Else(MachineInstr &MI, bool ExecModified) {
- MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
- unsigned Dst = MI.getOperand(0).getReg();
- unsigned Src = MI.getOperand(1).getReg();
-
- BuildMI(MBB, MBB.getFirstNonPHI(), DL,
- TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
- .addReg(Src); // Saved EXEC
-
- if (ExecModified) {
- // Adjust the saved exec to account for the modifications during the flow
- // block that contains the ELSE. This can happen when WQM mode is switched
- // off.
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
- .addReg(AMDGPU::EXEC)
- .addReg(Dst);
+ // Add an implicit def of exec to discourage scheduling VALU after this which
+ // will interfere with trying to form s_and_saveexec_b64 later.
+ unsigned CopyReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ MachineInstr *CopyExec =
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg)
+ .addReg(AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC, RegState::ImplicitDefine);
+
+ unsigned Tmp = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+ MachineInstr *And =
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_B64), Tmp)
+ .addReg(CopyReg)
+ //.addReg(AMDGPU::EXEC)
+ .addReg(Cond.getReg());
+ setImpSCCDefDead(*And, true);
+
+ MachineInstr *Xor =
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg)
+ .addReg(Tmp)
+ .addReg(CopyReg);
+ setImpSCCDefDead(*Xor, ImpDefSCC.isDead());
+
+ // Use a copy that is a terminator to get correct spill code placement it with
+ // fast regalloc.
+ MachineInstr *SetExec =
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64_term), AMDGPU::EXEC)
+ .addReg(Tmp, RegState::Kill);
+
+ // Insert a pseudo terminator to help keep the verifier happy. This will also
+ // be used later when inserting skips.
+ MachineInstr *NewBr =
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
+ .addOperand(MI.getOperand(2));
+
+ if (!LIS) {
+ MI.eraseFromParent();
+ return;
}
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC)
- .addReg(Dst);
+ LIS->InsertMachineInstrInMaps(*CopyExec);
- Skip(MI, MI.getOperand(2));
+ // Replace with and so we don't need to fix the live interval for condition
+ // register.
+ LIS->ReplaceMachineInstrInMaps(MI, *And);
- // Insert a pseudo terminator to help keep the verifier happy.
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
- .addOperand(MI.getOperand(2))
- .addReg(Dst);
+ LIS->InsertMachineInstrInMaps(*Xor);
+ LIS->InsertMachineInstrInMaps(*SetExec);
+ LIS->InsertMachineInstrInMaps(*NewBr);
+ LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
MI.eraseFromParent();
-}
-
-void SILowerControlFlow::Break(MachineInstr &MI) {
- MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
-
- unsigned Dst = MI.getOperand(0).getReg();
- unsigned Src = MI.getOperand(1).getReg();
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
- .addReg(AMDGPU::EXEC)
- .addReg(Src);
-
- MI.eraseFromParent();
+ // FIXME: Is there a better way of adjusting the liveness? It shouldn't be
+ // hard to add another def here but I'm not sure how to correctly update the
+ // valno.
+ LIS->removeInterval(SaveExecReg);
+ LIS->createAndComputeVirtRegInterval(SaveExecReg);
+ LIS->createAndComputeVirtRegInterval(Tmp);
+ LIS->createAndComputeVirtRegInterval(CopyReg);
}
-void SILowerControlFlow::IfBreak(MachineInstr &MI) {
+void SILowerControlFlow::emitElse(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
-
- unsigned Dst = MI.getOperand(0).getReg();
- unsigned Vcc = MI.getOperand(1).getReg();
- unsigned Src = MI.getOperand(2).getReg();
-
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
- .addReg(Vcc)
- .addReg(Src);
-
- MI.eraseFromParent();
-}
+ const DebugLoc &DL = MI.getDebugLoc();
-void SILowerControlFlow::ElseBreak(MachineInstr &MI) {
- MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
+ unsigned DstReg = MI.getOperand(0).getReg();
+ assert(MI.getOperand(0).getSubReg() == AMDGPU::NoSubRegister);
- unsigned Dst = MI.getOperand(0).getReg();
- unsigned Saved = MI.getOperand(1).getReg();
- unsigned Src = MI.getOperand(2).getReg();
+ bool ExecModified = MI.getOperand(3).getImm() != 0;
+ MachineBasicBlock::iterator Start = MBB.begin();
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
- .addReg(Saved)
- .addReg(Src);
+ // We are running before TwoAddressInstructions, and si_else's operands are
+ // tied. In order to correctly tie the registers, split this into a copy of
+ // the src like it does.
+ unsigned CopyReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ BuildMI(MBB, Start, DL, TII->get(AMDGPU::COPY), CopyReg)
+ .addOperand(MI.getOperand(1)); // Saved EXEC
- MI.eraseFromParent();
-}
+ // This must be inserted before phis and any spill code inserted before the
+ // else.
+ unsigned SaveReg = ExecModified ?
+ MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass) : DstReg;
+ MachineInstr *OrSaveExec =
+ BuildMI(MBB, Start, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), SaveReg)
+ .addReg(CopyReg);
-void SILowerControlFlow::Loop(MachineInstr &MI) {
- MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
- unsigned Src = MI.getOperand(0).getReg();
+ MachineBasicBlock *DestBB = MI.getOperand(2).getMBB();
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC)
- .addReg(Src);
+ MachineBasicBlock::iterator ElsePt(MI);
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
- .addOperand(MI.getOperand(1));
+ if (ExecModified) {
+ MachineInstr *And =
+ BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_AND_B64), DstReg)
+ .addReg(AMDGPU::EXEC)
+ .addReg(SaveReg);
- MI.eraseFromParent();
-}
+ if (LIS)
+ LIS->InsertMachineInstrInMaps(*And);
+ }
-void SILowerControlFlow::EndCf(MachineInstr &MI) {
- MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
- unsigned Reg = MI.getOperand(0).getReg();
+ MachineInstr *Xor =
+ BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC)
+ .addReg(DstReg);
- BuildMI(MBB, MBB.getFirstNonPHI(), DL,
- TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC)
- .addReg(Reg);
+ MachineInstr *Branch =
+ BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
+ .addMBB(DestBB);
- MI.eraseFromParent();
-}
-
-void SILowerControlFlow::Branch(MachineInstr &MI) {
- MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
- if (MBB == MI.getParent()->getNextNode())
+ if (!LIS) {
MI.eraseFromParent();
-
- // If these aren't equal, this is probably an infinite loop.
-}
-
-void SILowerControlFlow::Kill(MachineInstr &MI) {
- MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
- const MachineOperand &Op = MI.getOperand(0);
-
-#ifndef NDEBUG
- CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv();
- // Kill is only allowed in pixel / geometry shaders.
- assert(CallConv == CallingConv::AMDGPU_PS ||
- CallConv == CallingConv::AMDGPU_GS);
-#endif
-
- // Clear this thread from the exec mask if the operand is negative
- if ((Op.isImm())) {
- // Constant operand: Set exec mask to 0 or do nothing
- if (Op.getImm() & 0x80000000) {
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
- .addImm(0);
- }
- } else {
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
- .addImm(0)
- .addOperand(Op);
+ return;
}
+ LIS->RemoveMachineInstrFromMaps(MI);
MI.eraseFromParent();
-}
-// All currently live registers must remain so in the remainder block.
-void SILowerControlFlow::splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs,
- const MachineRegisterInfo &MRI,
- const MachineInstr &MI,
- MachineBasicBlock &LoopBB,
- MachineBasicBlock &RemainderBB,
- unsigned SaveReg,
- const MachineOperand &IdxReg) {
- // Add reg defined in loop body.
- RemainderLiveRegs.addReg(SaveReg);
-
- if (const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val)) {
- if (!Val->isUndef()) {
- RemainderLiveRegs.addReg(Val->getReg());
- LoopBB.addLiveIn(Val->getReg());
- }
- }
+ LIS->InsertMachineInstrInMaps(*OrSaveExec);
- for (unsigned Reg : RemainderLiveRegs) {
- if (MRI.isAllocatable(Reg))
- RemainderBB.addLiveIn(Reg);
- }
+ LIS->InsertMachineInstrInMaps(*Xor);
+ LIS->InsertMachineInstrInMaps(*Branch);
- const MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src);
- if (!Src->isUndef())
- LoopBB.addLiveIn(Src->getReg());
+ // src reg is tied to dst reg.
+ LIS->removeInterval(DstReg);
+ LIS->createAndComputeVirtRegInterval(DstReg);
+ LIS->createAndComputeVirtRegInterval(CopyReg);
+ if (ExecModified)
+ LIS->createAndComputeVirtRegInterval(SaveReg);
- if (!IdxReg.isUndef())
- LoopBB.addLiveIn(IdxReg.getReg());
- LoopBB.sortUniqueLiveIns();
+ // Let this be recomputed.
+ LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
}
-void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB,
- DebugLoc DL,
- MachineInstr *MovRel,
- const MachineOperand &IdxReg,
- int Offset) {
- MachineBasicBlock::iterator I = LoopBB.begin();
-
- // Read the next variant into VCC (lower 32 bits) <- also loop target
- BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), AMDGPU::VCC_LO)
- .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
-
- // Move index from VCC into M0
- BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
- .addReg(AMDGPU::VCC_LO);
-
- // Compare the just read M0 value to all possible Idx values
- BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32))
- .addReg(AMDGPU::M0)
- .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
-
- // Update EXEC, save the original EXEC value to VCC
- BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
- .addReg(AMDGPU::VCC);
-
- if (Offset != 0) {
- BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
- .addReg(AMDGPU::M0)
- .addImm(Offset);
- }
-
- // Do the actual move
- LoopBB.insert(I, MovRel);
+void SILowerControlFlow::emitBreak(MachineInstr &MI) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+ unsigned Dst = MI.getOperand(0).getReg();
- // Update EXEC, switch all done bits to 0 and all todo bits to 1
- BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
+ MachineInstr *Or =
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
.addReg(AMDGPU::EXEC)
- .addReg(AMDGPU::VCC);
+ .addOperand(MI.getOperand(1));
- // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
- BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
- .addMBB(&LoopBB);
+ if (LIS)
+ LIS->ReplaceMachineInstrInMaps(MI, *Or);
+ MI.eraseFromParent();
}
-MachineBasicBlock *SILowerControlFlow::insertSkipBlock(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
- MachineFunction *MF = MBB.getParent();
-
- MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
- MachineFunction::iterator MBBI(MBB);
- ++MBBI;
-
- MF->insert(MBBI, SkipBB);
-
- return SkipBB;
+void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
+ MI.setDesc(TII->get(AMDGPU::S_OR_B64));
}
-std::pair<MachineBasicBlock *, MachineBasicBlock *>
-SILowerControlFlow::splitBlock(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I) {
- MachineFunction *MF = MBB.getParent();
-
- // To insert the loop we need to split the block. Move everything after this
- // point to a new block, and insert a new empty block between the two.
- MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
- MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
- MachineFunction::iterator MBBI(MBB);
- ++MBBI;
-
- MF->insert(MBBI, LoopBB);
- MF->insert(MBBI, RemainderBB);
-
- // Move the rest of the block into a new block.
- RemainderBB->transferSuccessors(&MBB);
- RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
-
- MBB.addSuccessor(LoopBB);
-
- return std::make_pair(LoopBB, RemainderBB);
+void SILowerControlFlow::emitElseBreak(MachineInstr &MI) {
+ MI.setDesc(TII->get(AMDGPU::S_OR_B64));
}
-// Returns true if a new block was inserted.
-bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) {
+void SILowerControlFlow::emitLoop(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
- MachineBasicBlock::iterator I(&MI);
+ const DebugLoc &DL = MI.getDebugLoc();
- const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
+ MachineInstr *AndN2 =
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64_term), AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC)
+ .addOperand(MI.getOperand(0));
- if (AMDGPU::SReg_32RegClass.contains(Idx->getReg())) {
- if (Offset != 0) {
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
- .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef()))
- .addImm(Offset);
- } else {
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
- .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef()));
- }
+ MachineInstr *Branch =
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+ .addOperand(MI.getOperand(1));
- MBB.insert(I, MovRel);
- MI.eraseFromParent();
- return false;
+ if (LIS) {
+ LIS->ReplaceMachineInstrInMaps(MI, *AndN2);
+ LIS->InsertMachineInstrInMaps(*Branch);
}
- MachineOperand *SaveOp = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
- SaveOp->setIsDead(false);
- unsigned Save = SaveOp->getReg();
-
- // Reading from a VGPR requires looping over all workitems in the wavefront.
- assert(AMDGPU::SReg_64RegClass.contains(Save) &&
- AMDGPU::VGPR_32RegClass.contains(Idx->getReg()));
-
- // Save the EXEC mask
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), Save)
- .addReg(AMDGPU::EXEC);
-
- LivePhysRegs RemainderLiveRegs(TRI);
-
- RemainderLiveRegs.addLiveOuts(MBB);
-
- MachineBasicBlock *LoopBB;
- MachineBasicBlock *RemainderBB;
-
- std::tie(LoopBB, RemainderBB) = splitBlock(MBB, I);
-
- for (const MachineInstr &Inst : reverse(*RemainderBB))
- RemainderLiveRegs.stepBackward(Inst);
-
- MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
- LoopBB->addSuccessor(RemainderBB);
- LoopBB->addSuccessor(LoopBB);
-
- splitLoadM0BlockLiveIns(RemainderLiveRegs, MRI, MI, *LoopBB,
- *RemainderBB, Save, *Idx);
-
- emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, *Idx, Offset);
-
- MachineBasicBlock::iterator First = RemainderBB->begin();
- BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
- .addReg(Save);
-
MI.eraseFromParent();
- return true;
-}
-
-/// \param @VecReg The register which holds element zero of the vector being
-/// addressed into.
-//
-/// \param[in] @Idx The index operand from the movrel instruction. This must be
-// a register, but may be NoRegister.
-///
-/// \param[in] @Offset As an input, this is the constant offset part of the
-// indirect Index. e.g. v0 = v[VecReg + Offset] As an output, this is a constant
-// value that needs to be added to the value stored in M0.
-std::pair<unsigned, int>
-SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg, int Offset) const {
- unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0);
- if (!SubReg)
- SubReg = VecReg;
-
- const TargetRegisterClass *SuperRC = TRI->getPhysRegClass(VecReg);
- const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg);
- int NumElts = SuperRC->getSize() / RC->getSize();
-
- int BaseRegIdx = TRI->getHWRegIndex(SubReg);
-
- // Skip out of bounds offsets, or else we would end up using an undefined
- // register.
- if (Offset >= NumElts)
- return std::make_pair(RC->getRegister(BaseRegIdx), Offset);
-
- int RegIdx = BaseRegIdx + Offset;
- if (RegIdx < 0) {
- Offset = RegIdx;
- RegIdx = 0;
- } else {
- Offset = 0;
- }
-
- unsigned Reg = RC->getRegister(RegIdx);
- return std::make_pair(Reg, Offset);
}
-// Return true if a new block was inserted.
-bool SILowerControlFlow::indirectSrc(MachineInstr &MI) {
+void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
- unsigned Dst = MI.getOperand(0).getReg();
- const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
- int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
- unsigned Reg;
-
- std::tie(Reg, Offset) = computeIndirectRegAndOffset(SrcVec->getReg(), Offset);
+ MachineBasicBlock::iterator InsPt = MBB.begin();
+ MachineInstr *NewMI =
+ BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC)
+ .addOperand(MI.getOperand(0));
- const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
- if (Idx->getReg() == AMDGPU::NoRegister) {
- // Only had a constant offset, copy the register directly.
- BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
- .addReg(Reg, getUndefRegState(SrcVec->isUndef()));
- MI.eraseFromParent();
- return false;
- }
+ if (LIS)
+ LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
- MachineInstr *MovRel =
- BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
- .addReg(Reg, getUndefRegState(SrcVec->isUndef()))
- .addReg(SrcVec->getReg(), RegState::Implicit);
+ MI.eraseFromParent();
- return loadM0(MI, MovRel, Offset);
+ if (LIS)
+ LIS->handleMove(*NewMI);
}
-// Return true if a new block was inserted.
-bool SILowerControlFlow::indirectDst(MachineInstr &MI) {
- MachineBasicBlock &MBB = *MI.getParent();
- const DebugLoc &DL = MI.getDebugLoc();
-
- unsigned Dst = MI.getOperand(0).getReg();
- int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
- unsigned Reg;
-
- const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
- std::tie(Reg, Offset) = computeIndirectRegAndOffset(Dst, Offset);
-
- MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
- if (Idx->getReg() == AMDGPU::NoRegister) {
- // Only had a constant offset, copy the register directly.
- BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Reg)
- .addOperand(*Val);
- MI.eraseFromParent();
- return false;
+// Returns replace operands for a logical operation, either single result
+// for exec or two operands if source was another equivalent operation.
+void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
+ SmallVectorImpl<MachineOperand> &Src) const {
+ MachineOperand &Op = MI.getOperand(OpNo);
+ if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) {
+ Src.push_back(Op);
+ return;
}
- MachineInstr *MovRel =
- BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32), Reg)
- .addReg(Val->getReg(), getUndefRegState(Val->isUndef()))
- .addReg(Dst, RegState::Implicit);
+ MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
+ if (!Def || Def->getParent() != MI.getParent() ||
+ !(Def->isFullCopy() || (Def->getOpcode() == MI.getOpcode())))
+ return;
- return loadM0(MI, MovRel, Offset);
+ // Make sure we do not modify exec between def and use.
+ // A copy with implcitly defined exec inserted earlier is an exclusion, it
+ // does not really modify exec.
+ for (auto I = Def->getIterator(); I != MI.getIterator(); ++I)
+ if (I->modifiesRegister(AMDGPU::EXEC, TRI) &&
+ !(I->isCopy() && I->getOperand(0).getReg() != AMDGPU::EXEC))
+ return;
+
+ for (const auto &SrcOp : Def->explicit_operands())
+ if (SrcOp.isUse() && (!SrcOp.isReg() ||
+ TargetRegisterInfo::isVirtualRegister(SrcOp.getReg()) ||
+ SrcOp.getReg() == AMDGPU::EXEC))
+ Src.push_back(SrcOp);
+}
+
+// Search and combine pairs of equivalent instructions, like
+// S_AND_B64 x, (S_AND_B64 x, y) => S_AND_B64 x, y
+// S_OR_B64 x, (S_OR_B64 x, y) => S_OR_B64 x, y
+// One of the operands is exec mask.
+void SILowerControlFlow::combineMasks(MachineInstr &MI) {
+ assert(MI.getNumExplicitOperands() == 3);
+ SmallVector<MachineOperand, 4> Ops;
+ unsigned OpToReplace = 1;
+ findMaskOperands(MI, 1, Ops);
+ if (Ops.size() == 1) OpToReplace = 2; // First operand can be exec or its copy
+ findMaskOperands(MI, 2, Ops);
+ if (Ops.size() != 3) return;
+
+ unsigned UniqueOpndIdx;
+ if (Ops[0].isIdenticalTo(Ops[1])) UniqueOpndIdx = 2;
+ else if (Ops[0].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
+ else if (Ops[1].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
+ else return;
+
+ unsigned Reg = MI.getOperand(OpToReplace).getReg();
+ MI.RemoveOperand(OpToReplace);
+ MI.addOperand(Ops[UniqueOpndIdx]);
+ if (MRI->use_empty(Reg))
+ MRI->getUniqueVRegDef(Reg)->eraseFromParent();
}
bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
@@ -688,148 +402,66 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
- SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-
- bool HaveKill = false;
- bool NeedFlat = false;
- unsigned Depth = 0;
+ // This doesn't actually need LiveIntervals, but we can preserve them.
+ LIS = getAnalysisIfAvailable<LiveIntervals>();
+ MRI = &MF.getRegInfo();
MachineFunction::iterator NextBB;
-
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; BI = NextBB) {
NextBB = std::next(BI);
MachineBasicBlock &MBB = *BI;
- MachineBasicBlock *EmptyMBBAtEnd = nullptr;
- MachineBasicBlock::iterator I, Next;
- bool ExecModified = false;
+ MachineBasicBlock::iterator I, Next, Last;
- for (I = MBB.begin(); I != MBB.end(); I = Next) {
+ for (I = MBB.begin(), Last = MBB.end(); I != MBB.end(); I = Next) {
Next = std::next(I);
-
MachineInstr &MI = *I;
- // Flat uses m0 in case it needs to access LDS.
- if (TII->isFLAT(MI))
- NeedFlat = true;
-
- if (I->modifiesRegister(AMDGPU::EXEC, TRI))
- ExecModified = true;
-
switch (MI.getOpcode()) {
- default: break;
- case AMDGPU::SI_IF:
- ++Depth;
- If(MI);
- break;
-
- case AMDGPU::SI_ELSE:
- Else(MI, ExecModified);
- break;
-
- case AMDGPU::SI_BREAK:
- Break(MI);
- break;
-
- case AMDGPU::SI_IF_BREAK:
- IfBreak(MI);
- break;
-
- case AMDGPU::SI_ELSE_BREAK:
- ElseBreak(MI);
- break;
-
- case AMDGPU::SI_LOOP:
- ++Depth;
- Loop(MI);
- break;
-
- case AMDGPU::SI_END_CF:
- if (--Depth == 0 && HaveKill) {
- HaveKill = false;
- // TODO: Insert skip if exec is 0?
- }
-
- EndCf(MI);
- break;
-
- case AMDGPU::SI_KILL_TERMINATOR:
- if (Depth == 0) {
- if (skipIfDead(MI, *NextBB)) {
- NextBB = std::next(BI);
- BE = MF.end();
- }
- } else
- HaveKill = true;
- Kill(MI);
- break;
-
- case AMDGPU::S_BRANCH:
- Branch(MI);
- break;
-
- case AMDGPU::SI_INDIRECT_SRC_V1:
- case AMDGPU::SI_INDIRECT_SRC_V2:
- case AMDGPU::SI_INDIRECT_SRC_V4:
- case AMDGPU::SI_INDIRECT_SRC_V8:
- case AMDGPU::SI_INDIRECT_SRC_V16:
- if (indirectSrc(MI)) {
- // The block was split at this point. We can safely skip the middle
- // inserted block to the following which contains the rest of this
- // block's instructions.
- NextBB = std::next(BI);
- BE = MF.end();
- Next = MBB.end();
- }
-
- break;
-
- case AMDGPU::SI_INDIRECT_DST_V1:
- case AMDGPU::SI_INDIRECT_DST_V2:
- case AMDGPU::SI_INDIRECT_DST_V4:
- case AMDGPU::SI_INDIRECT_DST_V8:
- case AMDGPU::SI_INDIRECT_DST_V16:
- if (indirectDst(MI)) {
- // The block was split at this point. We can safely skip the middle
- // inserted block to the following which contains the rest of this
- // block's instructions.
- NextBB = std::next(BI);
- BE = MF.end();
- Next = MBB.end();
- }
-
- break;
-
- case AMDGPU::SI_RETURN: {
- assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
-
- // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
- // because external bytecode will be appended at the end.
- if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
- // SI_RETURN is not the last instruction. Add an empty block at
- // the end and jump there.
- if (!EmptyMBBAtEnd) {
- EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
- MF.insert(MF.end(), EmptyMBBAtEnd);
- }
-
- MBB.addSuccessor(EmptyMBBAtEnd);
- BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
- .addMBB(EmptyMBBAtEnd);
- I->eraseFromParent();
- }
- break;
- }
+ case AMDGPU::SI_IF:
+ emitIf(MI);
+ break;
+
+ case AMDGPU::SI_ELSE:
+ emitElse(MI);
+ break;
+
+ case AMDGPU::SI_BREAK:
+ emitBreak(MI);
+ break;
+
+ case AMDGPU::SI_IF_BREAK:
+ emitIfBreak(MI);
+ break;
+
+ case AMDGPU::SI_ELSE_BREAK:
+ emitElseBreak(MI);
+ break;
+
+ case AMDGPU::SI_LOOP:
+ emitLoop(MI);
+ break;
+
+ case AMDGPU::SI_END_CF:
+ emitEndCf(MI);
+ break;
+
+ case AMDGPU::S_AND_B64:
+ case AMDGPU::S_OR_B64:
+ // Cleanup bit manipulations on exec mask
+ combineMasks(MI);
+ Last = I;
+ continue;
+
+ default:
+ Last = I;
+ continue;
}
- }
- }
- if (NeedFlat && MFI->IsKernel) {
- // TODO: What to use with function calls?
- // We will need to Initialize the flat scratch register pair.
- if (NeedFlat)
- MFI->setHasFlatInstructions(true);
+ // Replay newly inserted code to combine masks
+ Next = (Last == MBB.end()) ? MBB.begin() : Last;
+ }
}
return true;
OpenPOWER on IntegriCloud