diff options
Diffstat (limited to 'contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp')
-rw-r--r-- | contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp | 930 |
1 files changed, 281 insertions, 649 deletions
diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index ee1d5da..7ed18f2 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -58,7 +58,6 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/Constants.h" using namespace llvm; @@ -68,63 +67,50 @@ namespace { class SILowerControlFlow : public MachineFunctionPass { private: - static const unsigned SkipThreshold = 12; - const SIRegisterInfo *TRI; const SIInstrInfo *TII; + LiveIntervals *LIS; + MachineRegisterInfo *MRI; - bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To); - - void Skip(MachineInstr &From, MachineOperand &To); - bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB); - - void If(MachineInstr &MI); - void Else(MachineInstr &MI, bool ExecModified); - void Break(MachineInstr &MI); - void IfBreak(MachineInstr &MI); - void ElseBreak(MachineInstr &MI); - void Loop(MachineInstr &MI); - void EndCf(MachineInstr &MI); - - void Kill(MachineInstr &MI); - void Branch(MachineInstr &MI); - - MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) const; - - std::pair<MachineBasicBlock *, MachineBasicBlock *> - splitBlock(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); + void emitIf(MachineInstr &MI); + void emitElse(MachineInstr &MI); + void emitBreak(MachineInstr &MI); + void emitIfBreak(MachineInstr &MI); + void emitElseBreak(MachineInstr &MI); + void emitLoop(MachineInstr &MI); + void emitEndCf(MachineInstr &MI); - void splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs, - const MachineRegisterInfo &MRI, - const MachineInstr &MI, - MachineBasicBlock &LoopBB, - MachineBasicBlock &RemainderBB, - unsigned SaveReg, - const MachineOperand &IdxReg); + void findMaskOperands(MachineInstr &MI, unsigned OpNo, + SmallVectorImpl<MachineOperand> &Src) const; - void emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL, - MachineInstr *MovRel, - const MachineOperand &IdxReg, - int Offset); - - bool loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0); - std::pair<unsigned, int> computeIndirectRegAndOffset(unsigned VecReg, - int Offset) const; - bool indirectSrc(MachineInstr &MI); - bool indirectDst(MachineInstr &MI); + void combineMasks(MachineInstr &MI); public: static char ID; SILowerControlFlow() : - MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { } + MachineFunctionPass(ID), + TRI(nullptr), + TII(nullptr), + LIS(nullptr), + MRI(nullptr) {} bool runOnMachineFunction(MachineFunction &MF) override; - const char *getPassName() const override { + StringRef getPassName() const override { return "SI Lower control flow pseudo instructions"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + // Should preserve the same set that TwoAddressInstructions does. + AU.addPreserved<SlotIndexes>(); + AU.addPreserved<LiveIntervals>(); + AU.addPreservedID(LiveVariablesID); + AU.addPreservedID(MachineLoopInfoID); + AU.addPreservedID(MachineDominatorsID); + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } }; } // End anonymous namespace @@ -132,555 +118,283 @@ public: char SILowerControlFlow::ID = 0; INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE, - "SI lower control flow", false, false) + "SI lower control flow", false, false) -char &llvm::SILowerControlFlowPassID = SILowerControlFlow::ID; +static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) { + MachineOperand &ImpDefSCC = MI.getOperand(3); + assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef()); - -FunctionPass *llvm::createSILowerControlFlowPass() { - return new SILowerControlFlow(); + ImpDefSCC.setIsDead(IsDead); } -static bool opcodeEmitsNoInsts(unsigned Opc) { - switch (Opc) { - case TargetOpcode::IMPLICIT_DEF: - case TargetOpcode::KILL: - case TargetOpcode::BUNDLE: - case TargetOpcode::CFI_INSTRUCTION: - case TargetOpcode::EH_LABEL: - case TargetOpcode::GC_LABEL: - case TargetOpcode::DBG_VALUE: - return true; - default: - return false; - } -} - -bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From, - MachineBasicBlock *To) { - if (From->succ_empty()) - return false; - - unsigned NumInstr = 0; - MachineFunction *MF = From->getParent(); - - for (MachineFunction::iterator MBBI(From), ToI(To), End = MF->end(); - MBBI != End && MBBI != ToI; ++MBBI) { - MachineBasicBlock &MBB = *MBBI; - - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - NumInstr < SkipThreshold && I != E; ++I) { - if (opcodeEmitsNoInsts(I->getOpcode())) - continue; - - // When a uniform loop is inside non-uniform control flow, the branch - // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken - // when EXEC = 0. We should skip the loop lest it becomes infinite. - if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ || - I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ) - return true; - - if (I->isInlineAsm()) { - const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); - const char *AsmStr = I->getOperand(0).getSymbolName(); - - // inlineasm length estimate is number of bytes assuming the longest - // instruction. - uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI); - NumInstr += MaxAsmSize / MAI->getMaxInstLength(); - } else { - ++NumInstr; - } +char &llvm::SILowerControlFlowID = SILowerControlFlow::ID; - if (NumInstr >= SkipThreshold) - return true; - } - } - - return false; -} - -void SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) { - - if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB())) - return; - - DebugLoc DL = From.getDebugLoc(); - BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) - .addOperand(To); -} - -bool SILowerControlFlow::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) { +void SILowerControlFlow::emitIf(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); - MachineFunction *MF = MBB.getParent(); - - if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS || - !shouldSkip(&MBB, &MBB.getParent()->back())) - return false; - - MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator()); - MBB.addSuccessor(SkipBB); - const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock::iterator I(&MI); - // If the exec mask is non-zero, skip the next two instructions - BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addMBB(&NextBB); - - MachineBasicBlock::iterator Insert = SkipBB->begin(); - - // Exec mask is zero: Export to NULL target... - BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP)) - .addImm(0) - .addImm(0x09) // V_008DFC_SQ_EXP_NULL - .addImm(0) - .addImm(1) - .addImm(1) - .addReg(AMDGPU::VGPR0, RegState::Undef) - .addReg(AMDGPU::VGPR0, RegState::Undef) - .addReg(AMDGPU::VGPR0, RegState::Undef) - .addReg(AMDGPU::VGPR0, RegState::Undef); - - // ... and terminate wavefront. - BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); - - return true; -} - -void SILowerControlFlow::If(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - unsigned Reg = MI.getOperand(0).getReg(); - unsigned Vcc = MI.getOperand(1).getReg(); - - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg) - .addReg(Vcc); - - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg) - .addReg(AMDGPU::EXEC) - .addReg(Reg); - - Skip(MI, MI.getOperand(2)); + MachineOperand &SaveExec = MI.getOperand(0); + MachineOperand &Cond = MI.getOperand(1); + assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister && + Cond.getSubReg() == AMDGPU::NoSubRegister); - // Insert a pseudo terminator to help keep the verifier happy. - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) - .addOperand(MI.getOperand(2)) - .addReg(Reg); + unsigned SaveExecReg = SaveExec.getReg(); - MI.eraseFromParent(); -} + MachineOperand &ImpDefSCC = MI.getOperand(4); + assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef()); -void SILowerControlFlow::Else(MachineInstr &MI, bool ExecModified) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - unsigned Dst = MI.getOperand(0).getReg(); - unsigned Src = MI.getOperand(1).getReg(); - - BuildMI(MBB, MBB.getFirstNonPHI(), DL, - TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst) - .addReg(Src); // Saved EXEC - - if (ExecModified) { - // Adjust the saved exec to account for the modifications during the flow - // block that contains the ELSE. This can happen when WQM mode is switched - // off. - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst) - .addReg(AMDGPU::EXEC) - .addReg(Dst); + // Add an implicit def of exec to discourage scheduling VALU after this which + // will interfere with trying to form s_and_saveexec_b64 later. + unsigned CopyReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + MachineInstr *CopyExec = + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg) + .addReg(AMDGPU::EXEC) + .addReg(AMDGPU::EXEC, RegState::ImplicitDefine); + + unsigned Tmp = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + + MachineInstr *And = + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_B64), Tmp) + .addReg(CopyReg) + //.addReg(AMDGPU::EXEC) + .addReg(Cond.getReg()); + setImpSCCDefDead(*And, true); + + MachineInstr *Xor = + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg) + .addReg(Tmp) + .addReg(CopyReg); + setImpSCCDefDead(*Xor, ImpDefSCC.isDead()); + + // Use a copy that is a terminator to get correct spill code placement it with + // fast regalloc. + MachineInstr *SetExec = + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64_term), AMDGPU::EXEC) + .addReg(Tmp, RegState::Kill); + + // Insert a pseudo terminator to help keep the verifier happy. This will also + // be used later when inserting skips. + MachineInstr *NewBr = + BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) + .addOperand(MI.getOperand(2)); + + if (!LIS) { + MI.eraseFromParent(); + return; } - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(Dst); + LIS->InsertMachineInstrInMaps(*CopyExec); - Skip(MI, MI.getOperand(2)); + // Replace with and so we don't need to fix the live interval for condition + // register. + LIS->ReplaceMachineInstrInMaps(MI, *And); - // Insert a pseudo terminator to help keep the verifier happy. - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) - .addOperand(MI.getOperand(2)) - .addReg(Dst); + LIS->InsertMachineInstrInMaps(*Xor); + LIS->InsertMachineInstrInMaps(*SetExec); + LIS->InsertMachineInstrInMaps(*NewBr); + LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI)); MI.eraseFromParent(); -} - -void SILowerControlFlow::Break(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - - unsigned Dst = MI.getOperand(0).getReg(); - unsigned Src = MI.getOperand(1).getReg(); - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) - .addReg(AMDGPU::EXEC) - .addReg(Src); - - MI.eraseFromParent(); + // FIXME: Is there a better way of adjusting the liveness? It shouldn't be + // hard to add another def here but I'm not sure how to correctly update the + // valno. + LIS->removeInterval(SaveExecReg); + LIS->createAndComputeVirtRegInterval(SaveExecReg); + LIS->createAndComputeVirtRegInterval(Tmp); + LIS->createAndComputeVirtRegInterval(CopyReg); } -void SILowerControlFlow::IfBreak(MachineInstr &MI) { +void SILowerControlFlow::emitElse(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - - unsigned Dst = MI.getOperand(0).getReg(); - unsigned Vcc = MI.getOperand(1).getReg(); - unsigned Src = MI.getOperand(2).getReg(); - - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) - .addReg(Vcc) - .addReg(Src); - - MI.eraseFromParent(); -} + const DebugLoc &DL = MI.getDebugLoc(); -void SILowerControlFlow::ElseBreak(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); + unsigned DstReg = MI.getOperand(0).getReg(); + assert(MI.getOperand(0).getSubReg() == AMDGPU::NoSubRegister); - unsigned Dst = MI.getOperand(0).getReg(); - unsigned Saved = MI.getOperand(1).getReg(); - unsigned Src = MI.getOperand(2).getReg(); + bool ExecModified = MI.getOperand(3).getImm() != 0; + MachineBasicBlock::iterator Start = MBB.begin(); - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) - .addReg(Saved) - .addReg(Src); + // We are running before TwoAddressInstructions, and si_else's operands are + // tied. In order to correctly tie the registers, split this into a copy of + // the src like it does. + unsigned CopyReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + BuildMI(MBB, Start, DL, TII->get(AMDGPU::COPY), CopyReg) + .addOperand(MI.getOperand(1)); // Saved EXEC - MI.eraseFromParent(); -} + // This must be inserted before phis and any spill code inserted before the + // else. + unsigned SaveReg = ExecModified ? + MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass) : DstReg; + MachineInstr *OrSaveExec = + BuildMI(MBB, Start, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), SaveReg) + .addReg(CopyReg); -void SILowerControlFlow::Loop(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - unsigned Src = MI.getOperand(0).getReg(); + MachineBasicBlock *DestBB = MI.getOperand(2).getMBB(); - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(Src); + MachineBasicBlock::iterator ElsePt(MI); - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addOperand(MI.getOperand(1)); + if (ExecModified) { + MachineInstr *And = + BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_AND_B64), DstReg) + .addReg(AMDGPU::EXEC) + .addReg(SaveReg); - MI.eraseFromParent(); -} + if (LIS) + LIS->InsertMachineInstrInMaps(*And); + } -void SILowerControlFlow::EndCf(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - unsigned Reg = MI.getOperand(0).getReg(); + MachineInstr *Xor = + BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(DstReg); - BuildMI(MBB, MBB.getFirstNonPHI(), DL, - TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(Reg); + MachineInstr *Branch = + BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) + .addMBB(DestBB); - MI.eraseFromParent(); -} - -void SILowerControlFlow::Branch(MachineInstr &MI) { - MachineBasicBlock *MBB = MI.getOperand(0).getMBB(); - if (MBB == MI.getParent()->getNextNode()) + if (!LIS) { MI.eraseFromParent(); - - // If these aren't equal, this is probably an infinite loop. -} - -void SILowerControlFlow::Kill(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - const MachineOperand &Op = MI.getOperand(0); - -#ifndef NDEBUG - CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv(); - // Kill is only allowed in pixel / geometry shaders. - assert(CallConv == CallingConv::AMDGPU_PS || - CallConv == CallingConv::AMDGPU_GS); -#endif - - // Clear this thread from the exec mask if the operand is negative - if ((Op.isImm())) { - // Constant operand: Set exec mask to 0 or do nothing - if (Op.getImm() & 0x80000000) { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) - .addImm(0); - } - } else { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32)) - .addImm(0) - .addOperand(Op); + return; } + LIS->RemoveMachineInstrFromMaps(MI); MI.eraseFromParent(); -} -// All currently live registers must remain so in the remainder block. -void SILowerControlFlow::splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs, - const MachineRegisterInfo &MRI, - const MachineInstr &MI, - MachineBasicBlock &LoopBB, - MachineBasicBlock &RemainderBB, - unsigned SaveReg, - const MachineOperand &IdxReg) { - // Add reg defined in loop body. - RemainderLiveRegs.addReg(SaveReg); - - if (const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val)) { - if (!Val->isUndef()) { - RemainderLiveRegs.addReg(Val->getReg()); - LoopBB.addLiveIn(Val->getReg()); - } - } + LIS->InsertMachineInstrInMaps(*OrSaveExec); - for (unsigned Reg : RemainderLiveRegs) { - if (MRI.isAllocatable(Reg)) - RemainderBB.addLiveIn(Reg); - } + LIS->InsertMachineInstrInMaps(*Xor); + LIS->InsertMachineInstrInMaps(*Branch); - const MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src); - if (!Src->isUndef()) - LoopBB.addLiveIn(Src->getReg()); + // src reg is tied to dst reg. + LIS->removeInterval(DstReg); + LIS->createAndComputeVirtRegInterval(DstReg); + LIS->createAndComputeVirtRegInterval(CopyReg); + if (ExecModified) + LIS->createAndComputeVirtRegInterval(SaveReg); - if (!IdxReg.isUndef()) - LoopBB.addLiveIn(IdxReg.getReg()); - LoopBB.sortUniqueLiveIns(); + // Let this be recomputed. + LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI)); } -void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, - DebugLoc DL, - MachineInstr *MovRel, - const MachineOperand &IdxReg, - int Offset) { - MachineBasicBlock::iterator I = LoopBB.begin(); - - // Read the next variant into VCC (lower 32 bits) <- also loop target - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), AMDGPU::VCC_LO) - .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef())); - - // Move index from VCC into M0 - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .addReg(AMDGPU::VCC_LO); - - // Compare the just read M0 value to all possible Idx values - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32)) - .addReg(AMDGPU::M0) - .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef())); - - // Update EXEC, save the original EXEC value to VCC - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC) - .addReg(AMDGPU::VCC); - - if (Offset != 0) { - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) - .addReg(AMDGPU::M0) - .addImm(Offset); - } - - // Do the actual move - LoopBB.insert(I, MovRel); +void SILowerControlFlow::emitBreak(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + unsigned Dst = MI.getOperand(0).getReg(); - // Update EXEC, switch all done bits to 0 and all todo bits to 1 - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) + MachineInstr *Or = + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) .addReg(AMDGPU::EXEC) - .addReg(AMDGPU::VCC); + .addOperand(MI.getOperand(1)); - // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addMBB(&LoopBB); + if (LIS) + LIS->ReplaceMachineInstrInMaps(MI, *Or); + MI.eraseFromParent(); } -MachineBasicBlock *SILowerControlFlow::insertSkipBlock( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { - MachineFunction *MF = MBB.getParent(); - - MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock(); - MachineFunction::iterator MBBI(MBB); - ++MBBI; - - MF->insert(MBBI, SkipBB); - - return SkipBB; +void SILowerControlFlow::emitIfBreak(MachineInstr &MI) { + MI.setDesc(TII->get(AMDGPU::S_OR_B64)); } -std::pair<MachineBasicBlock *, MachineBasicBlock *> -SILowerControlFlow::splitBlock(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) { - MachineFunction *MF = MBB.getParent(); - - // To insert the loop we need to split the block. Move everything after this - // point to a new block, and insert a new empty block between the two. - MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); - MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); - MachineFunction::iterator MBBI(MBB); - ++MBBI; - - MF->insert(MBBI, LoopBB); - MF->insert(MBBI, RemainderBB); - - // Move the rest of the block into a new block. - RemainderBB->transferSuccessors(&MBB); - RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); - - MBB.addSuccessor(LoopBB); - - return std::make_pair(LoopBB, RemainderBB); +void SILowerControlFlow::emitElseBreak(MachineInstr &MI) { + MI.setDesc(TII->get(AMDGPU::S_OR_B64)); } -// Returns true if a new block was inserted. -bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) { +void SILowerControlFlow::emitLoop(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); - MachineBasicBlock::iterator I(&MI); + const DebugLoc &DL = MI.getDebugLoc(); - const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); + MachineInstr *AndN2 = + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64_term), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addOperand(MI.getOperand(0)); - if (AMDGPU::SReg_32RegClass.contains(Idx->getReg())) { - if (Offset != 0) { - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) - .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef())) - .addImm(Offset); - } else { - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef())); - } + MachineInstr *Branch = + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addOperand(MI.getOperand(1)); - MBB.insert(I, MovRel); - MI.eraseFromParent(); - return false; + if (LIS) { + LIS->ReplaceMachineInstrInMaps(MI, *AndN2); + LIS->InsertMachineInstrInMaps(*Branch); } - MachineOperand *SaveOp = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); - SaveOp->setIsDead(false); - unsigned Save = SaveOp->getReg(); - - // Reading from a VGPR requires looping over all workitems in the wavefront. - assert(AMDGPU::SReg_64RegClass.contains(Save) && - AMDGPU::VGPR_32RegClass.contains(Idx->getReg())); - - // Save the EXEC mask - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), Save) - .addReg(AMDGPU::EXEC); - - LivePhysRegs RemainderLiveRegs(TRI); - - RemainderLiveRegs.addLiveOuts(MBB); - - MachineBasicBlock *LoopBB; - MachineBasicBlock *RemainderBB; - - std::tie(LoopBB, RemainderBB) = splitBlock(MBB, I); - - for (const MachineInstr &Inst : reverse(*RemainderBB)) - RemainderLiveRegs.stepBackward(Inst); - - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - LoopBB->addSuccessor(RemainderBB); - LoopBB->addSuccessor(LoopBB); - - splitLoadM0BlockLiveIns(RemainderLiveRegs, MRI, MI, *LoopBB, - *RemainderBB, Save, *Idx); - - emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, *Idx, Offset); - - MachineBasicBlock::iterator First = RemainderBB->begin(); - BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) - .addReg(Save); - MI.eraseFromParent(); - return true; -} - -/// \param @VecReg The register which holds element zero of the vector being -/// addressed into. -// -/// \param[in] @Idx The index operand from the movrel instruction. This must be -// a register, but may be NoRegister. -/// -/// \param[in] @Offset As an input, this is the constant offset part of the -// indirect Index. e.g. v0 = v[VecReg + Offset] As an output, this is a constant -// value that needs to be added to the value stored in M0. -std::pair<unsigned, int> -SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg, int Offset) const { - unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0); - if (!SubReg) - SubReg = VecReg; - - const TargetRegisterClass *SuperRC = TRI->getPhysRegClass(VecReg); - const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg); - int NumElts = SuperRC->getSize() / RC->getSize(); - - int BaseRegIdx = TRI->getHWRegIndex(SubReg); - - // Skip out of bounds offsets, or else we would end up using an undefined - // register. - if (Offset >= NumElts) - return std::make_pair(RC->getRegister(BaseRegIdx), Offset); - - int RegIdx = BaseRegIdx + Offset; - if (RegIdx < 0) { - Offset = RegIdx; - RegIdx = 0; - } else { - Offset = 0; - } - - unsigned Reg = RC->getRegister(RegIdx); - return std::make_pair(Reg, Offset); } -// Return true if a new block was inserted. -bool SILowerControlFlow::indirectSrc(MachineInstr &MI) { +void SILowerControlFlow::emitEndCf(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); - unsigned Dst = MI.getOperand(0).getReg(); - const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src); - int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); - unsigned Reg; - - std::tie(Reg, Offset) = computeIndirectRegAndOffset(SrcVec->getReg(), Offset); + MachineBasicBlock::iterator InsPt = MBB.begin(); + MachineInstr *NewMI = + BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addOperand(MI.getOperand(0)); - const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); - if (Idx->getReg() == AMDGPU::NoRegister) { - // Only had a constant offset, copy the register directly. - BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst) - .addReg(Reg, getUndefRegState(SrcVec->isUndef())); - MI.eraseFromParent(); - return false; - } + if (LIS) + LIS->ReplaceMachineInstrInMaps(MI, *NewMI); - MachineInstr *MovRel = - BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) - .addReg(Reg, getUndefRegState(SrcVec->isUndef())) - .addReg(SrcVec->getReg(), RegState::Implicit); + MI.eraseFromParent(); - return loadM0(MI, MovRel, Offset); + if (LIS) + LIS->handleMove(*NewMI); } -// Return true if a new block was inserted. -bool SILowerControlFlow::indirectDst(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - const DebugLoc &DL = MI.getDebugLoc(); - - unsigned Dst = MI.getOperand(0).getReg(); - int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); - unsigned Reg; - - const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val); - std::tie(Reg, Offset) = computeIndirectRegAndOffset(Dst, Offset); - - MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); - if (Idx->getReg() == AMDGPU::NoRegister) { - // Only had a constant offset, copy the register directly. - BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Reg) - .addOperand(*Val); - MI.eraseFromParent(); - return false; +// Returns replace operands for a logical operation, either single result +// for exec or two operands if source was another equivalent operation. +void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo, + SmallVectorImpl<MachineOperand> &Src) const { + MachineOperand &Op = MI.getOperand(OpNo); + if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) { + Src.push_back(Op); + return; } - MachineInstr *MovRel = - BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32), Reg) - .addReg(Val->getReg(), getUndefRegState(Val->isUndef())) - .addReg(Dst, RegState::Implicit); + MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); + if (!Def || Def->getParent() != MI.getParent() || + !(Def->isFullCopy() || (Def->getOpcode() == MI.getOpcode()))) + return; - return loadM0(MI, MovRel, Offset); + // Make sure we do not modify exec between def and use. + // A copy with implcitly defined exec inserted earlier is an exclusion, it + // does not really modify exec. + for (auto I = Def->getIterator(); I != MI.getIterator(); ++I) + if (I->modifiesRegister(AMDGPU::EXEC, TRI) && + !(I->isCopy() && I->getOperand(0).getReg() != AMDGPU::EXEC)) + return; + + for (const auto &SrcOp : Def->explicit_operands()) + if (SrcOp.isUse() && (!SrcOp.isReg() || + TargetRegisterInfo::isVirtualRegister(SrcOp.getReg()) || + SrcOp.getReg() == AMDGPU::EXEC)) + Src.push_back(SrcOp); +} + +// Search and combine pairs of equivalent instructions, like +// S_AND_B64 x, (S_AND_B64 x, y) => S_AND_B64 x, y +// S_OR_B64 x, (S_OR_B64 x, y) => S_OR_B64 x, y +// One of the operands is exec mask. +void SILowerControlFlow::combineMasks(MachineInstr &MI) { + assert(MI.getNumExplicitOperands() == 3); + SmallVector<MachineOperand, 4> Ops; + unsigned OpToReplace = 1; + findMaskOperands(MI, 1, Ops); + if (Ops.size() == 1) OpToReplace = 2; // First operand can be exec or its copy + findMaskOperands(MI, 2, Ops); + if (Ops.size() != 3) return; + + unsigned UniqueOpndIdx; + if (Ops[0].isIdenticalTo(Ops[1])) UniqueOpndIdx = 2; + else if (Ops[0].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1; + else if (Ops[1].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1; + else return; + + unsigned Reg = MI.getOperand(OpToReplace).getReg(); + MI.RemoveOperand(OpToReplace); + MI.addOperand(Ops[UniqueOpndIdx]); + if (MRI->use_empty(Reg)) + MRI->getUniqueVRegDef(Reg)->eraseFromParent(); } bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { @@ -688,148 +402,66 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); - SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - - bool HaveKill = false; - bool NeedFlat = false; - unsigned Depth = 0; + // This doesn't actually need LiveIntervals, but we can preserve them. + LIS = getAnalysisIfAvailable<LiveIntervals>(); + MRI = &MF.getRegInfo(); MachineFunction::iterator NextBB; - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; BI = NextBB) { NextBB = std::next(BI); MachineBasicBlock &MBB = *BI; - MachineBasicBlock *EmptyMBBAtEnd = nullptr; - MachineBasicBlock::iterator I, Next; - bool ExecModified = false; + MachineBasicBlock::iterator I, Next, Last; - for (I = MBB.begin(); I != MBB.end(); I = Next) { + for (I = MBB.begin(), Last = MBB.end(); I != MBB.end(); I = Next) { Next = std::next(I); - MachineInstr &MI = *I; - // Flat uses m0 in case it needs to access LDS. - if (TII->isFLAT(MI)) - NeedFlat = true; - - if (I->modifiesRegister(AMDGPU::EXEC, TRI)) - ExecModified = true; - switch (MI.getOpcode()) { - default: break; - case AMDGPU::SI_IF: - ++Depth; - If(MI); - break; - - case AMDGPU::SI_ELSE: - Else(MI, ExecModified); - break; - - case AMDGPU::SI_BREAK: - Break(MI); - break; - - case AMDGPU::SI_IF_BREAK: - IfBreak(MI); - break; - - case AMDGPU::SI_ELSE_BREAK: - ElseBreak(MI); - break; - - case AMDGPU::SI_LOOP: - ++Depth; - Loop(MI); - break; - - case AMDGPU::SI_END_CF: - if (--Depth == 0 && HaveKill) { - HaveKill = false; - // TODO: Insert skip if exec is 0? - } - - EndCf(MI); - break; - - case AMDGPU::SI_KILL_TERMINATOR: - if (Depth == 0) { - if (skipIfDead(MI, *NextBB)) { - NextBB = std::next(BI); - BE = MF.end(); - } - } else - HaveKill = true; - Kill(MI); - break; - - case AMDGPU::S_BRANCH: - Branch(MI); - break; - - case AMDGPU::SI_INDIRECT_SRC_V1: - case AMDGPU::SI_INDIRECT_SRC_V2: - case AMDGPU::SI_INDIRECT_SRC_V4: - case AMDGPU::SI_INDIRECT_SRC_V8: - case AMDGPU::SI_INDIRECT_SRC_V16: - if (indirectSrc(MI)) { - // The block was split at this point. We can safely skip the middle - // inserted block to the following which contains the rest of this - // block's instructions. - NextBB = std::next(BI); - BE = MF.end(); - Next = MBB.end(); - } - - break; - - case AMDGPU::SI_INDIRECT_DST_V1: - case AMDGPU::SI_INDIRECT_DST_V2: - case AMDGPU::SI_INDIRECT_DST_V4: - case AMDGPU::SI_INDIRECT_DST_V8: - case AMDGPU::SI_INDIRECT_DST_V16: - if (indirectDst(MI)) { - // The block was split at this point. We can safely skip the middle - // inserted block to the following which contains the rest of this - // block's instructions. - NextBB = std::next(BI); - BE = MF.end(); - Next = MBB.end(); - } - - break; - - case AMDGPU::SI_RETURN: { - assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid()); - - // Graphics shaders returning non-void shouldn't contain S_ENDPGM, - // because external bytecode will be appended at the end. - if (BI != --MF.end() || I != MBB.getFirstTerminator()) { - // SI_RETURN is not the last instruction. Add an empty block at - // the end and jump there. - if (!EmptyMBBAtEnd) { - EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); - MF.insert(MF.end(), EmptyMBBAtEnd); - } - - MBB.addSuccessor(EmptyMBBAtEnd); - BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) - .addMBB(EmptyMBBAtEnd); - I->eraseFromParent(); - } - break; - } + case AMDGPU::SI_IF: + emitIf(MI); + break; + + case AMDGPU::SI_ELSE: + emitElse(MI); + break; + + case AMDGPU::SI_BREAK: + emitBreak(MI); + break; + + case AMDGPU::SI_IF_BREAK: + emitIfBreak(MI); + break; + + case AMDGPU::SI_ELSE_BREAK: + emitElseBreak(MI); + break; + + case AMDGPU::SI_LOOP: + emitLoop(MI); + break; + + case AMDGPU::SI_END_CF: + emitEndCf(MI); + break; + + case AMDGPU::S_AND_B64: + case AMDGPU::S_OR_B64: + // Cleanup bit manipulations on exec mask + combineMasks(MI); + Last = I; + continue; + + default: + Last = I; + continue; } - } - } - if (NeedFlat && MFI->IsKernel) { - // TODO: What to use with function calls? - // We will need to Initialize the flat scratch register pair. - if (NeedFlat) - MFI->setHasFlatInstructions(true); + // Replay newly inserted code to combine masks + Next = (Last == MBB.end()) ? MBB.begin() : Last; + } } return true; |