diff options
Diffstat (limited to 'contrib/llvm/lib/Target/R600/SILowerControlFlow.cpp')
-rw-r--r-- | contrib/llvm/lib/Target/R600/SILowerControlFlow.cpp | 86 |
1 files changed, 53 insertions, 33 deletions
diff --git a/contrib/llvm/lib/Target/R600/SILowerControlFlow.cpp b/contrib/llvm/lib/Target/R600/SILowerControlFlow.cpp index 75b5a5e..068b22f 100644 --- a/contrib/llvm/lib/Target/R600/SILowerControlFlow.cpp +++ b/contrib/llvm/lib/Target/R600/SILowerControlFlow.cpp @@ -49,8 +49,10 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -86,7 +88,6 @@ private: void Kill(MachineInstr &MI); void Branch(MachineInstr &MI); - void InitM0ForLDS(MachineBasicBlock::iterator MI); void LoadM0(MachineInstr &MI, MachineInstr *MovRel); void IndirectSrc(MachineInstr &MI); void IndirectDst(MachineInstr &MI); @@ -307,10 +308,9 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) { #endif // Clear this thread from the exec mask if the operand is negative - if ((Op.isImm() || Op.isFPImm())) { + if ((Op.isImm())) { // Constant operand: Set exec mask to 0 or do nothing - if (Op.isImm() ? (Op.getImm() & 0x80000000) : - Op.getFPImm()->isNegative()) { + if (Op.getImm() & 0x80000000) { BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) .addImm(0); } @@ -323,14 +323,6 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) { MI.eraseFromParent(); } -/// The m0 register stores the maximum allowable address for LDS reads and -/// writes. Its value must be at least the size in bytes of LDS allocated by -/// the shader. For simplicity, we set it to the maximum possible value. -void SILowerControlFlowPass::InitM0ForLDS(MachineBasicBlock::iterator MI) { - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), - AMDGPU::M0).addImm(0xffffffff); -} - void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) { MachineBasicBlock &MBB = *MI.getParent(); @@ -347,7 +339,7 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) { } else { assert(AMDGPU::SReg_64RegClass.contains(Save)); - assert(AMDGPU::VReg_32RegClass.contains(Idx)); + assert(AMDGPU::VGPR_32RegClass.contains(Idx)); // Save the EXEC mask BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save) @@ -389,12 +381,6 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) { .addReg(Save); } - // FIXME: Are there any values other than the LDS address clamp that need to - // be stored in the m0 register and may be live for more than a few - // instructions? If so, we should save the m0 register at the beginning - // of this function and restore it here. - // FIXME: Add support for LDS direct loads. - InitM0ForLDS(&MI); MI.eraseFromParent(); } @@ -442,13 +428,14 @@ void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) { } bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { - TII = static_cast<const SIInstrInfo*>(MF.getTarget().getInstrInfo()); - TRI = static_cast<const SIRegisterInfo*>(MF.getTarget().getRegisterInfo()); + TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + TRI = + static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); bool HaveKill = false; - bool NeedM0 = false; bool NeedWQM = false; + bool NeedFlat = false; unsigned Depth = 0; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); @@ -460,10 +447,12 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { Next = std::next(I); MachineInstr &MI = *I; - if (TII->isDS(MI.getOpcode())) { - NeedM0 = true; + if (TII->isDS(MI.getOpcode())) NeedWQM = true; - } + + // Flat uses m0 in case it needs to access LDS. + if (TII->isFLAT(MI.getOpcode())) + NeedFlat = true; switch (MI.getOpcode()) { default: break; @@ -530,23 +519,54 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { case AMDGPU::V_INTERP_MOV_F32: NeedWQM = true; break; - } } } - if (NeedM0) { - MachineBasicBlock &MBB = MF.front(); - // Initialize M0 to a value that won't cause LDS access to be discarded - // due to offset clamping - InitM0ForLDS(MBB.getFirstNonPHI()); - } - if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) { MachineBasicBlock &MBB = MF.front(); BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC).addReg(AMDGPU::EXEC); } + // FIXME: This seems inappropriate to do here. + if (NeedFlat && MFI->IsKernel) { + // Insert the prologue initializing the SGPRs pointing to the scratch space + // for flat accesses. + const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + + // TODO: What to use with function calls? + + // FIXME: This is reporting stack size that is used in a scratch buffer + // rather than registers as well. + uint64_t StackSizeBytes = FrameInfo->getStackSize(); + + int IndirectBegin + = static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF); + // Convert register index to 256-byte unit. + uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256); + + assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff && + "Stack limits should be smaller than 16-bits"); + + // Initialize the flat scratch register pair. + // TODO: Can we use one s_mov_b64 here? + + // Offset is in units of 256-bytes. + MachineBasicBlock &MBB = MF.front(); + DebugLoc NoDL; + MachineBasicBlock::iterator Start = MBB.getFirstNonPHI(); + const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32); + + assert(isInt<16>(StackOffset) && isInt<16>(StackSizeBytes)); + + BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO) + .addImm(StackOffset); + + // Documentation says size is "per-thread scratch size in bytes" + BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI) + .addImm(StackSizeBytes); + } + return true; } |