diff options
Diffstat (limited to 'contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp')
-rw-r--r-- | contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 1260 |
1 files changed, 864 insertions, 396 deletions
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 9190819..26a8d22 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -28,6 +28,13 @@ using namespace llvm; +// Must be at least 4 to be able to branch over minimum unconditional branch +// code. This is only for making it possible to write reasonably small tests for +// long branches. +static cl::opt<unsigned> +BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), + cl::desc("Restrict range of branch instructions (DEBUG)")); + SIInstrInfo::SIInstrInfo(const SISubtarget &ST) : AMDGPUInstrInfo(ST), RI(), ST(ST) {} @@ -258,7 +265,8 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, } if (isMUBUF(LdSt) || isMTBUF(LdSt)) { - if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1) + const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset); + if (SOffset && SOffset->isReg()) return false; const MachineOperand *AddrReg = @@ -270,6 +278,10 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, getNamedOperand(LdSt, AMDGPU::OpName::offset); BaseReg = AddrReg->getReg(); Offset = OffsetImm->getImm(); + + if (SOffset) // soffset can be an inline immediate. + Offset += SOffset->getImm(); + return true; } @@ -287,7 +299,7 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, } if (isFLAT(LdSt)) { - const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::addr); + const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); BaseReg = AddrReg->getReg(); Offset = 0; return true; @@ -302,20 +314,16 @@ bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, const MachineOperand *FirstDst = nullptr; const MachineOperand *SecondDst = nullptr; - if (isDS(FirstLdSt) && isDS(SecondLdSt)) { - FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); - SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); - } - - if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { - FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); - SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); - } - if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt))) { FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); + } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { + FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); + SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); + } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) { + FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); + SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); } if (!FirstDst || !SecondDst) @@ -342,62 +350,32 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const { + const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg); - // If we are trying to copy to or from SCC, there is a bug somewhere else in - // the backend. While it may be theoretically possible to do this, it should - // never be necessary. - assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC); - - static const int16_t Sub0_15[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, - AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, - AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, - AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, - }; - - static const int16_t Sub0_15_64[] = { - AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, - AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, - AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, - AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, - }; - - static const int16_t Sub0_7[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, - AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, - }; - - static const int16_t Sub0_7_64[] = { - AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, - AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, - }; - - static const int16_t Sub0_3[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, - }; - - static const int16_t Sub0_3_64[] = { - AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, - }; - - static const int16_t Sub0_2[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, - }; - - static const int16_t Sub0_1[] = { - AMDGPU::sub0, AMDGPU::sub1, - }; + if (RC == &AMDGPU::VGPR_32RegClass) { + assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || + AMDGPU::SReg_32RegClass.contains(SrcReg)); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + } - unsigned Opcode; - ArrayRef<int16_t> SubIndices; + if (RC == &AMDGPU::SReg_32_XM0RegClass || + RC == &AMDGPU::SReg_32RegClass) { + if (SrcReg == AMDGPU::SCC) { + BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg) + .addImm(-1) + .addImm(0); + return; + } - if (AMDGPU::SReg_32RegClass.contains(DestReg)) { assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); return; + } - } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) { + if (RC == &AMDGPU::SReg_64RegClass) { if (DestReg == AMDGPU::VCC) { if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) @@ -405,7 +383,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } else { // FIXME: Hack until VReg_1 removed. assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); - BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32)) + BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) .addImm(0) .addReg(SrcReg, getKillRegState(KillSrc)); } @@ -417,62 +395,29 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); return; + } - } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) { - assert(AMDGPU::SReg_128RegClass.contains(SrcReg)); - Opcode = AMDGPU::S_MOV_B64; - SubIndices = Sub0_3_64; - - } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) { - assert(AMDGPU::SReg_256RegClass.contains(SrcReg)); - Opcode = AMDGPU::S_MOV_B64; - SubIndices = Sub0_7_64; - - } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) { - assert(AMDGPU::SReg_512RegClass.contains(SrcReg)); - Opcode = AMDGPU::S_MOV_B64; - SubIndices = Sub0_15_64; - - } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) { - assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || - AMDGPU::SReg_32RegClass.contains(SrcReg)); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); + if (DestReg == AMDGPU::SCC) { + assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); + BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)) + .addReg(SrcReg, getKillRegState(KillSrc)) + .addImm(0); return; + } - } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) { - assert(AMDGPU::VReg_64RegClass.contains(SrcReg) || - AMDGPU::SReg_64RegClass.contains(SrcReg)); - Opcode = AMDGPU::V_MOV_B32_e32; - SubIndices = Sub0_1; - - } else if (AMDGPU::VReg_96RegClass.contains(DestReg)) { - assert(AMDGPU::VReg_96RegClass.contains(SrcReg)); - Opcode = AMDGPU::V_MOV_B32_e32; - SubIndices = Sub0_2; - - } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) { - assert(AMDGPU::VReg_128RegClass.contains(SrcReg) || - AMDGPU::SReg_128RegClass.contains(SrcReg)); - Opcode = AMDGPU::V_MOV_B32_e32; - SubIndices = Sub0_3; - - } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) { - assert(AMDGPU::VReg_256RegClass.contains(SrcReg) || - AMDGPU::SReg_256RegClass.contains(SrcReg)); - Opcode = AMDGPU::V_MOV_B32_e32; - SubIndices = Sub0_7; - - } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) { - assert(AMDGPU::VReg_512RegClass.contains(SrcReg) || - AMDGPU::SReg_512RegClass.contains(SrcReg)); - Opcode = AMDGPU::V_MOV_B32_e32; - SubIndices = Sub0_15; - - } else { - llvm_unreachable("Can't copy register!"); + unsigned EltSize = 4; + unsigned Opcode = AMDGPU::V_MOV_B32_e32; + if (RI.isSGPRClass(RC)) { + if (RC->getSize() > 4) { + Opcode = AMDGPU::S_MOV_B64; + EltSize = 8; + } else { + Opcode = AMDGPU::S_MOV_B32; + EltSize = 4; + } } + ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize); bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { @@ -497,9 +442,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } } -int SIInstrInfo::commuteOpcode(const MachineInstr &MI) const { - const unsigned Opcode = MI.getOpcode(); - +int SIInstrInfo::commuteOpcode(unsigned Opcode) const { int NewOpc; // Try to map original to commuted opcode @@ -573,11 +516,11 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, const TargetRegisterInfo *TRI) const { MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); - MachineFrameInfo *FrameInfo = MF->getFrameInfo(); + MachineFrameInfo &FrameInfo = MF->getFrameInfo(); DebugLoc DL = MBB.findDebugLoc(MI); - unsigned Size = FrameInfo->getObjectSize(FrameIndex); - unsigned Align = FrameInfo->getObjectAlignment(FrameIndex); + unsigned Size = FrameInfo.getObjectSize(FrameIndex); + unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, FrameIndex); MachineMemOperand *MMO @@ -587,20 +530,31 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, if (RI.isSGPRClass(RC)) { MFI->setHasSpilledSGPRs(); + // We are only allowed to create one new instruction when spilling + // registers, so we need to use pseudo instruction for spilling SGPRs. + const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(RC->getSize())); + + // The SGPR spill/restore instructions only work on number sgprs, so we need + // to make sure we are using the correct register class. if (TargetRegisterInfo::isVirtualRegister(SrcReg) && RC->getSize() == 4) { - // m0 may not be allowed for readlane. MachineRegisterInfo &MRI = MF->getRegInfo(); MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); } - // We are only allowed to create one new instruction when spilling - // registers, so we need to use pseudo instruction for spilling - // SGPRs. - unsigned Opcode = getSGPRSpillSaveOpcode(RC->getSize()); - BuildMI(MBB, MI, DL, get(Opcode)) - .addReg(SrcReg, getKillRegState(isKill)) // src - .addFrameIndex(FrameIndex) // frame_idx - .addMemOperand(MMO); + MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc) + .addReg(SrcReg, getKillRegState(isKill)) // data + .addFrameIndex(FrameIndex) // addr + .addMemOperand(MMO) + .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) + .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit); + // Add the scratch resource registers as implicit uses because we may end up + // needing them, and need to ensure that the reserved registers are + // correctly handled. + + if (ST.hasScalarStores()) { + // m0 is used for offset to scalar stores if used to spill. + Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine); + } return; } @@ -620,11 +574,11 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize()); MFI->setHasSpilledVGPRs(); BuildMI(MBB, MI, DL, get(Opcode)) - .addReg(SrcReg, getKillRegState(isKill)) // src - .addFrameIndex(FrameIndex) // frame_idx - .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc - .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset - .addImm(0) // offset + .addReg(SrcReg, getKillRegState(isKill)) // data + .addFrameIndex(FrameIndex) // addr + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset + .addImm(0) // offset .addMemOperand(MMO); } @@ -671,10 +625,10 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, const TargetRegisterInfo *TRI) const { MachineFunction *MF = MBB.getParent(); const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); - MachineFrameInfo *FrameInfo = MF->getFrameInfo(); + MachineFrameInfo &FrameInfo = MF->getFrameInfo(); DebugLoc DL = MBB.findDebugLoc(MI); - unsigned Align = FrameInfo->getObjectAlignment(FrameIndex); - unsigned Size = FrameInfo->getObjectSize(FrameIndex); + unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); + unsigned Size = FrameInfo.getObjectSize(FrameIndex); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, FrameIndex); @@ -685,17 +639,22 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, if (RI.isSGPRClass(RC)) { // FIXME: Maybe this should not include a memoperand because it will be // lowered to non-memory instructions. - unsigned Opcode = getSGPRSpillRestoreOpcode(RC->getSize()); - + const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(RC->getSize())); if (TargetRegisterInfo::isVirtualRegister(DestReg) && RC->getSize() == 4) { - // m0 may not be allowed for readlane. MachineRegisterInfo &MRI = MF->getRegInfo(); MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); } - BuildMI(MBB, MI, DL, get(Opcode), DestReg) - .addFrameIndex(FrameIndex) // frame_idx - .addMemOperand(MMO); + MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg) + .addFrameIndex(FrameIndex) // addr + .addMemOperand(MMO) + .addReg(MFI->getScratchRSrcReg(), RegState::Implicit) + .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit); + + if (ST.hasScalarStores()) { + // m0 is used for offset to scalar stores if used to spill. + Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine); + } return; } @@ -713,7 +672,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize()); BuildMI(MBB, MI, DL, get(Opcode), DestReg) - .addFrameIndex(FrameIndex) // frame_idx + .addFrameIndex(FrameIndex) // vaddr .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset .addImm(0) // offset @@ -729,7 +688,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress( const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); DebugLoc DL = MBB.findDebugLoc(MI); - unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF); + unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize(); unsigned WavefrontSize = ST.getWavefrontSize(); unsigned TIDReg = MFI->getTIDReg(); @@ -808,7 +767,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress( } // Add FrameIndex to LDS offset - unsigned LDSOffset = MFI->LDSSize + (FrameOffset * WorkGroupSize); + unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize); BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) .addImm(LDSOffset) .addReg(TIDReg); @@ -851,7 +810,24 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { DebugLoc DL = MBB.findDebugLoc(MI); switch (MI.getOpcode()) { default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); - + case AMDGPU::S_MOV_B64_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_MOV_B64)); + break; + } + case AMDGPU::S_XOR_B64_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_XOR_B64)); + break; + } + case AMDGPU::S_ANDN2_B64_term: { + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_ANDN2_B64)); + break; + } case AMDGPU::V_MOV_B64_PSEUDO: { unsigned Dst = MI.getOperand(0).getReg(); unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); @@ -880,36 +856,37 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.eraseFromParent(); break; } + case AMDGPU::V_MOVRELD_B32_V1: + case AMDGPU::V_MOVRELD_B32_V2: + case AMDGPU::V_MOVRELD_B32_V4: + case AMDGPU::V_MOVRELD_B32_V8: + case AMDGPU::V_MOVRELD_B32_V16: { + const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32); + unsigned VecReg = MI.getOperand(0).getReg(); + bool IsUndef = MI.getOperand(1).isUndef(); + unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm(); + assert(VecReg == MI.getOperand(1).getReg()); + + MachineInstr *MovRel = + BuildMI(MBB, MI, DL, MovRelDesc) + .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) + .addOperand(MI.getOperand(2)) + .addReg(VecReg, RegState::ImplicitDefine) + .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)); + + const int ImpDefIdx = + MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses(); + const int ImpUseIdx = ImpDefIdx + 1; + MovRel->tieOperands(ImpDefIdx, ImpUseIdx); - case AMDGPU::V_CNDMASK_B64_PSEUDO: { - unsigned Dst = MI.getOperand(0).getReg(); - unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); - unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); - unsigned Src0 = MI.getOperand(1).getReg(); - unsigned Src1 = MI.getOperand(2).getReg(); - const MachineOperand &SrcCond = MI.getOperand(3); - - BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo) - .addReg(RI.getSubReg(Src0, AMDGPU::sub0)) - .addReg(RI.getSubReg(Src1, AMDGPU::sub0)) - .addReg(SrcCond.getReg()) - .addReg(Dst, RegState::Implicit | RegState::Define); - BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi) - .addReg(RI.getSubReg(Src0, AMDGPU::sub1)) - .addReg(RI.getSubReg(Src1, AMDGPU::sub1)) - .addReg(SrcCond.getReg(), getKillRegState(SrcCond.isKill())) - .addReg(Dst, RegState::Implicit | RegState::Define); MI.eraseFromParent(); break; } - case AMDGPU::SI_PC_ADD_REL_OFFSET: { - const SIRegisterInfo *TRI - = static_cast<const SIRegisterInfo *>(ST.getRegisterInfo()); MachineFunction &MF = *MBB.getParent(); unsigned Reg = MI.getOperand(0).getReg(); - unsigned RegLo = TRI->getSubReg(Reg, AMDGPU::sub0); - unsigned RegHi = TRI->getSubReg(Reg, AMDGPU::sub1); + unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0); + unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1); // Create a bundle so these instructions won't be re-ordered by the // post-RA scheduler. @@ -921,10 +898,15 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) .addReg(RegLo) .addOperand(MI.getOperand(1))); - Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) - .addReg(RegHi) - .addImm(0)); + MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) + .addReg(RegHi); + if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE) + MIB.addImm(0); + else + MIB.addOperand(MI.getOperand(2)); + + Bundler.append(MIB); llvm::finalizeBundle(MBB, Bundler.begin()); MI.eraseFromParent(); @@ -934,91 +916,96 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return true; } -/// Commutes the operands in the given instruction. -/// The commutable operands are specified by their indices OpIdx0 and OpIdx1. -/// -/// Do not call this method for a non-commutable instruction or for -/// non-commutable pair of operand indices OpIdx0 and OpIdx1. -/// Even though the instruction is commutable, the method may still -/// fail to commute the operands, null pointer is returned in such cases. -MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, - unsigned OpIdx0, - unsigned OpIdx1) const { - int CommutedOpcode = commuteOpcode(MI); - if (CommutedOpcode == -1) - return nullptr; +bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, + MachineOperand &Src0, + unsigned Src0OpName, + MachineOperand &Src1, + unsigned Src1OpName) const { + MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName); + if (!Src0Mods) + return false; - int Src0Idx = - AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); - MachineOperand &Src0 = MI.getOperand(Src0Idx); - if (!Src0.isReg()) + MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName); + assert(Src1Mods && + "All commutable instructions have both src0 and src1 modifiers"); + + int Src0ModsVal = Src0Mods->getImm(); + int Src1ModsVal = Src1Mods->getImm(); + + Src1Mods->setImm(Src0ModsVal); + Src0Mods->setImm(Src1ModsVal); + return true; +} + +static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, + MachineOperand &RegOp, + MachineOperand &NonRegOp) { + unsigned Reg = RegOp.getReg(); + unsigned SubReg = RegOp.getSubReg(); + bool IsKill = RegOp.isKill(); + bool IsDead = RegOp.isDead(); + bool IsUndef = RegOp.isUndef(); + bool IsDebug = RegOp.isDebug(); + + if (NonRegOp.isImm()) + RegOp.ChangeToImmediate(NonRegOp.getImm()); + else if (NonRegOp.isFI()) + RegOp.ChangeToFrameIndex(NonRegOp.getIndex()); + else return nullptr; - int Src1Idx = - AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1); + NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug); + NonRegOp.setSubReg(SubReg); - if ((OpIdx0 != static_cast<unsigned>(Src0Idx) || - OpIdx1 != static_cast<unsigned>(Src1Idx)) && - (OpIdx0 != static_cast<unsigned>(Src1Idx) || - OpIdx1 != static_cast<unsigned>(Src0Idx))) + return &MI; +} + +MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, + unsigned Src0Idx, + unsigned Src1Idx) const { + assert(!NewMI && "this should never be used"); + + unsigned Opc = MI.getOpcode(); + int CommutedOpcode = commuteOpcode(Opc); + if (CommutedOpcode == -1) return nullptr; - MachineOperand &Src1 = MI.getOperand(Src1Idx); + assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == + static_cast<int>(Src0Idx) && + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == + static_cast<int>(Src1Idx) && + "inconsistency with findCommutedOpIndices"); - if (isVOP2(MI) || isVOPC(MI)) { - const MCInstrDesc &InstrDesc = MI.getDesc(); - // For VOP2 and VOPC instructions, any operand type is valid to use for - // src0. Make sure we can use the src0 as src1. - // - // We could be stricter here and only allow commuting if there is a reason - // to do so. i.e. if both operands are VGPRs there is no real benefit, - // although MachineCSE attempts to find matches by commuting. - const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); - if (!isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) - return nullptr; - } + MachineOperand &Src0 = MI.getOperand(Src0Idx); + MachineOperand &Src1 = MI.getOperand(Src1Idx); - MachineInstr *CommutedMI = &MI; - if (!Src1.isReg()) { - // Allow commuting instructions with Imm operands. - if (NewMI || !Src1.isImm() || (!isVOP2(MI) && !isVOP3(MI))) { - return nullptr; + MachineInstr *CommutedMI = nullptr; + if (Src0.isReg() && Src1.isReg()) { + if (isOperandLegal(MI, Src1Idx, &Src0)) { + // Be sure to copy the source modifiers to the right place. + CommutedMI + = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); } - // Be sure to copy the source modifiers to the right place. - if (MachineOperand *Src0Mods = - getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)) { - MachineOperand *Src1Mods = - getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); - - int Src0ModsVal = Src0Mods->getImm(); - if (!Src1Mods && Src0ModsVal != 0) - return nullptr; - - // XXX - This assert might be a lie. It might be useful to have a neg - // modifier with 0.0. - int Src1ModsVal = Src1Mods->getImm(); - assert((Src1ModsVal == 0) && "Not expecting modifiers with immediates"); - - Src1Mods->setImm(Src0ModsVal); - Src0Mods->setImm(Src1ModsVal); - } - - unsigned Reg = Src0.getReg(); - unsigned SubReg = Src0.getSubReg(); - if (Src1.isImm()) - Src0.ChangeToImmediate(Src1.getImm()); - else - llvm_unreachable("Should only have immediates"); - Src1.ChangeToRegister(Reg, false); - Src1.setSubReg(SubReg); + } else if (Src0.isReg() && !Src1.isReg()) { + // src0 should always be able to support any operand type, so no need to + // check operand legality. + CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1); + } else if (!Src0.isReg() && Src1.isReg()) { + if (isOperandLegal(MI, Src1Idx, &Src0)) + CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); } else { - CommutedMI = - TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx0, OpIdx1); + // FIXME: Found two non registers to commute. This does happen. + return nullptr; } - if (CommutedMI) + + if (CommutedMI) { + swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers, + Src1, AMDGPU::OpName::src1_modifiers); + CommutedMI->setDesc(get(CommutedOpcode)); + } return CommutedMI; } @@ -1028,8 +1015,7 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, // TargetInstrInfo::commuteInstruction uses it. bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const { - const MCInstrDesc &MCID = MI.getDesc(); - if (!MCID.isCommutable()) + if (!MI.isCommutable()) return false; unsigned Opc = MI.getOpcode(); @@ -1037,34 +1023,135 @@ bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0, if (Src0Idx == -1) return false; - // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on - // immediate. Also, immediate src0 operand is not handled in - // SIInstrInfo::commuteInstruction(); - if (!MI.getOperand(Src0Idx).isReg()) - return false; - int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); if (Src1Idx == -1) return false; - MachineOperand &Src1 = MI.getOperand(Src1Idx); - if (Src1.isImm()) { - // SIInstrInfo::commuteInstruction() does support commuting the immediate - // operand src1 in 2 and 3 operand instructions. - if (!isVOP2(MI.getOpcode()) && !isVOP3(MI.getOpcode())) - return false; - } else if (Src1.isReg()) { - // If any source modifiers are set, the generic instruction commuting won't - // understand how to copy the source modifiers. - if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || - hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)) - return false; - } else - return false; - return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); } +bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, + int64_t BrOffset) const { + // BranchRelaxation should never have to check s_setpc_b64 because its dest + // block is unanalyzable. + assert(BranchOp != AMDGPU::S_SETPC_B64); + + // Convert to dwords. + BrOffset /= 4; + + // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is + // from the next instruction. + BrOffset -= 1; + + return isIntN(BranchOffsetBits, BrOffset); +} + +MachineBasicBlock *SIInstrInfo::getBranchDestBlock( + const MachineInstr &MI) const { + if (MI.getOpcode() == AMDGPU::S_SETPC_B64) { + // This would be a difficult analysis to perform, but can always be legal so + // there's no need to analyze it. + return nullptr; + } + + return MI.getOperand(0).getMBB(); +} + +unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, + MachineBasicBlock &DestBB, + const DebugLoc &DL, + int64_t BrOffset, + RegScavenger *RS) const { + assert(RS && "RegScavenger required for long branching"); + assert(MBB.empty() && + "new block should be inserted for expanding unconditional branch"); + assert(MBB.pred_size() == 1); + + MachineFunction *MF = MBB.getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + // FIXME: Virtual register workaround for RegScavenger not working with empty + // blocks. + unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + + auto I = MBB.end(); + + // We need to compute the offset relative to the instruction immediately after + // s_getpc_b64. Insert pc arithmetic code before last terminator. + MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); + + // TODO: Handle > 32-bit block address. + if (BrOffset >= 0) { + BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) + .addReg(PCReg, RegState::Define, AMDGPU::sub0) + .addReg(PCReg, 0, AMDGPU::sub0) + .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD); + BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) + .addReg(PCReg, RegState::Define, AMDGPU::sub1) + .addReg(PCReg, 0, AMDGPU::sub1) + .addImm(0); + } else { + // Backwards branch. + BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32)) + .addReg(PCReg, RegState::Define, AMDGPU::sub0) + .addReg(PCReg, 0, AMDGPU::sub0) + .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD); + BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32)) + .addReg(PCReg, RegState::Define, AMDGPU::sub1) + .addReg(PCReg, 0, AMDGPU::sub1) + .addImm(0); + } + + // Insert the indirect branch after the other terminator. + BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) + .addReg(PCReg); + + // FIXME: If spilling is necessary, this will fail because this scavenger has + // no emergency stack slots. It is non-trivial to spill in this situation, + // because the restore code needs to be specially placed after the + // jump. BranchRelaxation then needs to be made aware of the newly inserted + // block. + // + // If a spill is needed for the pc register pair, we need to insert a spill + // restore block right before the destination block, and insert a short branch + // into the old destination block's fallthrough predecessor. + // e.g.: + // + // s_cbranch_scc0 skip_long_branch: + // + // long_branch_bb: + // spill s[8:9] + // s_getpc_b64 s[8:9] + // s_add_u32 s8, s8, restore_bb + // s_addc_u32 s9, s9, 0 + // s_setpc_b64 s[8:9] + // + // skip_long_branch: + // foo; + // + // ..... + // + // dest_bb_fallthrough_predecessor: + // bar; + // s_branch dest_bb + // + // restore_bb: + // restore s[8:9] + // fallthrough dest_bb + /// + // dest_bb: + // buzz; + + RS->enterBasicBlockEnd(MBB); + unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass, + MachineBasicBlock::iterator(GetPC), 0); + MRI.replaceRegWith(PCReg, Scav); + MRI.clearVirtRegs(); + RS->setRegUsed(Scav); + + return 4 + 8 + 4 + 4; +} + unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { switch (Cond) { case SIInstrInfo::SCC_TRUE: @@ -1103,15 +1190,12 @@ SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { } } -bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, - MachineBasicBlock *&FBB, - SmallVectorImpl<MachineOperand> &Cond, - bool AllowModify) const { - MachineBasicBlock::iterator I = MBB.getFirstTerminator(); - - if (I == MBB.end()) - return false; - +bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { if (I->getOpcode() == AMDGPU::S_BRANCH) { // Unconditional Branch TBB = I->getOperand(0).getMBB(); @@ -1124,6 +1208,7 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *CondBB = I->getOperand(0).getMBB(); Cond.push_back(MachineOperand::CreateImm(Pred)); + Cond.push_back(I->getOperand(1)); // Save the branch register. ++I; @@ -1142,29 +1227,81 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, return true; } -unsigned SIInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { +bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { + MachineBasicBlock::iterator I = MBB.getFirstTerminator(); + if (I == MBB.end()) + return false; + + if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH) + return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify); + + ++I; + + // TODO: Should be able to treat as fallthrough? + if (I == MBB.end()) + return true; + + if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify)) + return true; + + MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB(); + + // Specifically handle the case where the conditional branch is to the same + // destination as the mask branch. e.g. + // + // si_mask_branch BB8 + // s_cbranch_execz BB8 + // s_cbranch BB9 + // + // This is required to understand divergent loops which may need the branches + // to be relaxed. + if (TBB != MaskBrDest || Cond.empty()) + return true; + + auto Pred = Cond[0].getImm(); + return (Pred != EXECZ && Pred != EXECNZ); +} + +unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, + int *BytesRemoved) const { MachineBasicBlock::iterator I = MBB.getFirstTerminator(); unsigned Count = 0; + unsigned RemovedSize = 0; while (I != MBB.end()) { MachineBasicBlock::iterator Next = std::next(I); + if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) { + I = Next; + continue; + } + + RemovedSize += getInstSizeInBytes(*I); I->eraseFromParent(); ++Count; I = Next; } + if (BytesRemoved) + *BytesRemoved = RemovedSize; + return Count; } -unsigned SIInstrInfo::InsertBranch(MachineBasicBlock &MBB, +unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, - const DebugLoc &DL) const { + const DebugLoc &DL, + int *BytesAdded) const { if (!FBB && Cond.empty()) { BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) .addMBB(TBB); + if (BytesAdded) + *BytesAdded = 4; return 1; } @@ -1174,24 +1311,42 @@ unsigned SIInstrInfo::InsertBranch(MachineBasicBlock &MBB, = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm())); if (!FBB) { - BuildMI(&MBB, DL, get(Opcode)) + Cond[1].isUndef(); + MachineInstr *CondBr = + BuildMI(&MBB, DL, get(Opcode)) .addMBB(TBB); + + // Copy the flags onto the implicit condition register operand. + MachineOperand &CondReg = CondBr->getOperand(1); + CondReg.setIsUndef(Cond[1].isUndef()); + CondReg.setIsKill(Cond[1].isKill()); + + if (BytesAdded) + *BytesAdded = 4; return 1; } assert(TBB && FBB); - BuildMI(&MBB, DL, get(Opcode)) + MachineInstr *CondBr = + BuildMI(&MBB, DL, get(Opcode)) .addMBB(TBB); BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) .addMBB(FBB); + MachineOperand &CondReg = CondBr->getOperand(1); + CondReg.setIsUndef(Cond[1].isUndef()); + CondReg.setIsKill(Cond[1].isKill()); + + if (BytesAdded) + *BytesAdded = 8; + return 2; } -bool SIInstrInfo::ReverseBranchCondition( +bool SIInstrInfo::reverseBranchCondition( SmallVectorImpl<MachineOperand> &Cond) const { - assert(Cond.size() == 1); + assert(Cond.size() == 2); Cond[0].setImm(-Cond[0].getImm()); return false; } @@ -1210,15 +1365,43 @@ static void removeModOperands(MachineInstr &MI) { MI.RemoveOperand(Src0ModIdx); } -// TODO: Maybe this should be removed this and custom fold everything in -// SIFoldOperands? bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg, MachineRegisterInfo *MRI) const { if (!MRI->hasOneNonDBGUse(Reg)) return false; unsigned Opc = UseMI.getOpcode(); - if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64) { + if (Opc == AMDGPU::COPY) { + bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg()); + switch (DefMI.getOpcode()) { + default: + return false; + case AMDGPU::S_MOV_B64: + // TODO: We could fold 64-bit immediates, but this get compilicated + // when there are sub-registers. + return false; + + case AMDGPU::V_MOV_B32_e32: + case AMDGPU::S_MOV_B32: + break; + } + unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; + const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0); + assert(ImmOp); + // FIXME: We could handle FrameIndex values here. + if (!ImmOp->isImm()) { + return false; + } + UseMI.setDesc(get(NewOpc)); + UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm()); + UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); + return true; + } + + if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 || + Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) { + bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64; + // Don't fold if we are using source modifiers. The new VOP2 instructions // don't have them. if (hasModifiersSet(UseMI, AMDGPU::OpName::src0_modifiers) || @@ -1232,14 +1415,16 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, // If this is a free constant, there's no reason to do this. // TODO: We could fold this here instead of letting SIFoldOperands do it // later. - if (isInlineConstant(ImmOp, 4)) + MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); + + // Any src operand can be used for the legality check. + if (isInlineConstant(UseMI, *Src0, ImmOp)) return false; - MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); - // Multiplied part is the constant: Use v_madmk_f32 + // Multiplied part is the constant: Use v_madmk_{f16, f32}. // We should only expect these to be on src0 due to canonicalizations. if (Src0->isReg() && Src0->getReg() == Reg) { if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) @@ -1267,15 +1452,15 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Src0->setSubReg(Src1SubReg); Src0->setIsKill(Src1->isKill()); - if (Opc == AMDGPU::V_MAC_F32_e64) { + if (Opc == AMDGPU::V_MAC_F32_e64 || + Opc == AMDGPU::V_MAC_F16_e64) UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); - } Src1->ChangeToImmediate(Imm); removeModOperands(UseMI); - UseMI.setDesc(get(AMDGPU::V_MADMK_F32)); + UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16)); bool DeleteDef = MRI->hasOneNonDBGUse(Reg); if (DeleteDef) @@ -1284,7 +1469,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, return true; } - // Added part is the constant: Use v_madak_f32 + // Added part is the constant: Use v_madak_{f16, f32}. if (Src2->isReg() && Src2->getReg() == Reg) { // Not allowed to use constant bus for another operand. // We can however allow an inline immediate as src0. @@ -1306,17 +1491,17 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, UseMI.RemoveOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); - if (Opc == AMDGPU::V_MAC_F32_e64) { + if (Opc == AMDGPU::V_MAC_F32_e64 || + Opc == AMDGPU::V_MAC_F16_e64) UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); - } // ChangingToImmediate adds Src2 back to the instruction. Src2->ChangeToImmediate(Imm); // These come before src2. removeModOperands(UseMI); - UseMI.setDesc(get(AMDGPU::V_MADAK_F32)); + UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16)); bool DeleteDef = MRI->hasOneNonDBGUse(Reg); if (DeleteDef) @@ -1375,6 +1560,17 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa, if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) return false; + if (AA && MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) { + const MachineMemOperand *MMOa = *MIa.memoperands_begin(); + const MachineMemOperand *MMOb = *MIb.memoperands_begin(); + if (MMOa->getValue() && MMOb->getValue()) { + MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo()); + MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo()); + if (!AA->alias(LocA, LocB)) + return true; + } + } + // TODO: Should we check the address space from the MachineMemOperand? That // would allow us to distinguish objects we know don't alias based on the // underlying address space, even if it was lowered to a different one, @@ -1414,15 +1610,22 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, MachineInstr &MI, LiveVariables *LV) const { + bool IsF16 = false; switch (MI.getOpcode()) { default: return nullptr; + case AMDGPU::V_MAC_F16_e64: + IsF16 = true; case AMDGPU::V_MAC_F32_e64: break; + case AMDGPU::V_MAC_F16_e32: + IsF16 = true; case AMDGPU::V_MAC_F32_e32: { - const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); - if (Src0->isImm() && !isInlineConstant(*Src0, 4)) + int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::src0); + const MachineOperand *Src0 = &MI.getOperand(Src0Idx); + if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0)) return nullptr; break; } @@ -1433,7 +1636,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); - return BuildMI(*MBB, MI, MI.getDebugLoc(), get(AMDGPU::V_MAD_F32)) + return BuildMI(*MBB, MI, MI.getDebugLoc(), + get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32)) .addOperand(*Dst) .addImm(0) // Src0 mods .addOperand(*Src0) @@ -1445,6 +1649,20 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, .addImm(0); // omod } +// It's not generally safe to move VALU instructions across these since it will +// start using the register as a base index rather than directly. +// XXX - Why isn't hasSideEffects sufficient for these? +static bool changesVGPRIndexingMode(const MachineInstr &MI) { + switch (MI.getOpcode()) { + case AMDGPU::S_SET_GPR_IDX_ON: + case AMDGPU::S_SET_GPR_IDX_MODE: + case AMDGPU::S_SET_GPR_IDX_OFF: + return true; + default: + return false; + } +} + bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const { @@ -1454,67 +1672,78 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, // when they operate on VGPRs. Treating EXEC modifications as scheduling // boundaries prevents incorrect movements of such instructions. return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) || - MI.modifiesRegister(AMDGPU::EXEC, &RI); + MI.modifiesRegister(AMDGPU::EXEC, &RI) || + MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || + MI.getOpcode() == AMDGPU::S_SETREG_B32 || + changesVGPRIndexingMode(MI); } bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { - int64_t SVal = Imm.getSExtValue(); - if (SVal >= -16 && SVal <= 64) - return true; - - if (Imm.getBitWidth() == 64) { - uint64_t Val = Imm.getZExtValue(); - return (DoubleToBits(0.0) == Val) || - (DoubleToBits(1.0) == Val) || - (DoubleToBits(-1.0) == Val) || - (DoubleToBits(0.5) == Val) || - (DoubleToBits(-0.5) == Val) || - (DoubleToBits(2.0) == Val) || - (DoubleToBits(-2.0) == Val) || - (DoubleToBits(4.0) == Val) || - (DoubleToBits(-4.0) == Val); - } - - // The actual type of the operand does not seem to matter as long - // as the bits match one of the inline immediate values. For example: - // - // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal, - // so it is a legal inline immediate. - // - // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in - // floating-point, so it is a legal inline immediate. - uint32_t Val = Imm.getZExtValue(); - - return (FloatToBits(0.0f) == Val) || - (FloatToBits(1.0f) == Val) || - (FloatToBits(-1.0f) == Val) || - (FloatToBits(0.5f) == Val) || - (FloatToBits(-0.5f) == Val) || - (FloatToBits(2.0f) == Val) || - (FloatToBits(-2.0f) == Val) || - (FloatToBits(4.0f) == Val) || - (FloatToBits(-4.0f) == Val); + switch (Imm.getBitWidth()) { + case 32: + return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(), + ST.hasInv2PiInlineImm()); + case 64: + return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(), + ST.hasInv2PiInlineImm()); + case 16: + return AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), + ST.hasInv2PiInlineImm()); + default: + llvm_unreachable("invalid bitwidth"); + } } bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, - unsigned OpSize) const { - if (MO.isImm()) { - // MachineOperand provides no way to tell the true operand size, since it - // only records a 64-bit value. We need to know the size to determine if a - // 32-bit floating point immediate bit pattern is legal for an integer - // immediate. It would be for any 32-bit integer operand, but would not be - // for a 64-bit one. + uint8_t OperandType) const { + if (!MO.isImm() || OperandType < MCOI::OPERAND_FIRST_TARGET) + return false; - unsigned BitSize = 8 * OpSize; - return isInlineConstant(APInt(BitSize, MO.getImm(), true)); - } + // MachineOperand provides no way to tell the true operand size, since it only + // records a 64-bit value. We need to know the size to determine if a 32-bit + // floating point immediate bit pattern is legal for an integer immediate. It + // would be for any 32-bit integer operand, but would not be for a 64-bit one. + + int64_t Imm = MO.getImm(); + switch (operandBitWidth(OperandType)) { + case 32: { + int32_t Trunc = static_cast<int32_t>(Imm); + return Trunc == Imm && + AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); + } + case 64: { + return AMDGPU::isInlinableLiteral64(MO.getImm(), + ST.hasInv2PiInlineImm()); + } + case 16: { + if (isInt<16>(Imm) || isUInt<16>(Imm)) { + int16_t Trunc = static_cast<int16_t>(Imm); + return AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); + } - return false; + return false; + } + default: + llvm_unreachable("invalid bitwidth"); + } } -bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO, - unsigned OpSize) const { - return MO.isImm() && !isInlineConstant(MO, OpSize); +bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO, + const MCOperandInfo &OpInfo) const { + switch (MO.getType()) { + case MachineOperand::MO_Register: + return false; + case MachineOperand::MO_Immediate: + return !isInlineConstant(MO, OpInfo); + case MachineOperand::MO_FrameIndex: + case MachineOperand::MO_MachineBasicBlock: + case MachineOperand::MO_ExternalSymbol: + case MachineOperand::MO_GlobalAddress: + case MachineOperand::MO_MCSymbol: + return true; + default: + llvm_unreachable("unexpected operand type"); + } } static bool compareMachineOp(const MachineOperand &Op0, @@ -1544,11 +1773,10 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, if (OpInfo.RegClass < 0) return false; - unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize(); - if (isLiteralConstant(MO, OpSize)) - return RI.opCanUseLiteralConstant(OpInfo.OperandType); + if (MO.isImm() && isInlineConstant(MO, OpInfo)) + return RI.opCanUseInlineConstant(OpInfo.OperandType); - return RI.opCanUseInlineConstant(OpInfo.OperandType); + return RI.opCanUseLiteralConstant(OpInfo.OperandType); } bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { @@ -1575,12 +1803,17 @@ bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, - unsigned OpSize) const { + const MCOperandInfo &OpInfo) const { // Literal constants use the constant bus. - if (isLiteralConstant(MO, OpSize)) - return true; + //if (isLiteralConstantLike(MO, OpInfo)) + // return true; + if (MO.isImm()) + return !isInlineConstant(MO, OpInfo); + + if (!MO.isReg()) + return true; // Misc other operands like FrameIndex - if (!MO.isReg() || !MO.isUse()) + if (!MO.isUse()) return false; if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) @@ -1644,6 +1877,16 @@ static bool shouldReadExec(const MachineInstr &MI) { return true; } +static bool isSubRegOf(const SIRegisterInfo &TRI, + const MachineOperand &SuperVec, + const MachineOperand &SubReg) { + if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg())) + return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg()); + + return SubReg.getSubReg() != AMDGPU::NoSubRegister && + SubReg.getReg() == SuperVec.getReg(); +} + bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const { uint16_t Opcode = MI.getOpcode(); @@ -1660,6 +1903,28 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, return false; } + if (MI.isInlineAsm()) { + // Verify register classes for inlineasm constraints. + for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands(); + I != E; ++I) { + const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI); + if (!RC) + continue; + + const MachineOperand &Op = MI.getOperand(I); + if (!Op.isReg()) + continue; + + unsigned Reg = Op.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) { + ErrInfo = "inlineasm operand has incorrect register class."; + return false; + } + } + + return true; + } + // Make sure the register classes are correct. for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { if (MI.getOperand(i).isFPImm()) { @@ -1677,15 +1942,22 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, return false; } break; - case AMDGPU::OPERAND_REG_IMM32: + case AMDGPU::OPERAND_REG_IMM_INT32: + case AMDGPU::OPERAND_REG_IMM_FP32: break; - case AMDGPU::OPERAND_REG_INLINE_C: - if (isLiteralConstant(MI.getOperand(i), - RI.getRegClass(RegClass)->getSize())) { + case AMDGPU::OPERAND_REG_INLINE_C_INT32: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_INT64: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_INT16: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: { + const MachineOperand &MO = MI.getOperand(i); + if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) { ErrInfo = "Illegal immediate value for operand."; return false; } break; + } case MCOI::OPERAND_IMMEDIATE: case AMDGPU::OPERAND_KIMM32: // Check if this operand is an immediate. @@ -1695,7 +1967,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, ErrInfo = "Expected immediate, but got non-immediate"; return false; } - // Fall-through + LLVM_FALLTHROUGH; default: continue; } @@ -1737,7 +2009,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, if (OpIdx == -1) break; const MachineOperand &MO = MI.getOperand(OpIdx); - if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) { + if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { if (MO.isReg()) { if (MO.getReg() != SGPRUsed) ++ConstantBusCount; @@ -1768,6 +2040,65 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } + if (isSOPK(MI)) { + int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm(); + if (sopkIsZext(MI)) { + if (!isUInt<16>(Imm)) { + ErrInfo = "invalid immediate for SOPK instruction"; + return false; + } + } else { + if (!isInt<16>(Imm)) { + ErrInfo = "invalid immediate for SOPK instruction"; + return false; + } + } + } + + if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 || + Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 || + Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || + Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) { + const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 || + Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64; + + const unsigned StaticNumOps = Desc.getNumOperands() + + Desc.getNumImplicitUses(); + const unsigned NumImplicitOps = IsDst ? 2 : 1; + + // Allow additional implicit operands. This allows a fixup done by the post + // RA scheduler where the main implicit operand is killed and implicit-defs + // are added for sub-registers that remain live after this instruction. + if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) { + ErrInfo = "missing implicit register operands"; + return false; + } + + const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); + if (IsDst) { + if (!Dst->isUse()) { + ErrInfo = "v_movreld_b32 vdst should be a use operand"; + return false; + } + + unsigned UseOpIdx; + if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) || + UseOpIdx != StaticNumOps + 1) { + ErrInfo = "movrel implicit operands should be tied"; + return false; + } + } + + const MachineOperand &Src0 = MI.getOperand(Src0Idx); + const MachineOperand &ImpUse + = MI.getOperand(StaticNumOps + NumImplicitOps - 1); + if (!ImpUse.isReg() || !ImpUse.isUse() || + !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) { + ErrInfo = "src0 should be subreg of implicit vector use"; + return false; + } + } + // Make sure we aren't losing exec uses in the td files. This mostly requires // being careful when using let Uses to try to add other use registers. if (shouldReadExec(MI)) { @@ -1777,6 +2108,18 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } + if (isSMRD(MI)) { + if (MI.mayStore()) { + // The register offset form of scalar stores may only use m0 as the + // soffset register. + const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff); + if (Soff && Soff->getReg() != AMDGPU::M0) { + ErrInfo = "scalar stores must use m0 as offset register"; + return false; + } + } + } + return true; } @@ -1797,13 +2140,13 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; - case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e32; - case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e32; - case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e32; - case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e32; - case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e32; - case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e32; - case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e32; + case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; + case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; + case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; + case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64; + case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64; + case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64; + case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64; case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32; case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64; case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32; @@ -1830,6 +2173,8 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; + case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32; + case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32; case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; @@ -1937,11 +2282,10 @@ MachineOperand SIInstrInfo::buildExtractSubRegOrImm( unsigned SubIdx, const TargetRegisterClass *SubRC) const { if (Op.isImm()) { - // XXX - Is there a better way to do this? if (SubIdx == AMDGPU::sub0) - return MachineOperand::CreateImm(Op.getImm() & 0xFFFFFFFF); + return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm())); if (SubIdx == AMDGPU::sub1) - return MachineOperand::CreateImm(Op.getImm() >> 32); + return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32)); llvm_unreachable("Unhandled register index for immediate"); } @@ -1978,8 +2322,8 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, // In order to be legal, the common sub-class must be equal to the // class of the current operand. For example: // - // v_mov_b32 s0 ; Operand defined as vsrc_32 - // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL + // v_mov_b32 s0 ; Operand defined as vsrc_b32 + // ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL // // s_sendmsg 0, s0 ; Operand defined as m0reg // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL @@ -2008,7 +2352,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, if (!MO) MO = &MI.getOperand(OpIdx); - if (isVALU(MI) && usesConstantBus(MRI, *MO, DefinedRC->getSize())) { + if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { RegSubRegPair SGPRUsed; if (MO->isReg()) @@ -2020,7 +2364,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand &Op = MI.getOperand(i); if (Op.isReg()) { if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) && - usesConstantBus(MRI, Op, getOpSize(MI, i))) { + usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) { return false; } } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) { @@ -2202,6 +2546,39 @@ void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, } } +void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, + MachineBasicBlock::iterator I, + const TargetRegisterClass *DstRC, + MachineOperand &Op, + MachineRegisterInfo &MRI, + const DebugLoc &DL) const { + + unsigned OpReg = Op.getReg(); + unsigned OpSubReg = Op.getSubReg(); + + const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg( + RI.getRegClassForReg(MRI, OpReg), OpSubReg); + + // Check if operand is already the correct register class. + if (DstRC == OpRC) + return; + + unsigned DstReg = MRI.createVirtualRegister(DstRC); + MachineInstr *Copy = BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg) + .addOperand(Op); + + Op.setReg(DstReg); + Op.setSubReg(0); + + MachineInstr *Def = MRI.getVRegDef(OpReg); + if (!Def) + return; + + // Try to eliminate the copy if it is copying an immediate value. + if (Def->isMoveImmediate()) + FoldImmediate(*Copy, *Def, OpReg, &MRI); +} + void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { MachineFunction &MF = *MI.getParent()->getParent(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -2260,15 +2637,14 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { MachineOperand &Op = MI.getOperand(I); if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) continue; - unsigned DstReg = MRI.createVirtualRegister(RC); // MI is a PHI instruction. MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); - BuildMI(*InsertBB, Insert, MI.getDebugLoc(), get(AMDGPU::COPY), DstReg) - .addOperand(Op); - Op.setReg(DstReg); + // Avoid creating no-op copies with the same src and dst reg class. These + // confuse some of the machine passes. + legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc()); } } @@ -2292,12 +2668,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { if (VRC == OpRC) continue; - unsigned DstReg = MRI.createVirtualRegister(VRC); - - BuildMI(*MBB, MI, MI.getDebugLoc(), get(AMDGPU::COPY), DstReg) - .addOperand(Op); - - Op.setReg(DstReg); + legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc()); Op.setIsKill(); } } @@ -2313,11 +2684,9 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); if (DstRC != Src0RC) { - MachineBasicBlock &MBB = *MI.getParent(); - unsigned NewSrc0 = MRI.createVirtualRegister(DstRC); - BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::COPY), NewSrc0) - .addReg(Src0); - MI.getOperand(1).setReg(NewSrc0); + MachineBasicBlock *MBB = MI.getParent(); + MachineOperand &Op = MI.getOperand(1); + legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc()); } return; } @@ -2664,6 +3033,22 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { continue; unsigned DstReg = Inst.getOperand(0).getReg(); + if (Inst.isCopy() && + TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) && + NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { + // Instead of creating a copy where src and dst are the same register + // class, we just replace all uses of dst with src. These kinds of + // copies interfere with the heuristics MachineSink uses to decide + // whether or not to split a critical edge. Since the pass assumes + // that copies will end up as machine instructions and not be + // eliminated. + addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); + MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); + MRI.clearKillFlags(Inst.getOperand(1).getReg()); + Inst.getOperand(0).setReg(DstReg); + continue; + } + NewDstReg = MRI.createVirtualRegister(NewDstRC); MRI.replaceRegWith(DstReg, NewDstReg); } @@ -2927,10 +3312,16 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist( MachineRegisterInfo &MRI, SmallVectorImpl<MachineInstr *> &Worklist) const { for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), - E = MRI.use_end(); I != E; ++I) { + E = MRI.use_end(); I != E;) { MachineInstr &UseMI = *I->getParent(); if (!canReadVGPR(UseMI, I.getOperandNo())) { Worklist.push_back(&UseMI); + + do { + ++I; + } while (I != E && I->getParent() == &UseMI); + } else { + ++I; } } } @@ -3098,6 +3489,56 @@ bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const { return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc); } +unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, + int &FrameIndex) const { + const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr); + if (!Addr || !Addr->isFI()) + return AMDGPU::NoRegister; + + assert(!MI.memoperands_empty() && + (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS); + + FrameIndex = Addr->getIndex(); + return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); +} + +unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI, + int &FrameIndex) const { + const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr); + assert(Addr && Addr->isFI()); + FrameIndex = Addr->getIndex(); + return getNamedOperand(MI, AMDGPU::OpName::data)->getReg(); +} + +unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, + int &FrameIndex) const { + + if (!MI.mayLoad()) + return AMDGPU::NoRegister; + + if (isMUBUF(MI) || isVGPRSpill(MI)) + return isStackAccess(MI, FrameIndex); + + if (isSGPRSpill(MI)) + return isSGPRStackAccess(MI, FrameIndex); + + return AMDGPU::NoRegister; +} + +unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI, + int &FrameIndex) const { + if (!MI.mayStore()) + return AMDGPU::NoRegister; + + if (isMUBUF(MI) || isVGPRSpill(MI)) + return isStackAccess(MI, FrameIndex); + + if (isSGPRSpill(MI)) + return isSGPRStackAccess(MI, FrameIndex); + + return AMDGPU::NoRegister; +} + unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { unsigned Opc = MI.getOpcode(); const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc); @@ -3105,32 +3546,45 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { // If we have a definitive size, we can use it. Otherwise we need to inspect // the operands to know the size. - if (DescSize == 8 || DescSize == 4) + // + // FIXME: Instructions that have a base 32-bit encoding report their size as + // 4, even though they are really 8 bytes if they have a literal operand. + if (DescSize != 0 && DescSize != 4) return DescSize; - assert(DescSize == 0); + if (Opc == AMDGPU::WAVE_BARRIER) + return 0; // 4-byte instructions may have a 32-bit literal encoded after them. Check // operands that coud ever be literals. if (isVALU(MI) || isSALU(MI)) { + if (isFixedSize(MI)) { + assert(DescSize == 4); + return DescSize; + } + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); if (Src0Idx == -1) return 4; // No operands. - if (isLiteralConstant(MI.getOperand(Src0Idx), getOpSize(MI, Src0Idx))) + if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx])) return 8; int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); if (Src1Idx == -1) return 4; - if (isLiteralConstant(MI.getOperand(Src1Idx), getOpSize(MI, Src1Idx))) + if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx])) return 8; return 4; } + if (DescSize == 4) + return 4; + switch (Opc) { + case AMDGPU::SI_MASK_BRANCH: case TargetOpcode::IMPLICIT_DEF: case TargetOpcode::KILL: case TargetOpcode::DBG_VALUE: @@ -3147,6 +3601,20 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { } } +bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const { + if (!isFLAT(MI)) + return false; + + if (MI.memoperands_empty()) + return true; + + for (const MachineMemOperand *MMO : MI.memoperands()) { + if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) + return true; + } + return false; +} + ArrayRef<std::pair<int, const char *>> SIInstrInfo::getSerializableTargetIndices() const { static const std::pair<int, const char *> TargetIndices[] = { |