summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
diff options
context:
space:
mode:
authordim <dim@FreeBSD.org>2015-12-30 13:13:10 +0000
committerdim <dim@FreeBSD.org>2015-12-30 13:13:10 +0000
commit9b5bf5c4f53d65d6a48722d7410ed7cb15f5ba3a (patch)
treeb466a4817f79516eb1df8eae92bccf62ecc84003 /contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
parentf09a28d1de99fda4f5517fb12670fc36552f4927 (diff)
parente194cd6d03d91631334d9d5e55b506036f423cc8 (diff)
downloadFreeBSD-src-9b5bf5c4f53d65d6a48722d7410ed7cb15f5ba3a.zip
FreeBSD-src-9b5bf5c4f53d65d6a48722d7410ed7cb15f5ba3a.tar.gz
Update llvm to trunk r256633.
Diffstat (limited to 'contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp')
-rw-r--r--contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp271
1 files changed, 192 insertions, 79 deletions
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index e9e8412..3cdffef 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -12,7 +12,6 @@
//
//===----------------------------------------------------------------------===//
-
#include "SIRegisterInfo.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
@@ -33,6 +32,40 @@ void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) co
Reserved.set(*R);
}
+unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
+ const MachineFunction &MF) const {
+ const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+ if (ST.hasSGPRInitBug()) {
+ unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 4;
+ unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
+ return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
+ }
+
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ // 98/99 need to be reserved for flat_scr, and 100/101 for vcc. This is the
+ // next sgpr128 down.
+ return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95;
+ }
+
+ return AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99;
+}
+
+unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
+ const MachineFunction &MF) const {
+ const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+ if (ST.hasSGPRInitBug()) {
+ unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 5;
+ return AMDGPU::SGPR_32RegClass.getRegister(Idx);
+ }
+
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ // Next register before reservations for flat_scr and vcc.
+ return AMDGPU::SGPR97;
+ }
+
+ return AMDGPU::SGPR95;
+}
+
BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
@@ -42,13 +75,22 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
reserveRegisterTuples(Reserved, AMDGPU::EXEC);
reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
- // Reserve some VGPRs to use as temp registers in case we have to spill VGPRs
- reserveRegisterTuples(Reserved, AMDGPU::VGPR254);
- reserveRegisterTuples(Reserved, AMDGPU::VGPR255);
+ // Reserve the last 2 registers so we will always have at least 2 more that
+ // will physically contain VCC.
+ reserveRegisterTuples(Reserved, AMDGPU::SGPR102_SGPR103);
+
+ const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ // SI/CI have 104 SGPRs. VI has 102. We need to shift down the reservation
+ // for VCC/FLAT_SCR.
+ reserveRegisterTuples(Reserved, AMDGPU::SGPR98_SGPR99);
+ reserveRegisterTuples(Reserved, AMDGPU::SGPR100_SGPR101);
+ }
// Tonga and Iceland can only allocate a fixed number of SGPRs due
// to a hw bug.
- if (MF.getSubtarget<AMDGPUSubtarget>().hasSGPRInitBug()) {
+ if (ST.hasSGPRInitBug()) {
unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
// Reserve some SGPRs for FLAT_SCRATCH and VCC (4 SGPRs).
// Assume XNACK_MASK is unused.
@@ -60,34 +102,57 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
}
}
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
+ if (ScratchWaveOffsetReg != AMDGPU::NoRegister) {
+ // Reserve 1 SGPR for scratch wave offset in case we need to spill.
+ reserveRegisterTuples(Reserved, ScratchWaveOffsetReg);
+ }
+
+ unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
+ if (ScratchRSrcReg != AMDGPU::NoRegister) {
+ // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
+ // to spill.
+ // TODO: May need to reserve a VGPR if doing LDS spilling.
+ reserveRegisterTuples(Reserved, ScratchRSrcReg);
+ assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
+ }
+
return Reserved;
}
unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
unsigned Idx) const {
-
const AMDGPUSubtarget &STI = MF.getSubtarget<AMDGPUSubtarget>();
// FIXME: We should adjust the max number of waves based on LDS size.
unsigned SGPRLimit = getNumSGPRsAllowed(STI.getGeneration(),
STI.getMaxWavesPerCU());
unsigned VGPRLimit = getNumVGPRsAllowed(STI.getMaxWavesPerCU());
+ unsigned VSLimit = SGPRLimit + VGPRLimit;
+
for (regclass_iterator I = regclass_begin(), E = regclass_end();
I != E; ++I) {
+ const TargetRegisterClass *RC = *I;
- unsigned NumSubRegs = std::max((int)(*I)->getSize() / 4, 1);
+ unsigned NumSubRegs = std::max((int)RC->getSize() / 4, 1);
unsigned Limit;
- if (isSGPRClass(*I)) {
+ if (isPseudoRegClass(RC)) {
+ // FIXME: This is a hack. We should never be considering the pressure of
+ // these since no virtual register should ever have this class.
+ Limit = VSLimit;
+ } else if (isSGPRClass(RC)) {
Limit = SGPRLimit / NumSubRegs;
} else {
Limit = VGPRLimit / NumSubRegs;
}
- const int *Sets = getRegClassPressureSets(*I);
+ const int *Sets = getRegClassPressureSets(RC);
assert(Sets);
for (unsigned i = 0; Sets[i] != -1; ++i) {
- if (Sets[i] == (int)Idx)
+ if (Sets[i] == (int)Idx)
return Limit;
}
}
@@ -174,17 +239,17 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
unsigned SubReg = NumSubRegs > 1 ?
getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) :
Value;
- bool IsKill = (i == e - 1);
BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
- .addReg(SubReg, getDefRegState(IsLoad))
- .addReg(ScratchRsrcReg, getKillRegState(IsKill))
- .addReg(SOffset)
- .addImm(Offset)
- .addImm(0) // glc
- .addImm(0) // slc
- .addImm(0) // tfe
- .addReg(Value, RegState::Implicit | getDefRegState(IsLoad));
+ .addReg(SubReg, getDefRegState(IsLoad))
+ .addReg(ScratchRsrcReg)
+ .addReg(SOffset)
+ .addImm(Offset)
+ .addImm(0) // glc
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .addReg(Value, RegState::Implicit | getDefRegState(IsLoad))
+ .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
}
}
@@ -228,6 +293,9 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
.addReg(SubReg)
.addImm(Spill.Lane);
+ // FIXME: Since this spills to another register instead of an actual
+ // frame index, we should delete the frame index when all references to
+ // it are fixed.
}
MI->eraseFromParent();
break;
@@ -263,16 +331,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
// TODO: only do this when it is needed
switch (MF->getSubtarget<AMDGPUSubtarget>().getGeneration()) {
case AMDGPUSubtarget::SOUTHERN_ISLANDS:
- // "VALU writes SGPR" -> "SMRD reads that SGPR" needs "S_NOP 3" on SI
- TII->insertNOPs(MI, 3);
+ // "VALU writes SGPR" -> "SMRD reads that SGPR" needs 4 wait states
+ // ("S_NOP 3") on SI
+ TII->insertWaitStates(MI, 4);
break;
case AMDGPUSubtarget::SEA_ISLANDS:
break;
default: // VOLCANIC_ISLANDS and later
- // "VALU writes SGPR -> VMEM reads that SGPR" needs "S_NOP 4" on VI
- // and later. This also applies to VALUs which write VCC, but we're
- // unlikely to see VMEM use VCC.
- TII->insertNOPs(MI, 4);
+ // "VALU writes SGPR -> VMEM reads that SGPR" needs 5 wait states
+ // ("S_NOP 4") on VI and later. This also applies to VALUs which write
+ // VCC, but we're unlikely to see VMEM use VCC.
+ TII->insertWaitStates(MI, 5);
}
MI->eraseFromParent();
@@ -322,22 +391,16 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
}
}
-const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass(
- MVT VT) const {
- switch(VT.SimpleTy) {
- default:
- case MVT::i32: return &AMDGPU::VGPR_32RegClass;
- }
-}
-
unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const {
return getEncodingValue(Reg) & 0xff;
}
+// FIXME: This is very slow. It might be worth creating a map from physreg to
+// register class.
const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
assert(!TargetRegisterInfo::isVirtualRegister(Reg));
- static const TargetRegisterClass *BaseClasses[] = {
+ static const TargetRegisterClass *const BaseClasses[] = {
&AMDGPU::VGPR_32RegClass,
&AMDGPU::SReg_32RegClass,
&AMDGPU::VReg_64RegClass,
@@ -359,33 +422,45 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
return nullptr;
}
+// TODO: It might be helpful to have some target specific flags in
+// TargetRegisterClass to mark which classes are VGPRs to make this trivial.
bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
- return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) ||
- getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) ||
- getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) ||
- getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) ||
- getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) ||
- getCommonSubClass(&AMDGPU::VReg_512RegClass, RC);
+ switch (RC->getSize()) {
+ case 4:
+ return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr;
+ case 8:
+ return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr;
+ case 12:
+ return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr;
+ case 16:
+ return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr;
+ case 32:
+ return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr;
+ case 64:
+ return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr;
+ default:
+ llvm_unreachable("Invalid register class size");
+ }
}
const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
const TargetRegisterClass *SRC) const {
- if (hasVGPRs(SRC)) {
- return SRC;
- } else if (SRC == &AMDGPU::SCCRegRegClass) {
- return &AMDGPU::VCCRegRegClass;
- } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_32RegClass)) {
- return &AMDGPU::VGPR_32RegClass;
- } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_64RegClass)) {
- return &AMDGPU::VReg_64RegClass;
- } else if (getCommonSubClass(SRC, &AMDGPU::SReg_128RegClass)) {
- return &AMDGPU::VReg_128RegClass;
- } else if (getCommonSubClass(SRC, &AMDGPU::SReg_256RegClass)) {
- return &AMDGPU::VReg_256RegClass;
- } else if (getCommonSubClass(SRC, &AMDGPU::SReg_512RegClass)) {
- return &AMDGPU::VReg_512RegClass;
- }
- return nullptr;
+ switch (SRC->getSize()) {
+ case 4:
+ return &AMDGPU::VGPR_32RegClass;
+ case 8:
+ return &AMDGPU::VReg_64RegClass;
+ case 12:
+ return &AMDGPU::VReg_96RegClass;
+ case 16:
+ return &AMDGPU::VReg_128RegClass;
+ case 32:
+ return &AMDGPU::VReg_256RegClass;
+ case 64:
+ return &AMDGPU::VReg_512RegClass;
+ default:
+ llvm_unreachable("Invalid register class size");
+ }
}
const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
@@ -402,6 +477,30 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
}
}
+bool SIRegisterInfo::shouldRewriteCopySrc(
+ const TargetRegisterClass *DefRC,
+ unsigned DefSubReg,
+ const TargetRegisterClass *SrcRC,
+ unsigned SrcSubReg) const {
+ // We want to prefer the smallest register class possible, so we don't want to
+ // stop and rewrite on anything that looks like a subregister
+ // extract. Operations mostly don't care about the super register class, so we
+ // only want to stop on the most basic of copies between the smae register
+ // class.
+ //
+ // e.g. if we have something like
+ // vreg0 = ...
+ // vreg1 = ...
+ // vreg2 = REG_SEQUENCE vreg0, sub0, vreg1, sub1, vreg2, sub2
+ // vreg3 = COPY vreg2, sub0
+ //
+ // We want to look through the COPY to find:
+ // => vreg3 = COPY vreg0
+
+ // Plain copy.
+ return getCommonSubClass(DefRC, SrcRC) != nullptr;
+}
+
unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg,
const TargetRegisterClass *SubRC,
unsigned Channel) const {
@@ -462,30 +561,47 @@ bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
return OpType == AMDGPU::OPERAND_REG_INLINE_C;
}
+// FIXME: Most of these are flexible with HSA and we don't need to reserve them
+// as input registers if unused. Whether the dispatch ptr is necessary should be
+// easy to detect from used intrinsics. Scratch setup is harder to know.
unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
enum PreloadedValue Value) const {
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+ (void)ST;
switch (Value) {
- case SIRegisterInfo::TGID_X:
- return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0);
- case SIRegisterInfo::TGID_Y:
- return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1);
- case SIRegisterInfo::TGID_Z:
- return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2);
- case SIRegisterInfo::SCRATCH_WAVE_OFFSET:
- if (MFI->getShaderType() != ShaderType::COMPUTE)
- return MFI->ScratchOffsetReg;
- return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4);
- case SIRegisterInfo::SCRATCH_PTR:
- return AMDGPU::SGPR2_SGPR3;
- case SIRegisterInfo::INPUT_PTR:
- return AMDGPU::SGPR0_SGPR1;
- case SIRegisterInfo::TIDIG_X:
+ case SIRegisterInfo::WORKGROUP_ID_X:
+ assert(MFI->hasWorkGroupIDX());
+ return MFI->WorkGroupIDXSystemSGPR;
+ case SIRegisterInfo::WORKGROUP_ID_Y:
+ assert(MFI->hasWorkGroupIDY());
+ return MFI->WorkGroupIDYSystemSGPR;
+ case SIRegisterInfo::WORKGROUP_ID_Z:
+ assert(MFI->hasWorkGroupIDZ());
+ return MFI->WorkGroupIDZSystemSGPR;
+ case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
+ return MFI->PrivateSegmentWaveByteOffsetSystemSGPR;
+ case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER:
+ assert(ST.isAmdHsaOS() && "Non-HSA ABI currently uses relocations");
+ assert(MFI->hasPrivateSegmentBuffer());
+ return MFI->PrivateSegmentBufferUserSGPR;
+ case SIRegisterInfo::KERNARG_SEGMENT_PTR:
+ assert(MFI->hasKernargSegmentPtr());
+ return MFI->KernargSegmentPtrUserSGPR;
+ case SIRegisterInfo::DISPATCH_PTR:
+ assert(MFI->hasDispatchPtr());
+ return MFI->DispatchPtrUserSGPR;
+ case SIRegisterInfo::QUEUE_PTR:
+ llvm_unreachable("not implemented");
+ case SIRegisterInfo::WORKITEM_ID_X:
+ assert(MFI->hasWorkItemIDX());
return AMDGPU::VGPR0;
- case SIRegisterInfo::TIDIG_Y:
+ case SIRegisterInfo::WORKITEM_ID_Y:
+ assert(MFI->hasWorkItemIDY());
return AMDGPU::VGPR1;
- case SIRegisterInfo::TIDIG_Z:
+ case SIRegisterInfo::WORKITEM_ID_Z:
+ assert(MFI->hasWorkItemIDZ());
return AMDGPU::VGPR2;
}
llvm_unreachable("unexpected preloaded value type");
@@ -496,12 +612,9 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
// AMDGPU::NoRegister.
unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
const TargetRegisterClass *RC) const {
-
- for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
- I != E; ++I) {
- if (!MRI.isPhysRegUsed(*I))
- return *I;
- }
+ for (unsigned Reg : *RC)
+ if (!MRI.isPhysRegUsed(Reg))
+ return Reg;
return AMDGPU::NoRegister;
}
OpenPOWER on IntegriCloud