diff options
Diffstat (limited to 'contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp')
-rw-r--r-- | contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 373 |
1 files changed, 373 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp new file mode 100644 index 0000000..96e37c5 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -0,0 +1,373 @@ +//===-- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Copies from VGPR to SGPR registers are illegal and the register coalescer +/// will sometimes generate these illegal copies in situations like this: +/// +/// Register Class <vsrc> is the union of <vgpr> and <sgpr> +/// +/// BB0: +/// %vreg0 <sgpr> = SCALAR_INST +/// %vreg1 <vsrc> = COPY %vreg0 <sgpr> +/// ... +/// BRANCH %cond BB1, BB2 +/// BB1: +/// %vreg2 <vgpr> = VECTOR_INST +/// %vreg3 <vsrc> = COPY %vreg2 <vgpr> +/// BB2: +/// %vreg4 <vsrc> = PHI %vreg1 <vsrc>, <BB#0>, %vreg3 <vrsc>, <BB#1> +/// %vreg5 <vgpr> = VECTOR_INST %vreg4 <vsrc> +/// +/// +/// The coalescer will begin at BB0 and eliminate its copy, then the resulting +/// code will look like this: +/// +/// BB0: +/// %vreg0 <sgpr> = SCALAR_INST +/// ... +/// BRANCH %cond BB1, BB2 +/// BB1: +/// %vreg2 <vgpr> = VECTOR_INST +/// %vreg3 <vsrc> = COPY %vreg2 <vgpr> +/// BB2: +/// %vreg4 <sgpr> = PHI %vreg0 <sgpr>, <BB#0>, %vreg3 <vsrc>, <BB#1> +/// %vreg5 <vgpr> = VECTOR_INST %vreg4 <sgpr> +/// +/// Now that the result of the PHI instruction is an SGPR, the register +/// allocator is now forced to constrain the register class of %vreg3 to +/// <sgpr> so we end up with final code like this: +/// +/// BB0: +/// %vreg0 <sgpr> = SCALAR_INST +/// ... +/// BRANCH %cond BB1, BB2 +/// BB1: +/// %vreg2 <vgpr> = VECTOR_INST +/// %vreg3 <sgpr> = COPY %vreg2 <vgpr> +/// BB2: +/// %vreg4 <sgpr> = PHI %vreg0 <sgpr>, <BB#0>, %vreg3 <sgpr>, <BB#1> +/// %vreg5 <vgpr> = VECTOR_INST %vreg4 <sgpr> +/// +/// Now this code contains an illegal copy from a VGPR to an SGPR. +/// +/// In order to avoid this problem, this pass searches for PHI instructions +/// which define a <vsrc> register and constrains its definition class to +/// <vgpr> if the user of the PHI's definition register is a vector instruction. +/// If the PHI's definition class is constrained to <vgpr> then the coalescer +/// will be unable to perform the COPY removal from the above example which +/// ultimately led to the creation of an illegal COPY. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +#define DEBUG_TYPE "sgpr-copies" + +namespace { + +class SIFixSGPRCopies : public MachineFunctionPass { +public: + static char ID; + + SIFixSGPRCopies() : MachineFunctionPass(ID) { } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Fix SGPR copies"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace + +INITIALIZE_PASS(SIFixSGPRCopies, DEBUG_TYPE, + "SI Fix SGPR copies", false, false) + +char SIFixSGPRCopies::ID = 0; + +char &llvm::SIFixSGPRCopiesID = SIFixSGPRCopies::ID; + +FunctionPass *llvm::createSIFixSGPRCopiesPass() { + return new SIFixSGPRCopies(); +} + +static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) { + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + if (!MI.getOperand(i).isReg() || + !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg())) + continue; + + if (TRI->hasVGPRs(MRI.getRegClass(MI.getOperand(i).getReg()))) + return true; + } + return false; +} + +static std::pair<const TargetRegisterClass *, const TargetRegisterClass *> +getCopyRegClasses(const MachineInstr &Copy, + const SIRegisterInfo &TRI, + const MachineRegisterInfo &MRI) { + unsigned DstReg = Copy.getOperand(0).getReg(); + unsigned SrcReg = Copy.getOperand(1).getReg(); + + const TargetRegisterClass *SrcRC = + TargetRegisterInfo::isVirtualRegister(SrcReg) ? + MRI.getRegClass(SrcReg) : + TRI.getPhysRegClass(SrcReg); + + // We don't really care about the subregister here. + // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg()); + + const TargetRegisterClass *DstRC = + TargetRegisterInfo::isVirtualRegister(DstReg) ? + MRI.getRegClass(DstReg) : + TRI.getPhysRegClass(DstReg); + + return std::make_pair(SrcRC, DstRC); +} + +static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC, + const TargetRegisterClass *DstRC, + const SIRegisterInfo &TRI) { + return TRI.isSGPRClass(DstRC) && TRI.hasVGPRs(SrcRC); +} + +static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, + const TargetRegisterClass *DstRC, + const SIRegisterInfo &TRI) { + return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC); +} + +// Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE. +// +// SGPRx = ... +// SGPRy = REG_SEQUENCE SGPRx, sub0 ... +// VGPRz = COPY SGPRy +// +// ==> +// +// VGPRx = COPY SGPRx +// VGPRz = REG_SEQUENCE VGPRx, sub0 +// +// This exposes immediate folding opportunities when materializing 64-bit +// immediates. +static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, + const SIRegisterInfo *TRI, + const SIInstrInfo *TII, + MachineRegisterInfo &MRI) { + assert(MI.isRegSequence()); + + unsigned DstReg = MI.getOperand(0).getReg(); + if (!TRI->isSGPRClass(MRI.getRegClass(DstReg))) + return false; + + if (!MRI.hasOneUse(DstReg)) + return false; + + MachineInstr &CopyUse = *MRI.use_instr_begin(DstReg); + if (!CopyUse.isCopy()) + return false; + + const TargetRegisterClass *SrcRC, *DstRC; + std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI); + + if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) + return false; + + // TODO: Could have multiple extracts? + unsigned SubReg = CopyUse.getOperand(1).getSubReg(); + if (SubReg != AMDGPU::NoSubRegister) + return false; + + MRI.setRegClass(DstReg, DstRC); + + // SGPRx = ... + // SGPRy = REG_SEQUENCE SGPRx, sub0 ... + // VGPRz = COPY SGPRy + + // => + // VGPRx = COPY SGPRx + // VGPRz = REG_SEQUENCE VGPRx, sub0 + + MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg()); + + for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { + unsigned SrcReg = MI.getOperand(I).getReg(); + unsigned SrcSubReg = MI.getOperand(I).getReg(); + + const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); + assert(TRI->isSGPRClass(SrcRC) && + "Expected SGPR REG_SEQUENCE to only have SGPR inputs"); + + SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg); + const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC); + + unsigned TmpReg = MRI.createVirtualRegister(NewSrcRC); + + BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), TmpReg) + .addOperand(MI.getOperand(I)); + + MI.getOperand(I).setReg(TmpReg); + } + + CopyUse.eraseFromParent(); + return true; +} + +bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + + SmallVector<MachineInstr *, 16> Worklist; + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + MachineInstr &MI = *I; + + switch (MI.getOpcode()) { + default: + continue; + case AMDGPU::COPY: { + // If the destination register is a physical register there isn't really + // much we can do to fix this. + if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg())) + continue; + + const TargetRegisterClass *SrcRC, *DstRC; + std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI); + if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) { + DEBUG(dbgs() << "Fixing VGPR -> SGPR copy: " << MI); + TII->moveToVALU(MI); + } + + break; + } + case AMDGPU::PHI: { + DEBUG(dbgs() << "Fixing PHI: " << MI); + unsigned Reg = MI.getOperand(0).getReg(); + if (!TRI->isSGPRClass(MRI.getRegClass(Reg))) + break; + + // If a PHI node defines an SGPR and any of its operands are VGPRs, + // then we need to move it to the VALU. + // + // Also, if a PHI node defines an SGPR and has all SGPR operands + // we must move it to the VALU, because the SGPR operands will + // all end up being assigned the same register, which means + // there is a potential for a conflict if different threads take + // different control flow paths. + // + // For Example: + // + // sgpr0 = def; + // ... + // sgpr1 = def; + // ... + // sgpr2 = PHI sgpr0, sgpr1 + // use sgpr2; + // + // Will Become: + // + // sgpr2 = def; + // ... + // sgpr2 = def; + // ... + // use sgpr2 + // + // FIXME: This is OK if the branching decision is made based on an + // SGPR value. + bool SGPRBranch = false; + + // The one exception to this rule is when one of the operands + // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK + // instruction. In this case, there we know the program will + // never enter the second block (the loop) without entering + // the first block (where the condition is computed), so there + // is no chance for values to be over-written. + + bool HasBreakDef = false; + for (unsigned i = 1; i < MI.getNumOperands(); i+=2) { + unsigned Reg = MI.getOperand(i).getReg(); + if (TRI->hasVGPRs(MRI.getRegClass(Reg))) { + TII->moveToVALU(MI); + break; + } + MachineInstr *DefInstr = MRI.getUniqueVRegDef(Reg); + assert(DefInstr); + switch(DefInstr->getOpcode()) { + + case AMDGPU::SI_BREAK: + case AMDGPU::SI_IF_BREAK: + case AMDGPU::SI_ELSE_BREAK: + // If we see a PHI instruction that defines an SGPR, then that PHI + // instruction has already been considered and should have + // a *_BREAK as an operand. + case AMDGPU::PHI: + HasBreakDef = true; + break; + } + } + + if (!SGPRBranch && !HasBreakDef) + TII->moveToVALU(MI); + break; + } + case AMDGPU::REG_SEQUENCE: { + if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) || + !hasVGPROperands(MI, TRI)) { + foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI); + continue; + } + + DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI); + + TII->moveToVALU(MI); + break; + } + case AMDGPU::INSERT_SUBREG: { + const TargetRegisterClass *DstRC, *Src0RC, *Src1RC; + DstRC = MRI.getRegClass(MI.getOperand(0).getReg()); + Src0RC = MRI.getRegClass(MI.getOperand(1).getReg()); + Src1RC = MRI.getRegClass(MI.getOperand(2).getReg()); + if (TRI->isSGPRClass(DstRC) && + (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) { + DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI); + TII->moveToVALU(MI); + } + break; + } + } + } + } + + return true; +} |