diff options
Diffstat (limited to 'contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp')
-rw-r--r-- | contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 316 |
1 files changed, 274 insertions, 42 deletions
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index a5c0d49..0aad8f0 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -12,6 +12,8 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/ADT/DepthFirstIterator.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -34,9 +36,12 @@ struct FoldCandidate { }; unsigned char UseOpNo; MachineOperand::MachineOperandType Kind; + bool Commuted; - FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) : - UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()) { + FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp, + bool Commuted_ = false) : + UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()), + Commuted(Commuted_) { if (FoldOp->isImm()) { ImmToFold = FoldOp->getImm(); } else if (FoldOp->isFI()) { @@ -58,6 +63,10 @@ struct FoldCandidate { bool isReg() const { return Kind == MachineOperand::MO_Register; } + + bool isCommuted() const { + return Commuted; + } }; class SIFoldOperands : public MachineFunctionPass { @@ -66,6 +75,7 @@ public: MachineRegisterInfo *MRI; const SIInstrInfo *TII; const SIRegisterInfo *TRI; + const SISubtarget *ST; void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, @@ -75,6 +85,12 @@ public: void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const; + const MachineOperand *isClamp(const MachineInstr &MI) const; + bool tryFoldClamp(MachineInstr &MI); + + std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const; + bool tryFoldOMod(MachineInstr &MI); + public: SIFoldOperands() : MachineFunctionPass(ID) { initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry()); @@ -121,6 +137,7 @@ static bool isInlineConstantIfFolded(const SIInstrInfo *TII, = TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType); } + return false; } default: return false; @@ -131,27 +148,6 @@ FunctionPass *llvm::createSIFoldOperandsPass() { return new SIFoldOperands(); } -static bool isSafeToFold(const MachineInstr &MI) { - switch (MI.getOpcode()) { - case AMDGPU::V_MOV_B32_e32: - case AMDGPU::V_MOV_B32_e64: - case AMDGPU::V_MOV_B64_PSEUDO: { - // If there are additional implicit register operands, this may be used for - // register indexing so the source register operand isn't simply copied. - unsigned NumOps = MI.getDesc().getNumOperands() + - MI.getDesc().getNumImplicitUses(); - - return MI.getNumOperands() == NumOps; - } - case AMDGPU::S_MOV_B32: - case AMDGPU::S_MOV_B64: - case AMDGPU::COPY: - return true; - default: - return false; - } -} - static bool updateOperand(FoldCandidate &Fold, const TargetRegisterInfo &TRI) { MachineInstr *MI = Fold.UseMI; @@ -172,6 +168,8 @@ static bool updateOperand(FoldCandidate &Fold, if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) && TargetRegisterInfo::isVirtualRegister(New->getReg())) { Old.substVirtReg(New->getReg(), New->getSubReg(), TRI); + + Old.setIsUndef(New->isUndef()); return true; } @@ -250,8 +248,13 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1)) return false; - if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) + if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) { + TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1); return false; + } + + FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold, true)); + return true; } FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold)); @@ -260,9 +263,10 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, // If the use operand doesn't care about the value, this may be an operand only // used for register indexing, in which case it is unsafe to fold. -static bool isUseSafeToFold(const MachineInstr &MI, +static bool isUseSafeToFold(const SIInstrInfo *TII, + const MachineInstr &MI, const MachineOperand &UseMO) { - return !UseMO.isUndef(); + return !UseMO.isUndef() && !TII->isSDWA(MI); //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg()); } @@ -274,7 +278,7 @@ void SIFoldOperands::foldOperand( SmallVectorImpl<MachineInstr *> &CopiesToReplace) const { const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); - if (!isUseSafeToFold(*UseMI, UseOp)) + if (!isUseSafeToFold(TII, *UseMI, UseOp)) return; // FIXME: Fold operands with subregs. @@ -359,8 +363,6 @@ void SIFoldOperands::foldOperand( const TargetRegisterClass *FoldRC = TRI->getRegClass(FoldDesc.OpInfo[0].RegClass); - APInt Imm(TII->operandBitWidth(FoldDesc.OpInfo[1].OperandType), - OpToFold.getImm()); // Split 64-bit constants into 32-bits for folding. if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) { @@ -370,21 +372,25 @@ void SIFoldOperands::foldOperand( MRI->getRegClass(UseReg) : TRI->getPhysRegClass(UseReg); - assert(Imm.getBitWidth() == 64); - if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64) return; + APInt Imm(64, OpToFold.getImm()); if (UseOp.getSubReg() == AMDGPU::sub0) { Imm = Imm.getLoBits(32); } else { assert(UseOp.getSubReg() == AMDGPU::sub1); Imm = Imm.getHiBits(32); } + + MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); + tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII); + return; } - MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); - tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII); + + + tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII); } static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, @@ -468,7 +474,7 @@ static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI, return &Op; MachineInstr *Def = MRI.getVRegDef(Op.getReg()); - if (Def->isMoveImmediate()) { + if (Def && Def->isMoveImmediate()) { MachineOperand &ImmSrc = Def->getOperand(1); if (ImmSrc.isImm()) return &ImmSrc; @@ -581,6 +587,32 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI, return false; } +// Try to fold an instruction into a simpler one +static bool tryFoldInst(const SIInstrInfo *TII, + MachineInstr *MI) { + unsigned Opc = MI->getOpcode(); + + if (Opc == AMDGPU::V_CNDMASK_B32_e32 || + Opc == AMDGPU::V_CNDMASK_B32_e64 || + Opc == AMDGPU::V_CNDMASK_B64_PSEUDO) { + const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); + const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1); + if (Src1->isIdenticalTo(*Src0)) { + DEBUG(dbgs() << "Folded " << *MI << " into "); + int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); + if (Src2Idx != -1) + MI->RemoveOperand(Src2Idx); + MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1)); + mutateCopyOp(*MI, TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY + : getMovOpc(false))); + DEBUG(dbgs() << *MI << '\n'); + return true; + } + } + + return false; +} + void SIFoldOperands::foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const { // We need mutate the operands of new mov instructions to add implicit @@ -621,6 +653,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, // again. The same constant folded instruction could also have a second // use operand. NextUse = MRI->use_begin(Dst.getReg()); + FoldList.clear(); continue; } @@ -682,31 +715,230 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, } DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n'); + tryFoldInst(TII, Fold.UseMI); + } else if (Fold.isCommuted()) { + // Restoring instruction's original operand order if fold has failed. + TII->commuteInstruction(*Fold.UseMI, false); } } } +const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const { + unsigned Op = MI.getOpcode(); + switch (Op) { + case AMDGPU::V_MAX_F32_e64: + case AMDGPU::V_MAX_F16_e64: + case AMDGPU::V_MAX_F64: { + if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm()) + return nullptr; + + // Make sure sources are identical. + const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + if (!Src0->isReg() || !Src1->isReg() || + Src0->getSubReg() != Src1->getSubReg() || + Src0->getSubReg() != AMDGPU::NoSubRegister) + return nullptr; + + // Can't fold up if we have modifiers. + if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || + TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || + TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) + return nullptr; + return Src0; + } + default: + return nullptr; + } +} + +// We obviously have multiple uses in a clamp since the register is used twice +// in the same instruction. +static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg) { + int Count = 0; + for (auto I = MRI.use_instr_nodbg_begin(Reg), E = MRI.use_instr_nodbg_end(); + I != E; ++I) { + if (++Count > 1) + return false; + } + + return true; +} + +bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) { + const MachineOperand *ClampSrc = isClamp(MI); + if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg())) + return false; + + MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg()); + if (!TII->hasFPClamp(*Def)) + return false; + MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp); + if (!DefClamp) + return false; + + DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def << '\n'); + + // Clamp is applied after omod, so it is OK if omod is set. + DefClamp->setImm(1); + MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg()); + MI.eraseFromParent(); + return true; +} + +static int getOModValue(unsigned Opc, int64_t Val) { + switch (Opc) { + case AMDGPU::V_MUL_F32_e64: { + switch (static_cast<uint32_t>(Val)) { + case 0x3f000000: // 0.5 + return SIOutMods::DIV2; + case 0x40000000: // 2.0 + return SIOutMods::MUL2; + case 0x40800000: // 4.0 + return SIOutMods::MUL4; + default: + return SIOutMods::NONE; + } + } + case AMDGPU::V_MUL_F16_e64: { + switch (static_cast<uint16_t>(Val)) { + case 0x3800: // 0.5 + return SIOutMods::DIV2; + case 0x4000: // 2.0 + return SIOutMods::MUL2; + case 0x4400: // 4.0 + return SIOutMods::MUL4; + default: + return SIOutMods::NONE; + } + } + default: + llvm_unreachable("invalid mul opcode"); + } +} + +// FIXME: Does this really not support denormals with f16? +// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not +// handled, so will anything other than that break? +std::pair<const MachineOperand *, int> +SIFoldOperands::isOMod(const MachineInstr &MI) const { + unsigned Op = MI.getOpcode(); + switch (Op) { + case AMDGPU::V_MUL_F32_e64: + case AMDGPU::V_MUL_F16_e64: { + // If output denormals are enabled, omod is ignored. + if ((Op == AMDGPU::V_MUL_F32_e64 && ST->hasFP32Denormals()) || + (Op == AMDGPU::V_MUL_F16_e64 && ST->hasFP16Denormals())) + return std::make_pair(nullptr, SIOutMods::NONE); + + const MachineOperand *RegOp = nullptr; + const MachineOperand *ImmOp = nullptr; + const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + if (Src0->isImm()) { + ImmOp = Src0; + RegOp = Src1; + } else if (Src1->isImm()) { + ImmOp = Src1; + RegOp = Src0; + } else + return std::make_pair(nullptr, SIOutMods::NONE); + + int OMod = getOModValue(Op, ImmOp->getImm()); + if (OMod == SIOutMods::NONE || + TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || + TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || + TII->hasModifiersSet(MI, AMDGPU::OpName::omod) || + TII->hasModifiersSet(MI, AMDGPU::OpName::clamp)) + return std::make_pair(nullptr, SIOutMods::NONE); + + return std::make_pair(RegOp, OMod); + } + case AMDGPU::V_ADD_F32_e64: + case AMDGPU::V_ADD_F16_e64: { + // If output denormals are enabled, omod is ignored. + if ((Op == AMDGPU::V_ADD_F32_e64 && ST->hasFP32Denormals()) || + (Op == AMDGPU::V_ADD_F16_e64 && ST->hasFP16Denormals())) + return std::make_pair(nullptr, SIOutMods::NONE); + + // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x + const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + + if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() && + Src0->getSubReg() == Src1->getSubReg() && + !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) && + !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) && + !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) && + !TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) + return std::make_pair(Src0, SIOutMods::MUL2); + + return std::make_pair(nullptr, SIOutMods::NONE); + } + default: + return std::make_pair(nullptr, SIOutMods::NONE); + } +} + +// FIXME: Does this need to check IEEE bit on function? +bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) { + const MachineOperand *RegOp; + int OMod; + std::tie(RegOp, OMod) = isOMod(MI); + if (OMod == SIOutMods::NONE || !RegOp->isReg() || + RegOp->getSubReg() != AMDGPU::NoSubRegister || + !hasOneNonDBGUseInst(*MRI, RegOp->getReg())) + return false; + + MachineInstr *Def = MRI->getVRegDef(RegOp->getReg()); + MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod); + if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE) + return false; + + // Clamp is applied after omod. If the source already has clamp set, don't + // fold it. + if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp)) + return false; + + DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n'); + + DefOMod->setImm(OMod); + MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg()); + MI.eraseFromParent(); + return true; +} + bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(*MF.getFunction())) return false; - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); - MRI = &MF.getRegInfo(); - TII = ST.getInstrInfo(); + ST = &MF.getSubtarget<SISubtarget>(); + TII = ST->getInstrInfo(); TRI = &TII->getRegisterInfo(); - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + // omod is ignored by hardware if IEEE bit is enabled. omod also does not + // correctly handle signed zeros. + // + // TODO: Check nsz on instructions when fast math flags are preserved to MI + // level. + bool IsIEEEMode = ST->enableIEEEBit(MF) || !MFI->hasNoSignedZerosFPMath(); - MachineBasicBlock &MBB = *BI; + for (MachineBasicBlock *MBB : depth_first(&MF)) { MachineBasicBlock::iterator I, Next; - for (I = MBB.begin(); I != MBB.end(); I = Next) { + for (I = MBB->begin(); I != MBB->end(); I = Next) { Next = std::next(I); MachineInstr &MI = *I; - if (!isSafeToFold(MI)) + tryFoldInst(TII, &MI); + + if (!TII->isFoldableCopy(MI)) { + if (IsIEEEMode || !tryFoldOMod(MI)) + tryFoldClamp(MI); continue; + } MachineOperand &OpToFold = MI.getOperand(1); bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); |