diff options
Diffstat (limited to 'contrib/llvm/lib/Target/R600/R600ControlFlowFinalizer.cpp')
-rw-r--r-- | contrib/llvm/lib/Target/R600/R600ControlFlowFinalizer.cpp | 681 |
1 files changed, 681 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Target/R600/R600ControlFlowFinalizer.cpp b/contrib/llvm/lib/Target/R600/R600ControlFlowFinalizer.cpp new file mode 100644 index 0000000..edaf278 --- /dev/null +++ b/contrib/llvm/lib/Target/R600/R600ControlFlowFinalizer.cpp @@ -0,0 +1,681 @@ +//===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass compute turns all control flow pseudo instructions into native one +/// computing their address on the fly ; it also sets STACK_SIZE info. +//===----------------------------------------------------------------------===// + +#include "llvm/Support/Debug.h" +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "R600Defines.h" +#include "R600InstrInfo.h" +#include "R600MachineFunctionInfo.h" +#include "R600RegisterInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "r600cf" + +namespace { + +struct CFStack { + + enum StackItem { + ENTRY = 0, + SUB_ENTRY = 1, + FIRST_NON_WQM_PUSH = 2, + FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3 + }; + + const AMDGPUSubtarget &ST; + std::vector<StackItem> BranchStack; + std::vector<StackItem> LoopStack; + unsigned MaxStackSize; + unsigned CurrentEntries; + unsigned CurrentSubEntries; + + CFStack(const AMDGPUSubtarget &st, unsigned ShaderType) : ST(st), + // We need to reserve a stack entry for CALL_FS in vertex shaders. + MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0), + CurrentEntries(0), CurrentSubEntries(0) { } + + unsigned getLoopDepth(); + bool branchStackContains(CFStack::StackItem); + bool requiresWorkAroundForInst(unsigned Opcode); + unsigned getSubEntrySize(CFStack::StackItem Item); + void updateMaxStackSize(); + void pushBranch(unsigned Opcode, bool isWQM = false); + void pushLoop(); + void popBranch(); + void popLoop(); +}; + +unsigned CFStack::getLoopDepth() { + return LoopStack.size(); +} + +bool CFStack::branchStackContains(CFStack::StackItem Item) { + for (std::vector<CFStack::StackItem>::const_iterator I = BranchStack.begin(), + E = BranchStack.end(); I != E; ++I) { + if (*I == Item) + return true; + } + return false; +} + +bool CFStack::requiresWorkAroundForInst(unsigned Opcode) { + if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST.hasCaymanISA() && + getLoopDepth() > 1) + return true; + + if (!ST.hasCFAluBug()) + return false; + + switch(Opcode) { + default: return false; + case AMDGPU::CF_ALU_PUSH_BEFORE: + case AMDGPU::CF_ALU_ELSE_AFTER: + case AMDGPU::CF_ALU_BREAK: + case AMDGPU::CF_ALU_CONTINUE: + if (CurrentSubEntries == 0) + return false; + if (ST.getWavefrontSize() == 64) { + // We are being conservative here. We only require this work-around if + // CurrentSubEntries > 3 && + // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0) + // + // We have to be conservative, because we don't know for certain that + // our stack allocation algorithm for Evergreen/NI is correct. Applying this + // work-around when CurrentSubEntries > 3 allows us to over-allocate stack + // resources without any problems. + return CurrentSubEntries > 3; + } else { + assert(ST.getWavefrontSize() == 32); + // We are being conservative here. We only require the work-around if + // CurrentSubEntries > 7 && + // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0) + // See the comment on the wavefront size == 64 case for why we are + // being conservative. + return CurrentSubEntries > 7; + } + } +} + +unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) { + switch(Item) { + default: + return 0; + case CFStack::FIRST_NON_WQM_PUSH: + assert(!ST.hasCaymanISA()); + if (ST.getGeneration() <= AMDGPUSubtarget::R700) { + // +1 For the push operation. + // +2 Extra space required. + return 3; + } else { + // Some documentation says that this is not necessary on Evergreen, + // but experimentation has show that we need to allocate 1 extra + // sub-entry for the first non-WQM push. + // +1 For the push operation. + // +1 Extra space required. + return 2; + } + case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY: + assert(ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN); + // +1 For the push operation. + // +1 Extra space required. + return 2; + case CFStack::SUB_ENTRY: + return 1; + } +} + +void CFStack::updateMaxStackSize() { + unsigned CurrentStackSize = CurrentEntries + + (RoundUpToAlignment(CurrentSubEntries, 4) / 4); + MaxStackSize = std::max(CurrentStackSize, MaxStackSize); +} + +void CFStack::pushBranch(unsigned Opcode, bool isWQM) { + CFStack::StackItem Item = CFStack::ENTRY; + switch(Opcode) { + case AMDGPU::CF_PUSH_EG: + case AMDGPU::CF_ALU_PUSH_BEFORE: + if (!isWQM) { + if (!ST.hasCaymanISA() && !branchStackContains(CFStack::FIRST_NON_WQM_PUSH)) + Item = CFStack::FIRST_NON_WQM_PUSH; // May not be required on Evergreen/NI + // See comment in + // CFStack::getSubEntrySize() + else if (CurrentEntries > 0 && + ST.getGeneration() > AMDGPUSubtarget::EVERGREEN && + !ST.hasCaymanISA() && + !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY)) + Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY; + else + Item = CFStack::SUB_ENTRY; + } else + Item = CFStack::ENTRY; + break; + } + BranchStack.push_back(Item); + if (Item == CFStack::ENTRY) + CurrentEntries++; + else + CurrentSubEntries += getSubEntrySize(Item); + updateMaxStackSize(); +} + +void CFStack::pushLoop() { + LoopStack.push_back(CFStack::ENTRY); + CurrentEntries++; + updateMaxStackSize(); +} + +void CFStack::popBranch() { + CFStack::StackItem Top = BranchStack.back(); + if (Top == CFStack::ENTRY) + CurrentEntries--; + else + CurrentSubEntries-= getSubEntrySize(Top); + BranchStack.pop_back(); +} + +void CFStack::popLoop() { + CurrentEntries--; + LoopStack.pop_back(); +} + +class R600ControlFlowFinalizer : public MachineFunctionPass { + +private: + typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile; + + enum ControlFlowInstruction { + CF_TC, + CF_VC, + CF_CALL_FS, + CF_WHILE_LOOP, + CF_END_LOOP, + CF_LOOP_BREAK, + CF_LOOP_CONTINUE, + CF_JUMP, + CF_ELSE, + CF_POP, + CF_END + }; + + static char ID; + const R600InstrInfo *TII; + const R600RegisterInfo *TRI; + unsigned MaxFetchInst; + const AMDGPUSubtarget &ST; + + bool IsTrivialInst(MachineInstr *MI) const { + switch (MI->getOpcode()) { + case AMDGPU::KILL: + case AMDGPU::RETURN: + return true; + default: + return false; + } + } + + const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const { + unsigned Opcode = 0; + bool isEg = (ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN); + switch (CFI) { + case CF_TC: + Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600; + break; + case CF_VC: + Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600; + break; + case CF_CALL_FS: + Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600; + break; + case CF_WHILE_LOOP: + Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600; + break; + case CF_END_LOOP: + Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600; + break; + case CF_LOOP_BREAK: + Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600; + break; + case CF_LOOP_CONTINUE: + Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600; + break; + case CF_JUMP: + Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600; + break; + case CF_ELSE: + Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600; + break; + case CF_POP: + Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600; + break; + case CF_END: + if (ST.hasCaymanISA()) { + Opcode = AMDGPU::CF_END_CM; + break; + } + Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600; + break; + } + assert (Opcode && "No opcode selected"); + return TII->get(Opcode); + } + + bool isCompatibleWithClause(const MachineInstr *MI, + std::set<unsigned> &DstRegs) const { + unsigned DstMI, SrcMI; + for (MachineInstr::const_mop_iterator I = MI->operands_begin(), + E = MI->operands_end(); I != E; ++I) { + const MachineOperand &MO = *I; + if (!MO.isReg()) + continue; + if (MO.isDef()) { + unsigned Reg = MO.getReg(); + if (AMDGPU::R600_Reg128RegClass.contains(Reg)) + DstMI = Reg; + else + DstMI = TRI->getMatchingSuperReg(Reg, + TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)), + &AMDGPU::R600_Reg128RegClass); + } + if (MO.isUse()) { + unsigned Reg = MO.getReg(); + if (AMDGPU::R600_Reg128RegClass.contains(Reg)) + SrcMI = Reg; + else + SrcMI = TRI->getMatchingSuperReg(Reg, + TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)), + &AMDGPU::R600_Reg128RegClass); + } + } + if ((DstRegs.find(SrcMI) == DstRegs.end())) { + DstRegs.insert(DstMI); + return true; + } else + return false; + } + + ClauseFile + MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I) + const { + MachineBasicBlock::iterator ClauseHead = I; + std::vector<MachineInstr *> ClauseContent; + unsigned AluInstCount = 0; + bool IsTex = TII->usesTextureCache(ClauseHead); + std::set<unsigned> DstRegs; + for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) { + if (IsTrivialInst(I)) + continue; + if (AluInstCount >= MaxFetchInst) + break; + if ((IsTex && !TII->usesTextureCache(I)) || + (!IsTex && !TII->usesVertexCache(I))) + break; + if (!isCompatibleWithClause(I, DstRegs)) + break; + AluInstCount ++; + ClauseContent.push_back(I); + } + MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), + getHWInstrDesc(IsTex?CF_TC:CF_VC)) + .addImm(0) // ADDR + .addImm(AluInstCount - 1); // COUNT + return ClauseFile(MIb, std::move(ClauseContent)); + } + + void getLiteral(MachineInstr *MI, std::vector<int64_t> &Lits) const { + static const unsigned LiteralRegs[] = { + AMDGPU::ALU_LITERAL_X, + AMDGPU::ALU_LITERAL_Y, + AMDGPU::ALU_LITERAL_Z, + AMDGPU::ALU_LITERAL_W + }; + const SmallVector<std::pair<MachineOperand *, int64_t>, 3 > Srcs = + TII->getSrcs(MI); + for (unsigned i = 0, e = Srcs.size(); i < e; ++i) { + if (Srcs[i].first->getReg() != AMDGPU::ALU_LITERAL_X) + continue; + int64_t Imm = Srcs[i].second; + std::vector<int64_t>::iterator It = + std::find(Lits.begin(), Lits.end(), Imm); + if (It != Lits.end()) { + unsigned Index = It - Lits.begin(); + Srcs[i].first->setReg(LiteralRegs[Index]); + } else { + assert(Lits.size() < 4 && "Too many literals in Instruction Group"); + Srcs[i].first->setReg(LiteralRegs[Lits.size()]); + Lits.push_back(Imm); + } + } + } + + MachineBasicBlock::iterator insertLiterals( + MachineBasicBlock::iterator InsertPos, + const std::vector<unsigned> &Literals) const { + MachineBasicBlock *MBB = InsertPos->getParent(); + for (unsigned i = 0, e = Literals.size(); i < e; i+=2) { + unsigned LiteralPair0 = Literals[i]; + unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0; + InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(), + TII->get(AMDGPU::LITERALS)) + .addImm(LiteralPair0) + .addImm(LiteralPair1); + } + return InsertPos; + } + + ClauseFile + MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I) + const { + MachineBasicBlock::iterator ClauseHead = I; + std::vector<MachineInstr *> ClauseContent; + I++; + for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) { + if (IsTrivialInst(I)) { + ++I; + continue; + } + if (!I->isBundle() && !TII->isALUInstr(I->getOpcode())) + break; + std::vector<int64_t> Literals; + if (I->isBundle()) { + MachineInstr *DeleteMI = I; + MachineBasicBlock::instr_iterator BI = I.getInstrIterator(); + while (++BI != E && BI->isBundledWithPred()) { + BI->unbundleFromPred(); + for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = BI->getOperand(i); + if (MO.isReg() && MO.isInternalRead()) + MO.setIsInternalRead(false); + } + getLiteral(BI, Literals); + ClauseContent.push_back(BI); + } + I = BI; + DeleteMI->eraseFromParent(); + } else { + getLiteral(I, Literals); + ClauseContent.push_back(I); + I++; + } + for (unsigned i = 0, e = Literals.size(); i < e; i+=2) { + unsigned literal0 = Literals[i]; + unsigned literal2 = (i + 1 < e)?Literals[i + 1]:0; + MachineInstr *MILit = BuildMI(MBB, I, I->getDebugLoc(), + TII->get(AMDGPU::LITERALS)) + .addImm(literal0) + .addImm(literal2); + ClauseContent.push_back(MILit); + } + } + assert(ClauseContent.size() < 128 && "ALU clause is too big"); + ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1); + return ClauseFile(ClauseHead, std::move(ClauseContent)); + } + + void + EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, + unsigned &CfCount) { + CounterPropagateAddr(Clause.first, CfCount); + MachineBasicBlock *BB = Clause.first->getParent(); + BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::FETCH_CLAUSE)) + .addImm(CfCount); + for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { + BB->splice(InsertPos, BB, Clause.second[i]); + } + CfCount += 2 * Clause.second.size(); + } + + void + EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, + unsigned &CfCount) { + Clause.first->getOperand(0).setImm(0); + CounterPropagateAddr(Clause.first, CfCount); + MachineBasicBlock *BB = Clause.first->getParent(); + BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE)) + .addImm(CfCount); + for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { + BB->splice(InsertPos, BB, Clause.second[i]); + } + CfCount += Clause.second.size(); + } + + void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const { + MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm()); + } + void CounterPropagateAddr(const std::set<MachineInstr *> &MIs, + unsigned Addr) const { + for (MachineInstr *MI : MIs) { + CounterPropagateAddr(MI, Addr); + } + } + +public: + R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID), + TII (nullptr), TRI(nullptr), + ST(tm.getSubtarget<AMDGPUSubtarget>()) { + const AMDGPUSubtarget &ST = tm.getSubtarget<AMDGPUSubtarget>(); + MaxFetchInst = ST.getTexVTXClauseSize(); + } + + bool runOnMachineFunction(MachineFunction &MF) override { + TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); + TRI = static_cast<const R600RegisterInfo *>( + MF.getSubtarget().getRegisterInfo()); + R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); + + CFStack CFStack(ST, MFI->getShaderType()); + for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME; + ++MB) { + MachineBasicBlock &MBB = *MB; + unsigned CfCount = 0; + std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack; + std::vector<MachineInstr * > IfThenElseStack; + if (MFI->getShaderType() == ShaderType::VERTEX) { + BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), + getHWInstrDesc(CF_CALL_FS)); + CfCount++; + } + std::vector<ClauseFile> FetchClauses, AluClauses; + std::vector<MachineInstr *> LastAlu(1); + std::vector<MachineInstr *> ToPopAfter; + + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E;) { + if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) { + DEBUG(dbgs() << CfCount << ":"; I->dump();); + FetchClauses.push_back(MakeFetchClause(MBB, I)); + CfCount++; + LastAlu.back() = nullptr; + continue; + } + + MachineBasicBlock::iterator MI = I; + if (MI->getOpcode() != AMDGPU::ENDIF) + LastAlu.back() = nullptr; + if (MI->getOpcode() == AMDGPU::CF_ALU) + LastAlu.back() = MI; + I++; + bool RequiresWorkAround = + CFStack.requiresWorkAroundForInst(MI->getOpcode()); + switch (MI->getOpcode()) { + case AMDGPU::CF_ALU_PUSH_BEFORE: + if (RequiresWorkAround) { + DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n"); + BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG)) + .addImm(CfCount + 1) + .addImm(1); + MI->setDesc(TII->get(AMDGPU::CF_ALU)); + CfCount++; + CFStack.pushBranch(AMDGPU::CF_PUSH_EG); + } else + CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE); + + case AMDGPU::CF_ALU: + I = MI; + AluClauses.push_back(MakeALUClause(MBB, I)); + DEBUG(dbgs() << CfCount << ":"; MI->dump();); + CfCount++; + break; + case AMDGPU::WHILELOOP: { + CFStack.pushLoop(); + MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), + getHWInstrDesc(CF_WHILE_LOOP)) + .addImm(1); + std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount, + std::set<MachineInstr *>()); + Pair.second.insert(MIb); + LoopStack.push_back(std::move(Pair)); + MI->eraseFromParent(); + CfCount++; + break; + } + case AMDGPU::ENDLOOP: { + CFStack.popLoop(); + std::pair<unsigned, std::set<MachineInstr *> > Pair = + std::move(LoopStack.back()); + LoopStack.pop_back(); + CounterPropagateAddr(Pair.second, CfCount); + BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP)) + .addImm(Pair.first + 1); + MI->eraseFromParent(); + CfCount++; + break; + } + case AMDGPU::IF_PREDICATE_SET: { + LastAlu.push_back(nullptr); + MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), + getHWInstrDesc(CF_JUMP)) + .addImm(0) + .addImm(0); + IfThenElseStack.push_back(MIb); + DEBUG(dbgs() << CfCount << ":"; MIb->dump();); + MI->eraseFromParent(); + CfCount++; + break; + } + case AMDGPU::ELSE: { + MachineInstr * JumpInst = IfThenElseStack.back(); + IfThenElseStack.pop_back(); + CounterPropagateAddr(JumpInst, CfCount); + MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), + getHWInstrDesc(CF_ELSE)) + .addImm(0) + .addImm(0); + DEBUG(dbgs() << CfCount << ":"; MIb->dump();); + IfThenElseStack.push_back(MIb); + MI->eraseFromParent(); + CfCount++; + break; + } + case AMDGPU::ENDIF: { + CFStack.popBranch(); + if (LastAlu.back()) { + ToPopAfter.push_back(LastAlu.back()); + } else { + MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), + getHWInstrDesc(CF_POP)) + .addImm(CfCount + 1) + .addImm(1); + (void)MIb; + DEBUG(dbgs() << CfCount << ":"; MIb->dump();); + CfCount++; + } + + MachineInstr *IfOrElseInst = IfThenElseStack.back(); + IfThenElseStack.pop_back(); + CounterPropagateAddr(IfOrElseInst, CfCount); + IfOrElseInst->getOperand(1).setImm(1); + LastAlu.pop_back(); + MI->eraseFromParent(); + break; + } + case AMDGPU::BREAK: { + CfCount ++; + MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), + getHWInstrDesc(CF_LOOP_BREAK)) + .addImm(0); + LoopStack.back().second.insert(MIb); + MI->eraseFromParent(); + break; + } + case AMDGPU::CONTINUE: { + MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), + getHWInstrDesc(CF_LOOP_CONTINUE)) + .addImm(0); + LoopStack.back().second.insert(MIb); + MI->eraseFromParent(); + CfCount++; + break; + } + case AMDGPU::RETURN: { + BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END)); + CfCount++; + MI->eraseFromParent(); + if (CfCount % 2) { + BuildMI(MBB, I, MBB.findDebugLoc(MI), TII->get(AMDGPU::PAD)); + CfCount++; + } + for (unsigned i = 0, e = FetchClauses.size(); i < e; i++) + EmitFetchClause(I, FetchClauses[i], CfCount); + for (unsigned i = 0, e = AluClauses.size(); i < e; i++) + EmitALUClause(I, AluClauses[i], CfCount); + } + default: + if (TII->isExport(MI->getOpcode())) { + DEBUG(dbgs() << CfCount << ":"; MI->dump();); + CfCount++; + } + break; + } + } + for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) { + MachineInstr *Alu = ToPopAfter[i]; + BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu), + TII->get(AMDGPU::CF_ALU_POP_AFTER)) + .addImm(Alu->getOperand(0).getImm()) + .addImm(Alu->getOperand(1).getImm()) + .addImm(Alu->getOperand(2).getImm()) + .addImm(Alu->getOperand(3).getImm()) + .addImm(Alu->getOperand(4).getImm()) + .addImm(Alu->getOperand(5).getImm()) + .addImm(Alu->getOperand(6).getImm()) + .addImm(Alu->getOperand(7).getImm()) + .addImm(Alu->getOperand(8).getImm()); + Alu->eraseFromParent(); + } + MFI->StackSize = CFStack.MaxStackSize; + } + + return false; + } + + const char *getPassName() const override { + return "R600 Control Flow Finalizer Pass"; + } +}; + +char R600ControlFlowFinalizer::ID = 0; + +} // end anonymous namespace + + +llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) { + return new R600ControlFlowFinalizer(TM); +} |