diff options
Diffstat (limited to 'lib/Target/R600')
45 files changed, 2364 insertions, 2388 deletions
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h index 0b01433..9792bd8 100644 --- a/lib/Target/R600/AMDGPU.h +++ b/lib/Target/R600/AMDGPU.h @@ -24,6 +24,7 @@ class AMDGPUTargetMachine; FunctionPass* createR600KernelParametersPass(const DataLayout *TD); FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm); FunctionPass *createR600EmitClauseMarkers(TargetMachine &tm); +FunctionPass *createR600Packetizer(TargetMachine &tm); FunctionPass *createR600ControlFlowFinalizer(TargetMachine &tm); // SI Passes diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp index f600144..4c35ecf 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.cpp +++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp @@ -19,9 +19,16 @@ #include "AMDGPUAsmPrinter.h" #include "AMDGPU.h" +#include "SIDefines.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "R600Defines.h" +#include "R600MachineFunctionInfo.h" +#include "R600RegisterInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" +#include "llvm/Support/ELF.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Target/TargetLoweringObjectFile.h" @@ -50,15 +57,82 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { if (OutStreamer.hasRawTextSupport()) { OutStreamer.EmitRawText("@" + MF.getName() + ":"); } - OutStreamer.SwitchSection(getObjFileLowering().getTextSection()); + + const MCSectionELF *ConfigSection = getObjFileLowering().getContext() + .getELFSection(".AMDGPU.config", + ELF::SHT_PROGBITS, 0, + SectionKind::getReadOnly()); + OutStreamer.SwitchSection(ConfigSection); if (STM.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) { - EmitProgramInfo(MF); + EmitProgramInfoSI(MF); + } else { + EmitProgramInfoR600(MF); } + OutStreamer.SwitchSection(getObjFileLowering().getTextSection()); EmitFunctionBody(); return false; } -void AMDGPUAsmPrinter::EmitProgramInfo(MachineFunction &MF) { +void AMDGPUAsmPrinter::EmitProgramInfoR600(MachineFunction &MF) { + unsigned MaxGPR = 0; + bool killPixel = false; + const R600RegisterInfo * RI = + static_cast<const R600RegisterInfo*>(TM.getRegisterInfo()); + R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); + const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>(); + + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + MachineBasicBlock &MBB = *BB; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + MachineInstr &MI = *I; + if (MI.getOpcode() == AMDGPU::KILLGT) + killPixel = true; + unsigned numOperands = MI.getNumOperands(); + for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { + MachineOperand & MO = MI.getOperand(op_idx); + if (!MO.isReg()) + continue; + unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff; + + // Register with value > 127 aren't GPR + if (HWReg > 127) + continue; + MaxGPR = std::max(MaxGPR, HWReg); + } + } + } + + unsigned RsrcReg; + if (STM.device()->getGeneration() >= AMDGPUDeviceInfo::HD5XXX) { + // Evergreen / Northern Islands + switch (MFI->ShaderType) { + default: // Fall through + case ShaderType::COMPUTE: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break; + case ShaderType::GEOMETRY: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break; + case ShaderType::PIXEL: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break; + case ShaderType::VERTEX: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break; + } + } else { + // R600 / R700 + switch (MFI->ShaderType) { + default: // Fall through + case ShaderType::GEOMETRY: // Fall through + case ShaderType::COMPUTE: // Fall through + case ShaderType::VERTEX: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break; + case ShaderType::PIXEL: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break; + } + } + + OutStreamer.EmitIntValue(RsrcReg, 4); + OutStreamer.EmitIntValue(S_NUM_GPRS(MaxGPR + 1) | + S_STACK_SIZE(MFI->StackSize), 4); + OutStreamer.EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4); + OutStreamer.EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4); +} + +void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF) { unsigned MaxSGPR = 0; unsigned MaxVGPR = 0; bool VCCUsed = false; @@ -107,6 +181,9 @@ void AMDGPUAsmPrinter::EmitProgramInfo(MachineFunction &MF) { } else if (AMDGPU::VReg_64RegClass.contains(reg)) { isSGPR = false; width = 2; + } else if (AMDGPU::VReg_96RegClass.contains(reg)) { + isSGPR = false; + width = 3; } else if (AMDGPU::SReg_128RegClass.contains(reg)) { isSGPR = true; width = 4; @@ -139,7 +216,19 @@ void AMDGPUAsmPrinter::EmitProgramInfo(MachineFunction &MF) { MaxSGPR += 2; } SIMachineFunctionInfo * MFI = MF.getInfo<SIMachineFunctionInfo>(); - OutStreamer.EmitIntValue(MaxSGPR + 1, 4); - OutStreamer.EmitIntValue(MaxVGPR + 1, 4); - OutStreamer.EmitIntValue(MFI->PSInputAddr, 4); + unsigned RsrcReg; + switch (MFI->ShaderType) { + default: // Fall through + case ShaderType::COMPUTE: RsrcReg = R_00B848_COMPUTE_PGM_RSRC1; break; + case ShaderType::GEOMETRY: RsrcReg = R_00B228_SPI_SHADER_PGM_RSRC1_GS; break; + case ShaderType::PIXEL: RsrcReg = R_00B028_SPI_SHADER_PGM_RSRC1_PS; break; + case ShaderType::VERTEX: RsrcReg = R_00B128_SPI_SHADER_PGM_RSRC1_VS; break; + } + + OutStreamer.EmitIntValue(RsrcReg, 4); + OutStreamer.EmitIntValue(S_00B028_VGPRS(MaxVGPR / 4) | S_00B028_SGPRS(MaxSGPR / 8), 4); + if (MFI->ShaderType == ShaderType::PIXEL) { + OutStreamer.EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); + OutStreamer.EmitIntValue(MFI->PSInputAddr, 4); + } } diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h index 3812282..f425ef4 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.h +++ b/lib/Target/R600/AMDGPUAsmPrinter.h @@ -33,7 +33,8 @@ public: /// \brief Emit register usage information so that the GPU driver /// can correctly setup the GPU state. - void EmitProgramInfo(MachineFunction &MF); + void EmitProgramInfoR600(MachineFunction &MF); + void EmitProgramInfoSI(MachineFunction &MF); /// Implemented in AMDGPUMCInstLower.cpp virtual void EmitInstruction(const MachineInstr *MI); diff --git a/lib/Target/R600/AMDGPUCallingConv.td b/lib/Target/R600/AMDGPUCallingConv.td index 45ae37e..9c30515 100644 --- a/lib/Target/R600/AMDGPUCallingConv.td +++ b/lib/Target/R600/AMDGPUCallingConv.td @@ -32,8 +32,14 @@ def CC_SI : CallingConv<[ VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31 - ]>>> + ]>>>, + // This is the default for i64 values. + // XXX: We should change this once clang understands the CC_AMDGPU. + CCIfType<[i64], CCAssignToRegWithShadow< + [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ], + [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ] + >> ]>; def CC_AMDGPU : CallingConv<[ diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index f31b646..c2a79ea 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -116,6 +116,7 @@ enum { BRANCH_COND, // End AMDIL ISD Opcodes BITALIGN, + BUFFER_STORE, DWORDADDR, FRACT, FMAX, diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td index e740348..d2620b2 100644 --- a/lib/Target/R600/AMDGPUInstructions.td +++ b/lib/Target/R600/AMDGPUInstructions.td @@ -94,6 +94,7 @@ class Constants { int TWO_PI = 0x40c90fdb; int PI = 0x40490fdb; int TWO_PI_INV = 0x3e22f983; +int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding } def CONST : Constants; @@ -115,21 +116,21 @@ class CLAMP <RegisterClass rc> : AMDGPUShaderInst < (outs rc:$dst), (ins rc:$src0), "CLAMP $dst, $src0", - [(set rc:$dst, (int_AMDIL_clamp rc:$src0, (f32 FP_ZERO), (f32 FP_ONE)))] + [(set f32:$dst, (int_AMDIL_clamp f32:$src0, (f32 FP_ZERO), (f32 FP_ONE)))] >; class FABS <RegisterClass rc> : AMDGPUShaderInst < (outs rc:$dst), (ins rc:$src0), "FABS $dst, $src0", - [(set rc:$dst, (fabs rc:$src0))] + [(set f32:$dst, (fabs f32:$src0))] >; class FNEG <RegisterClass rc> : AMDGPUShaderInst < (outs rc:$dst), (ins rc:$src0), "FNEG $dst, $src0", - [(set rc:$dst, (fneg rc:$src0))] + [(set f32:$dst, (fneg f32:$src0))] >; } // usesCustomInserter = 1 @@ -140,8 +141,7 @@ multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass, (outs dstClass:$dst), (ins addrClass:$addr, i32imm:$chan), "RegisterLoad $dst, $addr", - [(set (i32 dstClass:$dst), (AMDGPUregister_load addrPat:$addr, - (i32 timm:$chan)))] + [(set i32:$dst, (AMDGPUregister_load addrPat:$addr, (i32 timm:$chan)))] > { let isRegisterLoad = 1; } @@ -150,7 +150,7 @@ multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass, (outs), (ins dstClass:$val, addrClass:$addr, i32imm:$chan), "RegisterStore $val, $addr", - [(AMDGPUregister_store (i32 dstClass:$val), addrPat:$addr, (i32 timm:$chan))] + [(AMDGPUregister_store i32:$val, addrPat:$addr, (i32 timm:$chan))] > { let isRegisterStore = 1; } @@ -161,105 +161,140 @@ multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass, /* Generic helper patterns for intrinsics */ /* -------------------------------------- */ -class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul, - RegisterClass rc> : Pat < - (fpow rc:$src0, rc:$src1), - (exp_ieee (mul rc:$src1, (log_ieee rc:$src0))) +class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul> + : Pat < + (fpow f32:$src0, f32:$src1), + (exp_ieee (mul f32:$src1, (log_ieee f32:$src0))) >; /* Other helper patterns */ /* --------------------- */ /* Extract element pattern */ -class Extract_Element <ValueType sub_type, ValueType vec_type, - RegisterClass vec_class, int sub_idx, - SubRegIndex sub_reg>: Pat< - (sub_type (vector_extract (vec_type vec_class:$src), sub_idx)), - (EXTRACT_SUBREG vec_class:$src, sub_reg) +class Extract_Element <ValueType sub_type, ValueType vec_type, int sub_idx, + SubRegIndex sub_reg> + : Pat< + (sub_type (vector_extract vec_type:$src, sub_idx)), + (EXTRACT_SUBREG $src, sub_reg) >; /* Insert element pattern */ class Insert_Element <ValueType elem_type, ValueType vec_type, - RegisterClass elem_class, RegisterClass vec_class, - int sub_idx, SubRegIndex sub_reg> : Pat < - - (vec_type (vector_insert (vec_type vec_class:$vec), - (elem_type elem_class:$elem), sub_idx)), - (INSERT_SUBREG vec_class:$vec, elem_class:$elem, sub_reg) + int sub_idx, SubRegIndex sub_reg> + : Pat < + (vector_insert vec_type:$vec, elem_type:$elem, sub_idx), + (INSERT_SUBREG $vec, $elem, sub_reg) >; // Vector Build pattern -class Vector1_Build <ValueType vecType, RegisterClass vectorClass, - ValueType elemType, RegisterClass elemClass> : Pat < - (vecType (build_vector (elemType elemClass:$src))), - (vecType elemClass:$src) +class Vector1_Build <ValueType vecType, ValueType elemType, + RegisterClass rc> : Pat < + (vecType (build_vector elemType:$src)), + (vecType (COPY_TO_REGCLASS $src, rc)) >; -class Vector2_Build <ValueType vecType, RegisterClass vectorClass, - ValueType elemType, RegisterClass elemClass> : Pat < - (vecType (build_vector (elemType elemClass:$sub0), (elemType elemClass:$sub1))), +class Vector2_Build <ValueType vecType, ValueType elemType> : Pat < + (vecType (build_vector elemType:$sub0, elemType:$sub1)), (INSERT_SUBREG (INSERT_SUBREG - (vecType (IMPLICIT_DEF)), elemClass:$sub0, sub0), elemClass:$sub1, sub1) + (vecType (IMPLICIT_DEF)), $sub0, sub0), $sub1, sub1) >; -class Vector4_Build <ValueType vecType, RegisterClass vectorClass, - ValueType elemType, RegisterClass elemClass> : Pat < - (vecType (build_vector (elemType elemClass:$x), (elemType elemClass:$y), - (elemType elemClass:$z), (elemType elemClass:$w))), +class Vector4_Build <ValueType vecType, ValueType elemType> : Pat < + (vecType (build_vector elemType:$x, elemType:$y, elemType:$z, elemType:$w)), (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG - (vecType (IMPLICIT_DEF)), elemClass:$x, sub0), elemClass:$y, sub1), - elemClass:$z, sub2), elemClass:$w, sub3) + (vecType (IMPLICIT_DEF)), $x, sub0), $y, sub1), $z, sub2), $w, sub3) >; -class Vector8_Build <ValueType vecType, RegisterClass vectorClass, - ValueType elemType, RegisterClass elemClass> : Pat < - (vecType (build_vector (elemType elemClass:$sub0), (elemType elemClass:$sub1), - (elemType elemClass:$sub2), (elemType elemClass:$sub3), - (elemType elemClass:$sub4), (elemType elemClass:$sub5), - (elemType elemClass:$sub6), (elemType elemClass:$sub7))), - (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG +class Vector8_Build <ValueType vecType, ValueType elemType> : Pat < + (vecType (build_vector elemType:$sub0, elemType:$sub1, + elemType:$sub2, elemType:$sub3, + elemType:$sub4, elemType:$sub5, + elemType:$sub6, elemType:$sub7)), (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG - (vecType (IMPLICIT_DEF)), elemClass:$sub0, sub0), elemClass:$sub1, sub1), - elemClass:$sub2, sub2), elemClass:$sub3, sub3), - elemClass:$sub4, sub4), elemClass:$sub5, sub5), - elemClass:$sub6, sub6), elemClass:$sub7, sub7) + (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG + (vecType (IMPLICIT_DEF)), $sub0, sub0), $sub1, sub1), + $sub2, sub2), $sub3, sub3), + $sub4, sub4), $sub5, sub5), + $sub6, sub6), $sub7, sub7) >; -class Vector16_Build <ValueType vecType, RegisterClass vectorClass, - ValueType elemType, RegisterClass elemClass> : Pat < - (vecType (build_vector (elemType elemClass:$sub0), (elemType elemClass:$sub1), - (elemType elemClass:$sub2), (elemType elemClass:$sub3), - (elemType elemClass:$sub4), (elemType elemClass:$sub5), - (elemType elemClass:$sub6), (elemType elemClass:$sub7), - (elemType elemClass:$sub8), (elemType elemClass:$sub9), - (elemType elemClass:$sub10), (elemType elemClass:$sub11), - (elemType elemClass:$sub12), (elemType elemClass:$sub13), - (elemType elemClass:$sub14), (elemType elemClass:$sub15))), - (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG - (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG - (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG +class Vector16_Build <ValueType vecType, ValueType elemType> : Pat < + (vecType (build_vector elemType:$sub0, elemType:$sub1, + elemType:$sub2, elemType:$sub3, + elemType:$sub4, elemType:$sub5, + elemType:$sub6, elemType:$sub7, + elemType:$sub8, elemType:$sub9, + elemType:$sub10, elemType:$sub11, + elemType:$sub12, elemType:$sub13, + elemType:$sub14, elemType:$sub15)), (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG - (vecType (IMPLICIT_DEF)), elemClass:$sub0, sub0), elemClass:$sub1, sub1), - elemClass:$sub2, sub2), elemClass:$sub3, sub3), - elemClass:$sub4, sub4), elemClass:$sub5, sub5), - elemClass:$sub6, sub6), elemClass:$sub7, sub7), - elemClass:$sub8, sub8), elemClass:$sub9, sub9), - elemClass:$sub10, sub10), elemClass:$sub11, sub11), - elemClass:$sub12, sub12), elemClass:$sub13, sub13), - elemClass:$sub14, sub14), elemClass:$sub15, sub15) + (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG + (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG + (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG + (vecType (IMPLICIT_DEF)), $sub0, sub0), $sub1, sub1), + $sub2, sub2), $sub3, sub3), + $sub4, sub4), $sub5, sub5), + $sub6, sub6), $sub7, sub7), + $sub8, sub8), $sub9, sub9), + $sub10, sub10), $sub11, sub11), + $sub12, sub12), $sub13, sub13), + $sub14, sub14), $sub15, sub15) >; +// XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer +// can handle COPY instructions. // bitconvert pattern class BitConvert <ValueType dt, ValueType st, RegisterClass rc> : Pat < (dt (bitconvert (st rc:$src0))), (dt rc:$src0) >; +// XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer +// can handle COPY instructions. class DwordAddrPat<ValueType vt, RegisterClass rc> : Pat < (vt (AMDGPUdwordaddr (vt rc:$addr))), (vt rc:$addr) >; +// BFI_INT patterns + +multiclass BFIPatterns <Instruction BFI_INT> { + + // Definition from ISA doc: + // (y & x) | (z & ~x) + def : Pat < + (or (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))), + (BFI_INT $x, $y, $z) + >; + + // SHA-256 Ch function + // z ^ (x & (y ^ z)) + def : Pat < + (xor i32:$z, (and i32:$x, (xor i32:$y, i32:$z))), + (BFI_INT $x, $y, $z) + >; + +} + +// SHA-256 Ma patterns + +// ((x & z) | (y & (x | z))) -> BFI_INT (XOR x, y), z, y +class SHA256MaPattern <Instruction BFI_INT, Instruction XOR> : Pat < + (or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))), + (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y) +>; + +// Bitfield extract patterns + +def legalshift32 : ImmLeaf <i32, [{return Imm >=0 && Imm < 32;}]>; +def bfemask : PatLeaf <(imm), [{return isMask_32(N->getZExtValue());}], + SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(CountTrailingOnes_32(N->getZExtValue()), MVT::i32);}]>>; + +class BFEPattern <Instruction BFE> : Pat < + (and (srl i32:$x, legalshift32:$y), bfemask:$z), + (BFE $x, $y, $z) +>; + include "R600Instructions.td" include "SIInstrInfo.td" diff --git a/lib/Target/R600/AMDGPUMachineFunction.cpp b/lib/Target/R600/AMDGPUMachineFunction.cpp index 0223ec8..0461025 100644 --- a/lib/Target/R600/AMDGPUMachineFunction.cpp +++ b/lib/Target/R600/AMDGPUMachineFunction.cpp @@ -1,4 +1,5 @@ #include "AMDGPUMachineFunction.h" +#include "AMDGPU.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Function.h" @@ -8,6 +9,7 @@ const char *AMDGPUMachineFunction::ShaderTypeAttribute = "ShaderType"; AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : MachineFunctionInfo() { + ShaderType = ShaderType::COMPUTE; AttributeSet Set = MF.getFunction()->getAttributes(); Attribute A = Set.getAttribute(AttributeSet::FunctionIndex, ShaderTypeAttribute); diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp index 0f356a1..a7e1d7b 100644 --- a/lib/Target/R600/AMDGPUSubtarget.cpp +++ b/lib/Target/R600/AMDGPUSubtarget.cpp @@ -33,6 +33,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) : DefaultSize[0] = 64; DefaultSize[1] = 1; DefaultSize[2] = 1; + HasVertexCache = false; ParseSubtargetFeatures(GPU, FS); DevName = GPU; Device = AMDGPUDeviceInfo::getDeviceFromName(DevName, this, Is64bit); @@ -53,6 +54,10 @@ AMDGPUSubtarget::is64bit() const { return Is64bit; } bool +AMDGPUSubtarget::hasVertexCache() const { + return HasVertexCache; +} +bool AMDGPUSubtarget::isTargetELF() const { return false; } diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h index 1973fc6..b6501a4 100644 --- a/lib/Target/R600/AMDGPUSubtarget.h +++ b/lib/Target/R600/AMDGPUSubtarget.h @@ -36,6 +36,7 @@ private: bool Is32on64bit; bool DumpCode; bool R600ALUInst; + bool HasVertexCache; InstrItineraryData InstrItins; @@ -48,6 +49,7 @@ public: bool isOverride(AMDGPUDeviceInfo::Caps) const; bool is64bit() const; + bool hasVertexCache() const; // Helper functions to simplify if statements bool isTargetELF() const; diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp index e7ea876..31fbf32 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.cpp +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -115,7 +115,6 @@ AMDGPUPassConfig::addPreISel() { } bool AMDGPUPassConfig::addInstSelector() { - addPass(createAMDGPUPeepholeOpt(*TM)); addPass(createAMDGPUISelDag(getAMDGPUTargetMachine())); const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); @@ -153,8 +152,9 @@ bool AMDGPUPassConfig::addPreEmitPass() { addPass(createAMDGPUCFGStructurizerPass(*TM)); addPass(createR600EmitClauseMarkers(*TM)); addPass(createR600ExpandSpecialInstrsPass(*TM)); - addPass(createR600ControlFlowFinalizer(*TM)); addPass(&FinalizeMachineBundlesID); + addPass(createR600Packetizer(*TM)); + addPass(createR600ControlFlowFinalizer(*TM)); } else { addPass(createSILowerControlFlowPass(*TM)); } diff --git a/lib/Target/R600/AMDILBase.td b/lib/Target/R600/AMDILBase.td index c12cedc..e221110 100644 --- a/lib/Target/R600/AMDILBase.td +++ b/lib/Target/R600/AMDILBase.td @@ -74,6 +74,10 @@ def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst", "false", "Older version of ALU instructions encoding.">; +def FeatureVertexCache : SubtargetFeature<"HasVertexCache", + "HasVertexCache", + "true", + "Specify use of dedicated vertex cache.">; //===----------------------------------------------------------------------===// // Register File, Calling Conv, Instruction Descriptions diff --git a/lib/Target/R600/AMDILDeviceInfo.cpp b/lib/Target/R600/AMDILDeviceInfo.cpp index 9605fbe..126514b 100644 --- a/lib/Target/R600/AMDILDeviceInfo.cpp +++ b/lib/Target/R600/AMDILDeviceInfo.cpp @@ -44,7 +44,7 @@ AMDGPUDevice* getDeviceFromName(const std::string &deviceName, " on 32bit pointers!"); #endif return new AMDGPUEvergreenDevice(ptr); - } else if (deviceName == "redwood") { + } else if (deviceName == "redwood" || deviceName == "sumo") { #if DEBUG assert(!is64bit && "This device does not support 64bit pointers!"); assert(!is64on32bit && "This device does not support 64bit" @@ -79,7 +79,10 @@ AMDGPUDevice* getDeviceFromName(const std::string &deviceName, " on 32bit pointers!"); #endif return new AMDGPUNIDevice(ptr); - } else if (deviceName == "SI") { + } else if (deviceName == "SI" || + deviceName == "tahiti" || deviceName == "pitcairn" || + deviceName == "verde" || deviceName == "oland" || + deviceName == "hainan") { return new AMDGPUSIDevice(ptr); } else { #if DEBUG diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp index fa8f62d..ba75a44 100644 --- a/lib/Target/R600/AMDILISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp @@ -191,6 +191,29 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs, 2 * N->getNumOperands() + 1); } + case ISD::BUILD_PAIR: { + SDValue RC, SubReg0, SubReg1; + const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); + if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) { + break; + } + if (N->getValueType(0) == MVT::i128) { + RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32); + SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, MVT::i32); + SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, MVT::i32); + } else if (N->getValueType(0) == MVT::i64) { + RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32); + SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32); + SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32); + } else { + llvm_unreachable("Unhandled value type for BUILD_PAIR"); + } + const SDValue Ops[] = { RC, N->getOperand(0), SubReg0, + N->getOperand(1), SubReg1 }; + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, + N->getDebugLoc(), N->getValueType(0), Ops); + } + case ISD::ConstantFP: case ISD::Constant: { const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); diff --git a/lib/Target/R600/AMDILPeepholeOptimizer.cpp b/lib/Target/R600/AMDILPeepholeOptimizer.cpp deleted file mode 100644 index 3a28038..0000000 --- a/lib/Target/R600/AMDILPeepholeOptimizer.cpp +++ /dev/null @@ -1,1215 +0,0 @@ -//===-- AMDILPeepholeOptimizer.cpp - AMDGPU Peephole optimizations ---------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -/// \file -//==-----------------------------------------------------------------------===// - -#define DEBUG_TYPE "PeepholeOpt" -#ifdef DEBUG -#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) -#else -#define DEBUGME 0 -#endif - -#include "AMDILDevices.h" -#include "AMDGPUInstrInfo.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/ADT/Twine.h" -#include "llvm/IR/Constants.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionAnalysis.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Module.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/MathExtras.h" - -#include <sstream> - -#if 0 -STATISTIC(PointerAssignments, "Number of dynamic pointer " - "assigments discovered"); -STATISTIC(PointerSubtract, "Number of pointer subtractions discovered"); -#endif - -using namespace llvm; -// The Peephole optimization pass is used to do simple last minute optimizations -// that are required for correct code or to remove redundant functions -namespace { - -class OpaqueType; - -class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass { -public: - TargetMachine &TM; - static char ID; - AMDGPUPeepholeOpt(TargetMachine &tm); - ~AMDGPUPeepholeOpt(); - const char *getPassName() const; - bool runOnFunction(Function &F); - bool doInitialization(Module &M); - bool doFinalization(Module &M); - void getAnalysisUsage(AnalysisUsage &AU) const; -protected: -private: - // Function to initiate all of the instruction level optimizations. - bool instLevelOptimizations(BasicBlock::iterator *inst); - // Quick check to see if we need to dump all of the pointers into the - // arena. If this is correct, then we set all pointers to exist in arena. This - // is a workaround for aliasing of pointers in a struct/union. - bool dumpAllIntoArena(Function &F); - // Because I don't want to invalidate any pointers while in the - // safeNestedForEachFunction. I push atomic conversions to a vector and handle - // it later. This function does the conversions if required. - void doAtomicConversionIfNeeded(Function &F); - // Because __amdil_is_constant cannot be properly evaluated if - // optimizations are disabled, the call's are placed in a vector - // and evaluated after the __amdil_image* functions are evaluated - // which should allow the __amdil_is_constant function to be - // evaluated correctly. - void doIsConstCallConversionIfNeeded(); - bool mChanged; - bool mDebug; - bool mConvertAtomics; - CodeGenOpt::Level optLevel; - // Run a series of tests to see if we can optimize a CALL instruction. - bool optimizeCallInst(BasicBlock::iterator *bbb); - // A peephole optimization to optimize bit extract sequences. - bool optimizeBitExtract(Instruction *inst); - // A peephole optimization to optimize bit insert sequences. - bool optimizeBitInsert(Instruction *inst); - bool setupBitInsert(Instruction *base, - Instruction *&src, - Constant *&mask, - Constant *&shift); - // Expand the bit field insert instruction on versions of OpenCL that - // don't support it. - bool expandBFI(CallInst *CI); - // Expand the bit field mask instruction on version of OpenCL that - // don't support it. - bool expandBFM(CallInst *CI); - // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in - // this case we need to expand them. These functions check for 24bit functions - // and then expand. - bool isSigned24BitOps(CallInst *CI); - void expandSigned24BitOps(CallInst *CI); - // One optimization that can occur is that if the required workgroup size is - // specified then the result of get_local_size is known at compile time and - // can be returned accordingly. - bool isRWGLocalOpt(CallInst *CI); - // On northern island cards, the division is slightly less accurate than on - // previous generations, so we need to utilize a more accurate division. So we - // can translate the accurate divide to a normal divide on all other cards. - bool convertAccurateDivide(CallInst *CI); - void expandAccurateDivide(CallInst *CI); - // If the alignment is set incorrectly, it can produce really inefficient - // code. This checks for this scenario and fixes it if possible. - bool correctMisalignedMemOp(Instruction *inst); - - // If we are in no opt mode, then we need to make sure that - // local samplers are properly propagated as constant propagation - // doesn't occur and we need to know the value of kernel defined - // samplers at compile time. - bool propagateSamplerInst(CallInst *CI); - - // Helper functions - - // Group of functions that recursively calculate the size of a structure based - // on it's sub-types. - size_t getTypeSize(Type * const T, bool dereferencePtr = false); - size_t getTypeSize(StructType * const ST, bool dereferencePtr = false); - size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false); - size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false); - size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false); - size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false); - size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false); - size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false); - - LLVMContext *mCTX; - Function *mF; - const AMDGPUSubtarget *mSTM; - SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs; - SmallVector<CallInst *, 16> isConstVec; -}; // class AMDGPUPeepholeOpt - char AMDGPUPeepholeOpt::ID = 0; - -// A template function that has two levels of looping before calling the -// function with a pointer to the current iterator. -template<class InputIterator, class SecondIterator, class Function> -Function safeNestedForEach(InputIterator First, InputIterator Last, - SecondIterator S, Function F) { - for ( ; First != Last; ++First) { - SecondIterator sf, sl; - for (sf = First->begin(), sl = First->end(); - sf != sl; ) { - if (!F(&sf)) { - ++sf; - } - } - } - return F; -} - -} // anonymous namespace - -namespace llvm { - FunctionPass * - createAMDGPUPeepholeOpt(TargetMachine &tm) { - return new AMDGPUPeepholeOpt(tm); - } -} // llvm namespace - -AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm) - : FunctionPass(ID), TM(tm) { - mDebug = DEBUGME; - optLevel = TM.getOptLevel(); - -} - -AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt() { -} - -const char * -AMDGPUPeepholeOpt::getPassName() const { - return "AMDGPU PeepHole Optimization Pass"; -} - -bool -containsPointerType(Type *Ty) { - if (!Ty) { - return false; - } - switch(Ty->getTypeID()) { - default: - return false; - case Type::StructTyID: { - const StructType *ST = dyn_cast<StructType>(Ty); - for (StructType::element_iterator stb = ST->element_begin(), - ste = ST->element_end(); stb != ste; ++stb) { - if (!containsPointerType(*stb)) { - continue; - } - return true; - } - break; - } - case Type::VectorTyID: - case Type::ArrayTyID: - return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType()); - case Type::PointerTyID: - return true; - }; - return false; -} - -bool -AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F) { - bool dumpAll = false; - for (Function::const_arg_iterator cab = F.arg_begin(), - cae = F.arg_end(); cab != cae; ++cab) { - const Argument *arg = cab; - const PointerType *PT = dyn_cast<PointerType>(arg->getType()); - if (!PT) { - continue; - } - Type *DereferencedType = PT->getElementType(); - if (!dyn_cast<StructType>(DereferencedType) - ) { - continue; - } - if (!containsPointerType(DereferencedType)) { - continue; - } - // FIXME: Because a pointer inside of a struct/union may be aliased to - // another pointer we need to take the conservative approach and place all - // pointers into the arena until more advanced detection is implemented. - dumpAll = true; - } - return dumpAll; -} -void -AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded() { - if (isConstVec.empty()) { - return; - } - for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) { - CallInst *CI = isConstVec[x]; - Constant *CV = dyn_cast<Constant>(CI->getOperand(0)); - Type *aType = Type::getInt32Ty(*mCTX); - Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1) - : ConstantInt::get(aType, 0); - CI->replaceAllUsesWith(Val); - CI->eraseFromParent(); - } - isConstVec.clear(); -} -void -AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F) { - // Don't do anything if we don't have any atomic operations. - if (atomicFuncs.empty()) { - return; - } - // Change the function name for the atomic if it is required - uint32_t size = atomicFuncs.size(); - for (uint32_t x = 0; x < size; ++x) { - atomicFuncs[x].first->setOperand( - atomicFuncs[x].first->getNumOperands()-1, - atomicFuncs[x].second); - - } - mChanged = true; - if (mConvertAtomics) { - return; - } -} - -bool -AMDGPUPeepholeOpt::runOnFunction(Function &MF) { - mChanged = false; - mF = &MF; - mSTM = &TM.getSubtarget<AMDGPUSubtarget>(); - if (mDebug) { - MF.dump(); - } - mCTX = &MF.getType()->getContext(); - mConvertAtomics = true; - safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(), - std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations), - this)); - - doAtomicConversionIfNeeded(MF); - doIsConstCallConversionIfNeeded(); - - if (mDebug) { - MF.dump(); - } - return mChanged; -} - -bool -AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb) { - Instruction *inst = (*bbb); - CallInst *CI = dyn_cast<CallInst>(inst); - if (!CI) { - return false; - } - if (isSigned24BitOps(CI)) { - expandSigned24BitOps(CI); - ++(*bbb); - CI->eraseFromParent(); - return true; - } - if (propagateSamplerInst(CI)) { - return false; - } - if (expandBFI(CI) || expandBFM(CI)) { - ++(*bbb); - CI->eraseFromParent(); - return true; - } - if (convertAccurateDivide(CI)) { - expandAccurateDivide(CI); - ++(*bbb); - CI->eraseFromParent(); - return true; - } - - StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName(); - if (calleeName.startswith("__amdil_is_constant")) { - // If we do not have optimizations, then this - // cannot be properly evaluated, so we add the - // call instruction to a vector and process - // them at the end of processing after the - // samplers have been correctly handled. - if (optLevel == CodeGenOpt::None) { - isConstVec.push_back(CI); - return false; - } else { - Constant *CV = dyn_cast<Constant>(CI->getOperand(0)); - Type *aType = Type::getInt32Ty(*mCTX); - Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1) - : ConstantInt::get(aType, 0); - CI->replaceAllUsesWith(Val); - ++(*bbb); - CI->eraseFromParent(); - return true; - } - } - - if (calleeName.equals("__amdil_is_asic_id_i32")) { - ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0)); - Type *aType = Type::getInt32Ty(*mCTX); - Value *Val = CV; - if (Val) { - Val = ConstantInt::get(aType, - mSTM->device()->getDeviceFlag() & CV->getZExtValue()); - } else { - Val = ConstantInt::get(aType, 0); - } - CI->replaceAllUsesWith(Val); - ++(*bbb); - CI->eraseFromParent(); - return true; - } - Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1)); - if (!F) { - return false; - } - if (F->getName().startswith("__atom") && !CI->getNumUses() - && F->getName().find("_xchg") == StringRef::npos) { - std::string buffer(F->getName().str() + "_noret"); - F = dyn_cast<Function>( - F->getParent()->getOrInsertFunction(buffer, F->getFunctionType())); - atomicFuncs.push_back(std::make_pair(CI, F)); - } - - if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment) - && !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) { - return false; - } - if (!mConvertAtomics) { - return false; - } - StringRef name = F->getName(); - if (name.startswith("__atom") && name.find("_g") != StringRef::npos) { - mConvertAtomics = false; - } - return false; -} - -bool -AMDGPUPeepholeOpt::setupBitInsert(Instruction *base, - Instruction *&src, - Constant *&mask, - Constant *&shift) { - if (!base) { - if (mDebug) { - dbgs() << "Null pointer passed into function.\n"; - } - return false; - } - bool andOp = false; - if (base->getOpcode() == Instruction::Shl) { - shift = dyn_cast<Constant>(base->getOperand(1)); - } else if (base->getOpcode() == Instruction::And) { - mask = dyn_cast<Constant>(base->getOperand(1)); - andOp = true; - } else { - if (mDebug) { - dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n"; - } - // If the base is neither a Shl or a And, we don't fit any of the patterns above. - return false; - } - src = dyn_cast<Instruction>(base->getOperand(0)); - if (!src) { - if (mDebug) { - dbgs() << "Failed setup since the base operand is not an instruction!\n"; - } - return false; - } - // If we find an 'and' operation, then we don't need to - // find the next operation as we already know the - // bits that are valid at this point. - if (andOp) { - return true; - } - if (src->getOpcode() == Instruction::Shl && !shift) { - shift = dyn_cast<Constant>(src->getOperand(1)); - src = dyn_cast<Instruction>(src->getOperand(0)); - } else if (src->getOpcode() == Instruction::And && !mask) { - mask = dyn_cast<Constant>(src->getOperand(1)); - } - if (!mask && !shift) { - if (mDebug) { - dbgs() << "Failed setup since both mask and shift are NULL!\n"; - } - // Did not find a constant mask or a shift. - return false; - } - return true; -} -bool -AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst) { - if (!inst) { - return false; - } - if (!inst->isBinaryOp()) { - return false; - } - if (inst->getOpcode() != Instruction::Or) { - return false; - } - if (optLevel == CodeGenOpt::None) { - return false; - } - // We want to do an optimization on a sequence of ops that in the end equals a - // single ISA instruction. - // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F) - // Some simplified versions of this pattern are as follows: - // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0 - // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E - // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B - // (A & B) | (D << F) when (1 << F) >= B - // (A << C) | (D & E) when (1 << C) >= E - if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) { - // The HD4XXX hardware doesn't support the ubit_insert instruction. - return false; - } - Type *aType = inst->getType(); - bool isVector = aType->isVectorTy(); - int numEle = 1; - // This optimization only works on 32bit integers. - if (aType->getScalarType() - != Type::getInt32Ty(inst->getContext())) { - return false; - } - if (isVector) { - const VectorType *VT = dyn_cast<VectorType>(aType); - numEle = VT->getNumElements(); - // We currently cannot support more than 4 elements in a intrinsic and we - // cannot support Vec3 types. - if (numEle > 4 || numEle == 3) { - return false; - } - } - // TODO: Handle vectors. - if (isVector) { - if (mDebug) { - dbgs() << "!!! Vectors are not supported yet!\n"; - } - return false; - } - Instruction *LHSSrc = NULL, *RHSSrc = NULL; - Constant *LHSMask = NULL, *RHSMask = NULL; - Constant *LHSShift = NULL, *RHSShift = NULL; - Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0)); - Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1)); - if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) { - if (mDebug) { - dbgs() << "Found an OR Operation that failed setup!\n"; - inst->dump(); - if (LHS) { LHS->dump(); } - if (LHSSrc) { LHSSrc->dump(); } - if (LHSMask) { LHSMask->dump(); } - if (LHSShift) { LHSShift->dump(); } - } - // There was an issue with the setup for BitInsert. - return false; - } - if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) { - if (mDebug) { - dbgs() << "Found an OR Operation that failed setup!\n"; - inst->dump(); - if (RHS) { RHS->dump(); } - if (RHSSrc) { RHSSrc->dump(); } - if (RHSMask) { RHSMask->dump(); } - if (RHSShift) { RHSShift->dump(); } - } - // There was an issue with the setup for BitInsert. - return false; - } - if (mDebug) { - dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n"; - dbgs() << "Op: "; inst->dump(); - dbgs() << "LHS: "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; } - dbgs() << "LHS Src: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; } - dbgs() << "LHS Mask: "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; } - dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; } - dbgs() << "RHS: "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; } - dbgs() << "RHS Src: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; } - dbgs() << "RHS Mask: "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; } - dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; } - } - Constant *offset = NULL; - Constant *width = NULL; - uint32_t lhsMaskVal = 0, rhsMaskVal = 0; - uint32_t lhsShiftVal = 0, rhsShiftVal = 0; - uint32_t lhsMaskWidth = 0, rhsMaskWidth = 0; - uint32_t lhsMaskOffset = 0, rhsMaskOffset = 0; - lhsMaskVal = (LHSMask - ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0); - rhsMaskVal = (RHSMask - ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0); - lhsShiftVal = (LHSShift - ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0); - rhsShiftVal = (RHSShift - ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0); - lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal; - rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal; - lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal; - rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal; - // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks). - if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) { - return false; - } - if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) { - offset = ConstantInt::get(aType, lhsMaskOffset, false); - width = ConstantInt::get(aType, lhsMaskWidth, false); - RHSSrc = RHS; - if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) { - return false; - } - if (!LHSShift) { - LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, - "MaskShr", LHS); - } else if (lhsShiftVal != lhsMaskOffset) { - LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, - "MaskShr", LHS); - } - if (mDebug) { - dbgs() << "Optimizing LHS!\n"; - } - } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) { - offset = ConstantInt::get(aType, rhsMaskOffset, false); - width = ConstantInt::get(aType, rhsMaskWidth, false); - LHSSrc = RHSSrc; - RHSSrc = LHS; - if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) { - return false; - } - if (!RHSShift) { - LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, - "MaskShr", RHS); - } else if (rhsShiftVal != rhsMaskOffset) { - LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, - "MaskShr", RHS); - } - if (mDebug) { - dbgs() << "Optimizing RHS!\n"; - } - } else { - if (mDebug) { - dbgs() << "Failed constraint 3!\n"; - } - return false; - } - if (mDebug) { - dbgs() << "Width: "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; } - dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; } - dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; } - dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; } - } - if (!offset || !width) { - if (mDebug) { - dbgs() << "Either width or offset are NULL, failed detection!\n"; - } - return false; - } - // Lets create the function signature. - std::vector<Type *> callTypes; - callTypes.push_back(aType); - callTypes.push_back(aType); - callTypes.push_back(aType); - callTypes.push_back(aType); - FunctionType *funcType = FunctionType::get(aType, callTypes, false); - std::string name = "__amdil_ubit_insert"; - if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; } - Function *Func = - dyn_cast<Function>(inst->getParent()->getParent()->getParent()-> - getOrInsertFunction(StringRef(name), funcType)); - Value *Operands[4] = { - width, - offset, - LHSSrc, - RHSSrc - }; - CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt"); - if (mDebug) { - dbgs() << "Old Inst: "; - inst->dump(); - dbgs() << "New Inst: "; - CI->dump(); - dbgs() << "\n\n"; - } - CI->insertBefore(inst); - inst->replaceAllUsesWith(CI); - return true; -} - -bool -AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst) { - if (!inst) { - return false; - } - if (!inst->isBinaryOp()) { - return false; - } - if (inst->getOpcode() != Instruction::And) { - return false; - } - if (optLevel == CodeGenOpt::None) { - return false; - } - // We want to do some simple optimizations on Shift right/And patterns. The - // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a - // value smaller than 32 and C is a mask. If C is a constant value, then the - // following transformation can occur. For signed integers, it turns into the - // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned - // integers, it turns into the function call dst = - // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract - // can be found in Section 7.9 of the ATI IL spec of the stream SDK for - // Evergreen hardware. - if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) { - // This does not work on HD4XXX hardware. - return false; - } - Type *aType = inst->getType(); - bool isVector = aType->isVectorTy(); - - // XXX Support vector types - if (isVector) { - return false; - } - int numEle = 1; - // This only works on 32bit integers - if (aType->getScalarType() - != Type::getInt32Ty(inst->getContext())) { - return false; - } - if (isVector) { - const VectorType *VT = dyn_cast<VectorType>(aType); - numEle = VT->getNumElements(); - // We currently cannot support more than 4 elements in a intrinsic and we - // cannot support Vec3 types. - if (numEle > 4 || numEle == 3) { - return false; - } - } - BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0)); - // If the first operand is not a shift instruction, then we can return as it - // doesn't match this pattern. - if (!ShiftInst || !ShiftInst->isShift()) { - return false; - } - // If we are a shift left, then we need don't match this pattern. - if (ShiftInst->getOpcode() == Instruction::Shl) { - return false; - } - bool isSigned = ShiftInst->isArithmeticShift(); - Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1)); - Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1)); - // Lets make sure that the shift value and the and mask are constant integers. - if (!AndMask || !ShrVal) { - return false; - } - Constant *newMaskConst; - Constant *shiftValConst; - if (isVector) { - // Handle the vector case - std::vector<Constant *> maskVals; - std::vector<Constant *> shiftVals; - ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask); - ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal); - Type *scalarType = AndMaskVec->getType()->getScalarType(); - assert(AndMaskVec->getNumOperands() == - ShrValVec->getNumOperands() && "cannot have a " - "combination where the number of elements to a " - "shift and an and are different!"); - for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) { - ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x)); - ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x)); - if (!AndCI || !ShiftIC) { - return false; - } - uint32_t maskVal = (uint32_t)AndCI->getZExtValue(); - if (!isMask_32(maskVal)) { - return false; - } - maskVal = (uint32_t)CountTrailingOnes_32(maskVal); - uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue(); - // If the mask or shiftval is greater than the bitcount, then break out. - if (maskVal >= 32 || shiftVal >= 32) { - return false; - } - // If the mask val is greater than the the number of original bits left - // then this optimization is invalid. - if (maskVal > (32 - shiftVal)) { - return false; - } - maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned)); - shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned)); - } - newMaskConst = ConstantVector::get(maskVals); - shiftValConst = ConstantVector::get(shiftVals); - } else { - // Handle the scalar case - uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue(); - // This must be a mask value where all lower bits are set to 1 and then any - // bit higher is set to 0. - if (!isMask_32(maskVal)) { - return false; - } - maskVal = (uint32_t)CountTrailingOnes_32(maskVal); - // Count the number of bits set in the mask, this is the width of the - // resulting bit set that is extracted from the source value. - uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue(); - // If the mask or shift val is greater than the bitcount, then break out. - if (maskVal >= 32 || shiftVal >= 32) { - return false; - } - // If the mask val is greater than the the number of original bits left then - // this optimization is invalid. - if (maskVal > (32 - shiftVal)) { - return false; - } - newMaskConst = ConstantInt::get(aType, maskVal, isSigned); - shiftValConst = ConstantInt::get(aType, shiftVal, isSigned); - } - // Lets create the function signature. - std::vector<Type *> callTypes; - callTypes.push_back(aType); - callTypes.push_back(aType); - callTypes.push_back(aType); - FunctionType *funcType = FunctionType::get(aType, callTypes, false); - std::string name = "llvm.AMDGPU.bit.extract.u32"; - if (isVector) { - name += ".v" + itostr(numEle) + "i32"; - } else { - name += "."; - } - // Lets create the function. - Function *Func = - dyn_cast<Function>(inst->getParent()->getParent()->getParent()-> - getOrInsertFunction(StringRef(name), funcType)); - Value *Operands[3] = { - ShiftInst->getOperand(0), - shiftValConst, - newMaskConst - }; - // Lets create the Call with the operands - CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt"); - CI->setDoesNotAccessMemory(); - CI->insertBefore(inst); - inst->replaceAllUsesWith(CI); - return true; -} - -bool -AMDGPUPeepholeOpt::expandBFI(CallInst *CI) { - if (!CI) { - return false; - } - Value *LHS = CI->getOperand(CI->getNumOperands() - 1); - if (!LHS->getName().startswith("__amdil_bfi")) { - return false; - } - Type* type = CI->getOperand(0)->getType(); - Constant *negOneConst = NULL; - if (type->isVectorTy()) { - std::vector<Constant *> negOneVals; - negOneConst = ConstantInt::get(CI->getContext(), - APInt(32, StringRef("-1"), 10)); - for (size_t x = 0, - y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) { - negOneVals.push_back(negOneConst); - } - negOneConst = ConstantVector::get(negOneVals); - } else { - negOneConst = ConstantInt::get(CI->getContext(), - APInt(32, StringRef("-1"), 10)); - } - // __amdil_bfi => (A & B) | (~A & C) - BinaryOperator *lhs = - BinaryOperator::Create(Instruction::And, CI->getOperand(0), - CI->getOperand(1), "bfi_and", CI); - BinaryOperator *rhs = - BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst, - "bfi_not", CI); - rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2), - "bfi_and", CI); - lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI); - CI->replaceAllUsesWith(lhs); - return true; -} - -bool -AMDGPUPeepholeOpt::expandBFM(CallInst *CI) { - if (!CI) { - return false; - } - Value *LHS = CI->getOperand(CI->getNumOperands() - 1); - if (!LHS->getName().startswith("__amdil_bfm")) { - return false; - } - // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f) - Constant *newMaskConst = NULL; - Constant *newShiftConst = NULL; - Type* type = CI->getOperand(0)->getType(); - if (type->isVectorTy()) { - std::vector<Constant*> newMaskVals, newShiftVals; - newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F); - newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1); - for (size_t x = 0, - y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) { - newMaskVals.push_back(newMaskConst); - newShiftVals.push_back(newShiftConst); - } - newMaskConst = ConstantVector::get(newMaskVals); - newShiftConst = ConstantVector::get(newShiftVals); - } else { - newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F); - newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1); - } - BinaryOperator *lhs = - BinaryOperator::Create(Instruction::And, CI->getOperand(0), - newMaskConst, "bfm_mask", CI); - lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst, - lhs, "bfm_shl", CI); - lhs = BinaryOperator::Create(Instruction::Sub, lhs, - newShiftConst, "bfm_sub", CI); - BinaryOperator *rhs = - BinaryOperator::Create(Instruction::And, CI->getOperand(1), - newMaskConst, "bfm_mask", CI); - lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI); - CI->replaceAllUsesWith(lhs); - return true; -} - -bool -AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb) { - Instruction *inst = (*bbb); - if (optimizeCallInst(bbb)) { - return true; - } - if (optimizeBitExtract(inst)) { - return false; - } - if (optimizeBitInsert(inst)) { - return false; - } - if (correctMisalignedMemOp(inst)) { - return false; - } - return false; -} -bool -AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst) { - LoadInst *linst = dyn_cast<LoadInst>(inst); - StoreInst *sinst = dyn_cast<StoreInst>(inst); - unsigned alignment; - Type* Ty = inst->getType(); - if (linst) { - alignment = linst->getAlignment(); - Ty = inst->getType(); - } else if (sinst) { - alignment = sinst->getAlignment(); - Ty = sinst->getValueOperand()->getType(); - } else { - return false; - } - unsigned size = getTypeSize(Ty); - if (size == alignment || size < alignment) { - return false; - } - if (!Ty->isStructTy()) { - return false; - } - if (alignment < 4) { - if (linst) { - linst->setAlignment(0); - return true; - } else if (sinst) { - sinst->setAlignment(0); - return true; - } - } - return false; -} -bool -AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI) { - if (!CI) { - return false; - } - Value *LHS = CI->getOperand(CI->getNumOperands() - 1); - std::string namePrefix = LHS->getName().substr(0, 14); - if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24" - && namePrefix != "__amdil__imul24_high") { - return false; - } - if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) { - return false; - } - return true; -} - -void -AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI) { - assert(isSigned24BitOps(CI) && "Must be a " - "signed 24 bit operation to call this function!"); - Value *LHS = CI->getOperand(CI->getNumOperands()-1); - // On 7XX and 8XX we do not have signed 24bit, so we need to - // expand it to the following: - // imul24 turns into 32bit imul - // imad24 turns into 32bit imad - // imul24_high turns into 32bit imulhigh - if (LHS->getName().substr(0, 14) == "__amdil_imad24") { - Type *aType = CI->getOperand(0)->getType(); - bool isVector = aType->isVectorTy(); - int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1; - std::vector<Type*> callTypes; - callTypes.push_back(CI->getOperand(0)->getType()); - callTypes.push_back(CI->getOperand(1)->getType()); - callTypes.push_back(CI->getOperand(2)->getType()); - FunctionType *funcType = - FunctionType::get(CI->getOperand(0)->getType(), callTypes, false); - std::string name = "__amdil_imad"; - if (isVector) { - name += "_v" + itostr(numEle) + "i32"; - } else { - name += "_i32"; - } - Function *Func = dyn_cast<Function>( - CI->getParent()->getParent()->getParent()-> - getOrInsertFunction(StringRef(name), funcType)); - Value *Operands[3] = { - CI->getOperand(0), - CI->getOperand(1), - CI->getOperand(2) - }; - CallInst *nCI = CallInst::Create(Func, Operands, "imad24"); - nCI->insertBefore(CI); - CI->replaceAllUsesWith(nCI); - } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") { - BinaryOperator *mulOp = - BinaryOperator::Create(Instruction::Mul, CI->getOperand(0), - CI->getOperand(1), "imul24", CI); - CI->replaceAllUsesWith(mulOp); - } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") { - Type *aType = CI->getOperand(0)->getType(); - - bool isVector = aType->isVectorTy(); - int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1; - std::vector<Type*> callTypes; - callTypes.push_back(CI->getOperand(0)->getType()); - callTypes.push_back(CI->getOperand(1)->getType()); - FunctionType *funcType = - FunctionType::get(CI->getOperand(0)->getType(), callTypes, false); - std::string name = "__amdil_imul_high"; - if (isVector) { - name += "_v" + itostr(numEle) + "i32"; - } else { - name += "_i32"; - } - Function *Func = dyn_cast<Function>( - CI->getParent()->getParent()->getParent()-> - getOrInsertFunction(StringRef(name), funcType)); - Value *Operands[2] = { - CI->getOperand(0), - CI->getOperand(1) - }; - CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high"); - nCI->insertBefore(CI); - CI->replaceAllUsesWith(nCI); - } -} - -bool -AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI) { - return (CI != NULL - && CI->getOperand(CI->getNumOperands() - 1)->getName() - == "__amdil_get_local_size_int"); -} - -bool -AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI) { - if (!CI) { - return false; - } - if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX - && (mSTM->getDeviceName() == "cayman")) { - return false; - } - return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20) - == "__amdil_improved_div"; -} - -void -AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI) { - assert(convertAccurateDivide(CI) - && "expanding accurate divide can only happen if it is expandable!"); - BinaryOperator *divOp = - BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0), - CI->getOperand(1), "fdiv32", CI); - CI->replaceAllUsesWith(divOp); -} - -bool -AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI) { - if (optLevel != CodeGenOpt::None) { - return false; - } - - if (!CI) { - return false; - } - - unsigned funcNameIdx = 0; - funcNameIdx = CI->getNumOperands() - 1; - StringRef calleeName = CI->getOperand(funcNameIdx)->getName(); - if (calleeName != "__amdil_image2d_read_norm" - && calleeName != "__amdil_image2d_read_unnorm" - && calleeName != "__amdil_image3d_read_norm" - && calleeName != "__amdil_image3d_read_unnorm") { - return false; - } - - unsigned samplerIdx = 2; - samplerIdx = 1; - Value *sampler = CI->getOperand(samplerIdx); - LoadInst *lInst = dyn_cast<LoadInst>(sampler); - if (!lInst) { - return false; - } - - if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { - return false; - } - - GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand()); - // If we are loading from what is not a global value, then we - // fail and return. - if (!gv) { - return false; - } - - // If we don't have an initializer or we have an initializer and - // the initializer is not a 32bit integer, we fail. - if (!gv->hasInitializer() - || !gv->getInitializer()->getType()->isIntegerTy(32)) { - return false; - } - - // Now that we have the global variable initializer, lets replace - // all uses of the load instruction with the samplerVal and - // reparse the __amdil_is_constant() function. - Constant *samplerVal = gv->getInitializer(); - lInst->replaceAllUsesWith(samplerVal); - return true; -} - -bool -AMDGPUPeepholeOpt::doInitialization(Module &M) { - return false; -} - -bool -AMDGPUPeepholeOpt::doFinalization(Module &M) { - return false; -} - -void -AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<MachineFunctionAnalysis>(); - FunctionPass::getAnalysisUsage(AU); - AU.setPreservesAll(); -} - -size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) { - size_t size = 0; - if (!T) { - return size; - } - switch (T->getTypeID()) { - case Type::X86_FP80TyID: - case Type::FP128TyID: - case Type::PPC_FP128TyID: - case Type::LabelTyID: - assert(0 && "These types are not supported by this backend"); - default: - case Type::FloatTyID: - case Type::DoubleTyID: - size = T->getPrimitiveSizeInBits() >> 3; - break; - case Type::PointerTyID: - size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr); - break; - case Type::IntegerTyID: - size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr); - break; - case Type::StructTyID: - size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr); - break; - case Type::ArrayTyID: - size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr); - break; - case Type::FunctionTyID: - size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr); - break; - case Type::VectorTyID: - size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr); - break; - }; - return size; -} - -size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST, - bool dereferencePtr) { - size_t size = 0; - if (!ST) { - return size; - } - Type *curType; - StructType::element_iterator eib; - StructType::element_iterator eie; - for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) { - curType = *eib; - size += getTypeSize(curType, dereferencePtr); - } - return size; -} - -size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT, - bool dereferencePtr) { - return IT ? (IT->getBitWidth() >> 3) : 0; -} - -size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT, - bool dereferencePtr) { - assert(0 && "Should not be able to calculate the size of an function type"); - return 0; -} - -size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT, - bool dereferencePtr) { - return (size_t)(AT ? (getTypeSize(AT->getElementType(), - dereferencePtr) * AT->getNumElements()) - : 0); -} - -size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT, - bool dereferencePtr) { - return VT ? (VT->getBitWidth() >> 3) : 0; -} - -size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT, - bool dereferencePtr) { - if (!PT) { - return 0; - } - Type *CT = PT->getElementType(); - if (CT->getTypeID() == Type::StructTyID && - PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { - return getTypeSize(dyn_cast<StructType>(CT)); - } else if (dereferencePtr) { - size_t size = 0; - for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) { - size += getTypeSize(PT->getContainedType(x), dereferencePtr); - } - return size; - } else { - return 4; - } -} - -size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT, - bool dereferencePtr) { - //assert(0 && "Should not be able to calculate the size of an opaque type"); - return 4; -} diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt index 8efba58..97f0a40 100644 --- a/lib/Target/R600/CMakeLists.txt +++ b/lib/Target/R600/CMakeLists.txt @@ -21,7 +21,6 @@ add_llvm_target(R600CodeGen AMDILISelDAGToDAG.cpp AMDILISelLowering.cpp AMDILNIDevice.cpp - AMDILPeepholeOptimizer.cpp AMDILSIDevice.cpp AMDGPUAsmPrinter.cpp AMDGPUFrameLowering.cpp @@ -42,6 +41,7 @@ add_llvm_target(R600CodeGen R600ISelLowering.cpp R600MachineFunctionInfo.cpp R600MachineScheduler.cpp + R600Packetizer.cpp R600RegisterInfo.cpp SIAnnotateControlFlow.cpp SIInsertWaits.cpp diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp index 10547a5..303cdf2 100644 --- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp +++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp @@ -17,6 +17,7 @@ using namespace llvm; void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot) { + OS.flush(); printInstruction(MI, OS); printAnnotation(OS, Annot); @@ -67,11 +68,14 @@ void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo, } void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo, - raw_ostream &O, StringRef Asm) { + raw_ostream &O, StringRef Asm, + StringRef Default) { const MCOperand &Op = MI->getOperand(OpNo); assert(Op.isImm()); if (Op.getImm() == 1) { O << Asm; + } else { + O << Default; } } @@ -98,7 +102,7 @@ void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printIfSet(MI, OpNo, O, " *"); + printIfSet(MI, OpNo, O.indent(20 - O.GetNumBytesInBuffer()), "*", " "); } void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo, @@ -169,4 +173,41 @@ void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo, O << "." << chans[chan]; } +void AMDGPUInstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + int BankSwizzle = MI->getOperand(OpNo).getImm(); + switch (BankSwizzle) { + case 1: + O << "BS:VEC_021"; + break; + case 2: + O << "BS:VEC_120"; + break; + case 3: + O << "BS:VEC_102"; + break; + case 4: + O << "BS:VEC_201"; + break; + case 5: + O << "BS:VEC_210"; + break; + default: + break; + } + return; +} + +void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + int KCacheMode = MI->getOperand(OpNo).getImm(); + if (KCacheMode > 0) { + int KCacheBank = MI->getOperand(OpNo - 2).getImm(); + O << "CB" << KCacheBank <<":"; + int KCacheAddr = MI->getOperand(OpNo + 2).getImm(); + int LineSize = (KCacheMode == 1)?16:32; + O << KCacheAddr * 16 << "-" << KCacheAddr * 16 + LineSize; + } +} + #include "AMDGPUGenAsmWriter.inc" diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h index 767a708..c6fd053 100644 --- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h +++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h @@ -35,7 +35,8 @@ private: void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O, StringRef Asm); + void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O, + StringRef Asm, StringRef Default = ""); void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O); @@ -47,6 +48,8 @@ private: void printUpdatePred(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printWrite(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printSel(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printBankSwizzle(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printKCache(const MCInst *MI, unsigned OpNo, raw_ostream &O); }; } // End namespace llvm diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp index 98fca43..a3397f3 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -44,7 +44,6 @@ public: AMDGPUAsmBackend(const Target &T) : MCAsmBackend() {} - virtual AMDGPUMCObjectWriter *createObjectWriter(raw_ostream &OS) const; virtual unsigned getNumFixupKinds() const { return 0; }; virtual void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, uint64_t Value) const; @@ -71,16 +70,6 @@ void AMDGPUMCObjectWriter::WriteObject(MCAssembler &Asm, } } -MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T, StringRef TT, - StringRef CPU) { - return new AMDGPUAsmBackend(T); -} - -AMDGPUMCObjectWriter * AMDGPUAsmBackend::createObjectWriter( - raw_ostream &OS) const { - return new AMDGPUMCObjectWriter(OS); -} - void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, uint64_t Value) const { @@ -88,3 +77,21 @@ void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, assert(Fixup.getKind() == FK_PCRel_4); *Dst = (Value - 4) / 4; } + +//===----------------------------------------------------------------------===// +// ELFAMDGPUAsmBackend class +//===----------------------------------------------------------------------===// + +class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend { +public: + ELFAMDGPUAsmBackend(const Target &T) : AMDGPUAsmBackend(T) { } + + MCObjectWriter *createObjectWriter(raw_ostream &OS) const { + return createAMDGPUELFObjectWriter(OS); + } +}; + +MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T, StringRef TT, + StringRef CPU) { + return new ELFAMDGPUAsmBackend(T); +} diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp new file mode 100644 index 0000000..48fac9f --- /dev/null +++ b/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp @@ -0,0 +1,39 @@ +//===-- AMDGPUELFObjectWriter.cpp - AMDGPU ELF Writer ----------------------==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#include "AMDGPUMCTargetDesc.h" +#include "llvm/MC/MCELFObjectWriter.h" + +using namespace llvm; + +namespace { + +class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter { +public: + AMDGPUELFObjectWriter(); +protected: + virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, + bool IsPCRel, bool IsRelocWithSymbol, + int64_t Addend) const { + llvm_unreachable("Not implemented"); + } + +}; + + +} // End anonymous namespace + +AMDGPUELFObjectWriter::AMDGPUELFObjectWriter() + : MCELFObjectTargetWriter(false, 0, 0, false) { } + +MCObjectWriter *llvm::createAMDGPUELFObjectWriter(raw_ostream &OS) { + MCELFObjectTargetWriter *MOTW = new AMDGPUELFObjectWriter(); + return createELFObjectWriter(MOTW, OS, true); +} diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp index b7cdd7c..2aae26a 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -68,8 +68,6 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Target &T, StringRef &TT) : MCAsmInfo() { //===--- Dwarf Emission Directives -----------------------------------===// HasLEB128 = true; SupportsDebugInformation = true; - DwarfSectionOffsetDirective = ".offset"; - } const char* diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp index 072ee49..61d70bb 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -78,7 +78,7 @@ static MCCodeEmitter *createAMDGPUMCCodeEmitter(const MCInstrInfo &MCII, if (STI.getFeatureBits() & AMDGPU::Feature64BitPtr) { return createSIMCCodeEmitter(MCII, MRI, STI, Ctx); } else { - return createR600MCCodeEmitter(MCII, MRI, STI, Ctx); + return createR600MCCodeEmitter(MCII, MRI, STI); } } @@ -88,7 +88,7 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT, MCCodeEmitter *_Emitter, bool RelaxAll, bool NoExecStack) { - return createPureStreamer(Ctx, MAB, _OS, _Emitter); + return createELFStreamer(Ctx, MAB, _OS, _Emitter, false, false); } extern "C" void LLVMInitializeR600TargetMC() { diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h index 363a4af..abb0320 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h @@ -23,16 +23,17 @@ class MCAsmBackend; class MCCodeEmitter; class MCContext; class MCInstrInfo; +class MCObjectWriter; class MCRegisterInfo; class MCSubtargetInfo; class Target; +class raw_ostream; extern Target TheAMDGPUTarget; MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, - MCContext &Ctx); + const MCSubtargetInfo &STI); MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, @@ -41,6 +42,8 @@ MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII, MCAsmBackend *createAMDGPUAsmBackend(const Target &T, StringRef TT, StringRef CPU); + +MCObjectWriter *createAMDGPUELFObjectWriter(raw_ostream &OS); } // End llvm namespace #define GET_REGINFO_ENUM diff --git a/lib/Target/R600/MCTargetDesc/CMakeLists.txt b/lib/Target/R600/MCTargetDesc/CMakeLists.txt index 37e714c..3ccdf42 100644 --- a/lib/Target/R600/MCTargetDesc/CMakeLists.txt +++ b/lib/Target/R600/MCTargetDesc/CMakeLists.txt @@ -1,6 +1,7 @@ add_llvm_library(LLVMR600Desc AMDGPUAsmBackend.cpp + AMDGPUELFObjectWriter.cpp AMDGPUMCTargetDesc.cpp AMDGPUMCAsmInfo.cpp R600MCCodeEmitter.cpp diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp index 927bcbd..cb4cf0c 100644 --- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp @@ -9,12 +9,8 @@ // /// \file /// -/// This code emitter outputs bytecode that is understood by the r600g driver -/// in the Mesa [1] project. The bytecode is very similar to the hardware's ISA, -/// but it still needs to be run through a finalizer in order to be executed -/// by the GPU. -/// -/// [1] http://www.mesa3d.org/ +/// \brief The R600 code emitter produces machine code that can be executed +/// directly on the GPU device. // //===----------------------------------------------------------------------===// @@ -30,9 +26,6 @@ #include "llvm/Support/raw_ostream.h" #include <stdio.h> -#define SRC_BYTE_COUNT 11 -#define DST_BYTE_COUNT 5 - using namespace llvm; namespace { @@ -43,13 +36,12 @@ class R600MCCodeEmitter : public AMDGPUMCCodeEmitter { const MCInstrInfo &MCII; const MCRegisterInfo &MRI; const MCSubtargetInfo &STI; - MCContext &Ctx; public: R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri, - const MCSubtargetInfo &sti, MCContext &ctx) - : MCII(mcii), MRI(mri), STI(sti), Ctx(ctx) { } + const MCSubtargetInfo &sti) + : MCII(mcii), MRI(mri), STI(sti) { } /// \brief Encode the instruction and write it to the OS. virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS, @@ -60,30 +52,14 @@ public: SmallVectorImpl<MCFixup> &Fixups) const; private: - void EmitALUInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups, - raw_ostream &OS) const; - void EmitSrc(const MCInst &MI, unsigned OpIdx, raw_ostream &OS) const; - void EmitSrcISA(const MCInst &MI, unsigned RegOpIdx, unsigned SelOpIdx, - raw_ostream &OS) const; - void EmitDst(const MCInst &MI, raw_ostream &OS) const; - void EmitFCInstr(const MCInst &MI, raw_ostream &OS) const; - - void EmitNullBytes(unsigned int byteCount, raw_ostream &OS) const; - void EmitByte(unsigned int byte, raw_ostream &OS) const; - void EmitTwoBytes(uint32_t bytes, raw_ostream &OS) const; - void Emit(uint32_t value, raw_ostream &OS) const; void Emit(uint64_t value, raw_ostream &OS) const; unsigned getHWRegChan(unsigned reg) const; unsigned getHWReg(unsigned regNo) const; - bool isFCOp(unsigned opcode) const; - bool isTexOp(unsigned opcode) const; - bool isFlagSet(const MCInst &MI, unsigned Operand, unsigned Flag) const; - }; } // End anonymous namespace @@ -95,16 +71,6 @@ enum RegElement { ELEMENT_W }; -enum InstrTypes { - INSTR_ALU = 0, - INSTR_TEX, - INSTR_FC, - INSTR_NATIVE, - INSTR_VTX, - INSTR_EXPORT, - INSTR_CFALU -}; - enum FCInstr { FC_IF_PREDICATE = 0, FC_ELSE, @@ -132,355 +98,95 @@ enum TextureTypes { MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, - MCContext &Ctx) { - return new R600MCCodeEmitter(MCII, MRI, STI, Ctx); + const MCSubtargetInfo &STI) { + return new R600MCCodeEmitter(MCII, MRI, STI); } void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups) const { - if (isFCOp(MI.getOpcode())){ - EmitFCInstr(MI, OS); - } else if (MI.getOpcode() == AMDGPU::RETURN || + const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); + if (MI.getOpcode() == AMDGPU::RETURN || + MI.getOpcode() == AMDGPU::FETCH_CLAUSE || + MI.getOpcode() == AMDGPU::ALU_CLAUSE || MI.getOpcode() == AMDGPU::BUNDLE || MI.getOpcode() == AMDGPU::KILL) { return; - } else { - switch(MI.getOpcode()) { - case AMDGPU::STACK_SIZE: { - EmitByte(MI.getOperand(0).getImm(), OS); - break; - } - case AMDGPU::RAT_WRITE_CACHELESS_32_eg: - case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { - uint64_t inst = getBinaryCodeForInstr(MI, Fixups); - EmitByte(INSTR_NATIVE, OS); - Emit(inst, OS); - break; - } - case AMDGPU::CONSTANT_LOAD_eg: - case AMDGPU::VTX_READ_PARAM_8_eg: - case AMDGPU::VTX_READ_PARAM_16_eg: - case AMDGPU::VTX_READ_PARAM_32_eg: - case AMDGPU::VTX_READ_PARAM_128_eg: - case AMDGPU::VTX_READ_GLOBAL_8_eg: - case AMDGPU::VTX_READ_GLOBAL_32_eg: - case AMDGPU::VTX_READ_GLOBAL_128_eg: - case AMDGPU::TEX_VTX_CONSTBUF: - case AMDGPU::TEX_VTX_TEXBUF : { - uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups); - uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset - - EmitByte(INSTR_VTX, OS); - Emit(InstWord01, OS); - Emit(InstWord2, OS); - break; - } - case AMDGPU::TEX_LD: - case AMDGPU::TEX_GET_TEXTURE_RESINFO: - case AMDGPU::TEX_SAMPLE: - case AMDGPU::TEX_SAMPLE_C: - case AMDGPU::TEX_SAMPLE_L: - case AMDGPU::TEX_SAMPLE_C_L: - case AMDGPU::TEX_SAMPLE_LB: - case AMDGPU::TEX_SAMPLE_C_LB: - case AMDGPU::TEX_SAMPLE_G: - case AMDGPU::TEX_SAMPLE_C_G: - case AMDGPU::TEX_GET_GRADIENTS_H: - case AMDGPU::TEX_GET_GRADIENTS_V: - case AMDGPU::TEX_SET_GRADIENTS_H: - case AMDGPU::TEX_SET_GRADIENTS_V: { - unsigned Opcode = MI.getOpcode(); - bool HasOffsets = (Opcode == AMDGPU::TEX_LD); - unsigned OpOffset = HasOffsets ? 3 : 0; - int64_t Sampler = MI.getOperand(OpOffset + 3).getImm(); - int64_t TextureType = MI.getOperand(OpOffset + 4).getImm(); - - uint32_t SrcSelect[4] = {0, 1, 2, 3}; - uint32_t Offsets[3] = {0, 0, 0}; - uint64_t CoordType[4] = {1, 1, 1, 1}; - - if (HasOffsets) - for (unsigned i = 0; i < 3; i++) { - int SignedOffset = MI.getOperand(i + 2).getImm(); - Offsets[i] = (SignedOffset & 0x1F); - } - - - if (TextureType == TEXTURE_RECT || - TextureType == TEXTURE_SHADOWRECT) { - CoordType[ELEMENT_X] = 0; - CoordType[ELEMENT_Y] = 0; - } - - if (TextureType == TEXTURE_1D_ARRAY || - TextureType == TEXTURE_SHADOW1D_ARRAY) { - if (Opcode == AMDGPU::TEX_SAMPLE_C_L || - Opcode == AMDGPU::TEX_SAMPLE_C_LB) { - CoordType[ELEMENT_Y] = 0; - } else { - CoordType[ELEMENT_Z] = 0; - SrcSelect[ELEMENT_Z] = ELEMENT_Y; - } - } else if (TextureType == TEXTURE_2D_ARRAY || - TextureType == TEXTURE_SHADOW2D_ARRAY) { - CoordType[ELEMENT_Z] = 0; + } else if (IS_VTX(Desc)) { + uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups); + uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset + InstWord2 |= 1 << 19; + + Emit(InstWord01, OS); + Emit(InstWord2, OS); + Emit((u_int32_t) 0, OS); + } else if (IS_TEX(Desc)) { + unsigned Opcode = MI.getOpcode(); + bool HasOffsets = (Opcode == AMDGPU::TEX_LD); + unsigned OpOffset = HasOffsets ? 3 : 0; + int64_t Sampler = MI.getOperand(OpOffset + 3).getImm(); + int64_t TextureType = MI.getOperand(OpOffset + 4).getImm(); + + uint32_t SrcSelect[4] = {0, 1, 2, 3}; + uint32_t Offsets[3] = {0, 0, 0}; + uint64_t CoordType[4] = {1, 1, 1, 1}; + + if (HasOffsets) + for (unsigned i = 0; i < 3; i++) { + int SignedOffset = MI.getOperand(i + 2).getImm(); + Offsets[i] = (SignedOffset & 0x1F); } - - if ((TextureType == TEXTURE_SHADOW1D || - TextureType == TEXTURE_SHADOW2D || - TextureType == TEXTURE_SHADOWRECT || - TextureType == TEXTURE_SHADOW1D_ARRAY) && - Opcode != AMDGPU::TEX_SAMPLE_C_L && - Opcode != AMDGPU::TEX_SAMPLE_C_LB) { - SrcSelect[ELEMENT_W] = ELEMENT_Z; - } - - uint64_t Word01 = getBinaryCodeForInstr(MI, Fixups) | - CoordType[ELEMENT_X] << 60 | CoordType[ELEMENT_Y] << 61 | - CoordType[ELEMENT_Z] << 62 | CoordType[ELEMENT_W] << 63; - uint32_t Word2 = Sampler << 15 | SrcSelect[ELEMENT_X] << 20 | - SrcSelect[ELEMENT_Y] << 23 | SrcSelect[ELEMENT_Z] << 26 | - SrcSelect[ELEMENT_W] << 29 | Offsets[0] << 0 | Offsets[1] << 5 | - Offsets[2] << 10; - - EmitByte(INSTR_TEX, OS); - Emit(Word01, OS); - Emit(Word2, OS); - break; - } - case AMDGPU::EG_ExportSwz: - case AMDGPU::R600_ExportSwz: - case AMDGPU::EG_ExportBuf: - case AMDGPU::R600_ExportBuf: { - uint64_t Inst = getBinaryCodeForInstr(MI, Fixups); - EmitByte(INSTR_EXPORT, OS); - Emit(Inst, OS); - break; - } - case AMDGPU::CF_ALU: - case AMDGPU::CF_ALU_PUSH_BEFORE: { - uint64_t Inst = getBinaryCodeForInstr(MI, Fixups); - EmitByte(INSTR_CFALU, OS); - Emit(Inst, OS); - break; - } - case AMDGPU::CF_TC: - case AMDGPU::CF_VC: - case AMDGPU::CF_CALL_FS: - return; - case AMDGPU::WHILE_LOOP: - case AMDGPU::END_LOOP: - case AMDGPU::LOOP_BREAK: - case AMDGPU::CF_CONTINUE: - case AMDGPU::CF_JUMP: - case AMDGPU::CF_ELSE: - case AMDGPU::POP: { - uint64_t Inst = getBinaryCodeForInstr(MI, Fixups); - EmitByte(INSTR_NATIVE, OS); - Emit(Inst, OS); - break; + if (TextureType == TEXTURE_RECT || + TextureType == TEXTURE_SHADOWRECT) { + CoordType[ELEMENT_X] = 0; + CoordType[ELEMENT_Y] = 0; } - default: - EmitALUInstr(MI, Fixups, OS); - break; - } - } -} - -void R600MCCodeEmitter::EmitALUInstr(const MCInst &MI, - SmallVectorImpl<MCFixup> &Fixups, - raw_ostream &OS) const { - const MCInstrDesc &MCDesc = MCII.get(MI.getOpcode()); - - // Emit instruction type - EmitByte(INSTR_ALU, OS); - - uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups); - - //older alu have different encoding for instructions with one or two src - //parameters. - if ((STI.getFeatureBits() & AMDGPU::FeatureR600ALUInst) && - !(MCDesc.TSFlags & R600_InstFlag::OP3)) { - uint64_t ISAOpCode = InstWord01 & (0x3FFULL << 39); - InstWord01 &= ~(0x3FFULL << 39); - InstWord01 |= ISAOpCode << 1; - } - - unsigned SrcNum = MCDesc.TSFlags & R600_InstFlag::OP3 ? 3 : - MCDesc.TSFlags & R600_InstFlag::OP2 ? 2 : 1; - - EmitByte(SrcNum, OS); - - const unsigned SrcOps[3][2] = { - {R600Operands::SRC0, R600Operands::SRC0_SEL}, - {R600Operands::SRC1, R600Operands::SRC1_SEL}, - {R600Operands::SRC2, R600Operands::SRC2_SEL} - }; - for (unsigned SrcIdx = 0; SrcIdx < SrcNum; ++SrcIdx) { - unsigned RegOpIdx = R600Operands::ALUOpTable[SrcNum-1][SrcOps[SrcIdx][0]]; - unsigned SelOpIdx = R600Operands::ALUOpTable[SrcNum-1][SrcOps[SrcIdx][1]]; - EmitSrcISA(MI, RegOpIdx, SelOpIdx, OS); - } - - Emit(InstWord01, OS); - return; -} - -void R600MCCodeEmitter::EmitSrc(const MCInst &MI, unsigned OpIdx, - raw_ostream &OS) const { - const MCOperand &MO = MI.getOperand(OpIdx); - union { - float f; - uint32_t i; - } Value; - Value.i = 0; - // Emit the source select (2 bytes). For GPRs, this is the register index. - // For other potential instruction operands, (e.g. constant registers) the - // value of the source select is defined in the r600isa docs. - if (MO.isReg()) { - unsigned reg = MO.getReg(); - EmitTwoBytes(getHWReg(reg), OS); - if (reg == AMDGPU::ALU_LITERAL_X) { - unsigned ImmOpIndex = MI.getNumOperands() - 1; - MCOperand ImmOp = MI.getOperand(ImmOpIndex); - if (ImmOp.isFPImm()) { - Value.f = ImmOp.getFPImm(); + if (TextureType == TEXTURE_1D_ARRAY || + TextureType == TEXTURE_SHADOW1D_ARRAY) { + if (Opcode == AMDGPU::TEX_SAMPLE_C_L || + Opcode == AMDGPU::TEX_SAMPLE_C_LB) { + CoordType[ELEMENT_Y] = 0; } else { - assert(ImmOp.isImm()); - Value.i = ImmOp.getImm(); + CoordType[ELEMENT_Z] = 0; + SrcSelect[ELEMENT_Z] = ELEMENT_Y; } + } else if (TextureType == TEXTURE_2D_ARRAY || + TextureType == TEXTURE_SHADOW2D_ARRAY) { + CoordType[ELEMENT_Z] = 0; } - } else { - // XXX: Handle other operand types. - EmitTwoBytes(0, OS); - } - - // Emit the source channel (1 byte) - if (MO.isReg()) { - EmitByte(getHWRegChan(MO.getReg()), OS); - } else { - EmitByte(0, OS); - } - - // XXX: Emit isNegated (1 byte) - if ((!(isFlagSet(MI, OpIdx, MO_FLAG_ABS))) - && (isFlagSet(MI, OpIdx, MO_FLAG_NEG) || - (MO.isReg() && - (MO.getReg() == AMDGPU::NEG_ONE || MO.getReg() == AMDGPU::NEG_HALF)))){ - EmitByte(1, OS); - } else { - EmitByte(0, OS); - } - - // Emit isAbsolute (1 byte) - if (isFlagSet(MI, OpIdx, MO_FLAG_ABS)) { - EmitByte(1, OS); - } else { - EmitByte(0, OS); - } - - // XXX: Emit relative addressing mode (1 byte) - EmitByte(0, OS); - - // Emit kc_bank, This will be adjusted later by r600_asm - EmitByte(0, OS); - // Emit the literal value, if applicable (4 bytes). - Emit(Value.i, OS); -} - -void R600MCCodeEmitter::EmitSrcISA(const MCInst &MI, unsigned RegOpIdx, - unsigned SelOpIdx, raw_ostream &OS) const { - const MCOperand &RegMO = MI.getOperand(RegOpIdx); - const MCOperand &SelMO = MI.getOperand(SelOpIdx); - - union { - float f; - uint32_t i; - } InlineConstant; - InlineConstant.i = 0; - // Emit source type (1 byte) and source select (4 bytes). For GPRs type is 0 - // and select is 0 (GPR index is encoded in the instr encoding. For constants - // type is 1 and select is the original const select passed from the driver. - unsigned Reg = RegMO.getReg(); - if (Reg == AMDGPU::ALU_CONST) { - EmitByte(1, OS); - uint32_t Sel = SelMO.getImm(); - Emit(Sel, OS); - } else { - EmitByte(0, OS); - Emit((uint32_t)0, OS); - } - - if (Reg == AMDGPU::ALU_LITERAL_X) { - unsigned ImmOpIndex = MI.getNumOperands() - 1; - MCOperand ImmOp = MI.getOperand(ImmOpIndex); - if (ImmOp.isFPImm()) { - InlineConstant.f = ImmOp.getFPImm(); - } else { - assert(ImmOp.isImm()); - InlineConstant.i = ImmOp.getImm(); + if ((TextureType == TEXTURE_SHADOW1D || + TextureType == TEXTURE_SHADOW2D || + TextureType == TEXTURE_SHADOWRECT || + TextureType == TEXTURE_SHADOW1D_ARRAY) && + Opcode != AMDGPU::TEX_SAMPLE_C_L && + Opcode != AMDGPU::TEX_SAMPLE_C_LB) { + SrcSelect[ELEMENT_W] = ELEMENT_Z; } - } - - // Emit the literal value, if applicable (4 bytes). - Emit(InlineConstant.i, OS); -} - -void R600MCCodeEmitter::EmitFCInstr(const MCInst &MI, raw_ostream &OS) const { - - // Emit instruction type - EmitByte(INSTR_FC, OS); - // Emit SRC - unsigned NumOperands = MI.getNumOperands(); - if (NumOperands > 0) { - assert(NumOperands == 1); - EmitSrc(MI, 0, OS); + uint64_t Word01 = getBinaryCodeForInstr(MI, Fixups) | + CoordType[ELEMENT_X] << 60 | CoordType[ELEMENT_Y] << 61 | + CoordType[ELEMENT_Z] << 62 | CoordType[ELEMENT_W] << 63; + uint32_t Word2 = Sampler << 15 | SrcSelect[ELEMENT_X] << 20 | + SrcSelect[ELEMENT_Y] << 23 | SrcSelect[ELEMENT_Z] << 26 | + SrcSelect[ELEMENT_W] << 29 | Offsets[0] << 0 | Offsets[1] << 5 | + Offsets[2] << 10; + + Emit(Word01, OS); + Emit(Word2, OS); + Emit((u_int32_t) 0, OS); } else { - EmitNullBytes(SRC_BYTE_COUNT, OS); - } - - // Emit FC Instruction - enum FCInstr instr; - switch (MI.getOpcode()) { - case AMDGPU::PREDICATED_BREAK: - instr = FC_BREAK_PREDICATE; - break; - case AMDGPU::CONTINUE: - instr = FC_CONTINUE; - break; - case AMDGPU::IF_PREDICATE_SET: - instr = FC_IF_PREDICATE; - break; - case AMDGPU::ELSE: - instr = FC_ELSE; - break; - case AMDGPU::ENDIF: - instr = FC_ENDIF; - break; - case AMDGPU::ENDLOOP: - instr = FC_ENDLOOP; - break; - case AMDGPU::WHILELOOP: - instr = FC_BGNLOOP; - break; - default: - abort(); - break; - } - EmitByte(instr, OS); -} - -void R600MCCodeEmitter::EmitNullBytes(unsigned int ByteCount, - raw_ostream &OS) const { - - for (unsigned int i = 0; i < ByteCount; i++) { - EmitByte(0, OS); + uint64_t Inst = getBinaryCodeForInstr(MI, Fixups); + if ((STI.getFeatureBits() & AMDGPU::FeatureR600ALUInst) && + ((Desc.TSFlags & R600_InstFlag::OP1) || + Desc.TSFlags & R600_InstFlag::OP2)) { + uint64_t ISAOpCode = Inst & (0x3FFULL << 39); + Inst &= ~(0x3FFULL << 39); + Inst |= ISAOpCode << 1; + } + Emit(Inst, OS); } } @@ -488,12 +194,6 @@ void R600MCCodeEmitter::EmitByte(unsigned int Byte, raw_ostream &OS) const { OS.write((uint8_t) Byte & 0xff); } -void R600MCCodeEmitter::EmitTwoBytes(unsigned int Bytes, - raw_ostream &OS) const { - OS.write((uint8_t) (Bytes & 0xff)); - OS.write((uint8_t) ((Bytes >> 8) & 0xff)); -} - void R600MCCodeEmitter::Emit(uint32_t Value, raw_ostream &OS) const { for (unsigned i = 0; i < 4; i++) { OS.write((uint8_t) ((Value >> (8 * i)) & 0xff)); @@ -531,55 +231,4 @@ uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI, } } -//===----------------------------------------------------------------------===// -// Encoding helper functions -//===----------------------------------------------------------------------===// - -bool R600MCCodeEmitter::isFCOp(unsigned opcode) const { - switch(opcode) { - default: return false; - case AMDGPU::PREDICATED_BREAK: - case AMDGPU::CONTINUE: - case AMDGPU::IF_PREDICATE_SET: - case AMDGPU::ELSE: - case AMDGPU::ENDIF: - case AMDGPU::ENDLOOP: - case AMDGPU::WHILELOOP: - return true; - } -} - -bool R600MCCodeEmitter::isTexOp(unsigned opcode) const { - switch(opcode) { - default: return false; - case AMDGPU::TEX_LD: - case AMDGPU::TEX_GET_TEXTURE_RESINFO: - case AMDGPU::TEX_SAMPLE: - case AMDGPU::TEX_SAMPLE_C: - case AMDGPU::TEX_SAMPLE_L: - case AMDGPU::TEX_SAMPLE_C_L: - case AMDGPU::TEX_SAMPLE_LB: - case AMDGPU::TEX_SAMPLE_C_LB: - case AMDGPU::TEX_SAMPLE_G: - case AMDGPU::TEX_SAMPLE_C_G: - case AMDGPU::TEX_GET_GRADIENTS_H: - case AMDGPU::TEX_GET_GRADIENTS_V: - case AMDGPU::TEX_SET_GRADIENTS_H: - case AMDGPU::TEX_SET_GRADIENTS_V: - return true; - } -} - -bool R600MCCodeEmitter::isFlagSet(const MCInst &MI, unsigned Operand, - unsigned Flag) const { - const MCInstrDesc &MCDesc = MCII.get(MI.getOpcode()); - unsigned FlagIndex = GET_FLAG_OPERAND_IDX(MCDesc.TSFlags); - if (FlagIndex == 0) { - return false; - } - assert(MI.getOperand(FlagIndex).isImm()); - return !!((MI.getOperand(FlagIndex).getImm() >> - (NUM_MO_FLAGS * Operand)) & Flag); -} - #include "AMDGPUGenMCCodeEmitter.inc" diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td index 868810c..0cbe919 100644 --- a/lib/Target/R600/Processors.td +++ b/lib/Target/R600/Processors.td @@ -1,4 +1,4 @@ -//===-- Processors.td - TODO: Add brief description -------===// +//===-- Processors.td - R600 Processor definitions ------------------------===// // // The LLVM Compiler Infrastructure // @@ -6,25 +6,43 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// -// AMDIL processors supported. -// -//===----------------------------------------------------------------------===// class Proc<string Name, ProcessorItineraries itin, list<SubtargetFeature> Features> : Processor<Name, itin, Features>; -def : Proc<"", R600_EG_Itin, [FeatureR600ALUInst]>; -def : Proc<"r600", R600_EG_Itin, [FeatureR600ALUInst]>; -def : Proc<"rv710", R600_EG_Itin, []>; -def : Proc<"rv730", R600_EG_Itin, []>; -def : Proc<"rv770", R600_EG_Itin, [FeatureFP64]>; -def : Proc<"cedar", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>; -def : Proc<"redwood", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>; -def : Proc<"juniper", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>; -def : Proc<"cypress", R600_EG_Itin, [FeatureByteAddress, FeatureImages, FeatureFP64]>; -def : Proc<"barts", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>; -def : Proc<"turks", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>; -def : Proc<"caicos", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>; -def : Proc<"cayman", R600_EG_Itin, [FeatureByteAddress, FeatureImages, FeatureFP64]>; -def : Proc<"SI", SI_Itin, [Feature64BitPtr]>; - +def : Proc<"", R600_VLIW5_Itin, + [FeatureR600ALUInst, FeatureVertexCache]>; +def : Proc<"r600", R600_VLIW5_Itin, + [FeatureR600ALUInst , FeatureVertexCache]>; +def : Proc<"rs880", R600_VLIW5_Itin, + [FeatureR600ALUInst]>; +def : Proc<"rv670", R600_VLIW5_Itin, + [FeatureR600ALUInst, FeatureFP64, FeatureVertexCache]>; +def : Proc<"rv710", R600_VLIW5_Itin, + [FeatureVertexCache]>; +def : Proc<"rv730", R600_VLIW5_Itin, + [FeatureVertexCache]>; +def : Proc<"rv770", R600_VLIW5_Itin, + [FeatureFP64, FeatureVertexCache]>; +def : Proc<"cedar", R600_VLIW5_Itin, + [FeatureByteAddress, FeatureImages, FeatureVertexCache]>; +def : Proc<"redwood", R600_VLIW5_Itin, + [FeatureByteAddress, FeatureImages, FeatureVertexCache]>; +def : Proc<"sumo", R600_VLIW5_Itin, + [FeatureByteAddress, FeatureImages]>; +def : Proc<"juniper", R600_VLIW5_Itin, + [FeatureByteAddress, FeatureImages, FeatureVertexCache]>; +def : Proc<"cypress", R600_VLIW5_Itin, + [FeatureByteAddress, FeatureImages, FeatureFP64, FeatureVertexCache]>; +def : Proc<"barts", R600_VLIW5_Itin, + [FeatureByteAddress, FeatureImages, FeatureVertexCache]>; +def : Proc<"turks", R600_VLIW5_Itin, + [FeatureByteAddress, FeatureImages, FeatureVertexCache]>; +def : Proc<"caicos", R600_VLIW5_Itin, + [FeatureByteAddress, FeatureImages]>; +def : Proc<"cayman", R600_VLIW4_Itin, + [FeatureByteAddress, FeatureImages, FeatureFP64]>;def : Proc<"SI", SI_Itin, [Feature64BitPtr, FeatureFP64]>; +def : Proc<"tahiti", SI_Itin, [Feature64BitPtr, FeatureFP64]>; +def : Proc<"pitcairn", SI_Itin, [Feature64BitPtr, FeatureFP64]>; +def : Proc<"verde", SI_Itin, [Feature64BitPtr, FeatureFP64]>; +def : Proc<"oland", SI_Itin, [Feature64BitPtr, FeatureFP64]>; +def : Proc<"hainan", SI_Itin, [Feature64BitPtr, FeatureFP64]>; diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp index 3a6c7ea..ffe3414 100644 --- a/lib/Target/R600/R600ControlFlowFinalizer.cpp +++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp @@ -30,35 +30,27 @@ namespace llvm { class R600ControlFlowFinalizer : public MachineFunctionPass { private: + typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile; + + enum ControlFlowInstruction { + CF_TC, + CF_VC, + CF_CALL_FS, + CF_WHILE_LOOP, + CF_END_LOOP, + CF_LOOP_BREAK, + CF_LOOP_CONTINUE, + CF_JUMP, + CF_ELSE, + CF_POP, + CF_END + }; + static char ID; const R600InstrInfo *TII; + const R600RegisterInfo &TRI; unsigned MaxFetchInst; - - bool isFetch(const MachineInstr *MI) const { - switch (MI->getOpcode()) { - case AMDGPU::TEX_VTX_CONSTBUF: - case AMDGPU::TEX_VTX_TEXBUF: - case AMDGPU::TEX_LD: - case AMDGPU::TEX_GET_TEXTURE_RESINFO: - case AMDGPU::TEX_GET_GRADIENTS_H: - case AMDGPU::TEX_GET_GRADIENTS_V: - case AMDGPU::TEX_SET_GRADIENTS_H: - case AMDGPU::TEX_SET_GRADIENTS_V: - case AMDGPU::TEX_SAMPLE: - case AMDGPU::TEX_SAMPLE_C: - case AMDGPU::TEX_SAMPLE_L: - case AMDGPU::TEX_SAMPLE_C_L: - case AMDGPU::TEX_SAMPLE_LB: - case AMDGPU::TEX_SAMPLE_C_LB: - case AMDGPU::TEX_SAMPLE_G: - case AMDGPU::TEX_SAMPLE_C_G: - case AMDGPU::TXD: - case AMDGPU::TXD_SHADOW: - return true; - default: - return false; - } - } + const AMDGPUSubtarget &ST; bool IsTrivialInst(MachineInstr *MI) const { switch (MI->getOpcode()) { @@ -70,26 +62,226 @@ private: } } - MachineBasicBlock::iterator - MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - unsigned CfAddress) const { + const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const { + unsigned Opcode = 0; + bool isEg = (ST.device()->getGeneration() >= AMDGPUDeviceInfo::HD5XXX); + switch (CFI) { + case CF_TC: + Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600; + break; + case CF_VC: + Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600; + break; + case CF_CALL_FS: + Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600; + break; + case CF_WHILE_LOOP: + Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600; + break; + case CF_END_LOOP: + Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600; + break; + case CF_LOOP_BREAK: + Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600; + break; + case CF_LOOP_CONTINUE: + Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600; + break; + case CF_JUMP: + Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600; + break; + case CF_ELSE: + Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600; + break; + case CF_POP: + Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600; + break; + case CF_END: + if (ST.device()->getDeviceFlag() == OCL_DEVICE_CAYMAN) { + Opcode = AMDGPU::CF_END_CM; + break; + } + Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600; + break; + } + assert (Opcode && "No opcode selected"); + return TII->get(Opcode); + } + + bool isCompatibleWithClause(const MachineInstr *MI, + std::set<unsigned> &DstRegs, std::set<unsigned> &SrcRegs) const { + unsigned DstMI, SrcMI; + for (MachineInstr::const_mop_iterator I = MI->operands_begin(), + E = MI->operands_end(); I != E; ++I) { + const MachineOperand &MO = *I; + if (!MO.isReg()) + continue; + if (MO.isDef()) + DstMI = MO.getReg(); + if (MO.isUse()) { + unsigned Reg = MO.getReg(); + if (AMDGPU::R600_Reg128RegClass.contains(Reg)) + SrcMI = Reg; + else + SrcMI = TRI.getMatchingSuperReg(Reg, + TRI.getSubRegFromChannel(TRI.getHWRegChan(Reg)), + &AMDGPU::R600_Reg128RegClass); + } + } + if ((DstRegs.find(SrcMI) == DstRegs.end()) && + (SrcRegs.find(DstMI) == SrcRegs.end())) { + SrcRegs.insert(SrcMI); + DstRegs.insert(DstMI); + return true; + } else + return false; + } + + ClauseFile + MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I) + const { MachineBasicBlock::iterator ClauseHead = I; + std::vector<MachineInstr *> ClauseContent; unsigned AluInstCount = 0; + bool IsTex = TII->usesTextureCache(ClauseHead); + std::set<unsigned> DstRegs, SrcRegs; for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) { if (IsTrivialInst(I)) continue; - if (!isFetch(I)) + if (AluInstCount > MaxFetchInst) + break; + if ((IsTex && !TII->usesTextureCache(I)) || + (!IsTex && !TII->usesVertexCache(I))) + break; + if (!isCompatibleWithClause(I, DstRegs, SrcRegs)) break; AluInstCount ++; - if (AluInstCount > MaxFetchInst) + ClauseContent.push_back(I); + } + MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), + getHWInstrDesc(IsTex?CF_TC:CF_VC)) + .addImm(0) // ADDR + .addImm(AluInstCount - 1); // COUNT + return ClauseFile(MIb, ClauseContent); + } + + void getLiteral(MachineInstr *MI, std::vector<int64_t> &Lits) const { + unsigned LiteralRegs[] = { + AMDGPU::ALU_LITERAL_X, + AMDGPU::ALU_LITERAL_Y, + AMDGPU::ALU_LITERAL_Z, + AMDGPU::ALU_LITERAL_W + }; + for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg()) + continue; + if (MO.getReg() != AMDGPU::ALU_LITERAL_X) + continue; + unsigned ImmIdx = TII->getOperandIdx(MI->getOpcode(), R600Operands::IMM); + int64_t Imm = MI->getOperand(ImmIdx).getImm(); + std::vector<int64_t>::iterator It = + std::find(Lits.begin(), Lits.end(), Imm); + if (It != Lits.end()) { + unsigned Index = It - Lits.begin(); + MO.setReg(LiteralRegs[Index]); + } else { + assert(Lits.size() < 4 && "Too many literals in Instruction Group"); + MO.setReg(LiteralRegs[Lits.size()]); + Lits.push_back(Imm); + } + } + } + + MachineBasicBlock::iterator insertLiterals( + MachineBasicBlock::iterator InsertPos, + const std::vector<unsigned> &Literals) const { + MachineBasicBlock *MBB = InsertPos->getParent(); + for (unsigned i = 0, e = Literals.size(); i < e; i+=2) { + unsigned LiteralPair0 = Literals[i]; + unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0; + InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(), + TII->get(AMDGPU::LITERALS)) + .addImm(LiteralPair0) + .addImm(LiteralPair1); + } + return InsertPos; + } + + ClauseFile + MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I) + const { + MachineBasicBlock::iterator ClauseHead = I; + std::vector<MachineInstr *> ClauseContent; + I++; + for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) { + if (IsTrivialInst(I)) { + ++I; + continue; + } + if (!I->isBundle() && !TII->isALUInstr(I->getOpcode())) break; + std::vector<int64_t> Literals; + if (I->isBundle()) { + MachineInstr *DeleteMI = I; + MachineBasicBlock::instr_iterator BI = I.getInstrIterator(); + while (++BI != E && BI->isBundledWithPred()) { + BI->unbundleFromPred(); + for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = BI->getOperand(i); + if (MO.isReg() && MO.isInternalRead()) + MO.setIsInternalRead(false); + } + getLiteral(BI, Literals); + ClauseContent.push_back(BI); + } + I = BI; + DeleteMI->eraseFromParent(); + } else { + getLiteral(I, Literals); + ClauseContent.push_back(I); + I++; + } + for (unsigned i = 0, e = Literals.size(); i < e; i+=2) { + unsigned literal0 = Literals[i]; + unsigned literal2 = (i + 1 < e)?Literals[i + 1]:0; + MachineInstr *MILit = BuildMI(MBB, I, I->getDebugLoc(), + TII->get(AMDGPU::LITERALS)) + .addImm(literal0) + .addImm(literal2); + ClauseContent.push_back(MILit); + } } - BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), - TII->get(AMDGPU::CF_TC)) - .addImm(CfAddress) // ADDR - .addImm(AluInstCount); // COUNT - return I; + ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1); + return ClauseFile(ClauseHead, ClauseContent); } + + void + EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, + unsigned &CfCount) { + CounterPropagateAddr(Clause.first, CfCount); + MachineBasicBlock *BB = Clause.first->getParent(); + BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::FETCH_CLAUSE)) + .addImm(CfCount); + for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { + BB->splice(InsertPos, BB, Clause.second[i]); + } + CfCount += 2 * Clause.second.size(); + } + + void + EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, + unsigned &CfCount) { + CounterPropagateAddr(Clause.first, CfCount); + MachineBasicBlock *BB = Clause.first->getParent(); + BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE)) + .addImm(CfCount); + for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { + BB->splice(InsertPos, BB, Clause.second[i]); + } + CfCount += Clause.second.size(); + } + void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const { MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm()); } @@ -102,9 +294,27 @@ private: } } + unsigned getHWStackSize(unsigned StackSubEntry, bool hasPush) const { + switch (ST.device()->getGeneration()) { + case AMDGPUDeviceInfo::HD4XXX: + if (hasPush) + StackSubEntry += 2; + break; + case AMDGPUDeviceInfo::HD5XXX: + if (hasPush) + StackSubEntry ++; + case AMDGPUDeviceInfo::HD6XXX: + StackSubEntry += 2; + break; + } + return (StackSubEntry + 3)/4; // Need ceil value of StackSubEntry/4 + } + public: R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID), - TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo())) { + TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo())), + TRI(TII->getRegisterInfo()), + ST(tm.getSubtarget<AMDGPUSubtarget>()) { const AMDGPUSubtarget &ST = tm.getSubtarget<AMDGPUSubtarget>(); if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD4XXX) MaxFetchInst = 8; @@ -115,6 +325,7 @@ public: virtual bool runOnMachineFunction(MachineFunction &MF) { unsigned MaxStack = 0; unsigned CurrentStack = 0; + bool HasPush = false; for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME; ++MB) { MachineBasicBlock &MBB = *MB; @@ -124,14 +335,16 @@ public: R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); if (MFI->ShaderType == 1) { BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), - TII->get(AMDGPU::CF_CALL_FS)); + getHWInstrDesc(CF_CALL_FS)); CfCount++; + MaxStack = 1; } + std::vector<ClauseFile> FetchClauses, AluClauses; for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { - if (isFetch(I)) { + if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) { DEBUG(dbgs() << CfCount << ":"; I->dump();); - I = MakeFetchClause(MBB, I, 0); + FetchClauses.push_back(MakeFetchClause(MBB, I)); CfCount++; continue; } @@ -142,20 +355,25 @@ public: case AMDGPU::CF_ALU_PUSH_BEFORE: CurrentStack++; MaxStack = std::max(MaxStack, CurrentStack); + HasPush = true; case AMDGPU::CF_ALU: + I = MI; + AluClauses.push_back(MakeALUClause(MBB, I)); case AMDGPU::EG_ExportBuf: case AMDGPU::EG_ExportSwz: case AMDGPU::R600_ExportBuf: case AMDGPU::R600_ExportSwz: + case AMDGPU::RAT_WRITE_CACHELESS_32_eg: + case AMDGPU::RAT_WRITE_CACHELESS_128_eg: DEBUG(dbgs() << CfCount << ":"; MI->dump();); CfCount++; break; case AMDGPU::WHILELOOP: { - CurrentStack++; + CurrentStack+=4; MaxStack = std::max(MaxStack, CurrentStack); MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), - TII->get(AMDGPU::WHILE_LOOP)) - .addImm(2); + getHWInstrDesc(CF_WHILE_LOOP)) + .addImm(1); std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount, std::set<MachineInstr *>()); Pair.second.insert(MIb); @@ -165,12 +383,12 @@ public: break; } case AMDGPU::ENDLOOP: { - CurrentStack--; + CurrentStack-=4; std::pair<unsigned, std::set<MachineInstr *> > Pair = LoopStack.back(); LoopStack.pop_back(); CounterPropagateAddr(Pair.second, CfCount); - BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::END_LOOP)) + BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP)) .addImm(Pair.first + 1); MI->eraseFromParent(); CfCount++; @@ -178,7 +396,7 @@ public: } case AMDGPU::IF_PREDICATE_SET: { MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), - TII->get(AMDGPU::CF_JUMP)) + getHWInstrDesc(CF_JUMP)) .addImm(0) .addImm(0); IfThenElseStack.push_back(MIb); @@ -192,7 +410,7 @@ public: IfThenElseStack.pop_back(); CounterPropagateAddr(JumpInst, CfCount); MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), - TII->get(AMDGPU::CF_ELSE)) + getHWInstrDesc(CF_ELSE)) .addImm(0) .addImm(1); DEBUG(dbgs() << CfCount << ":"; MIb->dump();); @@ -207,9 +425,10 @@ public: IfThenElseStack.pop_back(); CounterPropagateAddr(IfOrElseInst, CfCount + 1); MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), - TII->get(AMDGPU::POP)) + getHWInstrDesc(CF_POP)) .addImm(CfCount + 1) .addImm(1); + (void)MIb; DEBUG(dbgs() << CfCount << ":"; MIb->dump();); MI->eraseFromParent(); CfCount++; @@ -218,13 +437,13 @@ public: case AMDGPU::PREDICATED_BREAK: { CurrentStack--; CfCount += 3; - BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_JUMP)) + BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_JUMP)) .addImm(CfCount) .addImm(1); MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), - TII->get(AMDGPU::LOOP_BREAK)) + getHWInstrDesc(CF_LOOP_BREAK)) .addImm(0); - BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::POP)) + BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_POP)) .addImm(CfCount) .addImm(1); LoopStack.back().second.insert(MIb); @@ -233,20 +452,31 @@ public: } case AMDGPU::CONTINUE: { MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), - TII->get(AMDGPU::CF_CONTINUE)) + getHWInstrDesc(CF_LOOP_CONTINUE)) .addImm(0); LoopStack.back().second.insert(MIb); MI->eraseFromParent(); CfCount++; break; } + case AMDGPU::RETURN: { + BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END)); + CfCount++; + MI->eraseFromParent(); + if (CfCount % 2) { + BuildMI(MBB, I, MBB.findDebugLoc(MI), TII->get(AMDGPU::PAD)); + CfCount++; + } + for (unsigned i = 0, e = FetchClauses.size(); i < e; i++) + EmitFetchClause(I, FetchClauses[i], CfCount); + for (unsigned i = 0, e = AluClauses.size(); i < e; i++) + EmitALUClause(I, AluClauses[i], CfCount); + } default: break; } } - BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), - TII->get(AMDGPU::STACK_SIZE)) - .addImm(MaxStack); + MFI->StackSize = getHWStackSize(MaxStack, HasPush); } return false; @@ -265,4 +495,3 @@ char R600ControlFlowFinalizer::ID = 0; llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) { return new R600ControlFlowFinalizer(TM); } - diff --git a/lib/Target/R600/R600Defines.h b/lib/Target/R600/R600Defines.h index 16cfcf5..36bfb18 100644 --- a/lib/Target/R600/R600Defines.h +++ b/lib/Target/R600/R600Defines.h @@ -39,7 +39,9 @@ namespace R600_InstFlag { //FlagOperand bits 7, 8 NATIVE_OPERANDS = (1 << 9), OP1 = (1 << 10), - OP2 = (1 << 11) + OP2 = (1 << 11), + VTX_INST = (1 << 12), + TEX_INST = (1 << 13) }; } @@ -52,6 +54,9 @@ namespace R600_InstFlag { #define GET_REG_CHAN(reg) ((reg) >> HW_CHAN_SHIFT) #define GET_REG_INDEX(reg) ((reg) & HW_REG_MASK) +#define IS_VTX(desc) ((desc).TSFlags & R600_InstFlag::VTX_INST) +#define IS_TEX(desc) ((desc).TSFlags & R600_InstFlag::TEX_INST) + namespace R600Operands { enum Ops { DST, @@ -78,6 +83,7 @@ namespace R600Operands { LAST, PRED_SEL, IMM, + BANK_SWIZZLE, COUNT }; @@ -85,13 +91,39 @@ namespace R600Operands { // W C S S S S S S S S S S S // R O D L S R R R R S R R R R S R R R L P // D U I M R A R C C C C R C C C C R C C C A R I -// S E U T O E M C 0 0 0 0 C 1 1 1 1 C 2 2 2 S E M -// T M P E D L P 0 N R A S 1 N R A S 2 N R S T D M - {0,-1,-1, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,-1,-1,-1,10,11,12}, - {0, 1, 2, 3, 4 ,5 ,6 ,7, 8, 9,10,11,12,13,14,15,16,-1,-1,-1,-1,17,18,19}, - {0,-1,-1,-1,-1, 1, 2, 3, 4, 5,-1, 6, 7, 8, 9,-1,10,11,12,13,14,15,16,17} +// S E U T O E M C 0 0 0 0 C 1 1 1 1 C 2 2 2 S E M B +// T M P E D L P 0 N R A S 1 N R A S 2 N R S T D M S + {0,-1,-1, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,-1,-1,-1,10,11,12,13}, + {0, 1, 2, 3, 4 ,5 ,6 ,7, 8, 9,10,11,12,13,14,15,16,-1,-1,-1,-1,17,18,19,20}, + {0,-1,-1,-1,-1, 1, 2, 3, 4, 5,-1, 6, 7, 8, 9,-1,10,11,12,13,14,15,16,17,18} }; } +//===----------------------------------------------------------------------===// +// Config register definitions +//===----------------------------------------------------------------------===// + +#define R_02880C_DB_SHADER_CONTROL 0x02880C +#define S_02880C_KILL_ENABLE(x) (((x) & 0x1) << 6) + +// These fields are the same for all shader types and families. +#define S_NUM_GPRS(x) (((x) & 0xFF) << 0) +#define S_STACK_SIZE(x) (((x) & 0xFF) << 8) +//===----------------------------------------------------------------------===// +// R600, R700 Registers +//===----------------------------------------------------------------------===// + +#define R_028850_SQ_PGM_RESOURCES_PS 0x028850 +#define R_028868_SQ_PGM_RESOURCES_VS 0x028868 + +//===----------------------------------------------------------------------===// +// Evergreen, Northern Islands Registers +//===----------------------------------------------------------------------===// + +#define R_028844_SQ_PGM_RESOURCES_PS 0x028844 +#define R_028860_SQ_PGM_RESOURCES_VS 0x028860 +#define R_028878_SQ_PGM_RESOURCES_GS 0x028878 +#define R_0288D4_SQ_PGM_RESOURCES_LS 0x0288d4 + #endif // R600DEFINES_H_ diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index 53e6e51..7252235 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -43,11 +43,25 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setOperationAction(ISD::AND, MVT::v4i32, Expand); setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand); setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand); + setOperationAction(ISD::MUL, MVT::v2i32, Expand); + setOperationAction(ISD::MUL, MVT::v4i32, Expand); + setOperationAction(ISD::OR, MVT::v4i32, Expand); + setOperationAction(ISD::OR, MVT::v2i32, Expand); setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand); + setOperationAction(ISD::SHL, MVT::v4i32, Expand); + setOperationAction(ISD::SHL, MVT::v2i32, Expand); + setOperationAction(ISD::SRL, MVT::v4i32, Expand); + setOperationAction(ISD::SRL, MVT::v2i32, Expand); + setOperationAction(ISD::SRA, MVT::v4i32, Expand); + setOperationAction(ISD::SRA, MVT::v2i32, Expand); + setOperationAction(ISD::SUB, MVT::v4i32, Expand); + setOperationAction(ISD::SUB, MVT::v2i32, Expand); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand); setOperationAction(ISD::UDIV, MVT::v4i32, Expand); setOperationAction(ISD::UREM, MVT::v4i32, Expand); setOperationAction(ISD::SETCC, MVT::v4i32, Expand); + setOperationAction(ISD::XOR, MVT::v4i32, Expand); + setOperationAction(ISD::XOR, MVT::v2i32, Expand); setOperationAction(ISD::BR_CC, MVT::i32, Expand); setOperationAction(ISD::BR_CC, MVT::f32, Expand); @@ -70,6 +84,9 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setOperationAction(ISD::SELECT, MVT::i32, Custom); setOperationAction(ISD::SELECT, MVT::f32, Custom); + setOperationAction(ISD::VSELECT, MVT::v4i32, Expand); + setOperationAction(ISD::VSELECT, MVT::v2i32, Expand); + // Legalize loads and stores to the private address space. setOperationAction(ISD::LOAD, MVT::i32, Custom); setOperationAction(ISD::LOAD, MVT::v2i32, Custom); @@ -93,6 +110,7 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setTargetDAGCombine(ISD::SELECT_CC); setBooleanContents(ZeroOrNegativeOneBooleanContent); + setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); setSchedulingPreference(Sched::VLIW); } diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp index b232188..37150c4 100644 --- a/lib/Target/R600/R600InstrInfo.cpp +++ b/lib/Target/R600/R600InstrInfo.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "R600InstrInfo.h" +#include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" #include "R600Defines.h" @@ -29,7 +30,8 @@ using namespace llvm; R600InstrInfo::R600InstrInfo(AMDGPUTargetMachine &tm) : AMDGPUInstrInfo(tm), - RI(tm, *this) + RI(tm, *this), + ST(tm.getSubtarget<AMDGPUSubtarget>()) { } const R600RegisterInfo &R600InstrInfo::getRegisterInfo() const { @@ -139,6 +141,33 @@ bool R600InstrInfo::isALUInstr(unsigned Opcode) const { (TargetFlags & R600_InstFlag::OP3)); } +bool R600InstrInfo::isTransOnly(unsigned Opcode) const { + return (get(Opcode).TSFlags & R600_InstFlag::TRANS_ONLY); +} + +bool R600InstrInfo::isTransOnly(const MachineInstr *MI) const { + return isTransOnly(MI->getOpcode()); +} + +bool R600InstrInfo::usesVertexCache(unsigned Opcode) const { + return ST.hasVertexCache() && IS_VTX(get(Opcode)); +} + +bool R600InstrInfo::usesVertexCache(const MachineInstr *MI) const { + const R600MachineFunctionInfo *MFI = MI->getParent()->getParent()->getInfo<R600MachineFunctionInfo>(); + return MFI->ShaderType != ShaderType::COMPUTE && usesVertexCache(MI->getOpcode()); +} + +bool R600InstrInfo::usesTextureCache(unsigned Opcode) const { + return (!ST.hasVertexCache() && IS_VTX(get(Opcode))) || IS_TEX(get(Opcode)); +} + +bool R600InstrInfo::usesTextureCache(const MachineInstr *MI) const { + const R600MachineFunctionInfo *MFI = MI->getParent()->getParent()->getInfo<R600MachineFunctionInfo>(); + return (MFI->ShaderType == ShaderType::COMPUTE && usesVertexCache(MI->getOpcode())) || + usesTextureCache(MI->getOpcode()); +} + bool R600InstrInfo::fitsConstReadLimitations(const std::vector<unsigned> &Consts) const { @@ -183,10 +212,19 @@ R600InstrInfo::canBundle(const std::vector<MachineInstr *> &MIs) const { int SrcIdx = getOperandIdx(MI->getOpcode(), OpTable[j][0]); if (SrcIdx < 0) break; - if (MI->getOperand(SrcIdx).getReg() == AMDGPU::ALU_CONST) { + unsigned Reg = MI->getOperand(SrcIdx).getReg(); + if (Reg == AMDGPU::ALU_CONST) { unsigned Const = MI->getOperand( getOperandIdx(MI->getOpcode(), OpTable[j][1])).getImm(); Consts.push_back(Const); + continue; + } + if (AMDGPU::R600_KC0RegClass.contains(Reg) || + AMDGPU::R600_KC1RegClass.contains(Reg)) { + unsigned Index = RI.getEncodingValue(Reg) & 0xff; + unsigned Chan = RI.getHWRegChan(Reg); + Consts.push_back((Index << 2) | Chan); + continue; } } } @@ -684,7 +722,8 @@ MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MB //scheduling to the backend, we can change the default to 0. MIB.addImm(1) // $last .addReg(AMDGPU::PRED_SEL_OFF) // $pred_sel - .addImm(0); // $literal + .addImm(0) // $literal + .addImm(0); // $bank_swizzle return MIB; } diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h index dbae900..babe4b8 100644 --- a/lib/Target/R600/R600InstrInfo.h +++ b/lib/Target/R600/R600InstrInfo.h @@ -33,6 +33,7 @@ namespace llvm { class R600InstrInfo : public AMDGPUInstrInfo { private: const R600RegisterInfo RI; + const AMDGPUSubtarget &ST; int getBranchInstr(const MachineOperand &op) const; @@ -53,6 +54,14 @@ namespace llvm { /// \returns true if this \p Opcode represents an ALU instruction. bool isALUInstr(unsigned Opcode) const; + bool isTransOnly(unsigned Opcode) const; + bool isTransOnly(const MachineInstr *MI) const; + + bool usesVertexCache(unsigned Opcode) const; + bool usesVertexCache(const MachineInstr *MI) const; + bool usesTextureCache(unsigned Opcode) const; + bool usesTextureCache(const MachineInstr *MI) const; + bool fitsConstReadLimitations(const std::vector<unsigned>&) const; bool canBundle(const std::vector<MachineInstr *> &) const; diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index 663b41a..8f47523 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -13,11 +13,12 @@ include "R600Intrinsics.td" -class InstR600 <bits<11> inst, dag outs, dag ins, string asm, list<dag> pattern, +class InstR600 <dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin> : AMDGPUInst <outs, ins, asm, pattern> { field bits<64> Inst; + bit TransOnly = 0; bit Trig = 0; bit Op3 = 0; bit isVector = 0; @@ -25,9 +26,9 @@ class InstR600 <bits<11> inst, dag outs, dag ins, string asm, list<dag> pattern, bit Op1 = 0; bit Op2 = 0; bit HasNativeOperands = 0; + bit VTXInst = 0; + bit TEXInst = 0; - bits<11> op_code = inst; - //let Inst = inst; let Namespace = "AMDGPU"; let OutOperandList = outs; let InOperandList = ins; @@ -35,6 +36,7 @@ class InstR600 <bits<11> inst, dag outs, dag ins, string asm, list<dag> pattern, let Pattern = pattern; let Itinerary = itin; + let TSFlags{0} = TransOnly; let TSFlags{4} = Trig; let TSFlags{5} = Op3; @@ -45,11 +47,12 @@ class InstR600 <bits<11> inst, dag outs, dag ins, string asm, list<dag> pattern, let TSFlags{9} = HasNativeOperands; let TSFlags{10} = Op1; let TSFlags{11} = Op2; + let TSFlags{12} = VTXInst; + let TSFlags{13} = TEXInst; } class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern> : - AMDGPUInst <outs, ins, asm, pattern> { - field bits<64> Inst; + InstR600 <outs, ins, asm, pattern, NullALU> { let Namespace = "AMDGPU"; } @@ -74,6 +77,9 @@ class InstFlag<string PM = "printOperand", int Default = 0> def SEL : OperandWithDefaultOps <i32, (ops (i32 -1))> { let PrintMethod = "printSel"; } +def BANK_SWIZZLE : OperandWithDefaultOps <i32, (ops (i32 0))> { + let PrintMethod = "printBankSwizzle"; +} def LITERAL : InstFlag<"printLiteral">; @@ -137,7 +143,7 @@ class R600ALU_Word1 { field bits<32> Word1; bits<11> dst; - bits<3> bank_swizzle = 0; + bits<3> bank_swizzle; bits<1> dst_rel; bits<1> clamp; @@ -346,15 +352,15 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { // and R600InstrInfo::getOperandIdx(). class R600_1OP <bits<11> inst, string opName, list<dag> pattern, InstrItinClass itin = AnyALU> : - InstR600 <0, - (outs R600_Reg32:$dst), + InstR600 <(outs R600_Reg32:$dst), (ins WRITE:$write, OMOD:$omod, REL:$dst_rel, CLAMP:$clamp, R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel, - LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal), + LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal, + BANK_SWIZZLE:$bank_swizzle), !strconcat(" ", opName, - "$clamp $dst$write$dst_rel$omod, " + "$last$clamp $dst$write$dst_rel$omod, " "$src0_neg$src0_abs$src0$src0_abs$src0_rel, " - "$literal $pred_sel$last"), + "$pred_sel $bank_swizzle"), pattern, itin>, R600ALU_Word0, @@ -385,18 +391,18 @@ class R600_1OP_Helper <bits<11> inst, string opName, SDPatternOperator node, // R600InstrInfo::buildDefaultInstruction(), and R600InstrInfo::getOperandIdx(). class R600_2OP <bits<11> inst, string opName, list<dag> pattern, InstrItinClass itin = AnyALU> : - InstR600 <inst, - (outs R600_Reg32:$dst), + InstR600 <(outs R600_Reg32:$dst), (ins UEM:$update_exec_mask, UP:$update_pred, WRITE:$write, OMOD:$omod, REL:$dst_rel, CLAMP:$clamp, R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel, R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, ABS:$src1_abs, SEL:$src1_sel, - LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal), + LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal, + BANK_SWIZZLE:$bank_swizzle), !strconcat(" ", opName, - "$clamp $update_exec_mask$update_pred$dst$write$dst_rel$omod, " + "$last$clamp $update_exec_mask$update_pred$dst$write$dst_rel$omod, " "$src0_neg$src0_abs$src0$src0_abs$src0_rel, " "$src1_neg$src1_abs$src1$src1_abs$src1_rel, " - "$literal $pred_sel$last"), + "$pred_sel $bank_swizzle"), pattern, itin>, R600ALU_Word0, @@ -423,18 +429,19 @@ class R600_2OP_Helper <bits<11> inst, string opName, SDPatternOperator node, // R600InstrInfo::getOperandIdx(). class R600_3OP <bits<5> inst, string opName, list<dag> pattern, InstrItinClass itin = AnyALU> : - InstR600 <0, - (outs R600_Reg32:$dst), + InstR600 <(outs R600_Reg32:$dst), (ins REL:$dst_rel, CLAMP:$clamp, R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, SEL:$src0_sel, R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, SEL:$src1_sel, R600_Reg32:$src2, NEG:$src2_neg, REL:$src2_rel, SEL:$src2_sel, - LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal), - !strconcat(" ", opName, "$clamp $dst$dst_rel, " + LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal, + BANK_SWIZZLE:$bank_swizzle), + !strconcat(" ", opName, "$last$clamp $dst$dst_rel, " "$src0_neg$src0$src0_rel, " "$src1_neg$src1$src1_rel, " "$src2_neg$src2$src2_rel, " - "$literal $pred_sel$last"), + "$pred_sel" + "$bank_swizzle"), pattern, itin>, R600ALU_Word0, @@ -450,8 +457,7 @@ class R600_3OP <bits<5> inst, string opName, list<dag> pattern, class R600_REDUCTION <bits<11> inst, dag ins, string asm, list<dag> pattern, InstrItinClass itin = VecALU> : - InstR600 <inst, - (outs R600_Reg32:$dst), + InstR600 <(outs R600_Reg32:$dst), ins, asm, pattern, @@ -459,8 +465,7 @@ class R600_REDUCTION <bits<11> inst, dag ins, string asm, list<dag> pattern, class R600_TEX <bits<11> inst, string opName, list<dag> pattern, InstrItinClass itin = AnyALU> : - InstR600 <inst, - (outs R600_Reg128:$DST_GPR), + InstR600 <(outs R600_Reg128:$DST_GPR), (ins R600_Reg128:$SRC_GPR, i32imm:$RESOURCE_ID, i32imm:$SAMPLER_ID, i32imm:$textureTarget), !strconcat(opName, "$DST_GPR, $SRC_GPR, $RESOURCE_ID, $SAMPLER_ID, $textureTarget"), pattern, @@ -481,11 +486,14 @@ class R600_TEX <bits<11> inst, string opName, list<dag> pattern, let FETCH_WHOLE_QUAD = 0; let ALT_CONST = 0; let SAMPLER_INDEX_MODE = 0; + let RESOURCE_INDEX_MODE = 0; let COORD_TYPE_X = 0; let COORD_TYPE_Y = 0; let COORD_TYPE_Z = 0; let COORD_TYPE_W = 0; + + let TEXInst = 1; } } // End mayLoad = 1, mayStore = 0, hasSideEffects = 0 @@ -738,7 +746,9 @@ multiclass SteamOutputExportPattern<Instruction ExportInst, 4095, imm:$mask, buf3inst, 0)>; } -let usesCustomInserter = 1 in { +// Export Instructions should not be duplicated by TailDuplication pass +// (which assumes that duplicable instruction are affected by exec mask) +let usesCustomInserter = 1, isNotDuplicable = 1 in { class ExportSwzInst : InstR600ISA<( outs), @@ -805,12 +815,15 @@ class CF_ALU_WORD1 { let Word1{31} = BARRIER; } +def KCACHE : InstFlag<"printKCache">; + class ALU_CLAUSE<bits<4> inst, string OpName> : AMDGPUInst <(outs), -(ins i32imm:$ADDR, i32imm:$KCACHE_BANK0, i32imm:$KCACHE_BANK1, i32imm:$KCACHE_MODE0, i32imm:$KCACHE_MODE1, -i32imm:$KCACHE_ADDR0, i32imm:$KCACHE_ADDR1, i32imm:$COUNT), +(ins i32imm:$ADDR, i32imm:$KCACHE_BANK0, i32imm:$KCACHE_BANK1, +KCACHE:$KCACHE_MODE0, KCACHE:$KCACHE_MODE1, +i32imm:$KCACHE_ADDR0, i32imm:$KCACHE_ADDR1, +i32imm:$COUNT), !strconcat(OpName, " $COUNT, @$ADDR, " -"KC0[CB$KCACHE_BANK0:$KCACHE_ADDR0-$KCACHE_ADDR0+32]" -", KC1[CB$KCACHE_BANK1:$KCACHE_ADDR1-$KCACHE_ADDR1+32]"), +"KC0[$KCACHE_MODE0], KC1[$KCACHE_MODE1]"), [] >, CF_ALU_WORD0, CF_ALU_WORD1 { field bits<64> Inst; @@ -823,109 +836,139 @@ i32imm:$KCACHE_ADDR0, i32imm:$KCACHE_ADDR1, i32imm:$COUNT), let Inst{63-32} = Word1; } -class CF_WORD0 { +class CF_WORD0_R600 { field bits<32> Word0; - bits<24> ADDR; - bits<3> JUMPTABLE_SEL; + bits<32> ADDR; - let Word0{23-0} = ADDR; - let Word0{26-24} = JUMPTABLE_SEL; + let Word0 = ADDR; } -class CF_WORD1 { +class CF_WORD1_R600 { field bits<32> Word1; bits<3> POP_COUNT; bits<5> CF_CONST; bits<2> COND; - bits<6> COUNT; + bits<3> COUNT; + bits<6> CALL_COUNT; + bits<1> COUNT_3; + bits<1> END_OF_PROGRAM; bits<1> VALID_PIXEL_MODE; - bits<8> CF_INST; + bits<7> CF_INST; + bits<1> WHOLE_QUAD_MODE; bits<1> BARRIER; let Word1{2-0} = POP_COUNT; let Word1{7-3} = CF_CONST; let Word1{9-8} = COND; - let Word1{15-10} = COUNT; - let Word1{20} = VALID_PIXEL_MODE; - let Word1{29-22} = CF_INST; + let Word1{12-10} = COUNT; + let Word1{18-13} = CALL_COUNT; + let Word1{19} = COUNT_3; + let Word1{21} = END_OF_PROGRAM; + let Word1{22} = VALID_PIXEL_MODE; + let Word1{29-23} = CF_INST; + let Word1{30} = WHOLE_QUAD_MODE; let Word1{31} = BARRIER; } -class CF_CLAUSE <bits<8> inst, dag ins, string AsmPrint> : AMDGPUInst <(outs), -ins, AsmPrint, [] >, CF_WORD0, CF_WORD1 { +class CF_CLAUSE_R600 <bits<7> inst, dag ins, string AsmPrint> : AMDGPUInst <(outs), +ins, AsmPrint, [] >, CF_WORD0_R600, CF_WORD1_R600 { field bits<64> Inst; let CF_INST = inst; let BARRIER = 1; - let JUMPTABLE_SEL = 0; let CF_CONST = 0; let VALID_PIXEL_MODE = 0; let COND = 0; + let CALL_COUNT = 0; + let COUNT_3 = 0; + let END_OF_PROGRAM = 0; + let WHOLE_QUAD_MODE = 0; let Inst{31-0} = Word0; let Inst{63-32} = Word1; } -def CF_TC : CF_CLAUSE<1, (ins i32imm:$ADDR, i32imm:$COUNT), -"TEX $COUNT @$ADDR"> { - let POP_COUNT = 0; -} - -def CF_VC : CF_CLAUSE<2, (ins i32imm:$ADDR, i32imm:$COUNT), -"VTX $COUNT @$ADDR"> { - let POP_COUNT = 0; -} +class CF_WORD0_EG { + field bits<32> Word0; -def WHILE_LOOP : CF_CLAUSE<6, (ins i32imm:$ADDR), "LOOP_START_DX10 @$ADDR"> { - let POP_COUNT = 0; - let COUNT = 0; -} + bits<24> ADDR; + bits<3> JUMPTABLE_SEL; -def END_LOOP : CF_CLAUSE<5, (ins i32imm:$ADDR), "END_LOOP @$ADDR"> { - let POP_COUNT = 0; - let COUNT = 0; + let Word0{23-0} = ADDR; + let Word0{26-24} = JUMPTABLE_SEL; } -def LOOP_BREAK : CF_CLAUSE<9, (ins i32imm:$ADDR), "LOOP_BREAK @$ADDR"> { - let POP_COUNT = 0; - let COUNT = 0; -} +class CF_WORD1_EG { + field bits<32> Word1; -def CF_CONTINUE : CF_CLAUSE<8, (ins i32imm:$ADDR), "CONTINUE @$ADDR"> { - let POP_COUNT = 0; - let COUNT = 0; -} + bits<3> POP_COUNT; + bits<5> CF_CONST; + bits<2> COND; + bits<6> COUNT; + bits<1> VALID_PIXEL_MODE; + bits<1> END_OF_PROGRAM; + bits<8> CF_INST; + bits<1> BARRIER; -def CF_JUMP : CF_CLAUSE<10, (ins i32imm:$ADDR, i32imm:$POP_COUNT), "JUMP @$ADDR POP:$POP_COUNT"> { - let COUNT = 0; + let Word1{2-0} = POP_COUNT; + let Word1{7-3} = CF_CONST; + let Word1{9-8} = COND; + let Word1{15-10} = COUNT; + let Word1{20} = VALID_PIXEL_MODE; + let Word1{21} = END_OF_PROGRAM; + let Word1{29-22} = CF_INST; + let Word1{31} = BARRIER; } -def CF_ELSE : CF_CLAUSE<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT), "ELSE @$ADDR POP:$POP_COUNT"> { - let COUNT = 0; -} +class CF_CLAUSE_EG <bits<8> inst, dag ins, string AsmPrint> : AMDGPUInst <(outs), +ins, AsmPrint, [] >, CF_WORD0_EG, CF_WORD1_EG { + field bits<64> Inst; -def CF_CALL_FS : CF_CLAUSE<19, (ins), "CALL_FS"> { - let ADDR = 0; - let COUNT = 0; - let POP_COUNT = 0; -} + let CF_INST = inst; + let BARRIER = 1; + let JUMPTABLE_SEL = 0; + let CF_CONST = 0; + let VALID_PIXEL_MODE = 0; + let COND = 0; + let END_OF_PROGRAM = 0; -def POP : CF_CLAUSE<14, (ins i32imm:$ADDR, i32imm:$POP_COUNT), "POP @$ADDR POP:$POP_COUNT"> { - let COUNT = 0; + let Inst{31-0} = Word0; + let Inst{63-32} = Word1; } def CF_ALU : ALU_CLAUSE<8, "ALU">; def CF_ALU_PUSH_BEFORE : ALU_CLAUSE<9, "ALU_PUSH_BEFORE">; -def STACK_SIZE : AMDGPUInst <(outs), -(ins i32imm:$num), "nstack $num", [] > { +def FETCH_CLAUSE : AMDGPUInst <(outs), +(ins i32imm:$addr), "Fetch clause starting at $addr:", [] > { field bits<8> Inst; bits<8> num; let Inst = num; } +def ALU_CLAUSE : AMDGPUInst <(outs), +(ins i32imm:$addr), "ALU clause starting at $addr:", [] > { + field bits<8> Inst; + bits<8> num; + let Inst = num; +} + +def LITERALS : AMDGPUInst <(outs), +(ins LITERAL:$literal1, LITERAL:$literal2), "$literal1, $literal2", [] > { + field bits<64> Inst; + bits<32> literal1; + bits<32> literal2; + + let Inst{31-0} = literal1; + let Inst{63-32} = literal2; +} + +def PAD : AMDGPUInst <(outs), (ins), "PAD", [] > { + field bits<64> Inst; +} + let Predicates = [isR600toCayman] in { //===----------------------------------------------------------------------===// @@ -944,58 +987,42 @@ def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin>; // XXX: Use the defs in TargetSelectionDAG.td instead of intrinsics. def SETE : R600_2OP < 0x08, "SETE", - [(set R600_Reg32:$dst, - (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, - COND_EQ))] + [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_EQ))] >; def SGT : R600_2OP < 0x09, "SETGT", - [(set R600_Reg32:$dst, - (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, - COND_GT))] + [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_GT))] >; def SGE : R600_2OP < 0xA, "SETGE", - [(set R600_Reg32:$dst, - (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, - COND_GE))] + [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_GE))] >; def SNE : R600_2OP < 0xB, "SETNE", - [(set R600_Reg32:$dst, - (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, - COND_NE))] + [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_NE))] >; def SETE_DX10 : R600_2OP < 0xC, "SETE_DX10", - [(set R600_Reg32:$dst, - (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, (i32 -1), (i32 0), - COND_EQ))] + [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_EQ))] >; def SETGT_DX10 : R600_2OP < 0xD, "SETGT_DX10", - [(set R600_Reg32:$dst, - (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, (i32 -1), (i32 0), - COND_GT))] + [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_GT))] >; def SETGE_DX10 : R600_2OP < 0xE, "SETGE_DX10", - [(set R600_Reg32:$dst, - (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, (i32 -1), (i32 0), - COND_GE))] + [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_GE))] >; def SETNE_DX10 : R600_2OP < 0xF, "SETNE_DX10", - [(set R600_Reg32:$dst, - (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, (i32 -1), (i32 0), - COND_NE))] + [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_NE))] >; def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>; @@ -1053,38 +1080,32 @@ def MIN_UINT : R600_2OP_Helper <0x39, "MIN_UINT", AMDGPUumin>; def SETE_INT : R600_2OP < 0x3A, "SETE_INT", - [(set (i32 R600_Reg32:$dst), - (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETEQ))] + [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETEQ))] >; def SETGT_INT : R600_2OP < 0x3B, "SETGT_INT", - [(set (i32 R600_Reg32:$dst), - (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETGT))] + [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETGT))] >; def SETGE_INT : R600_2OP < 0x3C, "SETGE_INT", - [(set (i32 R600_Reg32:$dst), - (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETGE))] + [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETGE))] >; def SETNE_INT : R600_2OP < 0x3D, "SETNE_INT", - [(set (i32 R600_Reg32:$dst), - (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETNE))] + [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETNE))] >; def SETGT_UINT : R600_2OP < 0x3E, "SETGT_UINT", - [(set (i32 R600_Reg32:$dst), - (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETUGT))] + [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETUGT))] >; def SETGE_UINT : R600_2OP < 0x3F, "SETGE_UINT", - [(set (i32 R600_Reg32:$dst), - (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETUGE))] + [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETUGE))] >; def PRED_SETE_INT : R600_2OP <0x42, "PRED_SETE_INT", []>; @@ -1094,26 +1115,17 @@ def PRED_SETNE_INT : R600_2OP <0x45, "PRED_SETNE_INT", []>; def CNDE_INT : R600_3OP < 0x1C, "CNDE_INT", - [(set (i32 R600_Reg32:$dst), - (selectcc (i32 R600_Reg32:$src0), 0, - (i32 R600_Reg32:$src1), (i32 R600_Reg32:$src2), - COND_EQ))] + [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_EQ))] >; def CNDGE_INT : R600_3OP < 0x1E, "CNDGE_INT", - [(set (i32 R600_Reg32:$dst), - (selectcc (i32 R600_Reg32:$src0), 0, - (i32 R600_Reg32:$src1), (i32 R600_Reg32:$src2), - COND_GE))] + [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_GE))] >; def CNDGT_INT : R600_3OP < 0x1D, "CNDGT_INT", - [(set (i32 R600_Reg32:$dst), - (selectcc (i32 R600_Reg32:$src0), 0, - (i32 R600_Reg32:$src1), (i32 R600_Reg32:$src2), - COND_GT))] + [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_GT))] >; //===----------------------------------------------------------------------===// @@ -1122,7 +1134,7 @@ def CNDGT_INT : R600_3OP < def TEX_LD : R600_TEX < 0x03, "TEX_LD", - [(set R600_Reg128:$DST_GPR, (int_AMDGPU_txf R600_Reg128:$SRC_GPR, + [(set v4f32:$DST_GPR, (int_AMDGPU_txf v4f32:$SRC_GPR, imm:$OFFSET_X, imm:$OFFSET_Y, imm:$OFFSET_Z, imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))] > { @@ -1135,19 +1147,19 @@ let InOperandList = (ins R600_Reg128:$SRC_GPR, i32imm:$OFFSET_X, def TEX_GET_TEXTURE_RESINFO : R600_TEX < 0x04, "TEX_GET_TEXTURE_RESINFO", - [(set R600_Reg128:$DST_GPR, (int_AMDGPU_txq R600_Reg128:$SRC_GPR, + [(set v4f32:$DST_GPR, (int_AMDGPU_txq v4f32:$SRC_GPR, imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))] >; def TEX_GET_GRADIENTS_H : R600_TEX < 0x07, "TEX_GET_GRADIENTS_H", - [(set R600_Reg128:$DST_GPR, (int_AMDGPU_ddx R600_Reg128:$SRC_GPR, + [(set v4f32:$DST_GPR, (int_AMDGPU_ddx v4f32:$SRC_GPR, imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))] >; def TEX_GET_GRADIENTS_V : R600_TEX < 0x08, "TEX_GET_GRADIENTS_V", - [(set R600_Reg128:$DST_GPR, (int_AMDGPU_ddy R600_Reg128:$SRC_GPR, + [(set v4f32:$DST_GPR, (int_AMDGPU_ddy v4f32:$SRC_GPR, imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))] >; @@ -1163,37 +1175,37 @@ def TEX_SET_GRADIENTS_V : R600_TEX < def TEX_SAMPLE : R600_TEX < 0x10, "TEX_SAMPLE", - [(set R600_Reg128:$DST_GPR, (int_AMDGPU_tex R600_Reg128:$SRC_GPR, + [(set v4f32:$DST_GPR, (int_AMDGPU_tex v4f32:$SRC_GPR, imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))] >; def TEX_SAMPLE_C : R600_TEX < 0x18, "TEX_SAMPLE_C", - [(set R600_Reg128:$DST_GPR, (int_AMDGPU_tex R600_Reg128:$SRC_GPR, + [(set v4f32:$DST_GPR, (int_AMDGPU_tex v4f32:$SRC_GPR, imm:$RESOURCE_ID, imm:$SAMPLER_ID, TEX_SHADOW:$textureTarget))] >; def TEX_SAMPLE_L : R600_TEX < 0x11, "TEX_SAMPLE_L", - [(set R600_Reg128:$DST_GPR, (int_AMDGPU_txl R600_Reg128:$SRC_GPR, + [(set v4f32:$DST_GPR, (int_AMDGPU_txl v4f32:$SRC_GPR, imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))] >; def TEX_SAMPLE_C_L : R600_TEX < 0x19, "TEX_SAMPLE_C_L", - [(set R600_Reg128:$DST_GPR, (int_AMDGPU_txl R600_Reg128:$SRC_GPR, + [(set v4f32:$DST_GPR, (int_AMDGPU_txl v4f32:$SRC_GPR, imm:$RESOURCE_ID, imm:$SAMPLER_ID, TEX_SHADOW:$textureTarget))] >; def TEX_SAMPLE_LB : R600_TEX < 0x12, "TEX_SAMPLE_LB", - [(set R600_Reg128:$DST_GPR, (int_AMDGPU_txb R600_Reg128:$SRC_GPR, + [(set v4f32:$DST_GPR, (int_AMDGPU_txb v4f32:$SRC_GPR, imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))] >; def TEX_SAMPLE_C_LB : R600_TEX < 0x1A, "TEX_SAMPLE_C_LB", - [(set R600_Reg128:$DST_GPR, (int_AMDGPU_txb R600_Reg128:$SRC_GPR, + [(set v4f32:$DST_GPR, (int_AMDGPU_txb v4f32:$SRC_GPR, imm:$RESOURCE_ID, imm:$SAMPLER_ID, TEX_SHADOW:$textureTarget))] >; @@ -1223,32 +1235,22 @@ class MULADD_Common <bits<5> inst> : R600_3OP < class MULADD_IEEE_Common <bits<5> inst> : R600_3OP < inst, "MULADD_IEEE", - [(set (f32 R600_Reg32:$dst), - (fadd (fmul R600_Reg32:$src0, R600_Reg32:$src1), R600_Reg32:$src2))] + [(set f32:$dst, (fadd (fmul f32:$src0, f32:$src1), f32:$src2))] >; class CNDE_Common <bits<5> inst> : R600_3OP < inst, "CNDE", - [(set R600_Reg32:$dst, - (selectcc (f32 R600_Reg32:$src0), FP_ZERO, - (f32 R600_Reg32:$src1), (f32 R600_Reg32:$src2), - COND_EQ))] + [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_EQ))] >; class CNDGT_Common <bits<5> inst> : R600_3OP < inst, "CNDGT", - [(set R600_Reg32:$dst, - (selectcc (f32 R600_Reg32:$src0), FP_ZERO, - (f32 R600_Reg32:$src1), (f32 R600_Reg32:$src2), - COND_GT))] + [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_GT))] >; class CNDGE_Common <bits<5> inst> : R600_3OP < inst, "CNDGE", - [(set R600_Reg32:$dst, - (selectcc (f32 R600_Reg32:$src0), FP_ZERO, - (f32 R600_Reg32:$src1), (f32 R600_Reg32:$src2), - COND_GE))] + [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_GE))] >; multiclass DOT4_Common <bits<11> inst> { @@ -1256,7 +1258,7 @@ multiclass DOT4_Common <bits<11> inst> { def _pseudo : R600_REDUCTION <inst, (ins R600_Reg128:$src0, R600_Reg128:$src1), "DOT4 $dst $src0, $src1", - [(set R600_Reg32:$dst, (int_AMDGPU_dp4 R600_Reg128:$src0, R600_Reg128:$src1))] + [(set f32:$dst, (int_AMDGPU_dp4 v4f32:$src0, v4f32:$src1))] >; def _real : R600_2OP <inst, "DOT4", []>; @@ -1266,11 +1268,10 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { multiclass CUBE_Common <bits<11> inst> { def _pseudo : InstR600 < - inst, (outs R600_Reg128:$dst), (ins R600_Reg128:$src), "CUBE $dst $src", - [(set R600_Reg128:$dst, (int_AMDGPU_cube R600_Reg128:$src))], + [(set v4f32:$dst, (int_AMDGPU_cube v4f32:$src))], VecALU > { let isPseudo = 1; @@ -1282,23 +1283,38 @@ multiclass CUBE_Common <bits<11> inst> { class EXP_IEEE_Common <bits<11> inst> : R600_1OP_Helper < inst, "EXP_IEEE", fexp2 ->; +> { + let TransOnly = 1; + let Itinerary = TransALU; +} class FLT_TO_INT_Common <bits<11> inst> : R600_1OP_Helper < inst, "FLT_TO_INT", fp_to_sint ->; +> { + let TransOnly = 1; + let Itinerary = TransALU; +} class INT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper < inst, "INT_TO_FLT", sint_to_fp ->; +> { + let TransOnly = 1; + let Itinerary = TransALU; +} class FLT_TO_UINT_Common <bits<11> inst> : R600_1OP_Helper < inst, "FLT_TO_UINT", fp_to_uint ->; +> { + let TransOnly = 1; + let Itinerary = TransALU; +} class UINT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper < inst, "UINT_TO_FLT", uint_to_fp ->; +> { + let TransOnly = 1; + let Itinerary = TransALU; +} class LOG_CLAMPED_Common <bits<11> inst> : R600_1OP < inst, "LOG_CLAMPED", [] @@ -1306,50 +1322,84 @@ class LOG_CLAMPED_Common <bits<11> inst> : R600_1OP < class LOG_IEEE_Common <bits<11> inst> : R600_1OP_Helper < inst, "LOG_IEEE", flog2 ->; +> { + let TransOnly = 1; + let Itinerary = TransALU; +} class LSHL_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHL", shl>; class LSHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHR", srl>; class ASHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "ASHR", sra>; class MULHI_INT_Common <bits<11> inst> : R600_2OP_Helper < inst, "MULHI_INT", mulhs ->; +> { + let TransOnly = 1; + let Itinerary = TransALU; +} class MULHI_UINT_Common <bits<11> inst> : R600_2OP_Helper < inst, "MULHI", mulhu ->; +> { + let TransOnly = 1; + let Itinerary = TransALU; +} class MULLO_INT_Common <bits<11> inst> : R600_2OP_Helper < inst, "MULLO_INT", mul ->; -class MULLO_UINT_Common <bits<11> inst> : R600_2OP <inst, "MULLO_UINT", []>; +> { + let TransOnly = 1; + let Itinerary = TransALU; +} +class MULLO_UINT_Common <bits<11> inst> : R600_2OP <inst, "MULLO_UINT", []> { + let TransOnly = 1; + let Itinerary = TransALU; +} class RECIP_CLAMPED_Common <bits<11> inst> : R600_1OP < inst, "RECIP_CLAMPED", [] ->; +> { + let TransOnly = 1; + let Itinerary = TransALU; +} class RECIP_IEEE_Common <bits<11> inst> : R600_1OP < - inst, "RECIP_IEEE", [(set R600_Reg32:$dst, (fdiv FP_ONE, R600_Reg32:$src0))] ->; + inst, "RECIP_IEEE", [(set f32:$dst, (fdiv FP_ONE, f32:$src0))] +> { + let TransOnly = 1; + let Itinerary = TransALU; +} class RECIP_UINT_Common <bits<11> inst> : R600_1OP_Helper < inst, "RECIP_UINT", AMDGPUurecip ->; +> { + let TransOnly = 1; + let Itinerary = TransALU; +} class RECIPSQRT_CLAMPED_Common <bits<11> inst> : R600_1OP_Helper < inst, "RECIPSQRT_CLAMPED", int_AMDGPU_rsq ->; +> { + let TransOnly = 1; + let Itinerary = TransALU; +} class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP < inst, "RECIPSQRT_IEEE", [] ->; +> { + let TransOnly = 1; + let Itinerary = TransALU; +} class SIN_Common <bits<11> inst> : R600_1OP < inst, "SIN", []>{ let Trig = 1; + let TransOnly = 1; + let Itinerary = TransALU; } class COS_Common <bits<11> inst> : R600_1OP < inst, "COS", []> { let Trig = 1; + let TransOnly = 1; + let Itinerary = TransALU; } //===----------------------------------------------------------------------===// @@ -1358,19 +1408,20 @@ class COS_Common <bits<11> inst> : R600_1OP < multiclass DIV_Common <InstR600 recip_ieee> { def : Pat< - (int_AMDGPU_div R600_Reg32:$src0, R600_Reg32:$src1), - (MUL_IEEE R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1)) + (int_AMDGPU_div f32:$src0, f32:$src1), + (MUL_IEEE $src0, (recip_ieee $src1)) >; def : Pat< - (fdiv R600_Reg32:$src0, R600_Reg32:$src1), - (MUL_IEEE R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1)) + (fdiv f32:$src0, f32:$src1), + (MUL_IEEE $src0, (recip_ieee $src1)) >; } -class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ieee> : Pat < - (int_TGSI_lit_z R600_Reg32:$src_x, R600_Reg32:$src_y, R600_Reg32:$src_w), - (exp_ieee (mul_lit (log_clamped (MAX R600_Reg32:$src_y, (f32 ZERO))), R600_Reg32:$src_w, R600_Reg32:$src_x)) +class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ieee> + : Pat < + (int_TGSI_lit_z f32:$src_x, f32:$src_y, f32:$src_w), + (exp_ieee (mul_lit (log_clamped (MAX $src_y, (f32 ZERO))), $src_w, $src_x)) >; //===----------------------------------------------------------------------===// @@ -1410,14 +1461,13 @@ let Predicates = [isR600] in { def RECIP_UINT_r600 : RECIP_UINT_Common <0x78>; defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>; - def : POW_Common <LOG_IEEE_r600, EXP_IEEE_r600, MUL, R600_Reg32>; + def : POW_Common <LOG_IEEE_r600, EXP_IEEE_r600, MUL>; def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common<MUL_LIT_r600, LOG_CLAMPED_r600, EXP_IEEE_r600>; - def : Pat<(fsqrt R600_Reg32:$src), - (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_r600 R600_Reg32:$src))>; + def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>; def R600_ExportSwz : ExportSwzInst { - let Word1{20-17} = 1; // BURST_COUNT + let Word1{20-17} = 0; // BURST_COUNT let Word1{21} = eop; let Word1{22} = 1; // VALID_PIXEL_MODE let Word1{30-23} = inst; @@ -1426,25 +1476,77 @@ let Predicates = [isR600] in { defm : ExportPattern<R600_ExportSwz, 39>; def R600_ExportBuf : ExportBufInst { - let Word1{20-17} = 1; // BURST_COUNT + let Word1{20-17} = 0; // BURST_COUNT let Word1{21} = eop; let Word1{22} = 1; // VALID_PIXEL_MODE let Word1{30-23} = inst; let Word1{31} = 1; // BARRIER } defm : SteamOutputExportPattern<R600_ExportBuf, 0x20, 0x21, 0x22, 0x23>; + + def CF_TC_R600 : CF_CLAUSE_R600<1, (ins i32imm:$ADDR, i32imm:$COUNT), + "TEX $COUNT @$ADDR"> { + let POP_COUNT = 0; + } + def CF_VC_R600 : CF_CLAUSE_R600<2, (ins i32imm:$ADDR, i32imm:$COUNT), + "VTX $COUNT @$ADDR"> { + let POP_COUNT = 0; + } + def WHILE_LOOP_R600 : CF_CLAUSE_R600<6, (ins i32imm:$ADDR), + "LOOP_START_DX10 @$ADDR"> { + let POP_COUNT = 0; + let COUNT = 0; + } + def END_LOOP_R600 : CF_CLAUSE_R600<5, (ins i32imm:$ADDR), "END_LOOP @$ADDR"> { + let POP_COUNT = 0; + let COUNT = 0; + } + def LOOP_BREAK_R600 : CF_CLAUSE_R600<9, (ins i32imm:$ADDR), + "LOOP_BREAK @$ADDR"> { + let POP_COUNT = 0; + let COUNT = 0; + } + def CF_CONTINUE_R600 : CF_CLAUSE_R600<8, (ins i32imm:$ADDR), + "CONTINUE @$ADDR"> { + let POP_COUNT = 0; + let COUNT = 0; + } + def CF_JUMP_R600 : CF_CLAUSE_R600<10, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "JUMP @$ADDR POP:$POP_COUNT"> { + let COUNT = 0; + } + def CF_ELSE_R600 : CF_CLAUSE_R600<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "ELSE @$ADDR POP:$POP_COUNT"> { + let COUNT = 0; + } + def CF_CALL_FS_R600 : CF_CLAUSE_R600<19, (ins), "CALL_FS"> { + let ADDR = 0; + let COUNT = 0; + let POP_COUNT = 0; + } + def POP_R600 : CF_CLAUSE_R600<14, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "POP @$ADDR POP:$POP_COUNT"> { + let COUNT = 0; + } + def CF_END_R600 : CF_CLAUSE_R600<0, (ins), "CF_END"> { + let COUNT = 0; + let POP_COUNT = 0; + let ADDR = 0; + let END_OF_PROGRAM = 1; + } + } // Helper pattern for normalizing inputs to triginomic instructions for R700+ // cards. class COS_PAT <InstR600 trig> : Pat< - (fcos R600_Reg32:$src), - (trig (MUL_IEEE (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src)) + (fcos f32:$src), + (trig (MUL_IEEE (MOV_IMM_I32 CONST.TWO_PI_INV), $src)) >; class SIN_PAT <InstR600 trig> : Pat< - (fsin R600_Reg32:$src), - (trig (MUL_IEEE (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src)) + (fsin f32:$src), + (trig (MUL_IEEE (MOV_IMM_I32 CONST.TWO_PI_INV), $src)) >; //===----------------------------------------------------------------------===// @@ -1482,11 +1584,10 @@ def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>; def SIN_eg : SIN_Common<0x8D>; def COS_eg : COS_Common<0x8E>; -def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL, R600_Reg32>; +def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL>; def : SIN_PAT <SIN_eg>; def : COS_PAT <COS_eg>; -def : Pat<(fsqrt R600_Reg32:$src), - (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_eg R600_Reg32:$src))>; +def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>; } // End Predicates = [isEG] //===----------------------------------------------------------------------===// @@ -1510,15 +1611,17 @@ let Predicates = [isEGorCayman] in { // (16,8) = (Input << 8) >> 24 = (Input & 0xffffff) >> 16 // (24,8) = (Input << 0) >> 24 = (Input & 0xffffffff) >> 24 def BFE_UINT_eg : R600_3OP <0x4, "BFE_UINT", - [(set R600_Reg32:$dst, (int_AMDIL_bit_extract_u32 R600_Reg32:$src0, - R600_Reg32:$src1, - R600_Reg32:$src2))], + [(set i32:$dst, (int_AMDIL_bit_extract_u32 i32:$src0, i32:$src1, + i32:$src2))], VecALU >; + def : BFEPattern <BFE_UINT_eg>; + + def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", [], VecALU>; + defm : BFIPatterns <BFI_INT_eg>; def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", - [(set R600_Reg32:$dst, (AMDGPUbitalign R600_Reg32:$src0, R600_Reg32:$src1, - R600_Reg32:$src2))], + [(set i32:$dst, (AMDGPUbitalign i32:$src0, i32:$src1, i32:$src2))], VecALU >; @@ -1563,14 +1666,15 @@ let hasSideEffects = 1 in { // XXX: Lowering SELECT_CC will sometimes generate fp_to_[su]int nodes, // which do not need to be truncated since the fp values are 0.0f or 1.0f. // We should look into handling these cases separately. - def : Pat<(fp_to_sint R600_Reg32:$src0), - (FLT_TO_INT_eg (TRUNC R600_Reg32:$src0))>; + def : Pat<(fp_to_sint f32:$src0), (FLT_TO_INT_eg (TRUNC $src0))>; + + def : Pat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>; - def : Pat<(fp_to_uint R600_Reg32:$src0), - (FLT_TO_UINT_eg (TRUNC R600_Reg32:$src0))>; + // SHA-256 Patterns + def : SHA256MaPattern <BFI_INT_eg, XOR_INT>; def EG_ExportSwz : ExportSwzInst { - let Word1{19-16} = 1; // BURST_COUNT + let Word1{19-16} = 0; // BURST_COUNT let Word1{20} = 1; // VALID_PIXEL_MODE let Word1{21} = eop; let Word1{29-22} = inst; @@ -1580,7 +1684,7 @@ let hasSideEffects = 1 in { defm : ExportPattern<EG_ExportSwz, 83>; def EG_ExportBuf : ExportBufInst { - let Word1{19-16} = 1; // BURST_COUNT + let Word1{19-16} = 0; // BURST_COUNT let Word1{20} = 1; // VALID_PIXEL_MODE let Word1{21} = eop; let Word1{29-22} = inst; @@ -1589,6 +1693,57 @@ let hasSideEffects = 1 in { } defm : SteamOutputExportPattern<EG_ExportBuf, 0x40, 0x41, 0x42, 0x43>; + def CF_TC_EG : CF_CLAUSE_EG<1, (ins i32imm:$ADDR, i32imm:$COUNT), + "TEX $COUNT @$ADDR"> { + let POP_COUNT = 0; + } + def CF_VC_EG : CF_CLAUSE_EG<2, (ins i32imm:$ADDR, i32imm:$COUNT), + "VTX $COUNT @$ADDR"> { + let POP_COUNT = 0; + } + def WHILE_LOOP_EG : CF_CLAUSE_EG<6, (ins i32imm:$ADDR), + "LOOP_START_DX10 @$ADDR"> { + let POP_COUNT = 0; + let COUNT = 0; + } + def END_LOOP_EG : CF_CLAUSE_EG<5, (ins i32imm:$ADDR), "END_LOOP @$ADDR"> { + let POP_COUNT = 0; + let COUNT = 0; + } + def LOOP_BREAK_EG : CF_CLAUSE_EG<9, (ins i32imm:$ADDR), + "LOOP_BREAK @$ADDR"> { + let POP_COUNT = 0; + let COUNT = 0; + } + def CF_CONTINUE_EG : CF_CLAUSE_EG<8, (ins i32imm:$ADDR), + "CONTINUE @$ADDR"> { + let POP_COUNT = 0; + let COUNT = 0; + } + def CF_JUMP_EG : CF_CLAUSE_EG<10, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "JUMP @$ADDR POP:$POP_COUNT"> { + let COUNT = 0; + } + def CF_ELSE_EG : CF_CLAUSE_EG<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "ELSE @$ADDR POP:$POP_COUNT"> { + let COUNT = 0; + } + def CF_CALL_FS_EG : CF_CLAUSE_EG<19, (ins), "CALL_FS"> { + let ADDR = 0; + let COUNT = 0; + let POP_COUNT = 0; + } + def POP_EG : CF_CLAUSE_EG<14, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "POP @$ADDR POP:$POP_COUNT"> { + let COUNT = 0; + } + def CF_END_EG : CF_CLAUSE_EG<0, (ins), "CF_END"> { + let COUNT = 0; + let POP_COUNT = 0; + let ADDR = 0; + let END_OF_PROGRAM = 1; + } + //===----------------------------------------------------------------------===// // Memory read/write instructions //===----------------------------------------------------------------------===// @@ -1618,14 +1773,14 @@ class RAT_WRITE_CACHELESS_eg <dag ins, bits<4> comp_mask, string name, def RAT_WRITE_CACHELESS_32_eg : RAT_WRITE_CACHELESS_eg < (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), 0x1, "RAT_WRITE_CACHELESS_32_eg", - [(global_store (i32 R600_TReg32_X:$rw_gpr), R600_TReg32_X:$index_gpr)] + [(global_store i32:$rw_gpr, i32:$index_gpr)] >; //128-bit store def RAT_WRITE_CACHELESS_128_eg : RAT_WRITE_CACHELESS_eg < (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), 0xf, "RAT_WRITE_CACHELESS_128", - [(global_store (v4i32 R600_Reg128:$rw_gpr), R600_TReg32_X:$index_gpr)] + [(global_store v4i32:$rw_gpr, i32:$index_gpr)] >; class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern> @@ -1679,6 +1834,8 @@ class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern> // VTX_WORD3 (Padding) // // Inst{127-96} = 0; + + let VTXInst = 1; } class VTX_READ_8_eg <bits<8> buffer_id, list<dag> pattern> @@ -1748,19 +1905,19 @@ class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern> //===----------------------------------------------------------------------===// def VTX_READ_PARAM_8_eg : VTX_READ_8_eg <0, - [(set (i32 R600_TReg32_X:$dst), (load_param_zexti8 ADDRVTX_READ:$ptr))] + [(set i32:$dst, (load_param_zexti8 ADDRVTX_READ:$ptr))] >; def VTX_READ_PARAM_16_eg : VTX_READ_16_eg <0, - [(set (i32 R600_TReg32_X:$dst), (load_param_zexti16 ADDRVTX_READ:$ptr))] + [(set i32:$dst, (load_param_zexti16 ADDRVTX_READ:$ptr))] >; def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0, - [(set (i32 R600_TReg32_X:$dst), (load_param ADDRVTX_READ:$ptr))] + [(set i32:$dst, (load_param ADDRVTX_READ:$ptr))] >; def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0, - [(set (v4i32 R600_Reg128:$dst), (load_param ADDRVTX_READ:$ptr))] + [(set v4i32:$dst, (load_param ADDRVTX_READ:$ptr))] >; //===----------------------------------------------------------------------===// @@ -1769,17 +1926,17 @@ def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0, // 8-bit reads def VTX_READ_GLOBAL_8_eg : VTX_READ_8_eg <1, - [(set (i32 R600_TReg32_X:$dst), (zextloadi8_global ADDRVTX_READ:$ptr))] + [(set i32:$dst, (zextloadi8_global ADDRVTX_READ:$ptr))] >; // 32-bit reads def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1, - [(set (i32 R600_TReg32_X:$dst), (global_load ADDRVTX_READ:$ptr))] + [(set i32:$dst, (global_load ADDRVTX_READ:$ptr))] >; // 128-bit reads def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1, - [(set (v4i32 R600_Reg128:$dst), (global_load ADDRVTX_READ:$ptr))] + [(set v4i32:$dst, (global_load ADDRVTX_READ:$ptr))] >; //===----------------------------------------------------------------------===// @@ -1788,7 +1945,7 @@ def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1, //===----------------------------------------------------------------------===// def CONSTANT_LOAD_eg : VTX_READ_32_eg <1, - [(set (i32 R600_TReg32_X:$dst), (constant_load ADDRVTX_READ:$ptr))] + [(set i32:$dst, (constant_load ADDRVTX_READ:$ptr))] >; } @@ -1818,22 +1975,27 @@ def SIN_cm : SIN_Common<0x8D>; def COS_cm : COS_Common<0x8E>; } // End isVector = 1 -def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL, R600_Reg32>; +def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>; def : SIN_PAT <SIN_cm>; def : COS_PAT <COS_cm>; defm DIV_cm : DIV_Common<RECIP_IEEE_cm>; // RECIP_UINT emulation for Cayman +// The multiplication scales from [0,1] to the unsigned integer range def : Pat < - (AMDGPUurecip R600_Reg32:$src0), - (FLT_TO_UINT_eg (MUL_IEEE (RECIP_IEEE_cm (UINT_TO_FLT_eg R600_Reg32:$src0)), - (MOV_IMM_I32 0x4f800000))) + (AMDGPUurecip i32:$src0), + (FLT_TO_UINT_eg (MUL_IEEE (RECIP_IEEE_cm (UINT_TO_FLT_eg $src0)), + (MOV_IMM_I32 CONST.FP_UINT_MAX_PLUS_1))) >; + def CF_END_CM : CF_CLAUSE_EG<32, (ins), "CF_END"> { + let ADDR = 0; + let POP_COUNT = 0; + let COUNT = 0; + } -def : Pat<(fsqrt R600_Reg32:$src), - (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm R600_Reg32:$src))>; +def : Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>; } // End isCayman @@ -1855,21 +2017,21 @@ def PREDICATED_BREAK : ILFormat<(outs), (ins GPRI32:$src), let isPseudo = 1 in { def PRED_X : InstR600 < - 0, (outs R600_Predicate_Bit:$dst), + (outs R600_Predicate_Bit:$dst), (ins R600_Reg32:$src0, i32imm:$src1, i32imm:$flags), "", [], NullALU> { let FlagOperandIdx = 3; } let isTerminator = 1, isBranch = 1 in { -def JUMP_COND : InstR600 <0x10, +def JUMP_COND : InstR600 < (outs), (ins brtarget:$target, R600_Predicate_Bit:$p), "JUMP $target ($p)", [], AnyALU >; -def JUMP : InstR600 <0x10, +def JUMP : InstR600 < (outs), (ins brtarget:$target), "JUMP $target", @@ -1896,20 +2058,28 @@ def MASK_WRITE : AMDGPUShaderInst < } // End mayLoad = 0, mayStore = 0, hasSideEffects = 1 -def TXD: AMDGPUShaderInst < +def TXD: InstR600 < (outs R600_Reg128:$dst), - (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget), + (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, + i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget), "TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget", - [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, imm:$resourceId, imm:$samplerId, imm:$textureTarget))] ->; + [(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2, + imm:$resourceId, imm:$samplerId, imm:$textureTarget))], + NullALU > { + let TEXInst = 1; +} -def TXD_SHADOW: AMDGPUShaderInst < +def TXD_SHADOW: InstR600 < (outs R600_Reg128:$dst), - (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget), + (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, + i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget), "TXD_SHADOW $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget", - [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))] ->; - + [(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2, + imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))], + NullALU +> { + let TEXInst = 1; +} } // End isPseudo = 1 } // End usesCustomInserter = 1 @@ -1946,7 +2116,7 @@ def CONST_COPY : Instruction { def TEX_VTX_CONSTBUF : InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "VTX_READ_eg $dst, $ptr", - [(set R600_Reg128:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr, (i32 imm:$BUFFER_ID)))]>, + [(set v4i32:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr, (i32 imm:$BUFFER_ID)))]>, VTX_WORD1_GPR, VTX_WORD0 { let VC_INST = 0; @@ -1995,11 +2165,12 @@ def TEX_VTX_CONSTBUF : // VTX_WORD3 (Padding) // // Inst{127-96} = 0; + let VTXInst = 1; } def TEX_VTX_TEXBUF: InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "TEX_VTX_EXPLICIT_READ $dst, $ptr", - [(set R600_Reg128:$dst, (int_R600_load_texbuf ADDRGA_VAR_OFFSET:$ptr, imm:$BUFFER_ID))]>, + [(set v4f32:$dst, (int_R600_load_texbuf ADDRGA_VAR_OFFSET:$ptr, imm:$BUFFER_ID))]>, VTX_WORD1_GPR, VTX_WORD0 { let VC_INST = 0; @@ -2048,6 +2219,7 @@ let Inst{63-32} = Word1; // VTX_WORD3 (Padding) // // Inst{127-96} = 0; + let VTXInst = 1; } @@ -2124,9 +2296,8 @@ let isTerminator=1 in { // CND*_INT Pattterns for f32 True / False values class CND_INT_f32 <InstR600 cnd, CondCode cc> : Pat < - (selectcc (i32 R600_Reg32:$src0), 0, (f32 R600_Reg32:$src1), - R600_Reg32:$src2, cc), - (cnd R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2) + (selectcc i32:$src0, 0, f32:$src1, f32:$src2, cc), + (cnd $src0, $src1, $src2) >; def : CND_INT_f32 <CNDE_INT, SETEQ>; @@ -2135,9 +2306,8 @@ def : CND_INT_f32 <CNDGE_INT, SETGE>; //CNDGE_INT extra pattern def : Pat < - (selectcc (i32 R600_Reg32:$src0), -1, (i32 R600_Reg32:$src1), - (i32 R600_Reg32:$src2), COND_GT), - (CNDGE_INT R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2) + (selectcc i32:$src0, -1, i32:$src1, i32:$src2, COND_GT), + (CNDGE_INT $src0, $src1, $src2) >; // KIL Patterns @@ -2147,56 +2317,56 @@ def KILP : Pat < >; def KIL : Pat < - (int_AMDGPU_kill R600_Reg32:$src0), - (MASK_WRITE (KILLGT (f32 ZERO), (f32 R600_Reg32:$src0))) + (int_AMDGPU_kill f32:$src0), + (MASK_WRITE (KILLGT (f32 ZERO), $src0)) >; // SGT Reverse args def : Pat < - (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, COND_LT), - (SGT R600_Reg32:$src1, R600_Reg32:$src0) + (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_LT), + (SGT $src1, $src0) >; // SGE Reverse args def : Pat < - (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, COND_LE), - (SGE R600_Reg32:$src1, R600_Reg32:$src0) + (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_LE), + (SGE $src1, $src0) >; // SETGT_DX10 reverse args def : Pat < - (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, COND_LT), - (SETGT_DX10 R600_Reg32:$src1, R600_Reg32:$src0) + (selectcc f32:$src0, f32:$src1, -1, 0, COND_LT), + (SETGT_DX10 $src1, $src0) >; // SETGE_DX10 reverse args def : Pat < - (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, COND_LE), - (SETGE_DX10 R600_Reg32:$src1, R600_Reg32:$src0) + (selectcc f32:$src0, f32:$src1, -1, 0, COND_LE), + (SETGE_DX10 $src1, $src0) >; // SETGT_INT reverse args def : Pat < - (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETLT), - (SETGT_INT R600_Reg32:$src1, R600_Reg32:$src0) + (selectcc i32:$src0, i32:$src1, -1, 0, SETLT), + (SETGT_INT $src1, $src0) >; // SETGE_INT reverse args def : Pat < - (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETLE), - (SETGE_INT R600_Reg32:$src1, R600_Reg32:$src0) + (selectcc i32:$src0, i32:$src1, -1, 0, SETLE), + (SETGE_INT $src1, $src0) >; // SETGT_UINT reverse args def : Pat < - (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETULT), - (SETGT_UINT R600_Reg32:$src1, R600_Reg32:$src0) + (selectcc i32:$src0, i32:$src1, -1, 0, SETULT), + (SETGT_UINT $src1, $src0) >; // SETGE_UINT reverse args def : Pat < - (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETULE), - (SETGE_UINT R600_Reg32:$src1, R600_Reg32:$src0) + (selectcc i32:$src0, i32:$src1, -1, 0, SETULE), + (SETGE_UINT $src1, $src0) >; // The next two patterns are special cases for handling 'true if ordered' and @@ -2209,50 +2379,50 @@ def : Pat < //SETE - 'true if ordered' def : Pat < - (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, SETO), - (SETE R600_Reg32:$src0, R600_Reg32:$src1) + (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, SETO), + (SETE $src0, $src1) >; //SETE_DX10 - 'true if ordered' def : Pat < - (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETO), - (SETE_DX10 R600_Reg32:$src0, R600_Reg32:$src1) + (selectcc f32:$src0, f32:$src1, -1, 0, SETO), + (SETE_DX10 $src0, $src1) >; //SNE - 'true if unordered' def : Pat < - (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, SETUO), - (SNE R600_Reg32:$src0, R600_Reg32:$src1) + (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, SETUO), + (SNE $src0, $src1) >; //SETNE_DX10 - 'true if ordered' def : Pat < - (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETUO), - (SETNE_DX10 R600_Reg32:$src0, R600_Reg32:$src1) + (selectcc f32:$src0, f32:$src1, -1, 0, SETUO), + (SETNE_DX10 $src0, $src1) >; -def : Extract_Element <f32, v4f32, R600_Reg128, 0, sub0>; -def : Extract_Element <f32, v4f32, R600_Reg128, 1, sub1>; -def : Extract_Element <f32, v4f32, R600_Reg128, 2, sub2>; -def : Extract_Element <f32, v4f32, R600_Reg128, 3, sub3>; +def : Extract_Element <f32, v4f32, 0, sub0>; +def : Extract_Element <f32, v4f32, 1, sub1>; +def : Extract_Element <f32, v4f32, 2, sub2>; +def : Extract_Element <f32, v4f32, 3, sub3>; -def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 0, sub0>; -def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 1, sub1>; -def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 2, sub2>; -def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 3, sub3>; +def : Insert_Element <f32, v4f32, 0, sub0>; +def : Insert_Element <f32, v4f32, 1, sub1>; +def : Insert_Element <f32, v4f32, 2, sub2>; +def : Insert_Element <f32, v4f32, 3, sub3>; -def : Extract_Element <i32, v4i32, R600_Reg128, 0, sub0>; -def : Extract_Element <i32, v4i32, R600_Reg128, 1, sub1>; -def : Extract_Element <i32, v4i32, R600_Reg128, 2, sub2>; -def : Extract_Element <i32, v4i32, R600_Reg128, 3, sub3>; +def : Extract_Element <i32, v4i32, 0, sub0>; +def : Extract_Element <i32, v4i32, 1, sub1>; +def : Extract_Element <i32, v4i32, 2, sub2>; +def : Extract_Element <i32, v4i32, 3, sub3>; -def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 0, sub0>; -def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 1, sub1>; -def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 2, sub2>; -def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 3, sub3>; +def : Insert_Element <i32, v4i32, 0, sub0>; +def : Insert_Element <i32, v4i32, 1, sub1>; +def : Insert_Element <i32, v4i32, 2, sub2>; +def : Insert_Element <i32, v4i32, 3, sub3>; -def : Vector4_Build <v4f32, R600_Reg128, f32, R600_Reg32>; -def : Vector4_Build <v4i32, R600_Reg128, i32, R600_Reg32>; +def : Vector4_Build <v4f32, f32>; +def : Vector4_Build <v4i32, i32>; // bitconvert patterns diff --git a/lib/Target/R600/R600MachineFunctionInfo.h b/lib/Target/R600/R600MachineFunctionInfo.h index 99c1f91..70fddbb 100644 --- a/lib/Target/R600/R600MachineFunctionInfo.h +++ b/lib/Target/R600/R600MachineFunctionInfo.h @@ -25,6 +25,7 @@ public: R600MachineFunctionInfo(const MachineFunction &MF); SmallVector<unsigned, 4> LiveOuts; std::vector<unsigned> IndirectRegs; + unsigned StackSize; }; } // End llvm namespace diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp new file mode 100644 index 0000000..cd7b7d0 --- /dev/null +++ b/lib/Target/R600/R600Packetizer.cpp @@ -0,0 +1,459 @@ +//===----- R600Packetizer.cpp - VLIW packetizer ---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass implements instructions packetization for R600. It unsets isLast +/// bit of instructions inside a bundle and substitutes src register with +/// PreviousVector when applicable. +// +//===----------------------------------------------------------------------===// + +#ifndef R600PACKETIZER_CPP +#define R600PACKETIZER_CPP + +#define DEBUG_TYPE "packets" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/CodeGen/DFAPacketizer.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "AMDGPU.h" +#include "R600InstrInfo.h" + +namespace llvm { + +class R600Packetizer : public MachineFunctionPass { + +public: + static char ID; + R600Packetizer(const TargetMachine &TM) : MachineFunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineLoopInfo>(); + AU.addPreserved<MachineLoopInfo>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + const char *getPassName() const { + return "R600 Packetizer"; + } + + bool runOnMachineFunction(MachineFunction &Fn); +}; +char R600Packetizer::ID = 0; + +class R600PacketizerList : public VLIWPacketizerList { + +private: + const R600InstrInfo *TII; + const R600RegisterInfo &TRI; + + enum BankSwizzle { + ALU_VEC_012 = 0, + ALU_VEC_021, + ALU_VEC_120, + ALU_VEC_102, + ALU_VEC_201, + ALU_VEC_210 + }; + + unsigned getSlot(const MachineInstr *MI) const { + return TRI.getHWRegChan(MI->getOperand(0).getReg()); + } + + /// \returns register to PV chan mapping for bundle/single instructions that + /// immediatly precedes I. + DenseMap<unsigned, unsigned> getPreviousVector(MachineBasicBlock::iterator I) + const { + DenseMap<unsigned, unsigned> Result; + I--; + if (!TII->isALUInstr(I->getOpcode()) && !I->isBundle()) + return Result; + MachineBasicBlock::instr_iterator BI = I.getInstrIterator(); + if (I->isBundle()) + BI++; + do { + if (TII->isPredicated(BI)) + continue; + if (TII->isTransOnly(BI)) + continue; + int OperandIdx = TII->getOperandIdx(BI->getOpcode(), R600Operands::WRITE); + if (OperandIdx < 0) + continue; + if (BI->getOperand(OperandIdx).getImm() == 0) + continue; + unsigned Dst = BI->getOperand(0).getReg(); + if (BI->getOpcode() == AMDGPU::DOT4_r600_real) { + Result[Dst] = AMDGPU::PV_X; + continue; + } + unsigned PVReg = 0; + switch (TRI.getHWRegChan(Dst)) { + case 0: + PVReg = AMDGPU::PV_X; + break; + case 1: + PVReg = AMDGPU::PV_Y; + break; + case 2: + PVReg = AMDGPU::PV_Z; + break; + case 3: + PVReg = AMDGPU::PV_W; + break; + default: + llvm_unreachable("Invalid Chan"); + } + Result[Dst] = PVReg; + } while ((++BI)->isBundledWithPred()); + return Result; + } + + void substitutePV(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PVs) + const { + R600Operands::Ops Ops[] = { + R600Operands::SRC0, + R600Operands::SRC1, + R600Operands::SRC2 + }; + for (unsigned i = 0; i < 3; i++) { + int OperandIdx = TII->getOperandIdx(MI->getOpcode(), Ops[i]); + if (OperandIdx < 0) + continue; + unsigned Src = MI->getOperand(OperandIdx).getReg(); + const DenseMap<unsigned, unsigned>::const_iterator It = PVs.find(Src); + if (It != PVs.end()) + MI->getOperand(OperandIdx).setReg(It->second); + } + } +public: + // Ctor. + R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI, + MachineDominatorTree &MDT) + : VLIWPacketizerList(MF, MLI, MDT, true), + TII (static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo())), + TRI(TII->getRegisterInfo()) { } + + // initPacketizerState - initialize some internal flags. + void initPacketizerState() { } + + // ignorePseudoInstruction - Ignore bundling of pseudo instructions. + bool ignorePseudoInstruction(MachineInstr *MI, MachineBasicBlock *MBB) { + return false; + } + + // isSoloInstruction - return true if instruction MI can not be packetized + // with any other instruction, which means that MI itself is a packet. + bool isSoloInstruction(MachineInstr *MI) { + if (TII->isVector(*MI)) + return true; + if (!TII->isALUInstr(MI->getOpcode())) + return true; + if (TII->get(MI->getOpcode()).TSFlags & R600_InstFlag::TRANS_ONLY) + return true; + if (TII->isTransOnly(MI)) + return true; + return false; + } + + // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ + // together. + bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) { + MachineInstr *MII = SUI->getInstr(), *MIJ = SUJ->getInstr(); + if (getSlot(MII) <= getSlot(MIJ)) + return false; + // Does MII and MIJ share the same pred_sel ? + int OpI = TII->getOperandIdx(MII->getOpcode(), R600Operands::PRED_SEL), + OpJ = TII->getOperandIdx(MIJ->getOpcode(), R600Operands::PRED_SEL); + unsigned PredI = (OpI > -1)?MII->getOperand(OpI).getReg():0, + PredJ = (OpJ > -1)?MIJ->getOperand(OpJ).getReg():0; + if (PredI != PredJ) + return false; + if (SUJ->isSucc(SUI)) { + for (unsigned i = 0, e = SUJ->Succs.size(); i < e; ++i) { + const SDep &Dep = SUJ->Succs[i]; + if (Dep.getSUnit() != SUI) + continue; + if (Dep.getKind() == SDep::Anti) + continue; + if (Dep.getKind() == SDep::Output) + if (MII->getOperand(0).getReg() != MIJ->getOperand(0).getReg()) + continue; + return false; + } + } + return true; + } + + // isLegalToPruneDependencies - Is it legal to prune dependece between SUI + // and SUJ. + bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) {return false;} + + void setIsLastBit(MachineInstr *MI, unsigned Bit) const { + unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), R600Operands::LAST); + MI->getOperand(LastOp).setImm(Bit); + } + + MachineBasicBlock::iterator addToPacket(MachineInstr *MI) { + CurrentPacketMIs.push_back(MI); + bool FitsConstLimits = TII->canBundle(CurrentPacketMIs); + DEBUG( + if (!FitsConstLimits) { + dbgs() << "Couldn't pack :\n"; + MI->dump(); + dbgs() << "with the following packets :\n"; + for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) { + CurrentPacketMIs[i]->dump(); + dbgs() << "\n"; + } + dbgs() << "because of Consts read limitations\n"; + }); + const DenseMap<unsigned, unsigned> &PV = + getPreviousVector(CurrentPacketMIs.front()); + bool FitsReadPortLimits = fitsReadPortLimitation(CurrentPacketMIs, PV); + DEBUG( + if (!FitsReadPortLimits) { + dbgs() << "Couldn't pack :\n"; + MI->dump(); + dbgs() << "with the following packets :\n"; + for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) { + CurrentPacketMIs[i]->dump(); + dbgs() << "\n"; + } + dbgs() << "because of Read port limitations\n"; + }); + bool isBundlable = FitsConstLimits && FitsReadPortLimits; + CurrentPacketMIs.pop_back(); + if (!isBundlable) { + endPacket(MI->getParent(), MI); + substitutePV(MI, getPreviousVector(MI)); + return VLIWPacketizerList::addToPacket(MI); + } + if (!CurrentPacketMIs.empty()) + setIsLastBit(CurrentPacketMIs.back(), 0); + substitutePV(MI, PV); + return VLIWPacketizerList::addToPacket(MI); + } +private: + std::vector<std::pair<int, unsigned> > + ExtractSrcs(const MachineInstr *MI, const DenseMap<unsigned, unsigned> &PV) + const { + R600Operands::Ops Ops[] = { + R600Operands::SRC0, + R600Operands::SRC1, + R600Operands::SRC2 + }; + std::vector<std::pair<int, unsigned> > Result; + for (unsigned i = 0; i < 3; i++) { + int OperandIdx = TII->getOperandIdx(MI->getOpcode(), Ops[i]); + if (OperandIdx < 0){ + Result.push_back(std::pair<int, unsigned>(-1,0)); + continue; + } + unsigned Src = MI->getOperand(OperandIdx).getReg(); + if (PV.find(Src) != PV.end()) { + Result.push_back(std::pair<int, unsigned>(-1,0)); + continue; + } + unsigned Reg = TRI.getEncodingValue(Src) & 0xff; + if (Reg > 127) { + Result.push_back(std::pair<int, unsigned>(-1,0)); + continue; + } + unsigned Chan = TRI.getHWRegChan(Src); + Result.push_back(std::pair<int, unsigned>(Reg, Chan)); + } + return Result; + } + + std::vector<std::pair<int, unsigned> > + Swizzle(std::vector<std::pair<int, unsigned> > Src, + BankSwizzle Swz) const { + switch (Swz) { + case ALU_VEC_012: + break; + case ALU_VEC_021: + std::swap(Src[1], Src[2]); + break; + case ALU_VEC_102: + std::swap(Src[0], Src[1]); + break; + case ALU_VEC_120: + std::swap(Src[0], Src[1]); + std::swap(Src[0], Src[2]); + break; + case ALU_VEC_201: + std::swap(Src[0], Src[2]); + std::swap(Src[0], Src[1]); + break; + case ALU_VEC_210: + std::swap(Src[0], Src[2]); + break; + } + return Src; + } + + bool isLegal(const std::vector<MachineInstr *> &IG, + const std::vector<BankSwizzle> &Swz, + const DenseMap<unsigned, unsigned> &PV) const { + assert (Swz.size() == IG.size()); + int Vector[4][3]; + memset(Vector, -1, sizeof(Vector)); + for (unsigned i = 0, e = IG.size(); i < e; i++) { + const std::vector<std::pair<int, unsigned> > &Srcs = + Swizzle(ExtractSrcs(IG[i], PV), Swz[i]); + for (unsigned j = 0; j < 3; j++) { + const std::pair<int, unsigned> &Src = Srcs[j]; + if (Src.first < 0) + continue; + if (Vector[Src.second][j] < 0) + Vector[Src.second][j] = Src.first; + if (Vector[Src.second][j] != Src.first) + return false; + } + } + return true; + } + + bool recursiveFitsFPLimitation( + std::vector<MachineInstr *> IG, + const DenseMap<unsigned, unsigned> &PV, + std::vector<BankSwizzle> &SwzCandidate, + std::vector<MachineInstr *> CurrentlyChecked) + const { + if (!isLegal(CurrentlyChecked, SwzCandidate, PV)) + return false; + if (IG.size() == CurrentlyChecked.size()) { + return true; + } + BankSwizzle AvailableSwizzle[] = { + ALU_VEC_012, + ALU_VEC_021, + ALU_VEC_120, + ALU_VEC_102, + ALU_VEC_201, + ALU_VEC_210 + }; + CurrentlyChecked.push_back(IG[CurrentlyChecked.size()]); + for (unsigned i = 0; i < 6; i++) { + SwzCandidate.push_back(AvailableSwizzle[i]); + if (recursiveFitsFPLimitation(IG, PV, SwzCandidate, CurrentlyChecked)) + return true; + SwzCandidate.pop_back(); + } + return false; + } + + bool fitsReadPortLimitation( + std::vector<MachineInstr *> IG, + const DenseMap<unsigned, unsigned> &PV) + const { + //Todo : support shared src0 - src1 operand + std::vector<BankSwizzle> SwzCandidate; + bool Result = recursiveFitsFPLimitation(IG, PV, SwzCandidate, + std::vector<MachineInstr *>()); + if (!Result) + return false; + for (unsigned i = 0, e = IG.size(); i < e; i++) { + MachineInstr *MI = IG[i]; + unsigned Op = TII->getOperandIdx(MI->getOpcode(), + R600Operands::BANK_SWIZZLE); + MI->getOperand(Op).setImm(SwzCandidate[i]); + } + return true; + } +}; + +bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) { + const TargetInstrInfo *TII = Fn.getTarget().getInstrInfo(); + MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>(); + MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>(); + + // Instantiate the packetizer. + R600PacketizerList Packetizer(Fn, MLI, MDT); + + // DFA state table should not be empty. + assert(Packetizer.getResourceTracker() && "Empty DFA table!"); + + // + // Loop over all basic blocks and remove KILL pseudo-instructions + // These instructions confuse the dependence analysis. Consider: + // D0 = ... (Insn 0) + // R0 = KILL R0, D0 (Insn 1) + // R0 = ... (Insn 2) + // Here, Insn 1 will result in the dependence graph not emitting an output + // dependence between Insn 0 and Insn 2. This can lead to incorrect + // packetization + // + for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); + MBB != MBBe; ++MBB) { + MachineBasicBlock::iterator End = MBB->end(); + MachineBasicBlock::iterator MI = MBB->begin(); + while (MI != End) { + if (MI->isKill()) { + MachineBasicBlock::iterator DeleteMI = MI; + ++MI; + MBB->erase(DeleteMI); + End = MBB->end(); + continue; + } + ++MI; + } + } + + // Loop over all of the basic blocks. + for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); + MBB != MBBe; ++MBB) { + // Find scheduling regions and schedule / packetize each region. + unsigned RemainingCount = MBB->size(); + for(MachineBasicBlock::iterator RegionEnd = MBB->end(); + RegionEnd != MBB->begin();) { + // The next region starts above the previous region. Look backward in the + // instruction stream until we find the nearest boundary. + MachineBasicBlock::iterator I = RegionEnd; + for(;I != MBB->begin(); --I, --RemainingCount) { + if (TII->isSchedulingBoundary(llvm::prior(I), MBB, Fn)) + break; + } + I = MBB->begin(); + + // Skip empty scheduling regions. + if (I == RegionEnd) { + RegionEnd = llvm::prior(RegionEnd); + --RemainingCount; + continue; + } + // Skip regions with one instruction. + if (I == llvm::prior(RegionEnd)) { + RegionEnd = llvm::prior(RegionEnd); + continue; + } + + Packetizer.PacketizeMIs(MBB, I, RegionEnd); + RegionEnd = I; + } + } + + return true; + +} + +} + +llvm::FunctionPass *llvm::createR600Packetizer(TargetMachine &tm) { + return new R600Packetizer(tm); +} + +#endif // R600PACKETIZER_CPP diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td index 03f4976..bfc546b 100644 --- a/lib/Target/R600/R600RegisterInfo.td +++ b/lib/Target/R600/R600RegisterInfo.td @@ -88,8 +88,14 @@ def NEG_ONE : R600Reg<"-1.0", 249>; def ONE_INT : R600Reg<"1", 250>; def HALF : R600Reg<"0.5", 252>; def NEG_HALF : R600Reg<"-0.5", 252>; -def ALU_LITERAL_X : R600Reg<"literal.x", 253>; -def PV_X : R600Reg<"pv.x", 254>; +def ALU_LITERAL_X : R600RegWithChan<"literal.x", 253, "X">; +def ALU_LITERAL_Y : R600RegWithChan<"literal.y", 253, "Y">; +def ALU_LITERAL_Z : R600RegWithChan<"literal.z", 253, "Z">; +def ALU_LITERAL_W : R600RegWithChan<"literal.w", 253, "W">; +def PV_X : R600RegWithChan<"PV.x", 254, "X">; +def PV_Y : R600RegWithChan<"PV.y", 254, "Y">; +def PV_Z : R600RegWithChan<"PV.z", 254, "Z">; +def PV_W : R600RegWithChan<"PV.w", 254, "W">; def PREDICATE_BIT : R600Reg<"PredicateBit", 0>; def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>; def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>; diff --git a/lib/Target/R600/R600Schedule.td b/lib/Target/R600/R600Schedule.td index 7ede181..78a460a 100644 --- a/lib/Target/R600/R600Schedule.td +++ b/lib/Target/R600/R600Schedule.td @@ -24,7 +24,7 @@ def AnyALU : InstrItinClass; def VecALU : InstrItinClass; def TransALU : InstrItinClass; -def R600_EG_Itin : ProcessorItineraries < +def R600_VLIW5_Itin : ProcessorItineraries < [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS, ALU_NULL], [], [ @@ -34,3 +34,14 @@ def R600_EG_Itin : ProcessorItineraries < InstrItinData<NullALU, [InstrStage<1, [ALU_NULL]>]> ] >; + +def R600_VLIW4_Itin : ProcessorItineraries < + [ALU_X, ALU_Y, ALU_Z, ALU_W, ALU_NULL], + [], + [ + InstrItinData<AnyALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W]>]>, + InstrItinData<VecALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_X, ALU_W]>]>, + InstrItinData<TransALU, [InstrStage<1, [ALU_NULL]>]>, + InstrItinData<NullALU, [InstrStage<1, [ALU_NULL]>]> + ] +>; diff --git a/lib/Target/R600/SIDefines.h b/lib/Target/R600/SIDefines.h new file mode 100644 index 0000000..716b093 --- /dev/null +++ b/lib/Target/R600/SIDefines.h @@ -0,0 +1,22 @@ +//===-- SIDefines.h - SI Helper Macros ----------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +//===----------------------------------------------------------------------===// + +#ifndef SIDEFINES_H_ +#define SIDEFINES_H_ + +#define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028 +#define R_00B128_SPI_SHADER_PGM_RSRC1_VS 0x00B128 +#define R_00B228_SPI_SHADER_PGM_RSRC1_GS 0x00B228 +#define R_00B848_COMPUTE_PGM_RSRC1 0x00B848 +#define S_00B028_VGPRS(x) (((x) & 0x3F) << 0) +#define S_00B028_SGPRS(x) (((x) & 0x0F) << 6) +#define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC + +#endif // SIDEFINES_H_ diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 6f0c307..6bd82a5 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -49,6 +49,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : addRegisterClass(MVT::v4i32, &AMDGPU::VReg_128RegClass); addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); + addRegisterClass(MVT::i128, &AMDGPU::SReg_128RegClass); addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass); addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); @@ -70,6 +71,10 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); + + setOperationAction(ISD::STORE, MVT::i32, Custom); + setOperationAction(ISD::STORE, MVT::i64, Custom); + setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::SETCC); @@ -234,6 +239,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); + case ISD::STORE: return LowerSTORE(Op, DAG); } return SDValue(); } @@ -332,6 +338,32 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, return Chain; } +#define RSRC_DATA_FORMAT 0xf00000000000 + +SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { + StoreSDNode *StoreNode = cast<StoreSDNode>(Op); + SDValue Chain = Op.getOperand(0); + SDValue Value = Op.getOperand(1); + SDValue VirtualAddress = Op.getOperand(2); + DebugLoc DL = Op.getDebugLoc(); + + if (StoreNode->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS) { + return SDValue(); + } + + SDValue SrcSrc = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, + DAG.getConstant(0, MVT::i64), + DAG.getConstant(RSRC_DATA_FORMAT, MVT::i64)); + + SDValue Ops[2]; + Ops[0] = DAG.getNode(AMDGPUISD::BUFFER_STORE, DL, MVT::Other, Chain, + Value, SrcSrc, VirtualAddress); + Ops[1] = Chain; + + return DAG.getMergeValues(Ops, 2, DL); + +} + SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); @@ -424,9 +456,12 @@ int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { float F; } Imm; - if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) + if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) { + if (Node->getZExtValue() >> 32) { + return -1; + } Imm.I = Node->getSExtValue(); - else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) + } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) Imm.F = Node->getValueAPF().convertToFloat(); else return -1; // It isn't an immediate @@ -534,8 +569,9 @@ void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, Operand = SDValue(Node, 0); } -SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, - SelectionDAG &DAG) const { +/// \brief Try to fold the Nodes operands into the Node +SDNode *SITargetLowering::foldOperands(MachineSDNode *Node, + SelectionDAG &DAG) const { // Original encoding (either e32 or e64) int Opcode = Node->getMachineOpcode(); @@ -666,5 +702,116 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, // Create a complete new instruction return DAG.getMachineNode(Desc->Opcode, Node->getDebugLoc(), - Node->getVTList(), Ops.data(), Ops.size()); + Node->getVTList(), Ops); +} + +/// \brief Helper function for adjustWritemask +unsigned SubIdx2Lane(unsigned Idx) { + switch (Idx) { + default: return 0; + case AMDGPU::sub0: return 0; + case AMDGPU::sub1: return 1; + case AMDGPU::sub2: return 2; + case AMDGPU::sub3: return 3; + } +} + +/// \brief Adjust the writemask of MIMG instructions +void SITargetLowering::adjustWritemask(MachineSDNode *&Node, + SelectionDAG &DAG) const { + SDNode *Users[4] = { }; + unsigned Writemask = 0, Lane = 0; + + // Try to figure out the used register components + for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); + I != E; ++I) { + + // Abort if we can't understand the usage + if (!I->isMachineOpcode() || + I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) + return; + + Lane = SubIdx2Lane(I->getConstantOperandVal(1)); + + // Abort if we have more than one user per component + if (Users[Lane]) + return; + + Users[Lane] = *I; + Writemask |= 1 << Lane; + } + + // Abort if all components are used + if (Writemask == 0xf) + return; + + // Adjust the writemask in the node + std::vector<SDValue> Ops; + Ops.push_back(DAG.getTargetConstant(Writemask, MVT::i32)); + for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i) + Ops.push_back(Node->getOperand(i)); + Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops.data(), Ops.size()); + + // If we only got one lane, replace it with a copy + if (Writemask == (1U << Lane)) { + SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID, MVT::i32); + SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, + DebugLoc(), Users[Lane]->getValueType(0), + SDValue(Node, 0), RC); + DAG.ReplaceAllUsesWith(Users[Lane], Copy); + return; + } + + // Update the users of the node with the new indices + for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { + + SDNode *User = Users[i]; + if (!User) + continue; + + SDValue Op = DAG.getTargetConstant(Idx, MVT::i32); + DAG.UpdateNodeOperands(User, User->getOperand(0), Op); + + switch (Idx) { + default: break; + case AMDGPU::sub0: Idx = AMDGPU::sub1; break; + case AMDGPU::sub1: Idx = AMDGPU::sub2; break; + case AMDGPU::sub2: Idx = AMDGPU::sub3; break; + } + } +} + +/// \brief Fold the instructions after slecting them +SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, + SelectionDAG &DAG) const { + + if (AMDGPU::isMIMG(Node->getMachineOpcode()) != -1) + adjustWritemask(Node, DAG); + + return foldOperands(Node, DAG); +} + +/// \brief Assign the register class depending on the number of +/// bits set in the writemask +void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, + SDNode *Node) const { + if (AMDGPU::isMIMG(MI->getOpcode()) == -1) + return; + + unsigned VReg = MI->getOperand(0).getReg(); + unsigned Writemask = MI->getOperand(1).getImm(); + unsigned BitsSet = 0; + for (unsigned i = 0; i < 4; ++i) + BitsSet += Writemask & (1 << i) ? 1 : 0; + + const TargetRegisterClass *RC; + switch (BitsSet) { + default: return; + case 1: RC = &AMDGPU::VReg_32RegClass; break; + case 2: RC = &AMDGPU::VReg_64RegClass; break; + case 3: RC = &AMDGPU::VReg_96RegClass; break; + } + + MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + MRI.setRegClass(VReg, RC); } diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h index 5ad2f40..de637be 100644 --- a/lib/Target/R600/SIISelLowering.h +++ b/lib/Target/R600/SIISelLowering.h @@ -24,6 +24,7 @@ class SITargetLowering : public AMDGPUTargetLowering { const SIInstrInfo * TII; const TargetRegisterInfo * TRI; + SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; @@ -33,6 +34,9 @@ class SITargetLowering : public AMDGPUTargetLowering { void ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, unsigned RegClass, bool &ScalarSlotUsed) const; + SDNode *foldOperands(MachineSDNode *N, SelectionDAG &DAG) const; + void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const; + public: SITargetLowering(TargetMachine &tm); @@ -49,6 +53,8 @@ public: virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const; virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const; + virtual void AdjustInstrPostInstrSelection(MachineInstr *MI, + SDNode *Node) const; int32_t analyzeImmediate(const SDNode *N) const; }; diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td index 3891ddb..f737ddd 100644 --- a/lib/Target/R600/SIInstrFormats.td +++ b/lib/Target/R600/SIInstrFormats.td @@ -284,33 +284,33 @@ let Uses = [EXEC] in { class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : Enc64<outs, ins, asm, pattern> { - bits<8> VDATA; - bits<12> OFFSET; - bits<1> OFFEN; - bits<1> IDXEN; - bits<1> GLC; - bits<1> ADDR64; - bits<1> LDS; - bits<8> VADDR; - bits<7> SRSRC; - bits<1> SLC; - bits<1> TFE; - bits<8> SOFFSET; - - let Inst{11-0} = OFFSET; - let Inst{12} = OFFEN; - let Inst{13} = IDXEN; - let Inst{14} = GLC; - let Inst{15} = ADDR64; - let Inst{16} = LDS; + bits<12> offset; + bits<1> offen; + bits<1> idxen; + bits<1> glc; + bits<1> addr64; + bits<1> lds; + bits<8> vaddr; + bits<8> vdata; + bits<7> srsrc; + bits<1> slc; + bits<1> tfe; + bits<8> soffset; + + let Inst{11-0} = offset; + let Inst{12} = offen; + let Inst{13} = idxen; + let Inst{14} = glc; + let Inst{15} = addr64; + let Inst{16} = lds; let Inst{24-18} = op; let Inst{31-26} = 0x38; //encoding - let Inst{39-32} = VADDR; - let Inst{47-40} = VDATA; - let Inst{52-48} = SRSRC{6-2}; - let Inst{54} = SLC; - let Inst{55} = TFE; - let Inst{63-56} = SOFFSET; + let Inst{39-32} = vaddr; + let Inst{47-40} = vdata; + let Inst{52-48} = srsrc{6-2}; + let Inst{54} = slc; + let Inst{55} = tfe; + let Inst{63-56} = soffset; let VM_CNT = 1; let EXP_CNT = 1; diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp index 0bfcef5..9a04c60 100644 --- a/lib/Target/R600/SIInstrInfo.cpp +++ b/lib/Target/R600/SIInstrInfo.cpp @@ -58,6 +58,10 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 0 }; + const int16_t Sub0_2[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 0 + }; + const int16_t Sub0_1[] = { AMDGPU::sub0, AMDGPU::sub1, 0 }; @@ -125,6 +129,11 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, Opcode = AMDGPU::V_MOV_B32_e32; SubIndices = Sub0_1; + } else if (AMDGPU::VReg_96RegClass.contains(DestReg)) { + assert(AMDGPU::VReg_96RegClass.contains(SrcReg)); + Opcode = AMDGPU::V_MOV_B32_e32; + SubIndices = Sub0_2; + } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) { assert(AMDGPU::VReg_128RegClass.contains(SrcReg) || AMDGPU::SReg_128RegClass.contains(SrcReg)); diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h index d4e60e5..87eff4d 100644 --- a/lib/Target/R600/SIInstrInfo.h +++ b/lib/Target/R600/SIInstrInfo.h @@ -80,6 +80,7 @@ namespace AMDGPU { int getVOPe64(uint16_t Opcode); int getCommuteRev(uint16_t Opcode); int getCommuteOrig(uint16_t Opcode); + int isMIMG(uint16_t Opcode); } // End namespace AMDGPU diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td index 617f0b8..c8aecb7 100644 --- a/lib/Target/R600/SIInstrInfo.td +++ b/lib/Target/R600/SIInstrInfo.td @@ -26,6 +26,10 @@ def HI32 : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(N->getZExtValue() >> 32, MVT::i32); }]>; +def SIbuffer_store : SDNode<"AMDGPUISD::BUFFER_STORE", + SDTypeProfile<0, 3, [SDTCisPtrTy<1>, SDTCisInt<2>]>, + [SDNPHasChain, SDNPMayStore]>; + def IMM8bitDWORD : ImmLeaf < i32, [{ return (Imm & ~0x3FC) == 0; @@ -255,14 +259,14 @@ multiclass VOPC_64 <bits<8> op, string opName, class VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3 < op, (outs VReg_32:$dst), (ins VSrc_32:$src0, VSrc_32:$src1, VSrc_32:$src2, - i32imm:$abs, i32imm:$clamp, i32imm:$omod, i32imm:$neg), + InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg), opName#" $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern >, VOP <opName>; class VOP3_64 <bits<9> op, string opName, list<dag> pattern> : VOP3 < op, (outs VReg_64:$dst), (ins VSrc_64:$src0, VSrc_64:$src1, VSrc_64:$src2, - i32imm:$abs, i32imm:$clamp, i32imm:$omod, i32imm:$neg), + InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg), opName#" $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern >, VOP <opName>; @@ -285,17 +289,39 @@ class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBU class MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> : MUBUF < op, - (outs regClass:$dst), + (outs regClass:$vdata), (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, i1imm:$lds, VReg_32:$vaddr, SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset), - asm#" $dst, $offset, $offen, $idxen, $glc, $addr64, " + asm#" $vdata, $offset, $offen, $idxen, $glc, $addr64, " #"$lds, $vaddr, $srsrc, $slc, $tfe, $soffset", []> { let mayLoad = 1; let mayStore = 0; } +class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass, + ValueType VT> : + MUBUF <op, (outs), (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr), + name#" $vdata, $srsrc + $vaddr", + [(SIbuffer_store (VT vdataClass:$vdata), (i128 SReg_128:$srsrc), + (i64 VReg_64:$vaddr))]> { + + let mayLoad = 0; + let mayStore = 1; + + // Encoding + let offset = 0; + let offen = 0; + let idxen = 0; + let glc = 0; + let addr64 = 1; + let lds = 0; + let slc = 0; + let tfe = 0; + let soffset = 128; // ZERO +} + class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF < op, (outs regClass:$dst), @@ -309,7 +335,22 @@ class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF let mayStore = 0; } -class MIMG_Load_Helper <bits<7> op, string asm> : MIMG < +class MIMG_NoSampler_Helper <bits<7> op, string asm> : MIMG < + op, + (outs VReg_128:$vdata), + (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, + i1imm:$tfe, i1imm:$lwe, i1imm:$slc, unknown:$vaddr, + SReg_256:$srsrc), + asm#" $vdata, $dmask, $unorm, $glc, $da, $r128," + #" $tfe, $lwe, $slc, $vaddr, $srsrc", + []> { + let SSAMP = 0; + let mayLoad = 1; + let mayStore = 0; + let hasPostISelHook = 1; +} + +class MIMG_Sampler_Helper <bits<7> op, string asm> : MIMG < op, (outs VReg_128:$vdata), (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, @@ -320,6 +361,7 @@ class MIMG_Load_Helper <bits<7> op, string asm> : MIMG < []> { let mayLoad = 1; let mayStore = 0; + let hasPostISelHook = 1; } //===----------------------------------------------------------------------===// @@ -353,4 +395,13 @@ def getCommuteOrig : InstrMapping { let ValueCols = [["1"]]; } +// Test if the supplied opcode is an MIMG instruction +def isMIMG : InstrMapping { + let FilterClass = "MIMG"; + let RowFields = ["Inst"]; + let ColFields = ["Size"]; + let KeyCol = ["8"]; + let ValueCols = [["8"]]; +} + include "SIInstructions.td" diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 4f734f9..0d50c5d 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -108,7 +108,7 @@ VGPR0 = V_CNDMASK VCC, VGPR0, VGPR1 def S_CMPK_EQ_I32 : SOPK < 0x00000003, (outs SCCReg:$dst), (ins SReg_32:$src0, i32imm:$src1), "S_CMPK_EQ_I32", - [(set SCCReg:$dst, (setcc SReg_32:$src0, imm:$src1, SETEQ))] + [(set i1:$dst, (setcc i32:$src0, imm:$src1, SETEQ))] >; */ @@ -408,8 +408,14 @@ def BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <0x0000000d, "BUFFER_LOAD_DWORDX2", def BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128>; //def BUFFER_STORE_BYTE : MUBUF_ <0x00000018, "BUFFER_STORE_BYTE", []>; //def BUFFER_STORE_SHORT : MUBUF_ <0x0000001a, "BUFFER_STORE_SHORT", []>; -//def BUFFER_STORE_DWORD : MUBUF_ <0x0000001c, "BUFFER_STORE_DWORD", []>; -//def BUFFER_STORE_DWORDX2 : MUBUF_DWORDX2 <0x0000001d, "BUFFER_STORE_DWORDX2", []>; + +def BUFFER_STORE_DWORD : MUBUF_Store_Helper < + 0x0000001c, "BUFFER_STORE_DWORD", VReg_32, i32 +>; + +def BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper < + 0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64, i64 +>; //def BUFFER_STORE_DWORDX4 : MUBUF_DWORDX4 <0x0000001e, "BUFFER_STORE_DWORDX4", []>; //def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>; //def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "BUFFER_ATOMIC_CMPSWAP", []>; @@ -489,7 +495,7 @@ defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper < //def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>; //def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>; //def IMAGE_LOAD : MIMG_NoPattern_ <"IMAGE_LOAD", 0x00000000>; -//def IMAGE_LOAD_MIP : MIMG_NoPattern_ <"IMAGE_LOAD_MIP", 0x00000001>; +def IMAGE_LOAD_MIP : MIMG_NoSampler_Helper <0x00000001, "IMAGE_LOAD_MIP">; //def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_PCK", 0x00000002>; //def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_PCK_SGN", 0x00000003>; //def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK", 0x00000004>; @@ -498,7 +504,7 @@ defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper < //def IMAGE_STORE_MIP : MIMG_NoPattern_ <"IMAGE_STORE_MIP", 0x00000009>; //def IMAGE_STORE_PCK : MIMG_NoPattern_ <"IMAGE_STORE_PCK", 0x0000000a>; //def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"IMAGE_STORE_MIP_PCK", 0x0000000b>; -//def IMAGE_GET_RESINFO : MIMG_NoPattern_ <"IMAGE_GET_RESINFO", 0x0000000e>; +def IMAGE_GET_RESINFO : MIMG_NoSampler_Helper <0x0000000e, "IMAGE_GET_RESINFO">; //def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_SWAP", 0x0000000f>; //def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_CMPSWAP", 0x00000010>; //def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"IMAGE_ATOMIC_ADD", 0x00000011>; @@ -516,20 +522,20 @@ defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper < //def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_FCMPSWAP", 0x0000001d>; //def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMIN", 0x0000001e>; //def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMAX", 0x0000001f>; -def IMAGE_SAMPLE : MIMG_Load_Helper <0x00000020, "IMAGE_SAMPLE">; +def IMAGE_SAMPLE : MIMG_Sampler_Helper <0x00000020, "IMAGE_SAMPLE">; //def IMAGE_SAMPLE_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CL", 0x00000021>; -def IMAGE_SAMPLE_D : MIMG_Load_Helper <0x00000022, "IMAGE_SAMPLE_D">; +def IMAGE_SAMPLE_D : MIMG_Sampler_Helper <0x00000022, "IMAGE_SAMPLE_D">; //def IMAGE_SAMPLE_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_CL", 0x00000023>; -def IMAGE_SAMPLE_L : MIMG_Load_Helper <0x00000024, "IMAGE_SAMPLE_L">; -def IMAGE_SAMPLE_B : MIMG_Load_Helper <0x00000025, "IMAGE_SAMPLE_B">; +def IMAGE_SAMPLE_L : MIMG_Sampler_Helper <0x00000024, "IMAGE_SAMPLE_L">; +def IMAGE_SAMPLE_B : MIMG_Sampler_Helper <0x00000025, "IMAGE_SAMPLE_B">; //def IMAGE_SAMPLE_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_CL", 0x00000026>; //def IMAGE_SAMPLE_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ", 0x00000027>; -def IMAGE_SAMPLE_C : MIMG_Load_Helper <0x00000028, "IMAGE_SAMPLE_C">; +def IMAGE_SAMPLE_C : MIMG_Sampler_Helper <0x00000028, "IMAGE_SAMPLE_C">; //def IMAGE_SAMPLE_C_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL", 0x00000029>; //def IMAGE_SAMPLE_C_D : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D", 0x0000002a>; //def IMAGE_SAMPLE_C_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL", 0x0000002b>; -def IMAGE_SAMPLE_C_L : MIMG_Load_Helper <0x0000002c, "IMAGE_SAMPLE_C_L">; -def IMAGE_SAMPLE_C_B : MIMG_Load_Helper <0x0000002d, "IMAGE_SAMPLE_C_B">; +def IMAGE_SAMPLE_C_L : MIMG_Sampler_Helper <0x0000002c, "IMAGE_SAMPLE_C_L">; +def IMAGE_SAMPLE_C_B : MIMG_Sampler_Helper <0x0000002d, "IMAGE_SAMPLE_C_B">; //def IMAGE_SAMPLE_C_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL", 0x0000002e>; //def IMAGE_SAMPLE_C_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ", 0x0000002f>; //def IMAGE_SAMPLE_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_O", 0x00000030>; @@ -594,12 +600,14 @@ defm V_READFIRSTLANE_B32 : VOP1_32 <0x00000002, "V_READFIRSTLANE_B32", []>; //defm V_CVT_I32_F64 : VOP1_32 <0x00000003, "V_CVT_I32_F64", []>; //defm V_CVT_F64_I32 : VOP1_64 <0x00000004, "V_CVT_F64_I32", []>; defm V_CVT_F32_I32 : VOP1_32 <0x00000005, "V_CVT_F32_I32", - [(set VReg_32:$dst, (sint_to_fp VSrc_32:$src0))] + [(set f32:$dst, (sint_to_fp i32:$src0))] +>; +defm V_CVT_F32_U32 : VOP1_32 <0x00000006, "V_CVT_F32_U32", + [(set f32:$dst, (uint_to_fp i32:$src0))] >; -//defm V_CVT_F32_U32 : VOP1_32 <0x00000006, "V_CVT_F32_U32", []>; -//defm V_CVT_U32_F32 : VOP1_32 <0x00000007, "V_CVT_U32_F32", []>; +defm V_CVT_U32_F32 : VOP1_32 <0x00000007, "V_CVT_U32_F32", []>; defm V_CVT_I32_F32 : VOP1_32 <0x00000008, "V_CVT_I32_F32", - [(set (i32 VReg_32:$dst), (fp_to_sint VSrc_32:$src0))] + [(set i32:$dst, (fp_to_sint f32:$src0))] >; defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>; ////def V_CVT_F16_F32 : VOP1_F16 <0x0000000a, "V_CVT_F16_F32", []>; @@ -616,35 +624,37 @@ defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>; //defm V_CVT_U32_F64 : VOP1_32 <0x00000015, "V_CVT_U32_F64", []>; //defm V_CVT_F64_U32 : VOP1_64 <0x00000016, "V_CVT_F64_U32", []>; defm V_FRACT_F32 : VOP1_32 <0x00000020, "V_FRACT_F32", - [(set VReg_32:$dst, (AMDGPUfract VSrc_32:$src0))] + [(set f32:$dst, (AMDGPUfract f32:$src0))] +>; +defm V_TRUNC_F32 : VOP1_32 <0x00000021, "V_TRUNC_F32", + [(set f32:$dst, (int_AMDGPU_trunc f32:$src0))] >; -defm V_TRUNC_F32 : VOP1_32 <0x00000021, "V_TRUNC_F32", []>; defm V_CEIL_F32 : VOP1_32 <0x00000022, "V_CEIL_F32", - [(set VReg_32:$dst, (fceil VSrc_32:$src0))] + [(set f32:$dst, (fceil f32:$src0))] >; defm V_RNDNE_F32 : VOP1_32 <0x00000023, "V_RNDNE_F32", - [(set VReg_32:$dst, (frint VSrc_32:$src0))] + [(set f32:$dst, (frint f32:$src0))] >; defm V_FLOOR_F32 : VOP1_32 <0x00000024, "V_FLOOR_F32", - [(set VReg_32:$dst, (ffloor VSrc_32:$src0))] + [(set f32:$dst, (ffloor f32:$src0))] >; defm V_EXP_F32 : VOP1_32 <0x00000025, "V_EXP_F32", - [(set VReg_32:$dst, (fexp2 VSrc_32:$src0))] + [(set f32:$dst, (fexp2 f32:$src0))] >; defm V_LOG_CLAMP_F32 : VOP1_32 <0x00000026, "V_LOG_CLAMP_F32", []>; defm V_LOG_F32 : VOP1_32 <0x00000027, "V_LOG_F32", - [(set VReg_32:$dst, (flog2 VSrc_32:$src0))] + [(set f32:$dst, (flog2 f32:$src0))] >; defm V_RCP_CLAMP_F32 : VOP1_32 <0x00000028, "V_RCP_CLAMP_F32", []>; defm V_RCP_LEGACY_F32 : VOP1_32 <0x00000029, "V_RCP_LEGACY_F32", []>; defm V_RCP_F32 : VOP1_32 <0x0000002a, "V_RCP_F32", - [(set VReg_32:$dst, (fdiv FP_ONE, VSrc_32:$src0))] + [(set f32:$dst, (fdiv FP_ONE, f32:$src0))] >; defm V_RCP_IFLAG_F32 : VOP1_32 <0x0000002b, "V_RCP_IFLAG_F32", []>; defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", []>; defm V_RSQ_LEGACY_F32 : VOP1_32 < 0x0000002d, "V_RSQ_LEGACY_F32", - [(set VReg_32:$dst, (int_AMDGPU_rsq VSrc_32:$src0))] + [(set f32:$dst, (int_AMDGPU_rsq f32:$src0))] >; defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32", []>; defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64", []>; @@ -787,14 +797,13 @@ def V_CNDMASK_B32_e64 : VOP3 <0x00000100, (outs VReg_32:$dst), (ins VSrc_32:$src0, VSrc_32:$src1, SSrc_64:$src2, InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg), "V_CNDMASK_B32_e64 $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", - [(set (i32 VReg_32:$dst), (select (i1 SSrc_64:$src2), - VSrc_32:$src1, VSrc_32:$src0))] + [(set i32:$dst, (select i1:$src2, i32:$src1, i32:$src0))] >; //f32 pattern for V_CNDMASK_B32_e64 def : Pat < - (f32 (select (i1 SSrc_64:$src2), VSrc_32:$src1, VSrc_32:$src0)), - (V_CNDMASK_B32_e64 VSrc_32:$src0, VSrc_32:$src1, SSrc_64:$src2) + (f32 (select i1:$src2, f32:$src1, f32:$src0)), + (V_CNDMASK_B32_e64 $src0, $src1, $src2) >; defm V_READLANE_B32 : VOP2_32 <0x00000001, "V_READLANE_B32", []>; @@ -802,11 +811,11 @@ defm V_WRITELANE_B32 : VOP2_32 <0x00000002, "V_WRITELANE_B32", []>; let isCommutable = 1 in { defm V_ADD_F32 : VOP2_32 <0x00000003, "V_ADD_F32", - [(set VReg_32:$dst, (fadd VSrc_32:$src0, VReg_32:$src1))] + [(set f32:$dst, (fadd f32:$src0, f32:$src1))] >; defm V_SUB_F32 : VOP2_32 <0x00000004, "V_SUB_F32", - [(set VReg_32:$dst, (fsub VSrc_32:$src0, VReg_32:$src1))] + [(set f32:$dst, (fsub f32:$src0, f32:$src1))] >; defm V_SUBREV_F32 : VOP2_32 <0x00000005, "V_SUBREV_F32", [], "V_SUB_F32">; } // End isCommutable = 1 @@ -817,11 +826,11 @@ let isCommutable = 1 in { defm V_MUL_LEGACY_F32 : VOP2_32 < 0x00000007, "V_MUL_LEGACY_F32", - [(set VReg_32:$dst, (int_AMDGPU_mul VSrc_32:$src0, VReg_32:$src1))] + [(set f32:$dst, (int_AMDGPU_mul f32:$src0, f32:$src1))] >; defm V_MUL_F32 : VOP2_32 <0x00000008, "V_MUL_F32", - [(set VReg_32:$dst, (fmul VSrc_32:$src0, VReg_32:$src1))] + [(set f32:$dst, (fmul f32:$src0, f32:$src1))] >; } // End isCommutable = 1 @@ -834,43 +843,51 @@ defm V_MUL_F32 : VOP2_32 <0x00000008, "V_MUL_F32", let isCommutable = 1 in { defm V_MIN_LEGACY_F32 : VOP2_32 <0x0000000d, "V_MIN_LEGACY_F32", - [(set VReg_32:$dst, (AMDGPUfmin VSrc_32:$src0, VReg_32:$src1))] + [(set f32:$dst, (AMDGPUfmin f32:$src0, f32:$src1))] >; defm V_MAX_LEGACY_F32 : VOP2_32 <0x0000000e, "V_MAX_LEGACY_F32", - [(set VReg_32:$dst, (AMDGPUfmax VSrc_32:$src0, VReg_32:$src1))] + [(set f32:$dst, (AMDGPUfmax f32:$src0, f32:$src1))] >; defm V_MIN_F32 : VOP2_32 <0x0000000f, "V_MIN_F32", []>; defm V_MAX_F32 : VOP2_32 <0x00000010, "V_MAX_F32", []>; -defm V_MIN_I32 : VOP2_32 <0x00000011, "V_MIN_I32", []>; -defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32", []>; -defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32", []>; -defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32", []>; +defm V_MIN_I32 : VOP2_32 <0x00000011, "V_MIN_I32", + [(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))] +>; +defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32", + [(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))] +>; +defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32", + [(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))] +>; +defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32", + [(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))] +>; defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32", - [(set VReg_32:$dst, (srl VSrc_32:$src0, (i32 VReg_32:$src1)))] + [(set i32:$dst, (srl i32:$src0, i32:$src1))] >; defm V_LSHRREV_B32 : VOP2_32 <0x00000016, "V_LSHRREV_B32", [], "V_LSHR_B32">; defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32", - [(set VReg_32:$dst, (sra VSrc_32:$src0, (i32 VReg_32:$src1)))] + [(set i32:$dst, (sra i32:$src0, i32:$src1))] >; defm V_ASHRREV_I32 : VOP2_32 <0x00000018, "V_ASHRREV_I32", [], "V_ASHR_I32">; defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32", - [(set VReg_32:$dst, (shl VSrc_32:$src0, (i32 VReg_32:$src1)))] + [(set i32:$dst, (shl i32:$src0, i32:$src1))] >; defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", [], "V_LSHL_B32">; defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32", - [(set VReg_32:$dst, (and VSrc_32:$src0, VReg_32:$src1))] + [(set i32:$dst, (and i32:$src0, i32:$src1))] >; defm V_OR_B32 : VOP2_32 <0x0000001c, "V_OR_B32", - [(set VReg_32:$dst, (or VSrc_32:$src0, VReg_32:$src1))] + [(set i32:$dst, (or i32:$src0, i32:$src1))] >; defm V_XOR_B32 : VOP2_32 <0x0000001d, "V_XOR_B32", - [(set VReg_32:$dst, (xor VSrc_32:$src0, VReg_32:$src1))] + [(set i32:$dst, (xor i32:$src0, i32:$src1))] >; } // End isCommutable = 1 @@ -885,11 +902,11 @@ defm V_MADAK_F32 : VOP2_32 <0x00000021, "V_MADAK_F32", []>; let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC defm V_ADD_I32 : VOP2b_32 <0x00000025, "V_ADD_I32", - [(set VReg_32:$dst, (add (i32 VSrc_32:$src0), (i32 VReg_32:$src1)))] + [(set i32:$dst, (add (i32 VSrc_32:$src0), (i32 VReg_32:$src1)))] >; defm V_SUB_I32 : VOP2b_32 <0x00000026, "V_SUB_I32", - [(set VReg_32:$dst, (sub (i32 VSrc_32:$src0), (i32 VReg_32:$src1)))] + [(set i32:$dst, (sub i32:$src0, i32:$src1))] >; defm V_SUBREV_I32 : VOP2b_32 <0x00000027, "V_SUBREV_I32", [], "V_SUB_I32">; @@ -905,7 +922,7 @@ defm V_LDEXP_F32 : VOP2_32 <0x0000002b, "V_LDEXP_F32", []>; ////def V_CVT_PKNORM_I16_F32 : VOP2_I16 <0x0000002d, "V_CVT_PKNORM_I16_F32", []>; ////def V_CVT_PKNORM_U16_F32 : VOP2_U16 <0x0000002e, "V_CVT_PKNORM_U16_F32", []>; defm V_CVT_PKRTZ_F16_F32 : VOP2_32 <0x0000002f, "V_CVT_PKRTZ_F16_F32", - [(set VReg_32:$dst, (int_SI_packf16 VSrc_32:$src0, VReg_32:$src1))] + [(set i32:$dst, (int_SI_packf16 f32:$src0, f32:$src1))] >; ////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "V_CVT_PK_U16_U32", []>; ////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "V_CVT_PK_I16_I32", []>; @@ -942,6 +959,7 @@ def V_CUBEMA_F32 : VOP3_32 <0x00000147, "V_CUBEMA_F32", []>; def V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32", []>; def V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32", []>; def V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32", []>; +defm : BFIPatterns <V_BFI_B32>; def V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32", []>; def V_FMA_F64 : VOP3_64 <0x0000014c, "V_FMA_F64", []>; //def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>; @@ -983,18 +1001,18 @@ def V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>; } // isCommutable = 1 def : Pat < - (mul VSrc_32:$src0, VReg_32:$src1), - (V_MUL_LO_I32 VSrc_32:$src0, VReg_32:$src1, (i32 0), 0, 0, 0, 0) + (mul i32:$src0, i32:$src1), + (V_MUL_LO_I32 $src0, $src1, (i32 0)) >; def : Pat < - (mulhu VSrc_32:$src0, VReg_32:$src1), - (V_MUL_HI_U32 VSrc_32:$src0, VReg_32:$src1, (i32 0), 0, 0, 0, 0) + (mulhu i32:$src0, i32:$src1), + (V_MUL_HI_U32 $src0, $src1, (i32 0)) >; def : Pat < - (mulhs VSrc_32:$src0, VReg_32:$src1), - (V_MUL_HI_I32 VSrc_32:$src0, VReg_32:$src1, (i32 0), 0, 0, 0, 0) + (mulhs i32:$src0, i32:$src1), + (V_MUL_HI_I32 $src0, $src1, (i32 0)) >; def V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>; @@ -1019,34 +1037,27 @@ def S_MAX_U32 : SOP2_32 <0x00000009, "S_MAX_U32", []>; def S_CSELECT_B32 : SOP2 < 0x0000000a, (outs SReg_32:$dst), (ins SReg_32:$src0, SReg_32:$src1, SCCReg:$scc), "S_CSELECT_B32", - [(set (i32 SReg_32:$dst), (select (i1 SCCReg:$scc), - SReg_32:$src0, SReg_32:$src1))] + [] >; def S_CSELECT_B64 : SOP2_64 <0x0000000b, "S_CSELECT_B64", []>; -// f32 pattern for S_CSELECT_B32 -def : Pat < - (f32 (select (i1 SCCReg:$scc), SReg_32:$src0, SReg_32:$src1)), - (S_CSELECT_B32 SReg_32:$src0, SReg_32:$src1, SCCReg:$scc) ->; - def S_AND_B32 : SOP2_32 <0x0000000e, "S_AND_B32", []>; def S_AND_B64 : SOP2_64 <0x0000000f, "S_AND_B64", - [(set SReg_64:$dst, (i64 (and SSrc_64:$src0, SSrc_64:$src1)))] + [(set i64:$dst, (and i64:$src0, i64:$src1))] >; def : Pat < - (i1 (and SSrc_64:$src0, SSrc_64:$src1)), - (S_AND_B64 SSrc_64:$src0, SSrc_64:$src1) + (i1 (and i1:$src0, i1:$src1)), + (S_AND_B64 $src0, $src1) >; def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32", []>; def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64", []>; def : Pat < - (i1 (or SSrc_64:$src0, SSrc_64:$src1)), - (S_OR_B64 SSrc_64:$src0, SSrc_64:$src1) + (i1 (or i1:$src0, i1:$src1)), + (S_OR_B64 $src0, $src1) >; def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32", []>; def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64", []>; @@ -1097,14 +1108,14 @@ def SI_IF : InstSI < (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target), "SI_IF $dst, $vcc, $target", - [(set SReg_64:$dst, (int_SI_if SReg_64:$vcc, bb:$target))] + [(set i64:$dst, (int_SI_if i1:$vcc, bb:$target))] >; def SI_ELSE : InstSI < (outs SReg_64:$dst), (ins SReg_64:$src, brtarget:$target), "SI_ELSE $dst, $src, $target", - [(set SReg_64:$dst, (int_SI_else SReg_64:$src, bb:$target))]> { + [(set i64:$dst, (int_SI_else i64:$src, bb:$target))]> { let Constraints = "$src = $dst"; } @@ -1113,7 +1124,7 @@ def SI_LOOP : InstSI < (outs), (ins SReg_64:$saved, brtarget:$target), "SI_LOOP $saved, $target", - [(int_SI_loop SReg_64:$saved, bb:$target)] + [(int_SI_loop i64:$saved, bb:$target)] >; } // end isBranch = 1, isTerminator = 1 @@ -1122,35 +1133,35 @@ def SI_BREAK : InstSI < (outs SReg_64:$dst), (ins SReg_64:$src), "SI_ELSE $dst, $src", - [(set SReg_64:$dst, (int_SI_break SReg_64:$src))] + [(set i64:$dst, (int_SI_break i64:$src))] >; def SI_IF_BREAK : InstSI < (outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src), "SI_IF_BREAK $dst, $vcc, $src", - [(set SReg_64:$dst, (int_SI_if_break SReg_64:$vcc, SReg_64:$src))] + [(set i64:$dst, (int_SI_if_break i1:$vcc, i64:$src))] >; def SI_ELSE_BREAK : InstSI < (outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1), "SI_ELSE_BREAK $dst, $src0, $src1", - [(set SReg_64:$dst, (int_SI_else_break SReg_64:$src0, SReg_64:$src1))] + [(set i64:$dst, (int_SI_else_break i64:$src0, i64:$src1))] >; def SI_END_CF : InstSI < (outs), (ins SReg_64:$saved), "SI_END_CF $saved", - [(int_SI_end_cf SReg_64:$saved)] + [(int_SI_end_cf i64:$saved)] >; def SI_KILL : InstSI < (outs), (ins VReg_32:$src), "SI_KIL $src", - [(int_AMDGPU_kill VReg_32:$src)] + [(int_AMDGPU_kill f32:$src)] >; } // end mayLoad = 1, mayStore = 1, hasSideEffects = 1 @@ -1184,8 +1195,8 @@ def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>; } // end IsCodeGenOnly, isPseudo def : Pat< - (int_AMDGPU_cndlt VReg_32:$src0, VReg_32:$src1, VReg_32:$src2), - (V_CNDMASK_B32_e64 VReg_32:$src2, VReg_32:$src1, (V_CMP_GT_F32_e64 0, VReg_32:$src0)) + (int_AMDGPU_cndlt f32:$src0, f32:$src1, f32:$src2), + (V_CNDMASK_B32_e64 $src2, $src1, (V_CMP_GT_F32_e64 0, $src0)) >; def : Pat < @@ -1195,93 +1206,110 @@ def : Pat < /* int_SI_vs_load_input */ def : Pat< - (int_SI_vs_load_input SReg_128:$tlst, IMM12bit:$attr_offset, - VReg_32:$buf_idx_vgpr), + (int_SI_vs_load_input v16i8:$tlst, IMM12bit:$attr_offset, + i32:$buf_idx_vgpr), (BUFFER_LOAD_FORMAT_XYZW imm:$attr_offset, 0, 1, 0, 0, 0, - VReg_32:$buf_idx_vgpr, SReg_128:$tlst, - 0, 0, 0) + $buf_idx_vgpr, $tlst, 0, 0, 0) >; /* int_SI_export */ def : Pat < (int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr, - VReg_32:$src0,VReg_32:$src1, VReg_32:$src2, VReg_32:$src3), + f32:$src0, f32:$src1, f32:$src2, f32:$src3), (EXP imm:$en, imm:$tgt, imm:$compr, imm:$done, imm:$vm, - VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3) + $src0, $src1, $src2, $src3) >; +/********** ======================= **********/ +/********** Image sampling patterns **********/ +/********** ======================= **********/ /* int_SI_sample for simple 1D texture lookup */ def : Pat < - (int_SI_sample imm:$writemask, VReg_32:$addr, - SReg_256:$rsrc, SReg_128:$sampler, imm), - (IMAGE_SAMPLE imm:$writemask, 0, 0, 0, 0, 0, 0, 0, VReg_32:$addr, - SReg_256:$rsrc, SReg_128:$sampler) + (int_SI_sample v1i32:$addr, v32i8:$rsrc, v16i8:$sampler, imm), + (IMAGE_SAMPLE 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) >; -class SamplePattern<Intrinsic name, MIMG opcode, RegisterClass addr_class, - ValueType addr_type> : Pat < - (name imm:$writemask, (addr_type addr_class:$addr), - SReg_256:$rsrc, SReg_128:$sampler, imm), - (opcode imm:$writemask, 0, 0, 0, 0, 0, 0, 0, addr_class:$addr, - SReg_256:$rsrc, SReg_128:$sampler) +class SamplePattern<Intrinsic name, MIMG opcode, ValueType vt> : Pat < + (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, imm), + (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) >; -class SampleRectPattern<Intrinsic name, MIMG opcode, RegisterClass addr_class, - ValueType addr_type> : Pat < - (name imm:$writemask, (addr_type addr_class:$addr), - SReg_256:$rsrc, SReg_128:$sampler, TEX_RECT), - (opcode imm:$writemask, 1, 0, 0, 0, 0, 0, 0, addr_class:$addr, - SReg_256:$rsrc, SReg_128:$sampler) +class SampleRectPattern<Intrinsic name, MIMG opcode, ValueType vt> : Pat < + (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, TEX_RECT), + (opcode 0xf, 1, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) >; -class SampleArrayPattern<Intrinsic name, MIMG opcode, RegisterClass addr_class, - ValueType addr_type> : Pat < - (name imm:$writemask, (addr_type addr_class:$addr), - SReg_256:$rsrc, SReg_128:$sampler, TEX_ARRAY), - (opcode imm:$writemask, 0, 0, 1, 0, 0, 0, 0, addr_class:$addr, - SReg_256:$rsrc, SReg_128:$sampler) +class SampleArrayPattern<Intrinsic name, MIMG opcode, ValueType vt> : Pat < + (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, TEX_ARRAY), + (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler) >; class SampleShadowPattern<Intrinsic name, MIMG opcode, - RegisterClass addr_class, ValueType addr_type> : Pat < - (name imm:$writemask, (addr_type addr_class:$addr), - SReg_256:$rsrc, SReg_128:$sampler, TEX_SHADOW), - (opcode imm:$writemask, 0, 0, 0, 0, 0, 0, 0, addr_class:$addr, - SReg_256:$rsrc, SReg_128:$sampler) + ValueType vt> : Pat < + (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, TEX_SHADOW), + (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) >; class SampleShadowArrayPattern<Intrinsic name, MIMG opcode, - RegisterClass addr_class, ValueType addr_type> : Pat < - (name imm:$writemask, (addr_type addr_class:$addr), - SReg_256:$rsrc, SReg_128:$sampler, TEX_SHADOW_ARRAY), - (opcode imm:$writemask, 0, 0, 1, 0, 0, 0, 0, addr_class:$addr, - SReg_256:$rsrc, SReg_128:$sampler) + ValueType vt> : Pat < + (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, TEX_SHADOW_ARRAY), + (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler) >; /* int_SI_sample* for texture lookups consuming more address parameters */ -multiclass SamplePatterns<RegisterClass addr_class, ValueType addr_type> { - def : SamplePattern <int_SI_sample, IMAGE_SAMPLE, addr_class, addr_type>; - def : SampleRectPattern <int_SI_sample, IMAGE_SAMPLE, addr_class, addr_type>; - def : SampleArrayPattern <int_SI_sample, IMAGE_SAMPLE, addr_class, addr_type>; - def : SampleShadowPattern <int_SI_sample, IMAGE_SAMPLE_C, addr_class, addr_type>; - def : SampleShadowArrayPattern <int_SI_sample, IMAGE_SAMPLE_C, addr_class, addr_type>; - - def : SamplePattern <int_SI_samplel, IMAGE_SAMPLE_L, addr_class, addr_type>; - def : SampleArrayPattern <int_SI_samplel, IMAGE_SAMPLE_L, addr_class, addr_type>; - def : SampleShadowPattern <int_SI_samplel, IMAGE_SAMPLE_C_L, addr_class, addr_type>; - def : SampleShadowArrayPattern <int_SI_samplel, IMAGE_SAMPLE_C_L, addr_class, addr_type>; - - def : SamplePattern <int_SI_sampleb, IMAGE_SAMPLE_B, addr_class, addr_type>; - def : SampleArrayPattern <int_SI_sampleb, IMAGE_SAMPLE_B, addr_class, addr_type>; - def : SampleShadowPattern <int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_class, addr_type>; - def : SampleShadowArrayPattern <int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_class, addr_type>; +multiclass SamplePatterns<ValueType addr_type> { + def : SamplePattern <int_SI_sample, IMAGE_SAMPLE, addr_type>; + def : SampleRectPattern <int_SI_sample, IMAGE_SAMPLE, addr_type>; + def : SampleArrayPattern <int_SI_sample, IMAGE_SAMPLE, addr_type>; + def : SampleShadowPattern <int_SI_sample, IMAGE_SAMPLE_C, addr_type>; + def : SampleShadowArrayPattern <int_SI_sample, IMAGE_SAMPLE_C, addr_type>; + + def : SamplePattern <int_SI_samplel, IMAGE_SAMPLE_L, addr_type>; + def : SampleArrayPattern <int_SI_samplel, IMAGE_SAMPLE_L, addr_type>; + def : SampleShadowPattern <int_SI_samplel, IMAGE_SAMPLE_C_L, addr_type>; + def : SampleShadowArrayPattern <int_SI_samplel, IMAGE_SAMPLE_C_L, addr_type>; + + def : SamplePattern <int_SI_sampleb, IMAGE_SAMPLE_B, addr_type>; + def : SampleArrayPattern <int_SI_sampleb, IMAGE_SAMPLE_B, addr_type>; + def : SampleShadowPattern <int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type>; + def : SampleShadowArrayPattern <int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type>; +} + +defm : SamplePatterns<v2i32>; +defm : SamplePatterns<v4i32>; +defm : SamplePatterns<v8i32>; +defm : SamplePatterns<v16i32>; + +/* int_SI_imageload for texture fetches consuming varying address parameters */ +class ImageLoadPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat < + (name addr_type:$addr, v32i8:$rsrc, imm), + (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc) +>; + +class ImageLoadArrayPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat < + (name addr_type:$addr, v32i8:$rsrc, TEX_ARRAY), + (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc) +>; + +multiclass ImageLoadPatterns<ValueType addr_type> { + def : ImageLoadPattern <int_SI_imageload, IMAGE_LOAD_MIP, addr_type>; + def : ImageLoadArrayPattern <int_SI_imageload, IMAGE_LOAD_MIP, addr_type>; } -defm : SamplePatterns<VReg_64, v2i32>; -defm : SamplePatterns<VReg_128, v4i32>; -defm : SamplePatterns<VReg_256, v8i32>; -defm : SamplePatterns<VReg_512, v16i32>; +defm : ImageLoadPatterns<v2i32>; +defm : ImageLoadPatterns<v4i32>; + +/* Image resource information */ +def : Pat < + (int_SI_resinfo i32:$mipid, v32i8:$rsrc, imm), + (IMAGE_GET_RESINFO 0xf, 0, 0, 0, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc) +>; + +def : Pat < + (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY), + (IMAGE_GET_RESINFO 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc) +>; /********** ============================================ **********/ /********** Extraction, Insertion, Building and Casting **********/ @@ -1289,77 +1317,77 @@ defm : SamplePatterns<VReg_512, v16i32>; foreach Index = 0-2 in { def Extract_Element_v2i32_#Index : Extract_Element < - i32, v2i32, VReg_64, Index, !cast<SubRegIndex>(sub#Index) + i32, v2i32, Index, !cast<SubRegIndex>(sub#Index) >; def Insert_Element_v2i32_#Index : Insert_Element < - i32, v2i32, VReg_32, VReg_64, Index, !cast<SubRegIndex>(sub#Index) + i32, v2i32, Index, !cast<SubRegIndex>(sub#Index) >; def Extract_Element_v2f32_#Index : Extract_Element < - f32, v2f32, VReg_64, Index, !cast<SubRegIndex>(sub#Index) + f32, v2f32, Index, !cast<SubRegIndex>(sub#Index) >; def Insert_Element_v2f32_#Index : Insert_Element < - f32, v2f32, VReg_32, VReg_64, Index, !cast<SubRegIndex>(sub#Index) + f32, v2f32, Index, !cast<SubRegIndex>(sub#Index) >; } foreach Index = 0-3 in { def Extract_Element_v4i32_#Index : Extract_Element < - i32, v4i32, VReg_128, Index, !cast<SubRegIndex>(sub#Index) + i32, v4i32, Index, !cast<SubRegIndex>(sub#Index) >; def Insert_Element_v4i32_#Index : Insert_Element < - i32, v4i32, VReg_32, VReg_128, Index, !cast<SubRegIndex>(sub#Index) + i32, v4i32, Index, !cast<SubRegIndex>(sub#Index) >; def Extract_Element_v4f32_#Index : Extract_Element < - f32, v4f32, VReg_128, Index, !cast<SubRegIndex>(sub#Index) + f32, v4f32, Index, !cast<SubRegIndex>(sub#Index) >; def Insert_Element_v4f32_#Index : Insert_Element < - f32, v4f32, VReg_32, VReg_128, Index, !cast<SubRegIndex>(sub#Index) + f32, v4f32, Index, !cast<SubRegIndex>(sub#Index) >; } foreach Index = 0-7 in { def Extract_Element_v8i32_#Index : Extract_Element < - i32, v8i32, VReg_256, Index, !cast<SubRegIndex>(sub#Index) + i32, v8i32, Index, !cast<SubRegIndex>(sub#Index) >; def Insert_Element_v8i32_#Index : Insert_Element < - i32, v8i32, VReg_32, VReg_256, Index, !cast<SubRegIndex>(sub#Index) + i32, v8i32, Index, !cast<SubRegIndex>(sub#Index) >; def Extract_Element_v8f32_#Index : Extract_Element < - f32, v8f32, VReg_256, Index, !cast<SubRegIndex>(sub#Index) + f32, v8f32, Index, !cast<SubRegIndex>(sub#Index) >; def Insert_Element_v8f32_#Index : Insert_Element < - f32, v8f32, VReg_32, VReg_256, Index, !cast<SubRegIndex>(sub#Index) + f32, v8f32, Index, !cast<SubRegIndex>(sub#Index) >; } foreach Index = 0-15 in { def Extract_Element_v16i32_#Index : Extract_Element < - i32, v16i32, VReg_512, Index, !cast<SubRegIndex>(sub#Index) + i32, v16i32, Index, !cast<SubRegIndex>(sub#Index) >; def Insert_Element_v16i32_#Index : Insert_Element < - i32, v16i32, VReg_32, VReg_512, Index, !cast<SubRegIndex>(sub#Index) + i32, v16i32, Index, !cast<SubRegIndex>(sub#Index) >; def Extract_Element_v16f32_#Index : Extract_Element < - f32, v16f32, VReg_512, Index, !cast<SubRegIndex>(sub#Index) + f32, v16f32, Index, !cast<SubRegIndex>(sub#Index) >; def Insert_Element_v16f32_#Index : Insert_Element < - f32, v16f32, VReg_32, VReg_512, Index, !cast<SubRegIndex>(sub#Index) + f32, v16f32, Index, !cast<SubRegIndex>(sub#Index) >; } -def : Vector1_Build <v1i32, VReg_32, i32, VReg_32>; -def : Vector2_Build <v2i32, VReg_64, i32, VReg_32>; -def : Vector2_Build <v2f32, VReg_64, f32, VReg_32>; -def : Vector4_Build <v4i32, VReg_128, i32, VReg_32>; -def : Vector4_Build <v4f32, VReg_128, f32, VReg_32>; -def : Vector8_Build <v8i32, VReg_256, i32, VReg_32>; -def : Vector8_Build <v8f32, VReg_256, f32, VReg_32>; -def : Vector16_Build <v16i32, VReg_512, i32, VReg_32>; -def : Vector16_Build <v16f32, VReg_512, f32, VReg_32>; +def : Vector1_Build <v1i32, i32, VReg_32>; +def : Vector2_Build <v2i32, i32>; +def : Vector2_Build <v2f32, f32>; +def : Vector4_Build <v4i32, i32>; +def : Vector4_Build <v4f32, f32>; +def : Vector8_Build <v8i32, i32>; +def : Vector8_Build <v8f32, f32>; +def : Vector16_Build <v16i32, i32>; +def : Vector16_Build <v16f32, f32>; def : BitConvert <i32, f32, SReg_32>; def : BitConvert <i32, f32, VReg_32>; @@ -1372,20 +1400,20 @@ def : BitConvert <f32, i32, VReg_32>; /********** =================== **********/ def : Pat < - (int_AMDIL_clamp VReg_32:$src, (f32 FP_ZERO), (f32 FP_ONE)), - (V_ADD_F32_e64 VReg_32:$src, (i32 0 /* SRC1 */), + (int_AMDIL_clamp f32:$src, (f32 FP_ZERO), (f32 FP_ONE)), + (V_ADD_F32_e64 $src, (i32 0 /* SRC1 */), 0 /* ABS */, 1 /* CLAMP */, 0 /* OMOD */, 0 /* NEG */) >; def : Pat < - (fabs VReg_32:$src), - (V_ADD_F32_e64 VReg_32:$src, (i32 0 /* SRC1 */), + (fabs f32:$src), + (V_ADD_F32_e64 $src, (i32 0 /* SRC1 */), 1 /* ABS */, 0 /* CLAMP */, 0 /* OMOD */, 0 /* NEG */) >; def : Pat < - (fneg VReg_32:$src), - (V_ADD_F32_e64 VReg_32:$src, (i32 0 /* SRC1 */), + (fneg f32:$src), + (V_ADD_F32_e64 $src, (i32 0 /* SRC1 */), 0 /* ABS */, 0 /* CLAMP */, 0 /* OMOD */, 1 /* NEG */) >; @@ -1426,16 +1454,16 @@ def : Pat < /********** ===================== **********/ def : Pat < - (int_SI_fs_constant imm:$attr_chan, imm:$attr, M0Reg:$params), - (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr, M0Reg:$params) + (int_SI_fs_constant imm:$attr_chan, imm:$attr, i32:$params), + (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr, $params) >; def : Pat < - (int_SI_fs_interp imm:$attr_chan, imm:$attr, M0Reg:$params, VReg_64:$ij), - (V_INTERP_P2_F32 (V_INTERP_P1_F32 (EXTRACT_SUBREG VReg_64:$ij, sub0), - imm:$attr_chan, imm:$attr, M0Reg:$params), - (EXTRACT_SUBREG VReg_64:$ij, sub1), - imm:$attr_chan, imm:$attr, M0Reg:$params) + (int_SI_fs_interp imm:$attr_chan, imm:$attr, M0Reg:$params, v2i32:$ij), + (V_INTERP_P2_F32 (V_INTERP_P1_F32 (EXTRACT_SUBREG v2i32:$ij, sub0), + imm:$attr_chan, imm:$attr, i32:$params), + (EXTRACT_SUBREG $ij, sub1), + imm:$attr_chan, imm:$attr, $params) >; /********** ================== **********/ @@ -1443,101 +1471,111 @@ def : Pat < /********** ================== **********/ /* llvm.AMDGPU.pow */ -def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32, VReg_32>; +def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>; def : Pat < - (int_AMDGPU_div VSrc_32:$src0, VSrc_32:$src1), - (V_MUL_LEGACY_F32_e32 VSrc_32:$src0, (V_RCP_LEGACY_F32_e32 VSrc_32:$src1)) + (int_AMDGPU_div f32:$src0, f32:$src1), + (V_MUL_LEGACY_F32_e32 $src0, (V_RCP_LEGACY_F32_e32 $src1)) >; def : Pat< - (fdiv VSrc_32:$src0, VSrc_32:$src1), - (V_MUL_F32_e32 VSrc_32:$src0, (V_RCP_F32_e32 VSrc_32:$src1)) + (fdiv f32:$src0, f32:$src1), + (V_MUL_F32_e32 $src0, (V_RCP_F32_e32 $src1)) >; def : Pat < - (fcos VSrc_32:$src0), - (V_COS_F32_e32 (V_MUL_F32_e32 VSrc_32:$src0, (V_MOV_B32_e32 CONST.TWO_PI_INV))) + (fcos f32:$src0), + (V_COS_F32_e32 (V_MUL_F32_e32 $src0, (V_MOV_B32_e32 CONST.TWO_PI_INV))) >; def : Pat < - (fsin VSrc_32:$src0), - (V_SIN_F32_e32 (V_MUL_F32_e32 VSrc_32:$src0, (V_MOV_B32_e32 CONST.TWO_PI_INV))) + (fsin f32:$src0), + (V_SIN_F32_e32 (V_MUL_F32_e32 $src0, (V_MOV_B32_e32 CONST.TWO_PI_INV))) >; def : Pat < - (int_AMDGPU_cube VReg_128:$src), + (int_AMDGPU_cube v4f32:$src), (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), - (V_CUBETC_F32 (EXTRACT_SUBREG VReg_128:$src, sub0), - (EXTRACT_SUBREG VReg_128:$src, sub1), - (EXTRACT_SUBREG VReg_128:$src, sub2), - 0, 0, 0, 0), sub0), - (V_CUBESC_F32 (EXTRACT_SUBREG VReg_128:$src, sub0), - (EXTRACT_SUBREG VReg_128:$src, sub1), - (EXTRACT_SUBREG VReg_128:$src, sub2), - 0, 0, 0, 0), sub1), - (V_CUBEMA_F32 (EXTRACT_SUBREG VReg_128:$src, sub0), - (EXTRACT_SUBREG VReg_128:$src, sub1), - (EXTRACT_SUBREG VReg_128:$src, sub2), - 0, 0, 0, 0), sub2), - (V_CUBEID_F32 (EXTRACT_SUBREG VReg_128:$src, sub0), - (EXTRACT_SUBREG VReg_128:$src, sub1), - (EXTRACT_SUBREG VReg_128:$src, sub2), - 0, 0, 0, 0), sub3) + (V_CUBETC_F32 (EXTRACT_SUBREG $src, sub0), + (EXTRACT_SUBREG $src, sub1), + (EXTRACT_SUBREG $src, sub2)), + sub0), + (V_CUBESC_F32 (EXTRACT_SUBREG $src, sub0), + (EXTRACT_SUBREG $src, sub1), + (EXTRACT_SUBREG $src, sub2)), + sub1), + (V_CUBEMA_F32 (EXTRACT_SUBREG $src, sub0), + (EXTRACT_SUBREG $src, sub1), + (EXTRACT_SUBREG $src, sub2)), + sub2), + (V_CUBEID_F32 (EXTRACT_SUBREG $src, sub0), + (EXTRACT_SUBREG $src, sub1), + (EXTRACT_SUBREG $src, sub2)), + sub3) >; def : Pat < - (i32 (sext (i1 SReg_64:$src0))), - (V_CNDMASK_B32_e64 (i32 0), (i32 -1), SReg_64:$src0) + (i32 (sext i1:$src0)), + (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src0) >; // 1. Offset as 8bit DWORD immediate def : Pat < - (int_SI_load_const SReg_128:$sbase, IMM8bitDWORD:$offset), - (S_BUFFER_LOAD_DWORD_IMM SReg_128:$sbase, IMM8bitDWORD:$offset) + (int_SI_load_const v16i8:$sbase, IMM8bitDWORD:$offset), + (S_BUFFER_LOAD_DWORD_IMM $sbase, IMM8bitDWORD:$offset) >; // 2. Offset loaded in an 32bit SGPR def : Pat < - (int_SI_load_const SReg_128:$sbase, imm:$offset), - (S_BUFFER_LOAD_DWORD_SGPR SReg_128:$sbase, (S_MOV_B32 imm:$offset)) + (int_SI_load_const v16i8:$sbase, imm:$offset), + (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset)) >; // 3. Offset in an 32Bit VGPR def : Pat < - (int_SI_load_const SReg_128:$sbase, VReg_32:$voff), - (BUFFER_LOAD_DWORD 0, 1, 0, 0, 0, 0, VReg_32:$voff, SReg_128:$sbase, 0, 0, 0) + (int_SI_load_const v16i8:$sbase, i32:$voff), + (BUFFER_LOAD_DWORD 0, 1, 0, 0, 0, 0, $voff, $sbase, 0, 0, 0) +>; + +// The multiplication scales from [0,1] to the unsigned integer range +def : Pat < + (AMDGPUurecip i32:$src0), + (V_CVT_U32_F32_e32 + (V_MUL_F32_e32 CONST.FP_UINT_MAX_PLUS_1, + (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0)))) >; /********** ================== **********/ /********** VOP3 Patterns **********/ /********** ================== **********/ -def : Pat <(f32 (fadd (fmul VSrc_32:$src0, VSrc_32:$src1), VSrc_32:$src2)), - (V_MAD_F32 VSrc_32:$src0, VSrc_32:$src1, VSrc_32:$src2, - 0, 0, 0, 0)>; +def : Pat < + (f32 (fadd (fmul f32:$src0, f32:$src1), f32:$src2)), + (V_MAD_F32 $src0, $src1, $src2) +>; /********** ================== **********/ /********** SMRD Patterns **********/ /********** ================== **********/ multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> { + // 1. Offset as 8bit DWORD immediate def : Pat < - (constant_load (SIadd64bit32bit SReg_64:$sbase, IMM8bitDWORD:$offset)), - (vt (Instr_IMM SReg_64:$sbase, IMM8bitDWORD:$offset)) + (constant_load (SIadd64bit32bit i64:$sbase, IMM8bitDWORD:$offset)), + (vt (Instr_IMM $sbase, IMM8bitDWORD:$offset)) >; // 2. Offset loaded in an 32bit SGPR def : Pat < - (constant_load (SIadd64bit32bit SReg_64:$sbase, imm:$offset)), - (vt (Instr_SGPR SReg_64:$sbase, (S_MOV_B32 imm:$offset))) + (constant_load (SIadd64bit32bit i64:$sbase, imm:$offset)), + (vt (Instr_SGPR $sbase, (S_MOV_B32 imm:$offset))) >; // 3. No offset at all def : Pat < - (constant_load SReg_64:$sbase), - (vt (Instr_IMM SReg_64:$sbase, 0)) + (constant_load i64:$sbase), + (vt (Instr_IMM $sbase, 0)) >; } @@ -1550,45 +1588,37 @@ defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>; /********** Indirect adressing **********/ /********** ====================== **********/ -multiclass SI_INDIRECT_Pattern <RegisterClass rc, ValueType vt, - SI_INDIRECT_DST IndDst> { +multiclass SI_INDIRECT_Pattern <ValueType vt, SI_INDIRECT_DST IndDst> { + // 1. Extract with offset def : Pat< - (vector_extract (vt rc:$vec), - (i64 (zext (i32 (add VReg_32:$idx, imm:$off)))) - ), - (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, imm:$off)) + (vector_extract vt:$vec, (i64 (zext (add i32:$idx, imm:$off)))), + (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, imm:$off)) >; // 2. Extract without offset def : Pat< - (vector_extract (vt rc:$vec), - (i64 (zext (i32 VReg_32:$idx))) - ), - (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, 0)) + (vector_extract vt:$vec, (i64 (zext i32:$idx))), + (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, 0)) >; // 3. Insert with offset def : Pat< - (vector_insert (vt rc:$vec), (f32 VReg_32:$val), - (i64 (zext (i32 (add VReg_32:$idx, imm:$off)))) - ), - (vt (IndDst (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, imm:$off, VReg_32:$val)) + (vector_insert vt:$vec, f32:$val, (i64 (zext (add i32:$idx, imm:$off)))), + (IndDst (IMPLICIT_DEF), $vec, $idx, imm:$off, $val) >; // 4. Insert without offset def : Pat< - (vector_insert (vt rc:$vec), (f32 VReg_32:$val), - (i64 (zext (i32 VReg_32:$idx))) - ), - (vt (IndDst (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, 0, VReg_32:$val)) + (vector_insert vt:$vec, f32:$val, (i64 (zext i32:$idx))), + (IndDst (IMPLICIT_DEF), $vec, $idx, 0, $val) >; } -defm : SI_INDIRECT_Pattern <VReg_64, v2f32, SI_INDIRECT_DST_V2>; -defm : SI_INDIRECT_Pattern <VReg_128, v4f32, SI_INDIRECT_DST_V4>; -defm : SI_INDIRECT_Pattern <VReg_256, v8f32, SI_INDIRECT_DST_V8>; -defm : SI_INDIRECT_Pattern <VReg_512, v16f32, SI_INDIRECT_DST_V16>; +defm : SI_INDIRECT_Pattern <v2f32, SI_INDIRECT_DST_V2>; +defm : SI_INDIRECT_Pattern <v4f32, SI_INDIRECT_DST_V4>; +defm : SI_INDIRECT_Pattern <v8f32, SI_INDIRECT_DST_V8>; +defm : SI_INDIRECT_Pattern <v16f32, SI_INDIRECT_DST_V16>; /********** =============== **********/ /********** Conditions **********/ @@ -1596,12 +1626,18 @@ defm : SI_INDIRECT_Pattern <VReg_512, v16f32, SI_INDIRECT_DST_V16>; def : Pat< (i1 (setcc f32:$src0, f32:$src1, SETO)), - (V_CMP_O_F32_e64 f32:$src0, f32:$src1) + (V_CMP_O_F32_e64 $src0, $src1) >; def : Pat< (i1 (setcc f32:$src0, f32:$src1, SETUO)), - (V_CMP_U_F32_e64 f32:$src0, f32:$src1) + (V_CMP_U_F32_e64 $src0, $src1) >; +//============================================================================// +// Miscellaneous Optimization Patterns +//============================================================================// + +def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e32>; + } // End isSI predicate diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td index 0af378e..224cd2f 100644 --- a/lib/Target/R600/SIIntrinsics.td +++ b/lib/Target/R600/SIIntrinsics.td @@ -19,12 +19,16 @@ let TargetPrefix = "SI", isTarget = 1 in { def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>; def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_v16i8_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ; - class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_i32_ty, llvm_anyvector_ty, llvm_v32i8_ty, llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>; + class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>; def int_SI_sample : Sample; def int_SI_sampleb : Sample; def int_SI_samplel : Sample; + def int_SI_imageload : Intrinsic <[llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; + + def int_SI_resinfo : Intrinsic <[llvm_v4i32_ty], [llvm_i32_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; + /* Interpolation Intrinsics */ def int_SI_fs_constant : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td index 4f14931..244d4c00 100644 --- a/lib/Target/R600/SIRegisterInfo.td +++ b/lib/Target/R600/SIRegisterInfo.td @@ -94,6 +94,12 @@ def VGPR_64 : RegisterTuples<[sub0, sub1], [(add (trunc VGPR_32, 255)), (add (shl VGPR_32, 1))]>; +// VGPR 96-bit registers +def VGPR_96 : RegisterTuples<[sub0, sub1, sub2], + [(add (trunc VGPR_32, 254)), + (add (shl VGPR_32, 1)), + (add (shl VGPR_32, 2))]>; + // VGPR 128-bit registers def VGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3], [(add (trunc VGPR_32, 253)), @@ -151,7 +157,7 @@ def SReg_64 : RegisterClass<"AMDGPU", [i64, i1], 64, (add SGPR_64, VCCReg, EXECReg) >; -def SReg_128 : RegisterClass<"AMDGPU", [v16i8], 128, (add SGPR_128)>; +def SReg_128 : RegisterClass<"AMDGPU", [v16i8, i128], 128, (add SGPR_128)>; def SReg_256 : RegisterClass<"AMDGPU", [v32i8], 256, (add SGPR_256)>; @@ -162,6 +168,10 @@ def VReg_32 : RegisterClass<"AMDGPU", [i32, f32, v1i32], 32, (add VGPR_32)>; def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 64, (add VGPR_64)>; +def VReg_96 : RegisterClass<"AMDGPU", [untyped], 96, (add VGPR_96)> { + let Size = 96; +} + def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, (add VGPR_128)>; def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 256, (add VGPR_256)>; |