summaryrefslogtreecommitdiffstats
path: root/lib/Target/R600
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/R600')
-rw-r--r--lib/Target/R600/AMDGPU.h1
-rw-r--r--lib/Target/R600/AMDGPUAsmPrinter.cpp101
-rw-r--r--lib/Target/R600/AMDGPUAsmPrinter.h3
-rw-r--r--lib/Target/R600/AMDGPUCallingConv.td8
-rw-r--r--lib/Target/R600/AMDGPUISelLowering.h1
-rw-r--r--lib/Target/R600/AMDGPUInstructions.td169
-rw-r--r--lib/Target/R600/AMDGPUMachineFunction.cpp2
-rw-r--r--lib/Target/R600/AMDGPUSubtarget.cpp5
-rw-r--r--lib/Target/R600/AMDGPUSubtarget.h2
-rw-r--r--lib/Target/R600/AMDGPUTargetMachine.cpp4
-rw-r--r--lib/Target/R600/AMDILBase.td4
-rw-r--r--lib/Target/R600/AMDILDeviceInfo.cpp7
-rw-r--r--lib/Target/R600/AMDILISelDAGToDAG.cpp23
-rw-r--r--lib/Target/R600/AMDILPeepholeOptimizer.cpp1215
-rw-r--r--lib/Target/R600/CMakeLists.txt2
-rw-r--r--lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp45
-rw-r--r--lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h5
-rw-r--r--lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp29
-rw-r--r--lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp39
-rw-r--r--lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp2
-rw-r--r--lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp4
-rw-r--r--lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h7
-rw-r--r--lib/Target/R600/MCTargetDesc/CMakeLists.txt1
-rw-r--r--lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp499
-rw-r--r--lib/Target/R600/Processors.td58
-rw-r--r--lib/Target/R600/R600ControlFlowFinalizer.cpp341
-rw-r--r--lib/Target/R600/R600Defines.h44
-rw-r--r--lib/Target/R600/R600ISelLowering.cpp18
-rw-r--r--lib/Target/R600/R600InstrInfo.cpp45
-rw-r--r--lib/Target/R600/R600InstrInfo.h9
-rw-r--r--lib/Target/R600/R600Instructions.td736
-rw-r--r--lib/Target/R600/R600MachineFunctionInfo.h1
-rw-r--r--lib/Target/R600/R600Packetizer.cpp459
-rw-r--r--lib/Target/R600/R600RegisterInfo.td10
-rw-r--r--lib/Target/R600/R600Schedule.td13
-rw-r--r--lib/Target/R600/SIDefines.h22
-rw-r--r--lib/Target/R600/SIISelLowering.cpp157
-rw-r--r--lib/Target/R600/SIISelLowering.h6
-rw-r--r--lib/Target/R600/SIInstrFormats.td50
-rw-r--r--lib/Target/R600/SIInstrInfo.cpp9
-rw-r--r--lib/Target/R600/SIInstrInfo.h1
-rw-r--r--lib/Target/R600/SIInstrInfo.td61
-rw-r--r--lib/Target/R600/SIInstructions.td516
-rw-r--r--lib/Target/R600/SIIntrinsics.td6
-rw-r--r--lib/Target/R600/SIRegisterInfo.td12
45 files changed, 2364 insertions, 2388 deletions
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
index 0b01433..9792bd8 100644
--- a/lib/Target/R600/AMDGPU.h
+++ b/lib/Target/R600/AMDGPU.h
@@ -24,6 +24,7 @@ class AMDGPUTargetMachine;
FunctionPass* createR600KernelParametersPass(const DataLayout *TD);
FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
FunctionPass *createR600EmitClauseMarkers(TargetMachine &tm);
+FunctionPass *createR600Packetizer(TargetMachine &tm);
FunctionPass *createR600ControlFlowFinalizer(TargetMachine &tm);
// SI Passes
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
index f600144..4c35ecf 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
@@ -19,9 +19,16 @@
#include "AMDGPUAsmPrinter.h"
#include "AMDGPU.h"
+#include "SIDefines.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
+#include "R600Defines.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/ELF.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
@@ -50,15 +57,82 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
if (OutStreamer.hasRawTextSupport()) {
OutStreamer.EmitRawText("@" + MF.getName() + ":");
}
- OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
+
+ const MCSectionELF *ConfigSection = getObjFileLowering().getContext()
+ .getELFSection(".AMDGPU.config",
+ ELF::SHT_PROGBITS, 0,
+ SectionKind::getReadOnly());
+ OutStreamer.SwitchSection(ConfigSection);
if (STM.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
- EmitProgramInfo(MF);
+ EmitProgramInfoSI(MF);
+ } else {
+ EmitProgramInfoR600(MF);
}
+ OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
EmitFunctionBody();
return false;
}
-void AMDGPUAsmPrinter::EmitProgramInfo(MachineFunction &MF) {
+void AMDGPUAsmPrinter::EmitProgramInfoR600(MachineFunction &MF) {
+ unsigned MaxGPR = 0;
+ bool killPixel = false;
+ const R600RegisterInfo * RI =
+ static_cast<const R600RegisterInfo*>(TM.getRegisterInfo());
+ R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+ const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
+
+ for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+ BB != BB_E; ++BB) {
+ MachineBasicBlock &MBB = *BB;
+ for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+ I != E; ++I) {
+ MachineInstr &MI = *I;
+ if (MI.getOpcode() == AMDGPU::KILLGT)
+ killPixel = true;
+ unsigned numOperands = MI.getNumOperands();
+ for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
+ MachineOperand & MO = MI.getOperand(op_idx);
+ if (!MO.isReg())
+ continue;
+ unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff;
+
+ // Register with value > 127 aren't GPR
+ if (HWReg > 127)
+ continue;
+ MaxGPR = std::max(MaxGPR, HWReg);
+ }
+ }
+ }
+
+ unsigned RsrcReg;
+ if (STM.device()->getGeneration() >= AMDGPUDeviceInfo::HD5XXX) {
+ // Evergreen / Northern Islands
+ switch (MFI->ShaderType) {
+ default: // Fall through
+ case ShaderType::COMPUTE: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
+ case ShaderType::GEOMETRY: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
+ case ShaderType::PIXEL: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break;
+ case ShaderType::VERTEX: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break;
+ }
+ } else {
+ // R600 / R700
+ switch (MFI->ShaderType) {
+ default: // Fall through
+ case ShaderType::GEOMETRY: // Fall through
+ case ShaderType::COMPUTE: // Fall through
+ case ShaderType::VERTEX: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break;
+ case ShaderType::PIXEL: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break;
+ }
+ }
+
+ OutStreamer.EmitIntValue(RsrcReg, 4);
+ OutStreamer.EmitIntValue(S_NUM_GPRS(MaxGPR + 1) |
+ S_STACK_SIZE(MFI->StackSize), 4);
+ OutStreamer.EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
+ OutStreamer.EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
+}
+
+void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF) {
unsigned MaxSGPR = 0;
unsigned MaxVGPR = 0;
bool VCCUsed = false;
@@ -107,6 +181,9 @@ void AMDGPUAsmPrinter::EmitProgramInfo(MachineFunction &MF) {
} else if (AMDGPU::VReg_64RegClass.contains(reg)) {
isSGPR = false;
width = 2;
+ } else if (AMDGPU::VReg_96RegClass.contains(reg)) {
+ isSGPR = false;
+ width = 3;
} else if (AMDGPU::SReg_128RegClass.contains(reg)) {
isSGPR = true;
width = 4;
@@ -139,7 +216,19 @@ void AMDGPUAsmPrinter::EmitProgramInfo(MachineFunction &MF) {
MaxSGPR += 2;
}
SIMachineFunctionInfo * MFI = MF.getInfo<SIMachineFunctionInfo>();
- OutStreamer.EmitIntValue(MaxSGPR + 1, 4);
- OutStreamer.EmitIntValue(MaxVGPR + 1, 4);
- OutStreamer.EmitIntValue(MFI->PSInputAddr, 4);
+ unsigned RsrcReg;
+ switch (MFI->ShaderType) {
+ default: // Fall through
+ case ShaderType::COMPUTE: RsrcReg = R_00B848_COMPUTE_PGM_RSRC1; break;
+ case ShaderType::GEOMETRY: RsrcReg = R_00B228_SPI_SHADER_PGM_RSRC1_GS; break;
+ case ShaderType::PIXEL: RsrcReg = R_00B028_SPI_SHADER_PGM_RSRC1_PS; break;
+ case ShaderType::VERTEX: RsrcReg = R_00B128_SPI_SHADER_PGM_RSRC1_VS; break;
+ }
+
+ OutStreamer.EmitIntValue(RsrcReg, 4);
+ OutStreamer.EmitIntValue(S_00B028_VGPRS(MaxVGPR / 4) | S_00B028_SGPRS(MaxSGPR / 8), 4);
+ if (MFI->ShaderType == ShaderType::PIXEL) {
+ OutStreamer.EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
+ OutStreamer.EmitIntValue(MFI->PSInputAddr, 4);
+ }
}
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h
index 3812282..f425ef4 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.h
+++ b/lib/Target/R600/AMDGPUAsmPrinter.h
@@ -33,7 +33,8 @@ public:
/// \brief Emit register usage information so that the GPU driver
/// can correctly setup the GPU state.
- void EmitProgramInfo(MachineFunction &MF);
+ void EmitProgramInfoR600(MachineFunction &MF);
+ void EmitProgramInfoSI(MachineFunction &MF);
/// Implemented in AMDGPUMCInstLower.cpp
virtual void EmitInstruction(const MachineInstr *MI);
diff --git a/lib/Target/R600/AMDGPUCallingConv.td b/lib/Target/R600/AMDGPUCallingConv.td
index 45ae37e..9c30515 100644
--- a/lib/Target/R600/AMDGPUCallingConv.td
+++ b/lib/Target/R600/AMDGPUCallingConv.td
@@ -32,8 +32,14 @@ def CC_SI : CallingConv<[
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31
- ]>>>
+ ]>>>,
+ // This is the default for i64 values.
+ // XXX: We should change this once clang understands the CC_AMDGPU.
+ CCIfType<[i64], CCAssignToRegWithShadow<
+ [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ],
+ [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ]
+ >>
]>;
def CC_AMDGPU : CallingConv<[
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
index f31b646..c2a79ea 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -116,6 +116,7 @@ enum {
BRANCH_COND,
// End AMDIL ISD Opcodes
BITALIGN,
+ BUFFER_STORE,
DWORDADDR,
FRACT,
FMAX,
diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
index e740348..d2620b2 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -94,6 +94,7 @@ class Constants {
int TWO_PI = 0x40c90fdb;
int PI = 0x40490fdb;
int TWO_PI_INV = 0x3e22f983;
+int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding
}
def CONST : Constants;
@@ -115,21 +116,21 @@ class CLAMP <RegisterClass rc> : AMDGPUShaderInst <
(outs rc:$dst),
(ins rc:$src0),
"CLAMP $dst, $src0",
- [(set rc:$dst, (int_AMDIL_clamp rc:$src0, (f32 FP_ZERO), (f32 FP_ONE)))]
+ [(set f32:$dst, (int_AMDIL_clamp f32:$src0, (f32 FP_ZERO), (f32 FP_ONE)))]
>;
class FABS <RegisterClass rc> : AMDGPUShaderInst <
(outs rc:$dst),
(ins rc:$src0),
"FABS $dst, $src0",
- [(set rc:$dst, (fabs rc:$src0))]
+ [(set f32:$dst, (fabs f32:$src0))]
>;
class FNEG <RegisterClass rc> : AMDGPUShaderInst <
(outs rc:$dst),
(ins rc:$src0),
"FNEG $dst, $src0",
- [(set rc:$dst, (fneg rc:$src0))]
+ [(set f32:$dst, (fneg f32:$src0))]
>;
} // usesCustomInserter = 1
@@ -140,8 +141,7 @@ multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass,
(outs dstClass:$dst),
(ins addrClass:$addr, i32imm:$chan),
"RegisterLoad $dst, $addr",
- [(set (i32 dstClass:$dst), (AMDGPUregister_load addrPat:$addr,
- (i32 timm:$chan)))]
+ [(set i32:$dst, (AMDGPUregister_load addrPat:$addr, (i32 timm:$chan)))]
> {
let isRegisterLoad = 1;
}
@@ -150,7 +150,7 @@ multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass,
(outs),
(ins dstClass:$val, addrClass:$addr, i32imm:$chan),
"RegisterStore $val, $addr",
- [(AMDGPUregister_store (i32 dstClass:$val), addrPat:$addr, (i32 timm:$chan))]
+ [(AMDGPUregister_store i32:$val, addrPat:$addr, (i32 timm:$chan))]
> {
let isRegisterStore = 1;
}
@@ -161,105 +161,140 @@ multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass,
/* Generic helper patterns for intrinsics */
/* -------------------------------------- */
-class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul,
- RegisterClass rc> : Pat <
- (fpow rc:$src0, rc:$src1),
- (exp_ieee (mul rc:$src1, (log_ieee rc:$src0)))
+class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul>
+ : Pat <
+ (fpow f32:$src0, f32:$src1),
+ (exp_ieee (mul f32:$src1, (log_ieee f32:$src0)))
>;
/* Other helper patterns */
/* --------------------- */
/* Extract element pattern */
-class Extract_Element <ValueType sub_type, ValueType vec_type,
- RegisterClass vec_class, int sub_idx,
- SubRegIndex sub_reg>: Pat<
- (sub_type (vector_extract (vec_type vec_class:$src), sub_idx)),
- (EXTRACT_SUBREG vec_class:$src, sub_reg)
+class Extract_Element <ValueType sub_type, ValueType vec_type, int sub_idx,
+ SubRegIndex sub_reg>
+ : Pat<
+ (sub_type (vector_extract vec_type:$src, sub_idx)),
+ (EXTRACT_SUBREG $src, sub_reg)
>;
/* Insert element pattern */
class Insert_Element <ValueType elem_type, ValueType vec_type,
- RegisterClass elem_class, RegisterClass vec_class,
- int sub_idx, SubRegIndex sub_reg> : Pat <
-
- (vec_type (vector_insert (vec_type vec_class:$vec),
- (elem_type elem_class:$elem), sub_idx)),
- (INSERT_SUBREG vec_class:$vec, elem_class:$elem, sub_reg)
+ int sub_idx, SubRegIndex sub_reg>
+ : Pat <
+ (vector_insert vec_type:$vec, elem_type:$elem, sub_idx),
+ (INSERT_SUBREG $vec, $elem, sub_reg)
>;
// Vector Build pattern
-class Vector1_Build <ValueType vecType, RegisterClass vectorClass,
- ValueType elemType, RegisterClass elemClass> : Pat <
- (vecType (build_vector (elemType elemClass:$src))),
- (vecType elemClass:$src)
+class Vector1_Build <ValueType vecType, ValueType elemType,
+ RegisterClass rc> : Pat <
+ (vecType (build_vector elemType:$src)),
+ (vecType (COPY_TO_REGCLASS $src, rc))
>;
-class Vector2_Build <ValueType vecType, RegisterClass vectorClass,
- ValueType elemType, RegisterClass elemClass> : Pat <
- (vecType (build_vector (elemType elemClass:$sub0), (elemType elemClass:$sub1))),
+class Vector2_Build <ValueType vecType, ValueType elemType> : Pat <
+ (vecType (build_vector elemType:$sub0, elemType:$sub1)),
(INSERT_SUBREG (INSERT_SUBREG
- (vecType (IMPLICIT_DEF)), elemClass:$sub0, sub0), elemClass:$sub1, sub1)
+ (vecType (IMPLICIT_DEF)), $sub0, sub0), $sub1, sub1)
>;
-class Vector4_Build <ValueType vecType, RegisterClass vectorClass,
- ValueType elemType, RegisterClass elemClass> : Pat <
- (vecType (build_vector (elemType elemClass:$x), (elemType elemClass:$y),
- (elemType elemClass:$z), (elemType elemClass:$w))),
+class Vector4_Build <ValueType vecType, ValueType elemType> : Pat <
+ (vecType (build_vector elemType:$x, elemType:$y, elemType:$z, elemType:$w)),
(INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
- (vecType (IMPLICIT_DEF)), elemClass:$x, sub0), elemClass:$y, sub1),
- elemClass:$z, sub2), elemClass:$w, sub3)
+ (vecType (IMPLICIT_DEF)), $x, sub0), $y, sub1), $z, sub2), $w, sub3)
>;
-class Vector8_Build <ValueType vecType, RegisterClass vectorClass,
- ValueType elemType, RegisterClass elemClass> : Pat <
- (vecType (build_vector (elemType elemClass:$sub0), (elemType elemClass:$sub1),
- (elemType elemClass:$sub2), (elemType elemClass:$sub3),
- (elemType elemClass:$sub4), (elemType elemClass:$sub5),
- (elemType elemClass:$sub6), (elemType elemClass:$sub7))),
- (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
+class Vector8_Build <ValueType vecType, ValueType elemType> : Pat <
+ (vecType (build_vector elemType:$sub0, elemType:$sub1,
+ elemType:$sub2, elemType:$sub3,
+ elemType:$sub4, elemType:$sub5,
+ elemType:$sub6, elemType:$sub7)),
(INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
- (vecType (IMPLICIT_DEF)), elemClass:$sub0, sub0), elemClass:$sub1, sub1),
- elemClass:$sub2, sub2), elemClass:$sub3, sub3),
- elemClass:$sub4, sub4), elemClass:$sub5, sub5),
- elemClass:$sub6, sub6), elemClass:$sub7, sub7)
+ (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
+ (vecType (IMPLICIT_DEF)), $sub0, sub0), $sub1, sub1),
+ $sub2, sub2), $sub3, sub3),
+ $sub4, sub4), $sub5, sub5),
+ $sub6, sub6), $sub7, sub7)
>;
-class Vector16_Build <ValueType vecType, RegisterClass vectorClass,
- ValueType elemType, RegisterClass elemClass> : Pat <
- (vecType (build_vector (elemType elemClass:$sub0), (elemType elemClass:$sub1),
- (elemType elemClass:$sub2), (elemType elemClass:$sub3),
- (elemType elemClass:$sub4), (elemType elemClass:$sub5),
- (elemType elemClass:$sub6), (elemType elemClass:$sub7),
- (elemType elemClass:$sub8), (elemType elemClass:$sub9),
- (elemType elemClass:$sub10), (elemType elemClass:$sub11),
- (elemType elemClass:$sub12), (elemType elemClass:$sub13),
- (elemType elemClass:$sub14), (elemType elemClass:$sub15))),
- (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
- (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
- (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
+class Vector16_Build <ValueType vecType, ValueType elemType> : Pat <
+ (vecType (build_vector elemType:$sub0, elemType:$sub1,
+ elemType:$sub2, elemType:$sub3,
+ elemType:$sub4, elemType:$sub5,
+ elemType:$sub6, elemType:$sub7,
+ elemType:$sub8, elemType:$sub9,
+ elemType:$sub10, elemType:$sub11,
+ elemType:$sub12, elemType:$sub13,
+ elemType:$sub14, elemType:$sub15)),
(INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
- (vecType (IMPLICIT_DEF)), elemClass:$sub0, sub0), elemClass:$sub1, sub1),
- elemClass:$sub2, sub2), elemClass:$sub3, sub3),
- elemClass:$sub4, sub4), elemClass:$sub5, sub5),
- elemClass:$sub6, sub6), elemClass:$sub7, sub7),
- elemClass:$sub8, sub8), elemClass:$sub9, sub9),
- elemClass:$sub10, sub10), elemClass:$sub11, sub11),
- elemClass:$sub12, sub12), elemClass:$sub13, sub13),
- elemClass:$sub14, sub14), elemClass:$sub15, sub15)
+ (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
+ (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
+ (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
+ (vecType (IMPLICIT_DEF)), $sub0, sub0), $sub1, sub1),
+ $sub2, sub2), $sub3, sub3),
+ $sub4, sub4), $sub5, sub5),
+ $sub6, sub6), $sub7, sub7),
+ $sub8, sub8), $sub9, sub9),
+ $sub10, sub10), $sub11, sub11),
+ $sub12, sub12), $sub13, sub13),
+ $sub14, sub14), $sub15, sub15)
>;
+// XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer
+// can handle COPY instructions.
// bitconvert pattern
class BitConvert <ValueType dt, ValueType st, RegisterClass rc> : Pat <
(dt (bitconvert (st rc:$src0))),
(dt rc:$src0)
>;
+// XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer
+// can handle COPY instructions.
class DwordAddrPat<ValueType vt, RegisterClass rc> : Pat <
(vt (AMDGPUdwordaddr (vt rc:$addr))),
(vt rc:$addr)
>;
+// BFI_INT patterns
+
+multiclass BFIPatterns <Instruction BFI_INT> {
+
+ // Definition from ISA doc:
+ // (y & x) | (z & ~x)
+ def : Pat <
+ (or (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))),
+ (BFI_INT $x, $y, $z)
+ >;
+
+ // SHA-256 Ch function
+ // z ^ (x & (y ^ z))
+ def : Pat <
+ (xor i32:$z, (and i32:$x, (xor i32:$y, i32:$z))),
+ (BFI_INT $x, $y, $z)
+ >;
+
+}
+
+// SHA-256 Ma patterns
+
+// ((x & z) | (y & (x | z))) -> BFI_INT (XOR x, y), z, y
+class SHA256MaPattern <Instruction BFI_INT, Instruction XOR> : Pat <
+ (or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))),
+ (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y)
+>;
+
+// Bitfield extract patterns
+
+def legalshift32 : ImmLeaf <i32, [{return Imm >=0 && Imm < 32;}]>;
+def bfemask : PatLeaf <(imm), [{return isMask_32(N->getZExtValue());}],
+ SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(CountTrailingOnes_32(N->getZExtValue()), MVT::i32);}]>>;
+
+class BFEPattern <Instruction BFE> : Pat <
+ (and (srl i32:$x, legalshift32:$y), bfemask:$z),
+ (BFE $x, $y, $z)
+>;
+
include "R600Instructions.td"
include "SIInstrInfo.td"
diff --git a/lib/Target/R600/AMDGPUMachineFunction.cpp b/lib/Target/R600/AMDGPUMachineFunction.cpp
index 0223ec8..0461025 100644
--- a/lib/Target/R600/AMDGPUMachineFunction.cpp
+++ b/lib/Target/R600/AMDGPUMachineFunction.cpp
@@ -1,4 +1,5 @@
#include "AMDGPUMachineFunction.h"
+#include "AMDGPU.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/Function.h"
@@ -8,6 +9,7 @@ const char *AMDGPUMachineFunction::ShaderTypeAttribute = "ShaderType";
AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
MachineFunctionInfo() {
+ ShaderType = ShaderType::COMPUTE;
AttributeSet Set = MF.getFunction()->getAttributes();
Attribute A = Set.getAttribute(AttributeSet::FunctionIndex,
ShaderTypeAttribute);
diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
index 0f356a1..a7e1d7b 100644
--- a/lib/Target/R600/AMDGPUSubtarget.cpp
+++ b/lib/Target/R600/AMDGPUSubtarget.cpp
@@ -33,6 +33,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) :
DefaultSize[0] = 64;
DefaultSize[1] = 1;
DefaultSize[2] = 1;
+ HasVertexCache = false;
ParseSubtargetFeatures(GPU, FS);
DevName = GPU;
Device = AMDGPUDeviceInfo::getDeviceFromName(DevName, this, Is64bit);
@@ -53,6 +54,10 @@ AMDGPUSubtarget::is64bit() const {
return Is64bit;
}
bool
+AMDGPUSubtarget::hasVertexCache() const {
+ return HasVertexCache;
+}
+bool
AMDGPUSubtarget::isTargetELF() const {
return false;
}
diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
index 1973fc6..b6501a4 100644
--- a/lib/Target/R600/AMDGPUSubtarget.h
+++ b/lib/Target/R600/AMDGPUSubtarget.h
@@ -36,6 +36,7 @@ private:
bool Is32on64bit;
bool DumpCode;
bool R600ALUInst;
+ bool HasVertexCache;
InstrItineraryData InstrItins;
@@ -48,6 +49,7 @@ public:
bool isOverride(AMDGPUDeviceInfo::Caps) const;
bool is64bit() const;
+ bool hasVertexCache() const;
// Helper functions to simplify if statements
bool isTargetELF() const;
diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
index e7ea876..31fbf32 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.cpp
+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
@@ -115,7 +115,6 @@ AMDGPUPassConfig::addPreISel() {
}
bool AMDGPUPassConfig::addInstSelector() {
- addPass(createAMDGPUPeepholeOpt(*TM));
addPass(createAMDGPUISelDag(getAMDGPUTargetMachine()));
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
@@ -153,8 +152,9 @@ bool AMDGPUPassConfig::addPreEmitPass() {
addPass(createAMDGPUCFGStructurizerPass(*TM));
addPass(createR600EmitClauseMarkers(*TM));
addPass(createR600ExpandSpecialInstrsPass(*TM));
- addPass(createR600ControlFlowFinalizer(*TM));
addPass(&FinalizeMachineBundlesID);
+ addPass(createR600Packetizer(*TM));
+ addPass(createR600ControlFlowFinalizer(*TM));
} else {
addPass(createSILowerControlFlowPass(*TM));
}
diff --git a/lib/Target/R600/AMDILBase.td b/lib/Target/R600/AMDILBase.td
index c12cedc..e221110 100644
--- a/lib/Target/R600/AMDILBase.td
+++ b/lib/Target/R600/AMDILBase.td
@@ -74,6 +74,10 @@ def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst",
"false",
"Older version of ALU instructions encoding.">;
+def FeatureVertexCache : SubtargetFeature<"HasVertexCache",
+ "HasVertexCache",
+ "true",
+ "Specify use of dedicated vertex cache.">;
//===----------------------------------------------------------------------===//
// Register File, Calling Conv, Instruction Descriptions
diff --git a/lib/Target/R600/AMDILDeviceInfo.cpp b/lib/Target/R600/AMDILDeviceInfo.cpp
index 9605fbe..126514b 100644
--- a/lib/Target/R600/AMDILDeviceInfo.cpp
+++ b/lib/Target/R600/AMDILDeviceInfo.cpp
@@ -44,7 +44,7 @@ AMDGPUDevice* getDeviceFromName(const std::string &deviceName,
" on 32bit pointers!");
#endif
return new AMDGPUEvergreenDevice(ptr);
- } else if (deviceName == "redwood") {
+ } else if (deviceName == "redwood" || deviceName == "sumo") {
#if DEBUG
assert(!is64bit && "This device does not support 64bit pointers!");
assert(!is64on32bit && "This device does not support 64bit"
@@ -79,7 +79,10 @@ AMDGPUDevice* getDeviceFromName(const std::string &deviceName,
" on 32bit pointers!");
#endif
return new AMDGPUNIDevice(ptr);
- } else if (deviceName == "SI") {
+ } else if (deviceName == "SI" ||
+ deviceName == "tahiti" || deviceName == "pitcairn" ||
+ deviceName == "verde" || deviceName == "oland" ||
+ deviceName == "hainan") {
return new AMDGPUSIDevice(ptr);
} else {
#if DEBUG
diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp
index fa8f62d..ba75a44 100644
--- a/lib/Target/R600/AMDILISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp
@@ -191,6 +191,29 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(),
RegSeqArgs, 2 * N->getNumOperands() + 1);
}
+ case ISD::BUILD_PAIR: {
+ SDValue RC, SubReg0, SubReg1;
+ const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+ if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
+ break;
+ }
+ if (N->getValueType(0) == MVT::i128) {
+ RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32);
+ SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, MVT::i32);
+ SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, MVT::i32);
+ } else if (N->getValueType(0) == MVT::i64) {
+ RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32);
+ SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32);
+ SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32);
+ } else {
+ llvm_unreachable("Unhandled value type for BUILD_PAIR");
+ }
+ const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
+ N->getOperand(1), SubReg1 };
+ return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
+ N->getDebugLoc(), N->getValueType(0), Ops);
+ }
+
case ISD::ConstantFP:
case ISD::Constant: {
const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
diff --git a/lib/Target/R600/AMDILPeepholeOptimizer.cpp b/lib/Target/R600/AMDILPeepholeOptimizer.cpp
deleted file mode 100644
index 3a28038..0000000
--- a/lib/Target/R600/AMDILPeepholeOptimizer.cpp
+++ /dev/null
@@ -1,1215 +0,0 @@
-//===-- AMDILPeepholeOptimizer.cpp - AMDGPU Peephole optimizations ---------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-/// \file
-//==-----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "PeepholeOpt"
-#ifdef DEBUG
-#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
-#else
-#define DEBUGME 0
-#endif
-
-#include "AMDILDevices.h"
-#include "AMDGPUInstrInfo.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionAnalysis.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/MathExtras.h"
-
-#include <sstream>
-
-#if 0
-STATISTIC(PointerAssignments, "Number of dynamic pointer "
- "assigments discovered");
-STATISTIC(PointerSubtract, "Number of pointer subtractions discovered");
-#endif
-
-using namespace llvm;
-// The Peephole optimization pass is used to do simple last minute optimizations
-// that are required for correct code or to remove redundant functions
-namespace {
-
-class OpaqueType;
-
-class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass {
-public:
- TargetMachine &TM;
- static char ID;
- AMDGPUPeepholeOpt(TargetMachine &tm);
- ~AMDGPUPeepholeOpt();
- const char *getPassName() const;
- bool runOnFunction(Function &F);
- bool doInitialization(Module &M);
- bool doFinalization(Module &M);
- void getAnalysisUsage(AnalysisUsage &AU) const;
-protected:
-private:
- // Function to initiate all of the instruction level optimizations.
- bool instLevelOptimizations(BasicBlock::iterator *inst);
- // Quick check to see if we need to dump all of the pointers into the
- // arena. If this is correct, then we set all pointers to exist in arena. This
- // is a workaround for aliasing of pointers in a struct/union.
- bool dumpAllIntoArena(Function &F);
- // Because I don't want to invalidate any pointers while in the
- // safeNestedForEachFunction. I push atomic conversions to a vector and handle
- // it later. This function does the conversions if required.
- void doAtomicConversionIfNeeded(Function &F);
- // Because __amdil_is_constant cannot be properly evaluated if
- // optimizations are disabled, the call's are placed in a vector
- // and evaluated after the __amdil_image* functions are evaluated
- // which should allow the __amdil_is_constant function to be
- // evaluated correctly.
- void doIsConstCallConversionIfNeeded();
- bool mChanged;
- bool mDebug;
- bool mConvertAtomics;
- CodeGenOpt::Level optLevel;
- // Run a series of tests to see if we can optimize a CALL instruction.
- bool optimizeCallInst(BasicBlock::iterator *bbb);
- // A peephole optimization to optimize bit extract sequences.
- bool optimizeBitExtract(Instruction *inst);
- // A peephole optimization to optimize bit insert sequences.
- bool optimizeBitInsert(Instruction *inst);
- bool setupBitInsert(Instruction *base,
- Instruction *&src,
- Constant *&mask,
- Constant *&shift);
- // Expand the bit field insert instruction on versions of OpenCL that
- // don't support it.
- bool expandBFI(CallInst *CI);
- // Expand the bit field mask instruction on version of OpenCL that
- // don't support it.
- bool expandBFM(CallInst *CI);
- // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
- // this case we need to expand them. These functions check for 24bit functions
- // and then expand.
- bool isSigned24BitOps(CallInst *CI);
- void expandSigned24BitOps(CallInst *CI);
- // One optimization that can occur is that if the required workgroup size is
- // specified then the result of get_local_size is known at compile time and
- // can be returned accordingly.
- bool isRWGLocalOpt(CallInst *CI);
- // On northern island cards, the division is slightly less accurate than on
- // previous generations, so we need to utilize a more accurate division. So we
- // can translate the accurate divide to a normal divide on all other cards.
- bool convertAccurateDivide(CallInst *CI);
- void expandAccurateDivide(CallInst *CI);
- // If the alignment is set incorrectly, it can produce really inefficient
- // code. This checks for this scenario and fixes it if possible.
- bool correctMisalignedMemOp(Instruction *inst);
-
- // If we are in no opt mode, then we need to make sure that
- // local samplers are properly propagated as constant propagation
- // doesn't occur and we need to know the value of kernel defined
- // samplers at compile time.
- bool propagateSamplerInst(CallInst *CI);
-
- // Helper functions
-
- // Group of functions that recursively calculate the size of a structure based
- // on it's sub-types.
- size_t getTypeSize(Type * const T, bool dereferencePtr = false);
- size_t getTypeSize(StructType * const ST, bool dereferencePtr = false);
- size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false);
- size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false);
- size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false);
- size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false);
- size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false);
- size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false);
-
- LLVMContext *mCTX;
- Function *mF;
- const AMDGPUSubtarget *mSTM;
- SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs;
- SmallVector<CallInst *, 16> isConstVec;
-}; // class AMDGPUPeepholeOpt
- char AMDGPUPeepholeOpt::ID = 0;
-
-// A template function that has two levels of looping before calling the
-// function with a pointer to the current iterator.
-template<class InputIterator, class SecondIterator, class Function>
-Function safeNestedForEach(InputIterator First, InputIterator Last,
- SecondIterator S, Function F) {
- for ( ; First != Last; ++First) {
- SecondIterator sf, sl;
- for (sf = First->begin(), sl = First->end();
- sf != sl; ) {
- if (!F(&sf)) {
- ++sf;
- }
- }
- }
- return F;
-}
-
-} // anonymous namespace
-
-namespace llvm {
- FunctionPass *
- createAMDGPUPeepholeOpt(TargetMachine &tm) {
- return new AMDGPUPeepholeOpt(tm);
- }
-} // llvm namespace
-
-AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm)
- : FunctionPass(ID), TM(tm) {
- mDebug = DEBUGME;
- optLevel = TM.getOptLevel();
-
-}
-
-AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt() {
-}
-
-const char *
-AMDGPUPeepholeOpt::getPassName() const {
- return "AMDGPU PeepHole Optimization Pass";
-}
-
-bool
-containsPointerType(Type *Ty) {
- if (!Ty) {
- return false;
- }
- switch(Ty->getTypeID()) {
- default:
- return false;
- case Type::StructTyID: {
- const StructType *ST = dyn_cast<StructType>(Ty);
- for (StructType::element_iterator stb = ST->element_begin(),
- ste = ST->element_end(); stb != ste; ++stb) {
- if (!containsPointerType(*stb)) {
- continue;
- }
- return true;
- }
- break;
- }
- case Type::VectorTyID:
- case Type::ArrayTyID:
- return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType());
- case Type::PointerTyID:
- return true;
- };
- return false;
-}
-
-bool
-AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F) {
- bool dumpAll = false;
- for (Function::const_arg_iterator cab = F.arg_begin(),
- cae = F.arg_end(); cab != cae; ++cab) {
- const Argument *arg = cab;
- const PointerType *PT = dyn_cast<PointerType>(arg->getType());
- if (!PT) {
- continue;
- }
- Type *DereferencedType = PT->getElementType();
- if (!dyn_cast<StructType>(DereferencedType)
- ) {
- continue;
- }
- if (!containsPointerType(DereferencedType)) {
- continue;
- }
- // FIXME: Because a pointer inside of a struct/union may be aliased to
- // another pointer we need to take the conservative approach and place all
- // pointers into the arena until more advanced detection is implemented.
- dumpAll = true;
- }
- return dumpAll;
-}
-void
-AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded() {
- if (isConstVec.empty()) {
- return;
- }
- for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) {
- CallInst *CI = isConstVec[x];
- Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
- Type *aType = Type::getInt32Ty(*mCTX);
- Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
- : ConstantInt::get(aType, 0);
- CI->replaceAllUsesWith(Val);
- CI->eraseFromParent();
- }
- isConstVec.clear();
-}
-void
-AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F) {
- // Don't do anything if we don't have any atomic operations.
- if (atomicFuncs.empty()) {
- return;
- }
- // Change the function name for the atomic if it is required
- uint32_t size = atomicFuncs.size();
- for (uint32_t x = 0; x < size; ++x) {
- atomicFuncs[x].first->setOperand(
- atomicFuncs[x].first->getNumOperands()-1,
- atomicFuncs[x].second);
-
- }
- mChanged = true;
- if (mConvertAtomics) {
- return;
- }
-}
-
-bool
-AMDGPUPeepholeOpt::runOnFunction(Function &MF) {
- mChanged = false;
- mF = &MF;
- mSTM = &TM.getSubtarget<AMDGPUSubtarget>();
- if (mDebug) {
- MF.dump();
- }
- mCTX = &MF.getType()->getContext();
- mConvertAtomics = true;
- safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
- std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations),
- this));
-
- doAtomicConversionIfNeeded(MF);
- doIsConstCallConversionIfNeeded();
-
- if (mDebug) {
- MF.dump();
- }
- return mChanged;
-}
-
-bool
-AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb) {
- Instruction *inst = (*bbb);
- CallInst *CI = dyn_cast<CallInst>(inst);
- if (!CI) {
- return false;
- }
- if (isSigned24BitOps(CI)) {
- expandSigned24BitOps(CI);
- ++(*bbb);
- CI->eraseFromParent();
- return true;
- }
- if (propagateSamplerInst(CI)) {
- return false;
- }
- if (expandBFI(CI) || expandBFM(CI)) {
- ++(*bbb);
- CI->eraseFromParent();
- return true;
- }
- if (convertAccurateDivide(CI)) {
- expandAccurateDivide(CI);
- ++(*bbb);
- CI->eraseFromParent();
- return true;
- }
-
- StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName();
- if (calleeName.startswith("__amdil_is_constant")) {
- // If we do not have optimizations, then this
- // cannot be properly evaluated, so we add the
- // call instruction to a vector and process
- // them at the end of processing after the
- // samplers have been correctly handled.
- if (optLevel == CodeGenOpt::None) {
- isConstVec.push_back(CI);
- return false;
- } else {
- Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
- Type *aType = Type::getInt32Ty(*mCTX);
- Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
- : ConstantInt::get(aType, 0);
- CI->replaceAllUsesWith(Val);
- ++(*bbb);
- CI->eraseFromParent();
- return true;
- }
- }
-
- if (calleeName.equals("__amdil_is_asic_id_i32")) {
- ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0));
- Type *aType = Type::getInt32Ty(*mCTX);
- Value *Val = CV;
- if (Val) {
- Val = ConstantInt::get(aType,
- mSTM->device()->getDeviceFlag() & CV->getZExtValue());
- } else {
- Val = ConstantInt::get(aType, 0);
- }
- CI->replaceAllUsesWith(Val);
- ++(*bbb);
- CI->eraseFromParent();
- return true;
- }
- Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1));
- if (!F) {
- return false;
- }
- if (F->getName().startswith("__atom") && !CI->getNumUses()
- && F->getName().find("_xchg") == StringRef::npos) {
- std::string buffer(F->getName().str() + "_noret");
- F = dyn_cast<Function>(
- F->getParent()->getOrInsertFunction(buffer, F->getFunctionType()));
- atomicFuncs.push_back(std::make_pair(CI, F));
- }
-
- if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment)
- && !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) {
- return false;
- }
- if (!mConvertAtomics) {
- return false;
- }
- StringRef name = F->getName();
- if (name.startswith("__atom") && name.find("_g") != StringRef::npos) {
- mConvertAtomics = false;
- }
- return false;
-}
-
-bool
-AMDGPUPeepholeOpt::setupBitInsert(Instruction *base,
- Instruction *&src,
- Constant *&mask,
- Constant *&shift) {
- if (!base) {
- if (mDebug) {
- dbgs() << "Null pointer passed into function.\n";
- }
- return false;
- }
- bool andOp = false;
- if (base->getOpcode() == Instruction::Shl) {
- shift = dyn_cast<Constant>(base->getOperand(1));
- } else if (base->getOpcode() == Instruction::And) {
- mask = dyn_cast<Constant>(base->getOperand(1));
- andOp = true;
- } else {
- if (mDebug) {
- dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
- }
- // If the base is neither a Shl or a And, we don't fit any of the patterns above.
- return false;
- }
- src = dyn_cast<Instruction>(base->getOperand(0));
- if (!src) {
- if (mDebug) {
- dbgs() << "Failed setup since the base operand is not an instruction!\n";
- }
- return false;
- }
- // If we find an 'and' operation, then we don't need to
- // find the next operation as we already know the
- // bits that are valid at this point.
- if (andOp) {
- return true;
- }
- if (src->getOpcode() == Instruction::Shl && !shift) {
- shift = dyn_cast<Constant>(src->getOperand(1));
- src = dyn_cast<Instruction>(src->getOperand(0));
- } else if (src->getOpcode() == Instruction::And && !mask) {
- mask = dyn_cast<Constant>(src->getOperand(1));
- }
- if (!mask && !shift) {
- if (mDebug) {
- dbgs() << "Failed setup since both mask and shift are NULL!\n";
- }
- // Did not find a constant mask or a shift.
- return false;
- }
- return true;
-}
-bool
-AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst) {
- if (!inst) {
- return false;
- }
- if (!inst->isBinaryOp()) {
- return false;
- }
- if (inst->getOpcode() != Instruction::Or) {
- return false;
- }
- if (optLevel == CodeGenOpt::None) {
- return false;
- }
- // We want to do an optimization on a sequence of ops that in the end equals a
- // single ISA instruction.
- // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F)
- // Some simplified versions of this pattern are as follows:
- // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0
- // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E
- // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
- // (A & B) | (D << F) when (1 << F) >= B
- // (A << C) | (D & E) when (1 << C) >= E
- if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
- // The HD4XXX hardware doesn't support the ubit_insert instruction.
- return false;
- }
- Type *aType = inst->getType();
- bool isVector = aType->isVectorTy();
- int numEle = 1;
- // This optimization only works on 32bit integers.
- if (aType->getScalarType()
- != Type::getInt32Ty(inst->getContext())) {
- return false;
- }
- if (isVector) {
- const VectorType *VT = dyn_cast<VectorType>(aType);
- numEle = VT->getNumElements();
- // We currently cannot support more than 4 elements in a intrinsic and we
- // cannot support Vec3 types.
- if (numEle > 4 || numEle == 3) {
- return false;
- }
- }
- // TODO: Handle vectors.
- if (isVector) {
- if (mDebug) {
- dbgs() << "!!! Vectors are not supported yet!\n";
- }
- return false;
- }
- Instruction *LHSSrc = NULL, *RHSSrc = NULL;
- Constant *LHSMask = NULL, *RHSMask = NULL;
- Constant *LHSShift = NULL, *RHSShift = NULL;
- Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0));
- Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1));
- if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) {
- if (mDebug) {
- dbgs() << "Found an OR Operation that failed setup!\n";
- inst->dump();
- if (LHS) { LHS->dump(); }
- if (LHSSrc) { LHSSrc->dump(); }
- if (LHSMask) { LHSMask->dump(); }
- if (LHSShift) { LHSShift->dump(); }
- }
- // There was an issue with the setup for BitInsert.
- return false;
- }
- if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) {
- if (mDebug) {
- dbgs() << "Found an OR Operation that failed setup!\n";
- inst->dump();
- if (RHS) { RHS->dump(); }
- if (RHSSrc) { RHSSrc->dump(); }
- if (RHSMask) { RHSMask->dump(); }
- if (RHSShift) { RHSShift->dump(); }
- }
- // There was an issue with the setup for BitInsert.
- return false;
- }
- if (mDebug) {
- dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
- dbgs() << "Op: "; inst->dump();
- dbgs() << "LHS: "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; }
- dbgs() << "LHS Src: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; }
- dbgs() << "LHS Mask: "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; }
- dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; }
- dbgs() << "RHS: "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; }
- dbgs() << "RHS Src: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; }
- dbgs() << "RHS Mask: "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; }
- dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; }
- }
- Constant *offset = NULL;
- Constant *width = NULL;
- uint32_t lhsMaskVal = 0, rhsMaskVal = 0;
- uint32_t lhsShiftVal = 0, rhsShiftVal = 0;
- uint32_t lhsMaskWidth = 0, rhsMaskWidth = 0;
- uint32_t lhsMaskOffset = 0, rhsMaskOffset = 0;
- lhsMaskVal = (LHSMask
- ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0);
- rhsMaskVal = (RHSMask
- ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0);
- lhsShiftVal = (LHSShift
- ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0);
- rhsShiftVal = (RHSShift
- ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0);
- lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal;
- rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal;
- lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal;
- rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal;
- // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks).
- if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) {
- return false;
- }
- if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) {
- offset = ConstantInt::get(aType, lhsMaskOffset, false);
- width = ConstantInt::get(aType, lhsMaskWidth, false);
- RHSSrc = RHS;
- if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) {
- return false;
- }
- if (!LHSShift) {
- LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
- "MaskShr", LHS);
- } else if (lhsShiftVal != lhsMaskOffset) {
- LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
- "MaskShr", LHS);
- }
- if (mDebug) {
- dbgs() << "Optimizing LHS!\n";
- }
- } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) {
- offset = ConstantInt::get(aType, rhsMaskOffset, false);
- width = ConstantInt::get(aType, rhsMaskWidth, false);
- LHSSrc = RHSSrc;
- RHSSrc = LHS;
- if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) {
- return false;
- }
- if (!RHSShift) {
- LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
- "MaskShr", RHS);
- } else if (rhsShiftVal != rhsMaskOffset) {
- LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
- "MaskShr", RHS);
- }
- if (mDebug) {
- dbgs() << "Optimizing RHS!\n";
- }
- } else {
- if (mDebug) {
- dbgs() << "Failed constraint 3!\n";
- }
- return false;
- }
- if (mDebug) {
- dbgs() << "Width: "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; }
- dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; }
- dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; }
- dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; }
- }
- if (!offset || !width) {
- if (mDebug) {
- dbgs() << "Either width or offset are NULL, failed detection!\n";
- }
- return false;
- }
- // Lets create the function signature.
- std::vector<Type *> callTypes;
- callTypes.push_back(aType);
- callTypes.push_back(aType);
- callTypes.push_back(aType);
- callTypes.push_back(aType);
- FunctionType *funcType = FunctionType::get(aType, callTypes, false);
- std::string name = "__amdil_ubit_insert";
- if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; }
- Function *Func =
- dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
- getOrInsertFunction(StringRef(name), funcType));
- Value *Operands[4] = {
- width,
- offset,
- LHSSrc,
- RHSSrc
- };
- CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt");
- if (mDebug) {
- dbgs() << "Old Inst: ";
- inst->dump();
- dbgs() << "New Inst: ";
- CI->dump();
- dbgs() << "\n\n";
- }
- CI->insertBefore(inst);
- inst->replaceAllUsesWith(CI);
- return true;
-}
-
-bool
-AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst) {
- if (!inst) {
- return false;
- }
- if (!inst->isBinaryOp()) {
- return false;
- }
- if (inst->getOpcode() != Instruction::And) {
- return false;
- }
- if (optLevel == CodeGenOpt::None) {
- return false;
- }
- // We want to do some simple optimizations on Shift right/And patterns. The
- // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
- // value smaller than 32 and C is a mask. If C is a constant value, then the
- // following transformation can occur. For signed integers, it turns into the
- // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
- // integers, it turns into the function call dst =
- // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
- // can be found in Section 7.9 of the ATI IL spec of the stream SDK for
- // Evergreen hardware.
- if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
- // This does not work on HD4XXX hardware.
- return false;
- }
- Type *aType = inst->getType();
- bool isVector = aType->isVectorTy();
-
- // XXX Support vector types
- if (isVector) {
- return false;
- }
- int numEle = 1;
- // This only works on 32bit integers
- if (aType->getScalarType()
- != Type::getInt32Ty(inst->getContext())) {
- return false;
- }
- if (isVector) {
- const VectorType *VT = dyn_cast<VectorType>(aType);
- numEle = VT->getNumElements();
- // We currently cannot support more than 4 elements in a intrinsic and we
- // cannot support Vec3 types.
- if (numEle > 4 || numEle == 3) {
- return false;
- }
- }
- BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0));
- // If the first operand is not a shift instruction, then we can return as it
- // doesn't match this pattern.
- if (!ShiftInst || !ShiftInst->isShift()) {
- return false;
- }
- // If we are a shift left, then we need don't match this pattern.
- if (ShiftInst->getOpcode() == Instruction::Shl) {
- return false;
- }
- bool isSigned = ShiftInst->isArithmeticShift();
- Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1));
- Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1));
- // Lets make sure that the shift value and the and mask are constant integers.
- if (!AndMask || !ShrVal) {
- return false;
- }
- Constant *newMaskConst;
- Constant *shiftValConst;
- if (isVector) {
- // Handle the vector case
- std::vector<Constant *> maskVals;
- std::vector<Constant *> shiftVals;
- ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask);
- ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal);
- Type *scalarType = AndMaskVec->getType()->getScalarType();
- assert(AndMaskVec->getNumOperands() ==
- ShrValVec->getNumOperands() && "cannot have a "
- "combination where the number of elements to a "
- "shift and an and are different!");
- for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) {
- ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x));
- ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x));
- if (!AndCI || !ShiftIC) {
- return false;
- }
- uint32_t maskVal = (uint32_t)AndCI->getZExtValue();
- if (!isMask_32(maskVal)) {
- return false;
- }
- maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
- uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue();
- // If the mask or shiftval is greater than the bitcount, then break out.
- if (maskVal >= 32 || shiftVal >= 32) {
- return false;
- }
- // If the mask val is greater than the the number of original bits left
- // then this optimization is invalid.
- if (maskVal > (32 - shiftVal)) {
- return false;
- }
- maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned));
- shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned));
- }
- newMaskConst = ConstantVector::get(maskVals);
- shiftValConst = ConstantVector::get(shiftVals);
- } else {
- // Handle the scalar case
- uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue();
- // This must be a mask value where all lower bits are set to 1 and then any
- // bit higher is set to 0.
- if (!isMask_32(maskVal)) {
- return false;
- }
- maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
- // Count the number of bits set in the mask, this is the width of the
- // resulting bit set that is extracted from the source value.
- uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue();
- // If the mask or shift val is greater than the bitcount, then break out.
- if (maskVal >= 32 || shiftVal >= 32) {
- return false;
- }
- // If the mask val is greater than the the number of original bits left then
- // this optimization is invalid.
- if (maskVal > (32 - shiftVal)) {
- return false;
- }
- newMaskConst = ConstantInt::get(aType, maskVal, isSigned);
- shiftValConst = ConstantInt::get(aType, shiftVal, isSigned);
- }
- // Lets create the function signature.
- std::vector<Type *> callTypes;
- callTypes.push_back(aType);
- callTypes.push_back(aType);
- callTypes.push_back(aType);
- FunctionType *funcType = FunctionType::get(aType, callTypes, false);
- std::string name = "llvm.AMDGPU.bit.extract.u32";
- if (isVector) {
- name += ".v" + itostr(numEle) + "i32";
- } else {
- name += ".";
- }
- // Lets create the function.
- Function *Func =
- dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
- getOrInsertFunction(StringRef(name), funcType));
- Value *Operands[3] = {
- ShiftInst->getOperand(0),
- shiftValConst,
- newMaskConst
- };
- // Lets create the Call with the operands
- CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt");
- CI->setDoesNotAccessMemory();
- CI->insertBefore(inst);
- inst->replaceAllUsesWith(CI);
- return true;
-}
-
-bool
-AMDGPUPeepholeOpt::expandBFI(CallInst *CI) {
- if (!CI) {
- return false;
- }
- Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
- if (!LHS->getName().startswith("__amdil_bfi")) {
- return false;
- }
- Type* type = CI->getOperand(0)->getType();
- Constant *negOneConst = NULL;
- if (type->isVectorTy()) {
- std::vector<Constant *> negOneVals;
- negOneConst = ConstantInt::get(CI->getContext(),
- APInt(32, StringRef("-1"), 10));
- for (size_t x = 0,
- y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
- negOneVals.push_back(negOneConst);
- }
- negOneConst = ConstantVector::get(negOneVals);
- } else {
- negOneConst = ConstantInt::get(CI->getContext(),
- APInt(32, StringRef("-1"), 10));
- }
- // __amdil_bfi => (A & B) | (~A & C)
- BinaryOperator *lhs =
- BinaryOperator::Create(Instruction::And, CI->getOperand(0),
- CI->getOperand(1), "bfi_and", CI);
- BinaryOperator *rhs =
- BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst,
- "bfi_not", CI);
- rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2),
- "bfi_and", CI);
- lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI);
- CI->replaceAllUsesWith(lhs);
- return true;
-}
-
-bool
-AMDGPUPeepholeOpt::expandBFM(CallInst *CI) {
- if (!CI) {
- return false;
- }
- Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
- if (!LHS->getName().startswith("__amdil_bfm")) {
- return false;
- }
- // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
- Constant *newMaskConst = NULL;
- Constant *newShiftConst = NULL;
- Type* type = CI->getOperand(0)->getType();
- if (type->isVectorTy()) {
- std::vector<Constant*> newMaskVals, newShiftVals;
- newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
- newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
- for (size_t x = 0,
- y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
- newMaskVals.push_back(newMaskConst);
- newShiftVals.push_back(newShiftConst);
- }
- newMaskConst = ConstantVector::get(newMaskVals);
- newShiftConst = ConstantVector::get(newShiftVals);
- } else {
- newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
- newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
- }
- BinaryOperator *lhs =
- BinaryOperator::Create(Instruction::And, CI->getOperand(0),
- newMaskConst, "bfm_mask", CI);
- lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst,
- lhs, "bfm_shl", CI);
- lhs = BinaryOperator::Create(Instruction::Sub, lhs,
- newShiftConst, "bfm_sub", CI);
- BinaryOperator *rhs =
- BinaryOperator::Create(Instruction::And, CI->getOperand(1),
- newMaskConst, "bfm_mask", CI);
- lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI);
- CI->replaceAllUsesWith(lhs);
- return true;
-}
-
-bool
-AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb) {
- Instruction *inst = (*bbb);
- if (optimizeCallInst(bbb)) {
- return true;
- }
- if (optimizeBitExtract(inst)) {
- return false;
- }
- if (optimizeBitInsert(inst)) {
- return false;
- }
- if (correctMisalignedMemOp(inst)) {
- return false;
- }
- return false;
-}
-bool
-AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst) {
- LoadInst *linst = dyn_cast<LoadInst>(inst);
- StoreInst *sinst = dyn_cast<StoreInst>(inst);
- unsigned alignment;
- Type* Ty = inst->getType();
- if (linst) {
- alignment = linst->getAlignment();
- Ty = inst->getType();
- } else if (sinst) {
- alignment = sinst->getAlignment();
- Ty = sinst->getValueOperand()->getType();
- } else {
- return false;
- }
- unsigned size = getTypeSize(Ty);
- if (size == alignment || size < alignment) {
- return false;
- }
- if (!Ty->isStructTy()) {
- return false;
- }
- if (alignment < 4) {
- if (linst) {
- linst->setAlignment(0);
- return true;
- } else if (sinst) {
- sinst->setAlignment(0);
- return true;
- }
- }
- return false;
-}
-bool
-AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI) {
- if (!CI) {
- return false;
- }
- Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
- std::string namePrefix = LHS->getName().substr(0, 14);
- if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24"
- && namePrefix != "__amdil__imul24_high") {
- return false;
- }
- if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) {
- return false;
- }
- return true;
-}
-
-void
-AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI) {
- assert(isSigned24BitOps(CI) && "Must be a "
- "signed 24 bit operation to call this function!");
- Value *LHS = CI->getOperand(CI->getNumOperands()-1);
- // On 7XX and 8XX we do not have signed 24bit, so we need to
- // expand it to the following:
- // imul24 turns into 32bit imul
- // imad24 turns into 32bit imad
- // imul24_high turns into 32bit imulhigh
- if (LHS->getName().substr(0, 14) == "__amdil_imad24") {
- Type *aType = CI->getOperand(0)->getType();
- bool isVector = aType->isVectorTy();
- int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
- std::vector<Type*> callTypes;
- callTypes.push_back(CI->getOperand(0)->getType());
- callTypes.push_back(CI->getOperand(1)->getType());
- callTypes.push_back(CI->getOperand(2)->getType());
- FunctionType *funcType =
- FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
- std::string name = "__amdil_imad";
- if (isVector) {
- name += "_v" + itostr(numEle) + "i32";
- } else {
- name += "_i32";
- }
- Function *Func = dyn_cast<Function>(
- CI->getParent()->getParent()->getParent()->
- getOrInsertFunction(StringRef(name), funcType));
- Value *Operands[3] = {
- CI->getOperand(0),
- CI->getOperand(1),
- CI->getOperand(2)
- };
- CallInst *nCI = CallInst::Create(Func, Operands, "imad24");
- nCI->insertBefore(CI);
- CI->replaceAllUsesWith(nCI);
- } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") {
- BinaryOperator *mulOp =
- BinaryOperator::Create(Instruction::Mul, CI->getOperand(0),
- CI->getOperand(1), "imul24", CI);
- CI->replaceAllUsesWith(mulOp);
- } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") {
- Type *aType = CI->getOperand(0)->getType();
-
- bool isVector = aType->isVectorTy();
- int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
- std::vector<Type*> callTypes;
- callTypes.push_back(CI->getOperand(0)->getType());
- callTypes.push_back(CI->getOperand(1)->getType());
- FunctionType *funcType =
- FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
- std::string name = "__amdil_imul_high";
- if (isVector) {
- name += "_v" + itostr(numEle) + "i32";
- } else {
- name += "_i32";
- }
- Function *Func = dyn_cast<Function>(
- CI->getParent()->getParent()->getParent()->
- getOrInsertFunction(StringRef(name), funcType));
- Value *Operands[2] = {
- CI->getOperand(0),
- CI->getOperand(1)
- };
- CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high");
- nCI->insertBefore(CI);
- CI->replaceAllUsesWith(nCI);
- }
-}
-
-bool
-AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI) {
- return (CI != NULL
- && CI->getOperand(CI->getNumOperands() - 1)->getName()
- == "__amdil_get_local_size_int");
-}
-
-bool
-AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI) {
- if (!CI) {
- return false;
- }
- if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX
- && (mSTM->getDeviceName() == "cayman")) {
- return false;
- }
- return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20)
- == "__amdil_improved_div";
-}
-
-void
-AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI) {
- assert(convertAccurateDivide(CI)
- && "expanding accurate divide can only happen if it is expandable!");
- BinaryOperator *divOp =
- BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0),
- CI->getOperand(1), "fdiv32", CI);
- CI->replaceAllUsesWith(divOp);
-}
-
-bool
-AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI) {
- if (optLevel != CodeGenOpt::None) {
- return false;
- }
-
- if (!CI) {
- return false;
- }
-
- unsigned funcNameIdx = 0;
- funcNameIdx = CI->getNumOperands() - 1;
- StringRef calleeName = CI->getOperand(funcNameIdx)->getName();
- if (calleeName != "__amdil_image2d_read_norm"
- && calleeName != "__amdil_image2d_read_unnorm"
- && calleeName != "__amdil_image3d_read_norm"
- && calleeName != "__amdil_image3d_read_unnorm") {
- return false;
- }
-
- unsigned samplerIdx = 2;
- samplerIdx = 1;
- Value *sampler = CI->getOperand(samplerIdx);
- LoadInst *lInst = dyn_cast<LoadInst>(sampler);
- if (!lInst) {
- return false;
- }
-
- if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
- return false;
- }
-
- GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand());
- // If we are loading from what is not a global value, then we
- // fail and return.
- if (!gv) {
- return false;
- }
-
- // If we don't have an initializer or we have an initializer and
- // the initializer is not a 32bit integer, we fail.
- if (!gv->hasInitializer()
- || !gv->getInitializer()->getType()->isIntegerTy(32)) {
- return false;
- }
-
- // Now that we have the global variable initializer, lets replace
- // all uses of the load instruction with the samplerVal and
- // reparse the __amdil_is_constant() function.
- Constant *samplerVal = gv->getInitializer();
- lInst->replaceAllUsesWith(samplerVal);
- return true;
-}
-
-bool
-AMDGPUPeepholeOpt::doInitialization(Module &M) {
- return false;
-}
-
-bool
-AMDGPUPeepholeOpt::doFinalization(Module &M) {
- return false;
-}
-
-void
-AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const {
- AU.addRequired<MachineFunctionAnalysis>();
- FunctionPass::getAnalysisUsage(AU);
- AU.setPreservesAll();
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) {
- size_t size = 0;
- if (!T) {
- return size;
- }
- switch (T->getTypeID()) {
- case Type::X86_FP80TyID:
- case Type::FP128TyID:
- case Type::PPC_FP128TyID:
- case Type::LabelTyID:
- assert(0 && "These types are not supported by this backend");
- default:
- case Type::FloatTyID:
- case Type::DoubleTyID:
- size = T->getPrimitiveSizeInBits() >> 3;
- break;
- case Type::PointerTyID:
- size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr);
- break;
- case Type::IntegerTyID:
- size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr);
- break;
- case Type::StructTyID:
- size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr);
- break;
- case Type::ArrayTyID:
- size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr);
- break;
- case Type::FunctionTyID:
- size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr);
- break;
- case Type::VectorTyID:
- size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr);
- break;
- };
- return size;
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST,
- bool dereferencePtr) {
- size_t size = 0;
- if (!ST) {
- return size;
- }
- Type *curType;
- StructType::element_iterator eib;
- StructType::element_iterator eie;
- for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {
- curType = *eib;
- size += getTypeSize(curType, dereferencePtr);
- }
- return size;
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT,
- bool dereferencePtr) {
- return IT ? (IT->getBitWidth() >> 3) : 0;
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT,
- bool dereferencePtr) {
- assert(0 && "Should not be able to calculate the size of an function type");
- return 0;
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT,
- bool dereferencePtr) {
- return (size_t)(AT ? (getTypeSize(AT->getElementType(),
- dereferencePtr) * AT->getNumElements())
- : 0);
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT,
- bool dereferencePtr) {
- return VT ? (VT->getBitWidth() >> 3) : 0;
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT,
- bool dereferencePtr) {
- if (!PT) {
- return 0;
- }
- Type *CT = PT->getElementType();
- if (CT->getTypeID() == Type::StructTyID &&
- PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
- return getTypeSize(dyn_cast<StructType>(CT));
- } else if (dereferencePtr) {
- size_t size = 0;
- for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
- size += getTypeSize(PT->getContainedType(x), dereferencePtr);
- }
- return size;
- } else {
- return 4;
- }
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT,
- bool dereferencePtr) {
- //assert(0 && "Should not be able to calculate the size of an opaque type");
- return 4;
-}
diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
index 8efba58..97f0a40 100644
--- a/lib/Target/R600/CMakeLists.txt
+++ b/lib/Target/R600/CMakeLists.txt
@@ -21,7 +21,6 @@ add_llvm_target(R600CodeGen
AMDILISelDAGToDAG.cpp
AMDILISelLowering.cpp
AMDILNIDevice.cpp
- AMDILPeepholeOptimizer.cpp
AMDILSIDevice.cpp
AMDGPUAsmPrinter.cpp
AMDGPUFrameLowering.cpp
@@ -42,6 +41,7 @@ add_llvm_target(R600CodeGen
R600ISelLowering.cpp
R600MachineFunctionInfo.cpp
R600MachineScheduler.cpp
+ R600Packetizer.cpp
R600RegisterInfo.cpp
SIAnnotateControlFlow.cpp
SIInsertWaits.cpp
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
index 10547a5..303cdf2 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
@@ -17,6 +17,7 @@ using namespace llvm;
void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
StringRef Annot) {
+ OS.flush();
printInstruction(MI, OS);
printAnnotation(OS, Annot);
@@ -67,11 +68,14 @@ void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo,
}
void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo,
- raw_ostream &O, StringRef Asm) {
+ raw_ostream &O, StringRef Asm,
+ StringRef Default) {
const MCOperand &Op = MI->getOperand(OpNo);
assert(Op.isImm());
if (Op.getImm() == 1) {
O << Asm;
+ } else {
+ O << Default;
}
}
@@ -98,7 +102,7 @@ void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo,
void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo,
raw_ostream &O) {
- printIfSet(MI, OpNo, O, " *");
+ printIfSet(MI, OpNo, O.indent(20 - O.GetNumBytesInBuffer()), "*", " ");
}
void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo,
@@ -169,4 +173,41 @@ void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo,
O << "." << chans[chan];
}
+void AMDGPUInstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ int BankSwizzle = MI->getOperand(OpNo).getImm();
+ switch (BankSwizzle) {
+ case 1:
+ O << "BS:VEC_021";
+ break;
+ case 2:
+ O << "BS:VEC_120";
+ break;
+ case 3:
+ O << "BS:VEC_102";
+ break;
+ case 4:
+ O << "BS:VEC_201";
+ break;
+ case 5:
+ O << "BS:VEC_210";
+ break;
+ default:
+ break;
+ }
+ return;
+}
+
+void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ int KCacheMode = MI->getOperand(OpNo).getImm();
+ if (KCacheMode > 0) {
+ int KCacheBank = MI->getOperand(OpNo - 2).getImm();
+ O << "CB" << KCacheBank <<":";
+ int KCacheAddr = MI->getOperand(OpNo + 2).getImm();
+ int LineSize = (KCacheMode == 1)?16:32;
+ O << KCacheAddr * 16 << "-" << KCacheAddr * 16 + LineSize;
+ }
+}
+
#include "AMDGPUGenAsmWriter.inc"
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
index 767a708..c6fd053 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
@@ -35,7 +35,8 @@ private:
void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O);
void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O, StringRef Asm);
+ void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O,
+ StringRef Asm, StringRef Default = "");
void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O);
@@ -47,6 +48,8 @@ private:
void printUpdatePred(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printWrite(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printBankSwizzle(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printKCache(const MCInst *MI, unsigned OpNo, raw_ostream &O);
};
} // End namespace llvm
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
index 98fca43..a3397f3 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -44,7 +44,6 @@ public:
AMDGPUAsmBackend(const Target &T)
: MCAsmBackend() {}
- virtual AMDGPUMCObjectWriter *createObjectWriter(raw_ostream &OS) const;
virtual unsigned getNumFixupKinds() const { return 0; };
virtual void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
uint64_t Value) const;
@@ -71,16 +70,6 @@ void AMDGPUMCObjectWriter::WriteObject(MCAssembler &Asm,
}
}
-MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T, StringRef TT,
- StringRef CPU) {
- return new AMDGPUAsmBackend(T);
-}
-
-AMDGPUMCObjectWriter * AMDGPUAsmBackend::createObjectWriter(
- raw_ostream &OS) const {
- return new AMDGPUMCObjectWriter(OS);
-}
-
void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
unsigned DataSize, uint64_t Value) const {
@@ -88,3 +77,21 @@ void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
assert(Fixup.getKind() == FK_PCRel_4);
*Dst = (Value - 4) / 4;
}
+
+//===----------------------------------------------------------------------===//
+// ELFAMDGPUAsmBackend class
+//===----------------------------------------------------------------------===//
+
+class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend {
+public:
+ ELFAMDGPUAsmBackend(const Target &T) : AMDGPUAsmBackend(T) { }
+
+ MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+ return createAMDGPUELFObjectWriter(OS);
+ }
+};
+
+MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T, StringRef TT,
+ StringRef CPU) {
+ return new ELFAMDGPUAsmBackend(T);
+}
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp
new file mode 100644
index 0000000..48fac9f
--- /dev/null
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -0,0 +1,39 @@
+//===-- AMDGPUELFObjectWriter.cpp - AMDGPU ELF Writer ----------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+ AMDGPUELFObjectWriter();
+protected:
+ virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+ bool IsPCRel, bool IsRelocWithSymbol,
+ int64_t Addend) const {
+ llvm_unreachable("Not implemented");
+ }
+
+};
+
+
+} // End anonymous namespace
+
+AMDGPUELFObjectWriter::AMDGPUELFObjectWriter()
+ : MCELFObjectTargetWriter(false, 0, 0, false) { }
+
+MCObjectWriter *llvm::createAMDGPUELFObjectWriter(raw_ostream &OS) {
+ MCELFObjectTargetWriter *MOTW = new AMDGPUELFObjectWriter();
+ return createELFObjectWriter(MOTW, OS, true);
+}
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index b7cdd7c..2aae26a 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -68,8 +68,6 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Target &T, StringRef &TT) : MCAsmInfo() {
//===--- Dwarf Emission Directives -----------------------------------===//
HasLEB128 = true;
SupportsDebugInformation = true;
- DwarfSectionOffsetDirective = ".offset";
-
}
const char*
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 072ee49..61d70bb 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -78,7 +78,7 @@ static MCCodeEmitter *createAMDGPUMCCodeEmitter(const MCInstrInfo &MCII,
if (STI.getFeatureBits() & AMDGPU::Feature64BitPtr) {
return createSIMCCodeEmitter(MCII, MRI, STI, Ctx);
} else {
- return createR600MCCodeEmitter(MCII, MRI, STI, Ctx);
+ return createR600MCCodeEmitter(MCII, MRI, STI);
}
}
@@ -88,7 +88,7 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
MCCodeEmitter *_Emitter,
bool RelaxAll,
bool NoExecStack) {
- return createPureStreamer(Ctx, MAB, _OS, _Emitter);
+ return createELFStreamer(Ctx, MAB, _OS, _Emitter, false, false);
}
extern "C" void LLVMInitializeR600TargetMC() {
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
index 363a4af..abb0320 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -23,16 +23,17 @@ class MCAsmBackend;
class MCCodeEmitter;
class MCContext;
class MCInstrInfo;
+class MCObjectWriter;
class MCRegisterInfo;
class MCSubtargetInfo;
class Target;
+class raw_ostream;
extern Target TheAMDGPUTarget;
MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
- const MCSubtargetInfo &STI,
- MCContext &Ctx);
+ const MCSubtargetInfo &STI);
MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
@@ -41,6 +42,8 @@ MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII,
MCAsmBackend *createAMDGPUAsmBackend(const Target &T, StringRef TT,
StringRef CPU);
+
+MCObjectWriter *createAMDGPUELFObjectWriter(raw_ostream &OS);
} // End llvm namespace
#define GET_REGINFO_ENUM
diff --git a/lib/Target/R600/MCTargetDesc/CMakeLists.txt b/lib/Target/R600/MCTargetDesc/CMakeLists.txt
index 37e714c..3ccdf42 100644
--- a/lib/Target/R600/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/R600/MCTargetDesc/CMakeLists.txt
@@ -1,6 +1,7 @@
add_llvm_library(LLVMR600Desc
AMDGPUAsmBackend.cpp
+ AMDGPUELFObjectWriter.cpp
AMDGPUMCTargetDesc.cpp
AMDGPUMCAsmInfo.cpp
R600MCCodeEmitter.cpp
diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
index 927bcbd..cb4cf0c 100644
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -9,12 +9,8 @@
//
/// \file
///
-/// This code emitter outputs bytecode that is understood by the r600g driver
-/// in the Mesa [1] project. The bytecode is very similar to the hardware's ISA,
-/// but it still needs to be run through a finalizer in order to be executed
-/// by the GPU.
-///
-/// [1] http://www.mesa3d.org/
+/// \brief The R600 code emitter produces machine code that can be executed
+/// directly on the GPU device.
//
//===----------------------------------------------------------------------===//
@@ -30,9 +26,6 @@
#include "llvm/Support/raw_ostream.h"
#include <stdio.h>
-#define SRC_BYTE_COUNT 11
-#define DST_BYTE_COUNT 5
-
using namespace llvm;
namespace {
@@ -43,13 +36,12 @@ class R600MCCodeEmitter : public AMDGPUMCCodeEmitter {
const MCInstrInfo &MCII;
const MCRegisterInfo &MRI;
const MCSubtargetInfo &STI;
- MCContext &Ctx;
public:
R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
- const MCSubtargetInfo &sti, MCContext &ctx)
- : MCII(mcii), MRI(mri), STI(sti), Ctx(ctx) { }
+ const MCSubtargetInfo &sti)
+ : MCII(mcii), MRI(mri), STI(sti) { }
/// \brief Encode the instruction and write it to the OS.
virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
@@ -60,30 +52,14 @@ public:
SmallVectorImpl<MCFixup> &Fixups) const;
private:
- void EmitALUInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups,
- raw_ostream &OS) const;
- void EmitSrc(const MCInst &MI, unsigned OpIdx, raw_ostream &OS) const;
- void EmitSrcISA(const MCInst &MI, unsigned RegOpIdx, unsigned SelOpIdx,
- raw_ostream &OS) const;
- void EmitDst(const MCInst &MI, raw_ostream &OS) const;
- void EmitFCInstr(const MCInst &MI, raw_ostream &OS) const;
-
- void EmitNullBytes(unsigned int byteCount, raw_ostream &OS) const;
-
void EmitByte(unsigned int byte, raw_ostream &OS) const;
- void EmitTwoBytes(uint32_t bytes, raw_ostream &OS) const;
-
void Emit(uint32_t value, raw_ostream &OS) const;
void Emit(uint64_t value, raw_ostream &OS) const;
unsigned getHWRegChan(unsigned reg) const;
unsigned getHWReg(unsigned regNo) const;
- bool isFCOp(unsigned opcode) const;
- bool isTexOp(unsigned opcode) const;
- bool isFlagSet(const MCInst &MI, unsigned Operand, unsigned Flag) const;
-
};
} // End anonymous namespace
@@ -95,16 +71,6 @@ enum RegElement {
ELEMENT_W
};
-enum InstrTypes {
- INSTR_ALU = 0,
- INSTR_TEX,
- INSTR_FC,
- INSTR_NATIVE,
- INSTR_VTX,
- INSTR_EXPORT,
- INSTR_CFALU
-};
-
enum FCInstr {
FC_IF_PREDICATE = 0,
FC_ELSE,
@@ -132,355 +98,95 @@ enum TextureTypes {
MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
- const MCSubtargetInfo &STI,
- MCContext &Ctx) {
- return new R600MCCodeEmitter(MCII, MRI, STI, Ctx);
+ const MCSubtargetInfo &STI) {
+ return new R600MCCodeEmitter(MCII, MRI, STI);
}
void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups) const {
- if (isFCOp(MI.getOpcode())){
- EmitFCInstr(MI, OS);
- } else if (MI.getOpcode() == AMDGPU::RETURN ||
+ const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+ if (MI.getOpcode() == AMDGPU::RETURN ||
+ MI.getOpcode() == AMDGPU::FETCH_CLAUSE ||
+ MI.getOpcode() == AMDGPU::ALU_CLAUSE ||
MI.getOpcode() == AMDGPU::BUNDLE ||
MI.getOpcode() == AMDGPU::KILL) {
return;
- } else {
- switch(MI.getOpcode()) {
- case AMDGPU::STACK_SIZE: {
- EmitByte(MI.getOperand(0).getImm(), OS);
- break;
- }
- case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
- case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
- uint64_t inst = getBinaryCodeForInstr(MI, Fixups);
- EmitByte(INSTR_NATIVE, OS);
- Emit(inst, OS);
- break;
- }
- case AMDGPU::CONSTANT_LOAD_eg:
- case AMDGPU::VTX_READ_PARAM_8_eg:
- case AMDGPU::VTX_READ_PARAM_16_eg:
- case AMDGPU::VTX_READ_PARAM_32_eg:
- case AMDGPU::VTX_READ_PARAM_128_eg:
- case AMDGPU::VTX_READ_GLOBAL_8_eg:
- case AMDGPU::VTX_READ_GLOBAL_32_eg:
- case AMDGPU::VTX_READ_GLOBAL_128_eg:
- case AMDGPU::TEX_VTX_CONSTBUF:
- case AMDGPU::TEX_VTX_TEXBUF : {
- uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups);
- uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset
-
- EmitByte(INSTR_VTX, OS);
- Emit(InstWord01, OS);
- Emit(InstWord2, OS);
- break;
- }
- case AMDGPU::TEX_LD:
- case AMDGPU::TEX_GET_TEXTURE_RESINFO:
- case AMDGPU::TEX_SAMPLE:
- case AMDGPU::TEX_SAMPLE_C:
- case AMDGPU::TEX_SAMPLE_L:
- case AMDGPU::TEX_SAMPLE_C_L:
- case AMDGPU::TEX_SAMPLE_LB:
- case AMDGPU::TEX_SAMPLE_C_LB:
- case AMDGPU::TEX_SAMPLE_G:
- case AMDGPU::TEX_SAMPLE_C_G:
- case AMDGPU::TEX_GET_GRADIENTS_H:
- case AMDGPU::TEX_GET_GRADIENTS_V:
- case AMDGPU::TEX_SET_GRADIENTS_H:
- case AMDGPU::TEX_SET_GRADIENTS_V: {
- unsigned Opcode = MI.getOpcode();
- bool HasOffsets = (Opcode == AMDGPU::TEX_LD);
- unsigned OpOffset = HasOffsets ? 3 : 0;
- int64_t Sampler = MI.getOperand(OpOffset + 3).getImm();
- int64_t TextureType = MI.getOperand(OpOffset + 4).getImm();
-
- uint32_t SrcSelect[4] = {0, 1, 2, 3};
- uint32_t Offsets[3] = {0, 0, 0};
- uint64_t CoordType[4] = {1, 1, 1, 1};
-
- if (HasOffsets)
- for (unsigned i = 0; i < 3; i++) {
- int SignedOffset = MI.getOperand(i + 2).getImm();
- Offsets[i] = (SignedOffset & 0x1F);
- }
-
-
- if (TextureType == TEXTURE_RECT ||
- TextureType == TEXTURE_SHADOWRECT) {
- CoordType[ELEMENT_X] = 0;
- CoordType[ELEMENT_Y] = 0;
- }
-
- if (TextureType == TEXTURE_1D_ARRAY ||
- TextureType == TEXTURE_SHADOW1D_ARRAY) {
- if (Opcode == AMDGPU::TEX_SAMPLE_C_L ||
- Opcode == AMDGPU::TEX_SAMPLE_C_LB) {
- CoordType[ELEMENT_Y] = 0;
- } else {
- CoordType[ELEMENT_Z] = 0;
- SrcSelect[ELEMENT_Z] = ELEMENT_Y;
- }
- } else if (TextureType == TEXTURE_2D_ARRAY ||
- TextureType == TEXTURE_SHADOW2D_ARRAY) {
- CoordType[ELEMENT_Z] = 0;
+ } else if (IS_VTX(Desc)) {
+ uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups);
+ uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset
+ InstWord2 |= 1 << 19;
+
+ Emit(InstWord01, OS);
+ Emit(InstWord2, OS);
+ Emit((u_int32_t) 0, OS);
+ } else if (IS_TEX(Desc)) {
+ unsigned Opcode = MI.getOpcode();
+ bool HasOffsets = (Opcode == AMDGPU::TEX_LD);
+ unsigned OpOffset = HasOffsets ? 3 : 0;
+ int64_t Sampler = MI.getOperand(OpOffset + 3).getImm();
+ int64_t TextureType = MI.getOperand(OpOffset + 4).getImm();
+
+ uint32_t SrcSelect[4] = {0, 1, 2, 3};
+ uint32_t Offsets[3] = {0, 0, 0};
+ uint64_t CoordType[4] = {1, 1, 1, 1};
+
+ if (HasOffsets)
+ for (unsigned i = 0; i < 3; i++) {
+ int SignedOffset = MI.getOperand(i + 2).getImm();
+ Offsets[i] = (SignedOffset & 0x1F);
}
-
- if ((TextureType == TEXTURE_SHADOW1D ||
- TextureType == TEXTURE_SHADOW2D ||
- TextureType == TEXTURE_SHADOWRECT ||
- TextureType == TEXTURE_SHADOW1D_ARRAY) &&
- Opcode != AMDGPU::TEX_SAMPLE_C_L &&
- Opcode != AMDGPU::TEX_SAMPLE_C_LB) {
- SrcSelect[ELEMENT_W] = ELEMENT_Z;
- }
-
- uint64_t Word01 = getBinaryCodeForInstr(MI, Fixups) |
- CoordType[ELEMENT_X] << 60 | CoordType[ELEMENT_Y] << 61 |
- CoordType[ELEMENT_Z] << 62 | CoordType[ELEMENT_W] << 63;
- uint32_t Word2 = Sampler << 15 | SrcSelect[ELEMENT_X] << 20 |
- SrcSelect[ELEMENT_Y] << 23 | SrcSelect[ELEMENT_Z] << 26 |
- SrcSelect[ELEMENT_W] << 29 | Offsets[0] << 0 | Offsets[1] << 5 |
- Offsets[2] << 10;
-
- EmitByte(INSTR_TEX, OS);
- Emit(Word01, OS);
- Emit(Word2, OS);
- break;
- }
- case AMDGPU::EG_ExportSwz:
- case AMDGPU::R600_ExportSwz:
- case AMDGPU::EG_ExportBuf:
- case AMDGPU::R600_ExportBuf: {
- uint64_t Inst = getBinaryCodeForInstr(MI, Fixups);
- EmitByte(INSTR_EXPORT, OS);
- Emit(Inst, OS);
- break;
- }
- case AMDGPU::CF_ALU:
- case AMDGPU::CF_ALU_PUSH_BEFORE: {
- uint64_t Inst = getBinaryCodeForInstr(MI, Fixups);
- EmitByte(INSTR_CFALU, OS);
- Emit(Inst, OS);
- break;
- }
- case AMDGPU::CF_TC:
- case AMDGPU::CF_VC:
- case AMDGPU::CF_CALL_FS:
- return;
- case AMDGPU::WHILE_LOOP:
- case AMDGPU::END_LOOP:
- case AMDGPU::LOOP_BREAK:
- case AMDGPU::CF_CONTINUE:
- case AMDGPU::CF_JUMP:
- case AMDGPU::CF_ELSE:
- case AMDGPU::POP: {
- uint64_t Inst = getBinaryCodeForInstr(MI, Fixups);
- EmitByte(INSTR_NATIVE, OS);
- Emit(Inst, OS);
- break;
+ if (TextureType == TEXTURE_RECT ||
+ TextureType == TEXTURE_SHADOWRECT) {
+ CoordType[ELEMENT_X] = 0;
+ CoordType[ELEMENT_Y] = 0;
}
- default:
- EmitALUInstr(MI, Fixups, OS);
- break;
- }
- }
-}
-
-void R600MCCodeEmitter::EmitALUInstr(const MCInst &MI,
- SmallVectorImpl<MCFixup> &Fixups,
- raw_ostream &OS) const {
- const MCInstrDesc &MCDesc = MCII.get(MI.getOpcode());
-
- // Emit instruction type
- EmitByte(INSTR_ALU, OS);
-
- uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups);
-
- //older alu have different encoding for instructions with one or two src
- //parameters.
- if ((STI.getFeatureBits() & AMDGPU::FeatureR600ALUInst) &&
- !(MCDesc.TSFlags & R600_InstFlag::OP3)) {
- uint64_t ISAOpCode = InstWord01 & (0x3FFULL << 39);
- InstWord01 &= ~(0x3FFULL << 39);
- InstWord01 |= ISAOpCode << 1;
- }
-
- unsigned SrcNum = MCDesc.TSFlags & R600_InstFlag::OP3 ? 3 :
- MCDesc.TSFlags & R600_InstFlag::OP2 ? 2 : 1;
-
- EmitByte(SrcNum, OS);
-
- const unsigned SrcOps[3][2] = {
- {R600Operands::SRC0, R600Operands::SRC0_SEL},
- {R600Operands::SRC1, R600Operands::SRC1_SEL},
- {R600Operands::SRC2, R600Operands::SRC2_SEL}
- };
- for (unsigned SrcIdx = 0; SrcIdx < SrcNum; ++SrcIdx) {
- unsigned RegOpIdx = R600Operands::ALUOpTable[SrcNum-1][SrcOps[SrcIdx][0]];
- unsigned SelOpIdx = R600Operands::ALUOpTable[SrcNum-1][SrcOps[SrcIdx][1]];
- EmitSrcISA(MI, RegOpIdx, SelOpIdx, OS);
- }
-
- Emit(InstWord01, OS);
- return;
-}
-
-void R600MCCodeEmitter::EmitSrc(const MCInst &MI, unsigned OpIdx,
- raw_ostream &OS) const {
- const MCOperand &MO = MI.getOperand(OpIdx);
- union {
- float f;
- uint32_t i;
- } Value;
- Value.i = 0;
- // Emit the source select (2 bytes). For GPRs, this is the register index.
- // For other potential instruction operands, (e.g. constant registers) the
- // value of the source select is defined in the r600isa docs.
- if (MO.isReg()) {
- unsigned reg = MO.getReg();
- EmitTwoBytes(getHWReg(reg), OS);
- if (reg == AMDGPU::ALU_LITERAL_X) {
- unsigned ImmOpIndex = MI.getNumOperands() - 1;
- MCOperand ImmOp = MI.getOperand(ImmOpIndex);
- if (ImmOp.isFPImm()) {
- Value.f = ImmOp.getFPImm();
+ if (TextureType == TEXTURE_1D_ARRAY ||
+ TextureType == TEXTURE_SHADOW1D_ARRAY) {
+ if (Opcode == AMDGPU::TEX_SAMPLE_C_L ||
+ Opcode == AMDGPU::TEX_SAMPLE_C_LB) {
+ CoordType[ELEMENT_Y] = 0;
} else {
- assert(ImmOp.isImm());
- Value.i = ImmOp.getImm();
+ CoordType[ELEMENT_Z] = 0;
+ SrcSelect[ELEMENT_Z] = ELEMENT_Y;
}
+ } else if (TextureType == TEXTURE_2D_ARRAY ||
+ TextureType == TEXTURE_SHADOW2D_ARRAY) {
+ CoordType[ELEMENT_Z] = 0;
}
- } else {
- // XXX: Handle other operand types.
- EmitTwoBytes(0, OS);
- }
-
- // Emit the source channel (1 byte)
- if (MO.isReg()) {
- EmitByte(getHWRegChan(MO.getReg()), OS);
- } else {
- EmitByte(0, OS);
- }
-
- // XXX: Emit isNegated (1 byte)
- if ((!(isFlagSet(MI, OpIdx, MO_FLAG_ABS)))
- && (isFlagSet(MI, OpIdx, MO_FLAG_NEG) ||
- (MO.isReg() &&
- (MO.getReg() == AMDGPU::NEG_ONE || MO.getReg() == AMDGPU::NEG_HALF)))){
- EmitByte(1, OS);
- } else {
- EmitByte(0, OS);
- }
-
- // Emit isAbsolute (1 byte)
- if (isFlagSet(MI, OpIdx, MO_FLAG_ABS)) {
- EmitByte(1, OS);
- } else {
- EmitByte(0, OS);
- }
-
- // XXX: Emit relative addressing mode (1 byte)
- EmitByte(0, OS);
-
- // Emit kc_bank, This will be adjusted later by r600_asm
- EmitByte(0, OS);
- // Emit the literal value, if applicable (4 bytes).
- Emit(Value.i, OS);
-}
-
-void R600MCCodeEmitter::EmitSrcISA(const MCInst &MI, unsigned RegOpIdx,
- unsigned SelOpIdx, raw_ostream &OS) const {
- const MCOperand &RegMO = MI.getOperand(RegOpIdx);
- const MCOperand &SelMO = MI.getOperand(SelOpIdx);
-
- union {
- float f;
- uint32_t i;
- } InlineConstant;
- InlineConstant.i = 0;
- // Emit source type (1 byte) and source select (4 bytes). For GPRs type is 0
- // and select is 0 (GPR index is encoded in the instr encoding. For constants
- // type is 1 and select is the original const select passed from the driver.
- unsigned Reg = RegMO.getReg();
- if (Reg == AMDGPU::ALU_CONST) {
- EmitByte(1, OS);
- uint32_t Sel = SelMO.getImm();
- Emit(Sel, OS);
- } else {
- EmitByte(0, OS);
- Emit((uint32_t)0, OS);
- }
-
- if (Reg == AMDGPU::ALU_LITERAL_X) {
- unsigned ImmOpIndex = MI.getNumOperands() - 1;
- MCOperand ImmOp = MI.getOperand(ImmOpIndex);
- if (ImmOp.isFPImm()) {
- InlineConstant.f = ImmOp.getFPImm();
- } else {
- assert(ImmOp.isImm());
- InlineConstant.i = ImmOp.getImm();
+ if ((TextureType == TEXTURE_SHADOW1D ||
+ TextureType == TEXTURE_SHADOW2D ||
+ TextureType == TEXTURE_SHADOWRECT ||
+ TextureType == TEXTURE_SHADOW1D_ARRAY) &&
+ Opcode != AMDGPU::TEX_SAMPLE_C_L &&
+ Opcode != AMDGPU::TEX_SAMPLE_C_LB) {
+ SrcSelect[ELEMENT_W] = ELEMENT_Z;
}
- }
-
- // Emit the literal value, if applicable (4 bytes).
- Emit(InlineConstant.i, OS);
-}
-
-void R600MCCodeEmitter::EmitFCInstr(const MCInst &MI, raw_ostream &OS) const {
-
- // Emit instruction type
- EmitByte(INSTR_FC, OS);
- // Emit SRC
- unsigned NumOperands = MI.getNumOperands();
- if (NumOperands > 0) {
- assert(NumOperands == 1);
- EmitSrc(MI, 0, OS);
+ uint64_t Word01 = getBinaryCodeForInstr(MI, Fixups) |
+ CoordType[ELEMENT_X] << 60 | CoordType[ELEMENT_Y] << 61 |
+ CoordType[ELEMENT_Z] << 62 | CoordType[ELEMENT_W] << 63;
+ uint32_t Word2 = Sampler << 15 | SrcSelect[ELEMENT_X] << 20 |
+ SrcSelect[ELEMENT_Y] << 23 | SrcSelect[ELEMENT_Z] << 26 |
+ SrcSelect[ELEMENT_W] << 29 | Offsets[0] << 0 | Offsets[1] << 5 |
+ Offsets[2] << 10;
+
+ Emit(Word01, OS);
+ Emit(Word2, OS);
+ Emit((u_int32_t) 0, OS);
} else {
- EmitNullBytes(SRC_BYTE_COUNT, OS);
- }
-
- // Emit FC Instruction
- enum FCInstr instr;
- switch (MI.getOpcode()) {
- case AMDGPU::PREDICATED_BREAK:
- instr = FC_BREAK_PREDICATE;
- break;
- case AMDGPU::CONTINUE:
- instr = FC_CONTINUE;
- break;
- case AMDGPU::IF_PREDICATE_SET:
- instr = FC_IF_PREDICATE;
- break;
- case AMDGPU::ELSE:
- instr = FC_ELSE;
- break;
- case AMDGPU::ENDIF:
- instr = FC_ENDIF;
- break;
- case AMDGPU::ENDLOOP:
- instr = FC_ENDLOOP;
- break;
- case AMDGPU::WHILELOOP:
- instr = FC_BGNLOOP;
- break;
- default:
- abort();
- break;
- }
- EmitByte(instr, OS);
-}
-
-void R600MCCodeEmitter::EmitNullBytes(unsigned int ByteCount,
- raw_ostream &OS) const {
-
- for (unsigned int i = 0; i < ByteCount; i++) {
- EmitByte(0, OS);
+ uint64_t Inst = getBinaryCodeForInstr(MI, Fixups);
+ if ((STI.getFeatureBits() & AMDGPU::FeatureR600ALUInst) &&
+ ((Desc.TSFlags & R600_InstFlag::OP1) ||
+ Desc.TSFlags & R600_InstFlag::OP2)) {
+ uint64_t ISAOpCode = Inst & (0x3FFULL << 39);
+ Inst &= ~(0x3FFULL << 39);
+ Inst |= ISAOpCode << 1;
+ }
+ Emit(Inst, OS);
}
}
@@ -488,12 +194,6 @@ void R600MCCodeEmitter::EmitByte(unsigned int Byte, raw_ostream &OS) const {
OS.write((uint8_t) Byte & 0xff);
}
-void R600MCCodeEmitter::EmitTwoBytes(unsigned int Bytes,
- raw_ostream &OS) const {
- OS.write((uint8_t) (Bytes & 0xff));
- OS.write((uint8_t) ((Bytes >> 8) & 0xff));
-}
-
void R600MCCodeEmitter::Emit(uint32_t Value, raw_ostream &OS) const {
for (unsigned i = 0; i < 4; i++) {
OS.write((uint8_t) ((Value >> (8 * i)) & 0xff));
@@ -531,55 +231,4 @@ uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI,
}
}
-//===----------------------------------------------------------------------===//
-// Encoding helper functions
-//===----------------------------------------------------------------------===//
-
-bool R600MCCodeEmitter::isFCOp(unsigned opcode) const {
- switch(opcode) {
- default: return false;
- case AMDGPU::PREDICATED_BREAK:
- case AMDGPU::CONTINUE:
- case AMDGPU::IF_PREDICATE_SET:
- case AMDGPU::ELSE:
- case AMDGPU::ENDIF:
- case AMDGPU::ENDLOOP:
- case AMDGPU::WHILELOOP:
- return true;
- }
-}
-
-bool R600MCCodeEmitter::isTexOp(unsigned opcode) const {
- switch(opcode) {
- default: return false;
- case AMDGPU::TEX_LD:
- case AMDGPU::TEX_GET_TEXTURE_RESINFO:
- case AMDGPU::TEX_SAMPLE:
- case AMDGPU::TEX_SAMPLE_C:
- case AMDGPU::TEX_SAMPLE_L:
- case AMDGPU::TEX_SAMPLE_C_L:
- case AMDGPU::TEX_SAMPLE_LB:
- case AMDGPU::TEX_SAMPLE_C_LB:
- case AMDGPU::TEX_SAMPLE_G:
- case AMDGPU::TEX_SAMPLE_C_G:
- case AMDGPU::TEX_GET_GRADIENTS_H:
- case AMDGPU::TEX_GET_GRADIENTS_V:
- case AMDGPU::TEX_SET_GRADIENTS_H:
- case AMDGPU::TEX_SET_GRADIENTS_V:
- return true;
- }
-}
-
-bool R600MCCodeEmitter::isFlagSet(const MCInst &MI, unsigned Operand,
- unsigned Flag) const {
- const MCInstrDesc &MCDesc = MCII.get(MI.getOpcode());
- unsigned FlagIndex = GET_FLAG_OPERAND_IDX(MCDesc.TSFlags);
- if (FlagIndex == 0) {
- return false;
- }
- assert(MI.getOperand(FlagIndex).isImm());
- return !!((MI.getOperand(FlagIndex).getImm() >>
- (NUM_MO_FLAGS * Operand)) & Flag);
-}
-
#include "AMDGPUGenMCCodeEmitter.inc"
diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td
index 868810c..0cbe919 100644
--- a/lib/Target/R600/Processors.td
+++ b/lib/Target/R600/Processors.td
@@ -1,4 +1,4 @@
-//===-- Processors.td - TODO: Add brief description -------===//
+//===-- Processors.td - R600 Processor definitions ------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -6,25 +6,43 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
-//
-// AMDIL processors supported.
-//
-//===----------------------------------------------------------------------===//
class Proc<string Name, ProcessorItineraries itin, list<SubtargetFeature> Features>
: Processor<Name, itin, Features>;
-def : Proc<"", R600_EG_Itin, [FeatureR600ALUInst]>;
-def : Proc<"r600", R600_EG_Itin, [FeatureR600ALUInst]>;
-def : Proc<"rv710", R600_EG_Itin, []>;
-def : Proc<"rv730", R600_EG_Itin, []>;
-def : Proc<"rv770", R600_EG_Itin, [FeatureFP64]>;
-def : Proc<"cedar", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
-def : Proc<"redwood", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
-def : Proc<"juniper", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
-def : Proc<"cypress", R600_EG_Itin, [FeatureByteAddress, FeatureImages, FeatureFP64]>;
-def : Proc<"barts", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
-def : Proc<"turks", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
-def : Proc<"caicos", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
-def : Proc<"cayman", R600_EG_Itin, [FeatureByteAddress, FeatureImages, FeatureFP64]>;
-def : Proc<"SI", SI_Itin, [Feature64BitPtr]>;
-
+def : Proc<"", R600_VLIW5_Itin,
+ [FeatureR600ALUInst, FeatureVertexCache]>;
+def : Proc<"r600", R600_VLIW5_Itin,
+ [FeatureR600ALUInst , FeatureVertexCache]>;
+def : Proc<"rs880", R600_VLIW5_Itin,
+ [FeatureR600ALUInst]>;
+def : Proc<"rv670", R600_VLIW5_Itin,
+ [FeatureR600ALUInst, FeatureFP64, FeatureVertexCache]>;
+def : Proc<"rv710", R600_VLIW5_Itin,
+ [FeatureVertexCache]>;
+def : Proc<"rv730", R600_VLIW5_Itin,
+ [FeatureVertexCache]>;
+def : Proc<"rv770", R600_VLIW5_Itin,
+ [FeatureFP64, FeatureVertexCache]>;
+def : Proc<"cedar", R600_VLIW5_Itin,
+ [FeatureByteAddress, FeatureImages, FeatureVertexCache]>;
+def : Proc<"redwood", R600_VLIW5_Itin,
+ [FeatureByteAddress, FeatureImages, FeatureVertexCache]>;
+def : Proc<"sumo", R600_VLIW5_Itin,
+ [FeatureByteAddress, FeatureImages]>;
+def : Proc<"juniper", R600_VLIW5_Itin,
+ [FeatureByteAddress, FeatureImages, FeatureVertexCache]>;
+def : Proc<"cypress", R600_VLIW5_Itin,
+ [FeatureByteAddress, FeatureImages, FeatureFP64, FeatureVertexCache]>;
+def : Proc<"barts", R600_VLIW5_Itin,
+ [FeatureByteAddress, FeatureImages, FeatureVertexCache]>;
+def : Proc<"turks", R600_VLIW5_Itin,
+ [FeatureByteAddress, FeatureImages, FeatureVertexCache]>;
+def : Proc<"caicos", R600_VLIW5_Itin,
+ [FeatureByteAddress, FeatureImages]>;
+def : Proc<"cayman", R600_VLIW4_Itin,
+ [FeatureByteAddress, FeatureImages, FeatureFP64]>;def : Proc<"SI", SI_Itin, [Feature64BitPtr, FeatureFP64]>;
+def : Proc<"tahiti", SI_Itin, [Feature64BitPtr, FeatureFP64]>;
+def : Proc<"pitcairn", SI_Itin, [Feature64BitPtr, FeatureFP64]>;
+def : Proc<"verde", SI_Itin, [Feature64BitPtr, FeatureFP64]>;
+def : Proc<"oland", SI_Itin, [Feature64BitPtr, FeatureFP64]>;
+def : Proc<"hainan", SI_Itin, [Feature64BitPtr, FeatureFP64]>;
diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
index 3a6c7ea..ffe3414 100644
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -30,35 +30,27 @@ namespace llvm {
class R600ControlFlowFinalizer : public MachineFunctionPass {
private:
+ typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile;
+
+ enum ControlFlowInstruction {
+ CF_TC,
+ CF_VC,
+ CF_CALL_FS,
+ CF_WHILE_LOOP,
+ CF_END_LOOP,
+ CF_LOOP_BREAK,
+ CF_LOOP_CONTINUE,
+ CF_JUMP,
+ CF_ELSE,
+ CF_POP,
+ CF_END
+ };
+
static char ID;
const R600InstrInfo *TII;
+ const R600RegisterInfo &TRI;
unsigned MaxFetchInst;
-
- bool isFetch(const MachineInstr *MI) const {
- switch (MI->getOpcode()) {
- case AMDGPU::TEX_VTX_CONSTBUF:
- case AMDGPU::TEX_VTX_TEXBUF:
- case AMDGPU::TEX_LD:
- case AMDGPU::TEX_GET_TEXTURE_RESINFO:
- case AMDGPU::TEX_GET_GRADIENTS_H:
- case AMDGPU::TEX_GET_GRADIENTS_V:
- case AMDGPU::TEX_SET_GRADIENTS_H:
- case AMDGPU::TEX_SET_GRADIENTS_V:
- case AMDGPU::TEX_SAMPLE:
- case AMDGPU::TEX_SAMPLE_C:
- case AMDGPU::TEX_SAMPLE_L:
- case AMDGPU::TEX_SAMPLE_C_L:
- case AMDGPU::TEX_SAMPLE_LB:
- case AMDGPU::TEX_SAMPLE_C_LB:
- case AMDGPU::TEX_SAMPLE_G:
- case AMDGPU::TEX_SAMPLE_C_G:
- case AMDGPU::TXD:
- case AMDGPU::TXD_SHADOW:
- return true;
- default:
- return false;
- }
- }
+ const AMDGPUSubtarget &ST;
bool IsTrivialInst(MachineInstr *MI) const {
switch (MI->getOpcode()) {
@@ -70,26 +62,226 @@ private:
}
}
- MachineBasicBlock::iterator
- MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
- unsigned CfAddress) const {
+ const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const {
+ unsigned Opcode = 0;
+ bool isEg = (ST.device()->getGeneration() >= AMDGPUDeviceInfo::HD5XXX);
+ switch (CFI) {
+ case CF_TC:
+ Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600;
+ break;
+ case CF_VC:
+ Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600;
+ break;
+ case CF_CALL_FS:
+ Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600;
+ break;
+ case CF_WHILE_LOOP:
+ Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600;
+ break;
+ case CF_END_LOOP:
+ Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600;
+ break;
+ case CF_LOOP_BREAK:
+ Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600;
+ break;
+ case CF_LOOP_CONTINUE:
+ Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600;
+ break;
+ case CF_JUMP:
+ Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600;
+ break;
+ case CF_ELSE:
+ Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600;
+ break;
+ case CF_POP:
+ Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600;
+ break;
+ case CF_END:
+ if (ST.device()->getDeviceFlag() == OCL_DEVICE_CAYMAN) {
+ Opcode = AMDGPU::CF_END_CM;
+ break;
+ }
+ Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600;
+ break;
+ }
+ assert (Opcode && "No opcode selected");
+ return TII->get(Opcode);
+ }
+
+ bool isCompatibleWithClause(const MachineInstr *MI,
+ std::set<unsigned> &DstRegs, std::set<unsigned> &SrcRegs) const {
+ unsigned DstMI, SrcMI;
+ for (MachineInstr::const_mop_iterator I = MI->operands_begin(),
+ E = MI->operands_end(); I != E; ++I) {
+ const MachineOperand &MO = *I;
+ if (!MO.isReg())
+ continue;
+ if (MO.isDef())
+ DstMI = MO.getReg();
+ if (MO.isUse()) {
+ unsigned Reg = MO.getReg();
+ if (AMDGPU::R600_Reg128RegClass.contains(Reg))
+ SrcMI = Reg;
+ else
+ SrcMI = TRI.getMatchingSuperReg(Reg,
+ TRI.getSubRegFromChannel(TRI.getHWRegChan(Reg)),
+ &AMDGPU::R600_Reg128RegClass);
+ }
+ }
+ if ((DstRegs.find(SrcMI) == DstRegs.end()) &&
+ (SrcRegs.find(DstMI) == SrcRegs.end())) {
+ SrcRegs.insert(SrcMI);
+ DstRegs.insert(DstMI);
+ return true;
+ } else
+ return false;
+ }
+
+ ClauseFile
+ MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
+ const {
MachineBasicBlock::iterator ClauseHead = I;
+ std::vector<MachineInstr *> ClauseContent;
unsigned AluInstCount = 0;
+ bool IsTex = TII->usesTextureCache(ClauseHead);
+ std::set<unsigned> DstRegs, SrcRegs;
for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
if (IsTrivialInst(I))
continue;
- if (!isFetch(I))
+ if (AluInstCount > MaxFetchInst)
+ break;
+ if ((IsTex && !TII->usesTextureCache(I)) ||
+ (!IsTex && !TII->usesVertexCache(I)))
+ break;
+ if (!isCompatibleWithClause(I, DstRegs, SrcRegs))
break;
AluInstCount ++;
- if (AluInstCount > MaxFetchInst)
+ ClauseContent.push_back(I);
+ }
+ MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead),
+ getHWInstrDesc(IsTex?CF_TC:CF_VC))
+ .addImm(0) // ADDR
+ .addImm(AluInstCount - 1); // COUNT
+ return ClauseFile(MIb, ClauseContent);
+ }
+
+ void getLiteral(MachineInstr *MI, std::vector<int64_t> &Lits) const {
+ unsigned LiteralRegs[] = {
+ AMDGPU::ALU_LITERAL_X,
+ AMDGPU::ALU_LITERAL_Y,
+ AMDGPU::ALU_LITERAL_Z,
+ AMDGPU::ALU_LITERAL_W
+ };
+ for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
+ MachineOperand &MO = MI->getOperand(i);
+ if (!MO.isReg())
+ continue;
+ if (MO.getReg() != AMDGPU::ALU_LITERAL_X)
+ continue;
+ unsigned ImmIdx = TII->getOperandIdx(MI->getOpcode(), R600Operands::IMM);
+ int64_t Imm = MI->getOperand(ImmIdx).getImm();
+ std::vector<int64_t>::iterator It =
+ std::find(Lits.begin(), Lits.end(), Imm);
+ if (It != Lits.end()) {
+ unsigned Index = It - Lits.begin();
+ MO.setReg(LiteralRegs[Index]);
+ } else {
+ assert(Lits.size() < 4 && "Too many literals in Instruction Group");
+ MO.setReg(LiteralRegs[Lits.size()]);
+ Lits.push_back(Imm);
+ }
+ }
+ }
+
+ MachineBasicBlock::iterator insertLiterals(
+ MachineBasicBlock::iterator InsertPos,
+ const std::vector<unsigned> &Literals) const {
+ MachineBasicBlock *MBB = InsertPos->getParent();
+ for (unsigned i = 0, e = Literals.size(); i < e; i+=2) {
+ unsigned LiteralPair0 = Literals[i];
+ unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0;
+ InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(),
+ TII->get(AMDGPU::LITERALS))
+ .addImm(LiteralPair0)
+ .addImm(LiteralPair1);
+ }
+ return InsertPos;
+ }
+
+ ClauseFile
+ MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
+ const {
+ MachineBasicBlock::iterator ClauseHead = I;
+ std::vector<MachineInstr *> ClauseContent;
+ I++;
+ for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) {
+ if (IsTrivialInst(I)) {
+ ++I;
+ continue;
+ }
+ if (!I->isBundle() && !TII->isALUInstr(I->getOpcode()))
break;
+ std::vector<int64_t> Literals;
+ if (I->isBundle()) {
+ MachineInstr *DeleteMI = I;
+ MachineBasicBlock::instr_iterator BI = I.getInstrIterator();
+ while (++BI != E && BI->isBundledWithPred()) {
+ BI->unbundleFromPred();
+ for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = BI->getOperand(i);
+ if (MO.isReg() && MO.isInternalRead())
+ MO.setIsInternalRead(false);
+ }
+ getLiteral(BI, Literals);
+ ClauseContent.push_back(BI);
+ }
+ I = BI;
+ DeleteMI->eraseFromParent();
+ } else {
+ getLiteral(I, Literals);
+ ClauseContent.push_back(I);
+ I++;
+ }
+ for (unsigned i = 0, e = Literals.size(); i < e; i+=2) {
+ unsigned literal0 = Literals[i];
+ unsigned literal2 = (i + 1 < e)?Literals[i + 1]:0;
+ MachineInstr *MILit = BuildMI(MBB, I, I->getDebugLoc(),
+ TII->get(AMDGPU::LITERALS))
+ .addImm(literal0)
+ .addImm(literal2);
+ ClauseContent.push_back(MILit);
+ }
}
- BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead),
- TII->get(AMDGPU::CF_TC))
- .addImm(CfAddress) // ADDR
- .addImm(AluInstCount); // COUNT
- return I;
+ ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1);
+ return ClauseFile(ClauseHead, ClauseContent);
}
+
+ void
+ EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
+ unsigned &CfCount) {
+ CounterPropagateAddr(Clause.first, CfCount);
+ MachineBasicBlock *BB = Clause.first->getParent();
+ BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::FETCH_CLAUSE))
+ .addImm(CfCount);
+ for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
+ BB->splice(InsertPos, BB, Clause.second[i]);
+ }
+ CfCount += 2 * Clause.second.size();
+ }
+
+ void
+ EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
+ unsigned &CfCount) {
+ CounterPropagateAddr(Clause.first, CfCount);
+ MachineBasicBlock *BB = Clause.first->getParent();
+ BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE))
+ .addImm(CfCount);
+ for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
+ BB->splice(InsertPos, BB, Clause.second[i]);
+ }
+ CfCount += Clause.second.size();
+ }
+
void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const {
MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm());
}
@@ -102,9 +294,27 @@ private:
}
}
+ unsigned getHWStackSize(unsigned StackSubEntry, bool hasPush) const {
+ switch (ST.device()->getGeneration()) {
+ case AMDGPUDeviceInfo::HD4XXX:
+ if (hasPush)
+ StackSubEntry += 2;
+ break;
+ case AMDGPUDeviceInfo::HD5XXX:
+ if (hasPush)
+ StackSubEntry ++;
+ case AMDGPUDeviceInfo::HD6XXX:
+ StackSubEntry += 2;
+ break;
+ }
+ return (StackSubEntry + 3)/4; // Need ceil value of StackSubEntry/4
+ }
+
public:
R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID),
- TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo())) {
+ TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo())),
+ TRI(TII->getRegisterInfo()),
+ ST(tm.getSubtarget<AMDGPUSubtarget>()) {
const AMDGPUSubtarget &ST = tm.getSubtarget<AMDGPUSubtarget>();
if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD4XXX)
MaxFetchInst = 8;
@@ -115,6 +325,7 @@ public:
virtual bool runOnMachineFunction(MachineFunction &MF) {
unsigned MaxStack = 0;
unsigned CurrentStack = 0;
+ bool HasPush = false;
for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME;
++MB) {
MachineBasicBlock &MBB = *MB;
@@ -124,14 +335,16 @@ public:
R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
if (MFI->ShaderType == 1) {
BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
- TII->get(AMDGPU::CF_CALL_FS));
+ getHWInstrDesc(CF_CALL_FS));
CfCount++;
+ MaxStack = 1;
}
+ std::vector<ClauseFile> FetchClauses, AluClauses;
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
I != E;) {
- if (isFetch(I)) {
+ if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) {
DEBUG(dbgs() << CfCount << ":"; I->dump(););
- I = MakeFetchClause(MBB, I, 0);
+ FetchClauses.push_back(MakeFetchClause(MBB, I));
CfCount++;
continue;
}
@@ -142,20 +355,25 @@ public:
case AMDGPU::CF_ALU_PUSH_BEFORE:
CurrentStack++;
MaxStack = std::max(MaxStack, CurrentStack);
+ HasPush = true;
case AMDGPU::CF_ALU:
+ I = MI;
+ AluClauses.push_back(MakeALUClause(MBB, I));
case AMDGPU::EG_ExportBuf:
case AMDGPU::EG_ExportSwz:
case AMDGPU::R600_ExportBuf:
case AMDGPU::R600_ExportSwz:
+ case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
+ case AMDGPU::RAT_WRITE_CACHELESS_128_eg:
DEBUG(dbgs() << CfCount << ":"; MI->dump(););
CfCount++;
break;
case AMDGPU::WHILELOOP: {
- CurrentStack++;
+ CurrentStack+=4;
MaxStack = std::max(MaxStack, CurrentStack);
MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
- TII->get(AMDGPU::WHILE_LOOP))
- .addImm(2);
+ getHWInstrDesc(CF_WHILE_LOOP))
+ .addImm(1);
std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount,
std::set<MachineInstr *>());
Pair.second.insert(MIb);
@@ -165,12 +383,12 @@ public:
break;
}
case AMDGPU::ENDLOOP: {
- CurrentStack--;
+ CurrentStack-=4;
std::pair<unsigned, std::set<MachineInstr *> > Pair =
LoopStack.back();
LoopStack.pop_back();
CounterPropagateAddr(Pair.second, CfCount);
- BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::END_LOOP))
+ BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP))
.addImm(Pair.first + 1);
MI->eraseFromParent();
CfCount++;
@@ -178,7 +396,7 @@ public:
}
case AMDGPU::IF_PREDICATE_SET: {
MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
- TII->get(AMDGPU::CF_JUMP))
+ getHWInstrDesc(CF_JUMP))
.addImm(0)
.addImm(0);
IfThenElseStack.push_back(MIb);
@@ -192,7 +410,7 @@ public:
IfThenElseStack.pop_back();
CounterPropagateAddr(JumpInst, CfCount);
MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
- TII->get(AMDGPU::CF_ELSE))
+ getHWInstrDesc(CF_ELSE))
.addImm(0)
.addImm(1);
DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
@@ -207,9 +425,10 @@ public:
IfThenElseStack.pop_back();
CounterPropagateAddr(IfOrElseInst, CfCount + 1);
MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
- TII->get(AMDGPU::POP))
+ getHWInstrDesc(CF_POP))
.addImm(CfCount + 1)
.addImm(1);
+ (void)MIb;
DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
MI->eraseFromParent();
CfCount++;
@@ -218,13 +437,13 @@ public:
case AMDGPU::PREDICATED_BREAK: {
CurrentStack--;
CfCount += 3;
- BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_JUMP))
+ BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_JUMP))
.addImm(CfCount)
.addImm(1);
MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
- TII->get(AMDGPU::LOOP_BREAK))
+ getHWInstrDesc(CF_LOOP_BREAK))
.addImm(0);
- BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::POP))
+ BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_POP))
.addImm(CfCount)
.addImm(1);
LoopStack.back().second.insert(MIb);
@@ -233,20 +452,31 @@ public:
}
case AMDGPU::CONTINUE: {
MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
- TII->get(AMDGPU::CF_CONTINUE))
+ getHWInstrDesc(CF_LOOP_CONTINUE))
.addImm(0);
LoopStack.back().second.insert(MIb);
MI->eraseFromParent();
CfCount++;
break;
}
+ case AMDGPU::RETURN: {
+ BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END));
+ CfCount++;
+ MI->eraseFromParent();
+ if (CfCount % 2) {
+ BuildMI(MBB, I, MBB.findDebugLoc(MI), TII->get(AMDGPU::PAD));
+ CfCount++;
+ }
+ for (unsigned i = 0, e = FetchClauses.size(); i < e; i++)
+ EmitFetchClause(I, FetchClauses[i], CfCount);
+ for (unsigned i = 0, e = AluClauses.size(); i < e; i++)
+ EmitALUClause(I, AluClauses[i], CfCount);
+ }
default:
break;
}
}
- BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
- TII->get(AMDGPU::STACK_SIZE))
- .addImm(MaxStack);
+ MFI->StackSize = getHWStackSize(MaxStack, HasPush);
}
return false;
@@ -265,4 +495,3 @@ char R600ControlFlowFinalizer::ID = 0;
llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) {
return new R600ControlFlowFinalizer(TM);
}
-
diff --git a/lib/Target/R600/R600Defines.h b/lib/Target/R600/R600Defines.h
index 16cfcf5..36bfb18 100644
--- a/lib/Target/R600/R600Defines.h
+++ b/lib/Target/R600/R600Defines.h
@@ -39,7 +39,9 @@ namespace R600_InstFlag {
//FlagOperand bits 7, 8
NATIVE_OPERANDS = (1 << 9),
OP1 = (1 << 10),
- OP2 = (1 << 11)
+ OP2 = (1 << 11),
+ VTX_INST = (1 << 12),
+ TEX_INST = (1 << 13)
};
}
@@ -52,6 +54,9 @@ namespace R600_InstFlag {
#define GET_REG_CHAN(reg) ((reg) >> HW_CHAN_SHIFT)
#define GET_REG_INDEX(reg) ((reg) & HW_REG_MASK)
+#define IS_VTX(desc) ((desc).TSFlags & R600_InstFlag::VTX_INST)
+#define IS_TEX(desc) ((desc).TSFlags & R600_InstFlag::TEX_INST)
+
namespace R600Operands {
enum Ops {
DST,
@@ -78,6 +83,7 @@ namespace R600Operands {
LAST,
PRED_SEL,
IMM,
+ BANK_SWIZZLE,
COUNT
};
@@ -85,13 +91,39 @@ namespace R600Operands {
// W C S S S S S S S S S S S
// R O D L S R R R R S R R R R S R R R L P
// D U I M R A R C C C C R C C C C R C C C A R I
-// S E U T O E M C 0 0 0 0 C 1 1 1 1 C 2 2 2 S E M
-// T M P E D L P 0 N R A S 1 N R A S 2 N R S T D M
- {0,-1,-1, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,-1,-1,-1,10,11,12},
- {0, 1, 2, 3, 4 ,5 ,6 ,7, 8, 9,10,11,12,13,14,15,16,-1,-1,-1,-1,17,18,19},
- {0,-1,-1,-1,-1, 1, 2, 3, 4, 5,-1, 6, 7, 8, 9,-1,10,11,12,13,14,15,16,17}
+// S E U T O E M C 0 0 0 0 C 1 1 1 1 C 2 2 2 S E M B
+// T M P E D L P 0 N R A S 1 N R A S 2 N R S T D M S
+ {0,-1,-1, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,-1,-1,-1,10,11,12,13},
+ {0, 1, 2, 3, 4 ,5 ,6 ,7, 8, 9,10,11,12,13,14,15,16,-1,-1,-1,-1,17,18,19,20},
+ {0,-1,-1,-1,-1, 1, 2, 3, 4, 5,-1, 6, 7, 8, 9,-1,10,11,12,13,14,15,16,17,18}
};
}
+//===----------------------------------------------------------------------===//
+// Config register definitions
+//===----------------------------------------------------------------------===//
+
+#define R_02880C_DB_SHADER_CONTROL 0x02880C
+#define S_02880C_KILL_ENABLE(x) (((x) & 0x1) << 6)
+
+// These fields are the same for all shader types and families.
+#define S_NUM_GPRS(x) (((x) & 0xFF) << 0)
+#define S_STACK_SIZE(x) (((x) & 0xFF) << 8)
+//===----------------------------------------------------------------------===//
+// R600, R700 Registers
+//===----------------------------------------------------------------------===//
+
+#define R_028850_SQ_PGM_RESOURCES_PS 0x028850
+#define R_028868_SQ_PGM_RESOURCES_VS 0x028868
+
+//===----------------------------------------------------------------------===//
+// Evergreen, Northern Islands Registers
+//===----------------------------------------------------------------------===//
+
+#define R_028844_SQ_PGM_RESOURCES_PS 0x028844
+#define R_028860_SQ_PGM_RESOURCES_VS 0x028860
+#define R_028878_SQ_PGM_RESOURCES_GS 0x028878
+#define R_0288D4_SQ_PGM_RESOURCES_LS 0x0288d4
+
#endif // R600DEFINES_H_
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index 53e6e51..7252235 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -43,11 +43,25 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
setOperationAction(ISD::AND, MVT::v4i32, Expand);
setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
+ setOperationAction(ISD::MUL, MVT::v2i32, Expand);
+ setOperationAction(ISD::MUL, MVT::v4i32, Expand);
+ setOperationAction(ISD::OR, MVT::v4i32, Expand);
+ setOperationAction(ISD::OR, MVT::v2i32, Expand);
setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
+ setOperationAction(ISD::SHL, MVT::v4i32, Expand);
+ setOperationAction(ISD::SHL, MVT::v2i32, Expand);
+ setOperationAction(ISD::SRL, MVT::v4i32, Expand);
+ setOperationAction(ISD::SRL, MVT::v2i32, Expand);
+ setOperationAction(ISD::SRA, MVT::v4i32, Expand);
+ setOperationAction(ISD::SRA, MVT::v2i32, Expand);
+ setOperationAction(ISD::SUB, MVT::v4i32, Expand);
+ setOperationAction(ISD::SUB, MVT::v2i32, Expand);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
setOperationAction(ISD::UREM, MVT::v4i32, Expand);
setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
+ setOperationAction(ISD::XOR, MVT::v4i32, Expand);
+ setOperationAction(ISD::XOR, MVT::v2i32, Expand);
setOperationAction(ISD::BR_CC, MVT::i32, Expand);
setOperationAction(ISD::BR_CC, MVT::f32, Expand);
@@ -70,6 +84,9 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
setOperationAction(ISD::SELECT, MVT::i32, Custom);
setOperationAction(ISD::SELECT, MVT::f32, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v4i32, Expand);
+ setOperationAction(ISD::VSELECT, MVT::v2i32, Expand);
+
// Legalize loads and stores to the private address space.
setOperationAction(ISD::LOAD, MVT::i32, Custom);
setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
@@ -93,6 +110,7 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
setTargetDAGCombine(ISD::SELECT_CC);
setBooleanContents(ZeroOrNegativeOneBooleanContent);
+ setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
setSchedulingPreference(Sched::VLIW);
}
diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
index b232188..37150c4 100644
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ b/lib/Target/R600/R600InstrInfo.cpp
@@ -13,6 +13,7 @@
//===----------------------------------------------------------------------===//
#include "R600InstrInfo.h"
+#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
#include "R600Defines.h"
@@ -29,7 +30,8 @@ using namespace llvm;
R600InstrInfo::R600InstrInfo(AMDGPUTargetMachine &tm)
: AMDGPUInstrInfo(tm),
- RI(tm, *this)
+ RI(tm, *this),
+ ST(tm.getSubtarget<AMDGPUSubtarget>())
{ }
const R600RegisterInfo &R600InstrInfo::getRegisterInfo() const {
@@ -139,6 +141,33 @@ bool R600InstrInfo::isALUInstr(unsigned Opcode) const {
(TargetFlags & R600_InstFlag::OP3));
}
+bool R600InstrInfo::isTransOnly(unsigned Opcode) const {
+ return (get(Opcode).TSFlags & R600_InstFlag::TRANS_ONLY);
+}
+
+bool R600InstrInfo::isTransOnly(const MachineInstr *MI) const {
+ return isTransOnly(MI->getOpcode());
+}
+
+bool R600InstrInfo::usesVertexCache(unsigned Opcode) const {
+ return ST.hasVertexCache() && IS_VTX(get(Opcode));
+}
+
+bool R600InstrInfo::usesVertexCache(const MachineInstr *MI) const {
+ const R600MachineFunctionInfo *MFI = MI->getParent()->getParent()->getInfo<R600MachineFunctionInfo>();
+ return MFI->ShaderType != ShaderType::COMPUTE && usesVertexCache(MI->getOpcode());
+}
+
+bool R600InstrInfo::usesTextureCache(unsigned Opcode) const {
+ return (!ST.hasVertexCache() && IS_VTX(get(Opcode))) || IS_TEX(get(Opcode));
+}
+
+bool R600InstrInfo::usesTextureCache(const MachineInstr *MI) const {
+ const R600MachineFunctionInfo *MFI = MI->getParent()->getParent()->getInfo<R600MachineFunctionInfo>();
+ return (MFI->ShaderType == ShaderType::COMPUTE && usesVertexCache(MI->getOpcode())) ||
+ usesTextureCache(MI->getOpcode());
+}
+
bool
R600InstrInfo::fitsConstReadLimitations(const std::vector<unsigned> &Consts)
const {
@@ -183,10 +212,19 @@ R600InstrInfo::canBundle(const std::vector<MachineInstr *> &MIs) const {
int SrcIdx = getOperandIdx(MI->getOpcode(), OpTable[j][0]);
if (SrcIdx < 0)
break;
- if (MI->getOperand(SrcIdx).getReg() == AMDGPU::ALU_CONST) {
+ unsigned Reg = MI->getOperand(SrcIdx).getReg();
+ if (Reg == AMDGPU::ALU_CONST) {
unsigned Const = MI->getOperand(
getOperandIdx(MI->getOpcode(), OpTable[j][1])).getImm();
Consts.push_back(Const);
+ continue;
+ }
+ if (AMDGPU::R600_KC0RegClass.contains(Reg) ||
+ AMDGPU::R600_KC1RegClass.contains(Reg)) {
+ unsigned Index = RI.getEncodingValue(Reg) & 0xff;
+ unsigned Chan = RI.getHWRegChan(Reg);
+ Consts.push_back((Index << 2) | Chan);
+ continue;
}
}
}
@@ -684,7 +722,8 @@ MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MB
//scheduling to the backend, we can change the default to 0.
MIB.addImm(1) // $last
.addReg(AMDGPU::PRED_SEL_OFF) // $pred_sel
- .addImm(0); // $literal
+ .addImm(0) // $literal
+ .addImm(0); // $bank_swizzle
return MIB;
}
diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h
index dbae900..babe4b8 100644
--- a/lib/Target/R600/R600InstrInfo.h
+++ b/lib/Target/R600/R600InstrInfo.h
@@ -33,6 +33,7 @@ namespace llvm {
class R600InstrInfo : public AMDGPUInstrInfo {
private:
const R600RegisterInfo RI;
+ const AMDGPUSubtarget &ST;
int getBranchInstr(const MachineOperand &op) const;
@@ -53,6 +54,14 @@ namespace llvm {
/// \returns true if this \p Opcode represents an ALU instruction.
bool isALUInstr(unsigned Opcode) const;
+ bool isTransOnly(unsigned Opcode) const;
+ bool isTransOnly(const MachineInstr *MI) const;
+
+ bool usesVertexCache(unsigned Opcode) const;
+ bool usesVertexCache(const MachineInstr *MI) const;
+ bool usesTextureCache(unsigned Opcode) const;
+ bool usesTextureCache(const MachineInstr *MI) const;
+
bool fitsConstReadLimitations(const std::vector<unsigned>&) const;
bool canBundle(const std::vector<MachineInstr *> &) const;
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index 663b41a..8f47523 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -13,11 +13,12 @@
include "R600Intrinsics.td"
-class InstR600 <bits<11> inst, dag outs, dag ins, string asm, list<dag> pattern,
+class InstR600 <dag outs, dag ins, string asm, list<dag> pattern,
InstrItinClass itin>
: AMDGPUInst <outs, ins, asm, pattern> {
field bits<64> Inst;
+ bit TransOnly = 0;
bit Trig = 0;
bit Op3 = 0;
bit isVector = 0;
@@ -25,9 +26,9 @@ class InstR600 <bits<11> inst, dag outs, dag ins, string asm, list<dag> pattern,
bit Op1 = 0;
bit Op2 = 0;
bit HasNativeOperands = 0;
+ bit VTXInst = 0;
+ bit TEXInst = 0;
- bits<11> op_code = inst;
- //let Inst = inst;
let Namespace = "AMDGPU";
let OutOperandList = outs;
let InOperandList = ins;
@@ -35,6 +36,7 @@ class InstR600 <bits<11> inst, dag outs, dag ins, string asm, list<dag> pattern,
let Pattern = pattern;
let Itinerary = itin;
+ let TSFlags{0} = TransOnly;
let TSFlags{4} = Trig;
let TSFlags{5} = Op3;
@@ -45,11 +47,12 @@ class InstR600 <bits<11> inst, dag outs, dag ins, string asm, list<dag> pattern,
let TSFlags{9} = HasNativeOperands;
let TSFlags{10} = Op1;
let TSFlags{11} = Op2;
+ let TSFlags{12} = VTXInst;
+ let TSFlags{13} = TEXInst;
}
class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern> :
- AMDGPUInst <outs, ins, asm, pattern> {
- field bits<64> Inst;
+ InstR600 <outs, ins, asm, pattern, NullALU> {
let Namespace = "AMDGPU";
}
@@ -74,6 +77,9 @@ class InstFlag<string PM = "printOperand", int Default = 0>
def SEL : OperandWithDefaultOps <i32, (ops (i32 -1))> {
let PrintMethod = "printSel";
}
+def BANK_SWIZZLE : OperandWithDefaultOps <i32, (ops (i32 0))> {
+ let PrintMethod = "printBankSwizzle";
+}
def LITERAL : InstFlag<"printLiteral">;
@@ -137,7 +143,7 @@ class R600ALU_Word1 {
field bits<32> Word1;
bits<11> dst;
- bits<3> bank_swizzle = 0;
+ bits<3> bank_swizzle;
bits<1> dst_rel;
bits<1> clamp;
@@ -346,15 +352,15 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
// and R600InstrInfo::getOperandIdx().
class R600_1OP <bits<11> inst, string opName, list<dag> pattern,
InstrItinClass itin = AnyALU> :
- InstR600 <0,
- (outs R600_Reg32:$dst),
+ InstR600 <(outs R600_Reg32:$dst),
(ins WRITE:$write, OMOD:$omod, REL:$dst_rel, CLAMP:$clamp,
R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel,
- LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal),
+ LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal,
+ BANK_SWIZZLE:$bank_swizzle),
!strconcat(" ", opName,
- "$clamp $dst$write$dst_rel$omod, "
+ "$last$clamp $dst$write$dst_rel$omod, "
"$src0_neg$src0_abs$src0$src0_abs$src0_rel, "
- "$literal $pred_sel$last"),
+ "$pred_sel $bank_swizzle"),
pattern,
itin>,
R600ALU_Word0,
@@ -385,18 +391,18 @@ class R600_1OP_Helper <bits<11> inst, string opName, SDPatternOperator node,
// R600InstrInfo::buildDefaultInstruction(), and R600InstrInfo::getOperandIdx().
class R600_2OP <bits<11> inst, string opName, list<dag> pattern,
InstrItinClass itin = AnyALU> :
- InstR600 <inst,
- (outs R600_Reg32:$dst),
+ InstR600 <(outs R600_Reg32:$dst),
(ins UEM:$update_exec_mask, UP:$update_pred, WRITE:$write,
OMOD:$omod, REL:$dst_rel, CLAMP:$clamp,
R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel,
R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, ABS:$src1_abs, SEL:$src1_sel,
- LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal),
+ LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal,
+ BANK_SWIZZLE:$bank_swizzle),
!strconcat(" ", opName,
- "$clamp $update_exec_mask$update_pred$dst$write$dst_rel$omod, "
+ "$last$clamp $update_exec_mask$update_pred$dst$write$dst_rel$omod, "
"$src0_neg$src0_abs$src0$src0_abs$src0_rel, "
"$src1_neg$src1_abs$src1$src1_abs$src1_rel, "
- "$literal $pred_sel$last"),
+ "$pred_sel $bank_swizzle"),
pattern,
itin>,
R600ALU_Word0,
@@ -423,18 +429,19 @@ class R600_2OP_Helper <bits<11> inst, string opName, SDPatternOperator node,
// R600InstrInfo::getOperandIdx().
class R600_3OP <bits<5> inst, string opName, list<dag> pattern,
InstrItinClass itin = AnyALU> :
- InstR600 <0,
- (outs R600_Reg32:$dst),
+ InstR600 <(outs R600_Reg32:$dst),
(ins REL:$dst_rel, CLAMP:$clamp,
R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, SEL:$src0_sel,
R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, SEL:$src1_sel,
R600_Reg32:$src2, NEG:$src2_neg, REL:$src2_rel, SEL:$src2_sel,
- LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal),
- !strconcat(" ", opName, "$clamp $dst$dst_rel, "
+ LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal,
+ BANK_SWIZZLE:$bank_swizzle),
+ !strconcat(" ", opName, "$last$clamp $dst$dst_rel, "
"$src0_neg$src0$src0_rel, "
"$src1_neg$src1$src1_rel, "
"$src2_neg$src2$src2_rel, "
- "$literal $pred_sel$last"),
+ "$pred_sel"
+ "$bank_swizzle"),
pattern,
itin>,
R600ALU_Word0,
@@ -450,8 +457,7 @@ class R600_3OP <bits<5> inst, string opName, list<dag> pattern,
class R600_REDUCTION <bits<11> inst, dag ins, string asm, list<dag> pattern,
InstrItinClass itin = VecALU> :
- InstR600 <inst,
- (outs R600_Reg32:$dst),
+ InstR600 <(outs R600_Reg32:$dst),
ins,
asm,
pattern,
@@ -459,8 +465,7 @@ class R600_REDUCTION <bits<11> inst, dag ins, string asm, list<dag> pattern,
class R600_TEX <bits<11> inst, string opName, list<dag> pattern,
InstrItinClass itin = AnyALU> :
- InstR600 <inst,
- (outs R600_Reg128:$DST_GPR),
+ InstR600 <(outs R600_Reg128:$DST_GPR),
(ins R600_Reg128:$SRC_GPR, i32imm:$RESOURCE_ID, i32imm:$SAMPLER_ID, i32imm:$textureTarget),
!strconcat(opName, "$DST_GPR, $SRC_GPR, $RESOURCE_ID, $SAMPLER_ID, $textureTarget"),
pattern,
@@ -481,11 +486,14 @@ class R600_TEX <bits<11> inst, string opName, list<dag> pattern,
let FETCH_WHOLE_QUAD = 0;
let ALT_CONST = 0;
let SAMPLER_INDEX_MODE = 0;
+ let RESOURCE_INDEX_MODE = 0;
let COORD_TYPE_X = 0;
let COORD_TYPE_Y = 0;
let COORD_TYPE_Z = 0;
let COORD_TYPE_W = 0;
+
+ let TEXInst = 1;
}
} // End mayLoad = 1, mayStore = 0, hasSideEffects = 0
@@ -738,7 +746,9 @@ multiclass SteamOutputExportPattern<Instruction ExportInst,
4095, imm:$mask, buf3inst, 0)>;
}
-let usesCustomInserter = 1 in {
+// Export Instructions should not be duplicated by TailDuplication pass
+// (which assumes that duplicable instruction are affected by exec mask)
+let usesCustomInserter = 1, isNotDuplicable = 1 in {
class ExportSwzInst : InstR600ISA<(
outs),
@@ -805,12 +815,15 @@ class CF_ALU_WORD1 {
let Word1{31} = BARRIER;
}
+def KCACHE : InstFlag<"printKCache">;
+
class ALU_CLAUSE<bits<4> inst, string OpName> : AMDGPUInst <(outs),
-(ins i32imm:$ADDR, i32imm:$KCACHE_BANK0, i32imm:$KCACHE_BANK1, i32imm:$KCACHE_MODE0, i32imm:$KCACHE_MODE1,
-i32imm:$KCACHE_ADDR0, i32imm:$KCACHE_ADDR1, i32imm:$COUNT),
+(ins i32imm:$ADDR, i32imm:$KCACHE_BANK0, i32imm:$KCACHE_BANK1,
+KCACHE:$KCACHE_MODE0, KCACHE:$KCACHE_MODE1,
+i32imm:$KCACHE_ADDR0, i32imm:$KCACHE_ADDR1,
+i32imm:$COUNT),
!strconcat(OpName, " $COUNT, @$ADDR, "
-"KC0[CB$KCACHE_BANK0:$KCACHE_ADDR0-$KCACHE_ADDR0+32]"
-", KC1[CB$KCACHE_BANK1:$KCACHE_ADDR1-$KCACHE_ADDR1+32]"),
+"KC0[$KCACHE_MODE0], KC1[$KCACHE_MODE1]"),
[] >, CF_ALU_WORD0, CF_ALU_WORD1 {
field bits<64> Inst;
@@ -823,109 +836,139 @@ i32imm:$KCACHE_ADDR0, i32imm:$KCACHE_ADDR1, i32imm:$COUNT),
let Inst{63-32} = Word1;
}
-class CF_WORD0 {
+class CF_WORD0_R600 {
field bits<32> Word0;
- bits<24> ADDR;
- bits<3> JUMPTABLE_SEL;
+ bits<32> ADDR;
- let Word0{23-0} = ADDR;
- let Word0{26-24} = JUMPTABLE_SEL;
+ let Word0 = ADDR;
}
-class CF_WORD1 {
+class CF_WORD1_R600 {
field bits<32> Word1;
bits<3> POP_COUNT;
bits<5> CF_CONST;
bits<2> COND;
- bits<6> COUNT;
+ bits<3> COUNT;
+ bits<6> CALL_COUNT;
+ bits<1> COUNT_3;
+ bits<1> END_OF_PROGRAM;
bits<1> VALID_PIXEL_MODE;
- bits<8> CF_INST;
+ bits<7> CF_INST;
+ bits<1> WHOLE_QUAD_MODE;
bits<1> BARRIER;
let Word1{2-0} = POP_COUNT;
let Word1{7-3} = CF_CONST;
let Word1{9-8} = COND;
- let Word1{15-10} = COUNT;
- let Word1{20} = VALID_PIXEL_MODE;
- let Word1{29-22} = CF_INST;
+ let Word1{12-10} = COUNT;
+ let Word1{18-13} = CALL_COUNT;
+ let Word1{19} = COUNT_3;
+ let Word1{21} = END_OF_PROGRAM;
+ let Word1{22} = VALID_PIXEL_MODE;
+ let Word1{29-23} = CF_INST;
+ let Word1{30} = WHOLE_QUAD_MODE;
let Word1{31} = BARRIER;
}
-class CF_CLAUSE <bits<8> inst, dag ins, string AsmPrint> : AMDGPUInst <(outs),
-ins, AsmPrint, [] >, CF_WORD0, CF_WORD1 {
+class CF_CLAUSE_R600 <bits<7> inst, dag ins, string AsmPrint> : AMDGPUInst <(outs),
+ins, AsmPrint, [] >, CF_WORD0_R600, CF_WORD1_R600 {
field bits<64> Inst;
let CF_INST = inst;
let BARRIER = 1;
- let JUMPTABLE_SEL = 0;
let CF_CONST = 0;
let VALID_PIXEL_MODE = 0;
let COND = 0;
+ let CALL_COUNT = 0;
+ let COUNT_3 = 0;
+ let END_OF_PROGRAM = 0;
+ let WHOLE_QUAD_MODE = 0;
let Inst{31-0} = Word0;
let Inst{63-32} = Word1;
}
-def CF_TC : CF_CLAUSE<1, (ins i32imm:$ADDR, i32imm:$COUNT),
-"TEX $COUNT @$ADDR"> {
- let POP_COUNT = 0;
-}
-
-def CF_VC : CF_CLAUSE<2, (ins i32imm:$ADDR, i32imm:$COUNT),
-"VTX $COUNT @$ADDR"> {
- let POP_COUNT = 0;
-}
+class CF_WORD0_EG {
+ field bits<32> Word0;
-def WHILE_LOOP : CF_CLAUSE<6, (ins i32imm:$ADDR), "LOOP_START_DX10 @$ADDR"> {
- let POP_COUNT = 0;
- let COUNT = 0;
-}
+ bits<24> ADDR;
+ bits<3> JUMPTABLE_SEL;
-def END_LOOP : CF_CLAUSE<5, (ins i32imm:$ADDR), "END_LOOP @$ADDR"> {
- let POP_COUNT = 0;
- let COUNT = 0;
+ let Word0{23-0} = ADDR;
+ let Word0{26-24} = JUMPTABLE_SEL;
}
-def LOOP_BREAK : CF_CLAUSE<9, (ins i32imm:$ADDR), "LOOP_BREAK @$ADDR"> {
- let POP_COUNT = 0;
- let COUNT = 0;
-}
+class CF_WORD1_EG {
+ field bits<32> Word1;
-def CF_CONTINUE : CF_CLAUSE<8, (ins i32imm:$ADDR), "CONTINUE @$ADDR"> {
- let POP_COUNT = 0;
- let COUNT = 0;
-}
+ bits<3> POP_COUNT;
+ bits<5> CF_CONST;
+ bits<2> COND;
+ bits<6> COUNT;
+ bits<1> VALID_PIXEL_MODE;
+ bits<1> END_OF_PROGRAM;
+ bits<8> CF_INST;
+ bits<1> BARRIER;
-def CF_JUMP : CF_CLAUSE<10, (ins i32imm:$ADDR, i32imm:$POP_COUNT), "JUMP @$ADDR POP:$POP_COUNT"> {
- let COUNT = 0;
+ let Word1{2-0} = POP_COUNT;
+ let Word1{7-3} = CF_CONST;
+ let Word1{9-8} = COND;
+ let Word1{15-10} = COUNT;
+ let Word1{20} = VALID_PIXEL_MODE;
+ let Word1{21} = END_OF_PROGRAM;
+ let Word1{29-22} = CF_INST;
+ let Word1{31} = BARRIER;
}
-def CF_ELSE : CF_CLAUSE<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT), "ELSE @$ADDR POP:$POP_COUNT"> {
- let COUNT = 0;
-}
+class CF_CLAUSE_EG <bits<8> inst, dag ins, string AsmPrint> : AMDGPUInst <(outs),
+ins, AsmPrint, [] >, CF_WORD0_EG, CF_WORD1_EG {
+ field bits<64> Inst;
-def CF_CALL_FS : CF_CLAUSE<19, (ins), "CALL_FS"> {
- let ADDR = 0;
- let COUNT = 0;
- let POP_COUNT = 0;
-}
+ let CF_INST = inst;
+ let BARRIER = 1;
+ let JUMPTABLE_SEL = 0;
+ let CF_CONST = 0;
+ let VALID_PIXEL_MODE = 0;
+ let COND = 0;
+ let END_OF_PROGRAM = 0;
-def POP : CF_CLAUSE<14, (ins i32imm:$ADDR, i32imm:$POP_COUNT), "POP @$ADDR POP:$POP_COUNT"> {
- let COUNT = 0;
+ let Inst{31-0} = Word0;
+ let Inst{63-32} = Word1;
}
def CF_ALU : ALU_CLAUSE<8, "ALU">;
def CF_ALU_PUSH_BEFORE : ALU_CLAUSE<9, "ALU_PUSH_BEFORE">;
-def STACK_SIZE : AMDGPUInst <(outs),
-(ins i32imm:$num), "nstack $num", [] > {
+def FETCH_CLAUSE : AMDGPUInst <(outs),
+(ins i32imm:$addr), "Fetch clause starting at $addr:", [] > {
field bits<8> Inst;
bits<8> num;
let Inst = num;
}
+def ALU_CLAUSE : AMDGPUInst <(outs),
+(ins i32imm:$addr), "ALU clause starting at $addr:", [] > {
+ field bits<8> Inst;
+ bits<8> num;
+ let Inst = num;
+}
+
+def LITERALS : AMDGPUInst <(outs),
+(ins LITERAL:$literal1, LITERAL:$literal2), "$literal1, $literal2", [] > {
+ field bits<64> Inst;
+ bits<32> literal1;
+ bits<32> literal2;
+
+ let Inst{31-0} = literal1;
+ let Inst{63-32} = literal2;
+}
+
+def PAD : AMDGPUInst <(outs), (ins), "PAD", [] > {
+ field bits<64> Inst;
+}
+
let Predicates = [isR600toCayman] in {
//===----------------------------------------------------------------------===//
@@ -944,58 +987,42 @@ def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin>;
// XXX: Use the defs in TargetSelectionDAG.td instead of intrinsics.
def SETE : R600_2OP <
0x08, "SETE",
- [(set R600_Reg32:$dst,
- (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO,
- COND_EQ))]
+ [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_EQ))]
>;
def SGT : R600_2OP <
0x09, "SETGT",
- [(set R600_Reg32:$dst,
- (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO,
- COND_GT))]
+ [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_GT))]
>;
def SGE : R600_2OP <
0xA, "SETGE",
- [(set R600_Reg32:$dst,
- (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO,
- COND_GE))]
+ [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_GE))]
>;
def SNE : R600_2OP <
0xB, "SETNE",
- [(set R600_Reg32:$dst,
- (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO,
- COND_NE))]
+ [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_NE))]
>;
def SETE_DX10 : R600_2OP <
0xC, "SETE_DX10",
- [(set R600_Reg32:$dst,
- (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, (i32 -1), (i32 0),
- COND_EQ))]
+ [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_EQ))]
>;
def SETGT_DX10 : R600_2OP <
0xD, "SETGT_DX10",
- [(set R600_Reg32:$dst,
- (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, (i32 -1), (i32 0),
- COND_GT))]
+ [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_GT))]
>;
def SETGE_DX10 : R600_2OP <
0xE, "SETGE_DX10",
- [(set R600_Reg32:$dst,
- (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, (i32 -1), (i32 0),
- COND_GE))]
+ [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_GE))]
>;
def SETNE_DX10 : R600_2OP <
0xF, "SETNE_DX10",
- [(set R600_Reg32:$dst,
- (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, (i32 -1), (i32 0),
- COND_NE))]
+ [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_NE))]
>;
def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>;
@@ -1053,38 +1080,32 @@ def MIN_UINT : R600_2OP_Helper <0x39, "MIN_UINT", AMDGPUumin>;
def SETE_INT : R600_2OP <
0x3A, "SETE_INT",
- [(set (i32 R600_Reg32:$dst),
- (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETEQ))]
+ [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETEQ))]
>;
def SETGT_INT : R600_2OP <
0x3B, "SETGT_INT",
- [(set (i32 R600_Reg32:$dst),
- (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETGT))]
+ [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETGT))]
>;
def SETGE_INT : R600_2OP <
0x3C, "SETGE_INT",
- [(set (i32 R600_Reg32:$dst),
- (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETGE))]
+ [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETGE))]
>;
def SETNE_INT : R600_2OP <
0x3D, "SETNE_INT",
- [(set (i32 R600_Reg32:$dst),
- (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETNE))]
+ [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETNE))]
>;
def SETGT_UINT : R600_2OP <
0x3E, "SETGT_UINT",
- [(set (i32 R600_Reg32:$dst),
- (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETUGT))]
+ [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETUGT))]
>;
def SETGE_UINT : R600_2OP <
0x3F, "SETGE_UINT",
- [(set (i32 R600_Reg32:$dst),
- (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETUGE))]
+ [(set i32:$dst, (selectcc i32:$src0, i32:$src1, -1, 0, SETUGE))]
>;
def PRED_SETE_INT : R600_2OP <0x42, "PRED_SETE_INT", []>;
@@ -1094,26 +1115,17 @@ def PRED_SETNE_INT : R600_2OP <0x45, "PRED_SETNE_INT", []>;
def CNDE_INT : R600_3OP <
0x1C, "CNDE_INT",
- [(set (i32 R600_Reg32:$dst),
- (selectcc (i32 R600_Reg32:$src0), 0,
- (i32 R600_Reg32:$src1), (i32 R600_Reg32:$src2),
- COND_EQ))]
+ [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_EQ))]
>;
def CNDGE_INT : R600_3OP <
0x1E, "CNDGE_INT",
- [(set (i32 R600_Reg32:$dst),
- (selectcc (i32 R600_Reg32:$src0), 0,
- (i32 R600_Reg32:$src1), (i32 R600_Reg32:$src2),
- COND_GE))]
+ [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_GE))]
>;
def CNDGT_INT : R600_3OP <
0x1D, "CNDGT_INT",
- [(set (i32 R600_Reg32:$dst),
- (selectcc (i32 R600_Reg32:$src0), 0,
- (i32 R600_Reg32:$src1), (i32 R600_Reg32:$src2),
- COND_GT))]
+ [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_GT))]
>;
//===----------------------------------------------------------------------===//
@@ -1122,7 +1134,7 @@ def CNDGT_INT : R600_3OP <
def TEX_LD : R600_TEX <
0x03, "TEX_LD",
- [(set R600_Reg128:$DST_GPR, (int_AMDGPU_txf R600_Reg128:$SRC_GPR,
+ [(set v4f32:$DST_GPR, (int_AMDGPU_txf v4f32:$SRC_GPR,
imm:$OFFSET_X, imm:$OFFSET_Y, imm:$OFFSET_Z, imm:$RESOURCE_ID,
imm:$SAMPLER_ID, imm:$textureTarget))]
> {
@@ -1135,19 +1147,19 @@ let InOperandList = (ins R600_Reg128:$SRC_GPR, i32imm:$OFFSET_X,
def TEX_GET_TEXTURE_RESINFO : R600_TEX <
0x04, "TEX_GET_TEXTURE_RESINFO",
- [(set R600_Reg128:$DST_GPR, (int_AMDGPU_txq R600_Reg128:$SRC_GPR,
+ [(set v4f32:$DST_GPR, (int_AMDGPU_txq v4f32:$SRC_GPR,
imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))]
>;
def TEX_GET_GRADIENTS_H : R600_TEX <
0x07, "TEX_GET_GRADIENTS_H",
- [(set R600_Reg128:$DST_GPR, (int_AMDGPU_ddx R600_Reg128:$SRC_GPR,
+ [(set v4f32:$DST_GPR, (int_AMDGPU_ddx v4f32:$SRC_GPR,
imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))]
>;
def TEX_GET_GRADIENTS_V : R600_TEX <
0x08, "TEX_GET_GRADIENTS_V",
- [(set R600_Reg128:$DST_GPR, (int_AMDGPU_ddy R600_Reg128:$SRC_GPR,
+ [(set v4f32:$DST_GPR, (int_AMDGPU_ddy v4f32:$SRC_GPR,
imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))]
>;
@@ -1163,37 +1175,37 @@ def TEX_SET_GRADIENTS_V : R600_TEX <
def TEX_SAMPLE : R600_TEX <
0x10, "TEX_SAMPLE",
- [(set R600_Reg128:$DST_GPR, (int_AMDGPU_tex R600_Reg128:$SRC_GPR,
+ [(set v4f32:$DST_GPR, (int_AMDGPU_tex v4f32:$SRC_GPR,
imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))]
>;
def TEX_SAMPLE_C : R600_TEX <
0x18, "TEX_SAMPLE_C",
- [(set R600_Reg128:$DST_GPR, (int_AMDGPU_tex R600_Reg128:$SRC_GPR,
+ [(set v4f32:$DST_GPR, (int_AMDGPU_tex v4f32:$SRC_GPR,
imm:$RESOURCE_ID, imm:$SAMPLER_ID, TEX_SHADOW:$textureTarget))]
>;
def TEX_SAMPLE_L : R600_TEX <
0x11, "TEX_SAMPLE_L",
- [(set R600_Reg128:$DST_GPR, (int_AMDGPU_txl R600_Reg128:$SRC_GPR,
+ [(set v4f32:$DST_GPR, (int_AMDGPU_txl v4f32:$SRC_GPR,
imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))]
>;
def TEX_SAMPLE_C_L : R600_TEX <
0x19, "TEX_SAMPLE_C_L",
- [(set R600_Reg128:$DST_GPR, (int_AMDGPU_txl R600_Reg128:$SRC_GPR,
+ [(set v4f32:$DST_GPR, (int_AMDGPU_txl v4f32:$SRC_GPR,
imm:$RESOURCE_ID, imm:$SAMPLER_ID, TEX_SHADOW:$textureTarget))]
>;
def TEX_SAMPLE_LB : R600_TEX <
0x12, "TEX_SAMPLE_LB",
- [(set R600_Reg128:$DST_GPR, (int_AMDGPU_txb R600_Reg128:$SRC_GPR,
+ [(set v4f32:$DST_GPR, (int_AMDGPU_txb v4f32:$SRC_GPR,
imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))]
>;
def TEX_SAMPLE_C_LB : R600_TEX <
0x1A, "TEX_SAMPLE_C_LB",
- [(set R600_Reg128:$DST_GPR, (int_AMDGPU_txb R600_Reg128:$SRC_GPR,
+ [(set v4f32:$DST_GPR, (int_AMDGPU_txb v4f32:$SRC_GPR,
imm:$RESOURCE_ID, imm:$SAMPLER_ID, TEX_SHADOW:$textureTarget))]
>;
@@ -1223,32 +1235,22 @@ class MULADD_Common <bits<5> inst> : R600_3OP <
class MULADD_IEEE_Common <bits<5> inst> : R600_3OP <
inst, "MULADD_IEEE",
- [(set (f32 R600_Reg32:$dst),
- (fadd (fmul R600_Reg32:$src0, R600_Reg32:$src1), R600_Reg32:$src2))]
+ [(set f32:$dst, (fadd (fmul f32:$src0, f32:$src1), f32:$src2))]
>;
class CNDE_Common <bits<5> inst> : R600_3OP <
inst, "CNDE",
- [(set R600_Reg32:$dst,
- (selectcc (f32 R600_Reg32:$src0), FP_ZERO,
- (f32 R600_Reg32:$src1), (f32 R600_Reg32:$src2),
- COND_EQ))]
+ [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_EQ))]
>;
class CNDGT_Common <bits<5> inst> : R600_3OP <
inst, "CNDGT",
- [(set R600_Reg32:$dst,
- (selectcc (f32 R600_Reg32:$src0), FP_ZERO,
- (f32 R600_Reg32:$src1), (f32 R600_Reg32:$src2),
- COND_GT))]
+ [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_GT))]
>;
class CNDGE_Common <bits<5> inst> : R600_3OP <
inst, "CNDGE",
- [(set R600_Reg32:$dst,
- (selectcc (f32 R600_Reg32:$src0), FP_ZERO,
- (f32 R600_Reg32:$src1), (f32 R600_Reg32:$src2),
- COND_GE))]
+ [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_GE))]
>;
multiclass DOT4_Common <bits<11> inst> {
@@ -1256,7 +1258,7 @@ multiclass DOT4_Common <bits<11> inst> {
def _pseudo : R600_REDUCTION <inst,
(ins R600_Reg128:$src0, R600_Reg128:$src1),
"DOT4 $dst $src0, $src1",
- [(set R600_Reg32:$dst, (int_AMDGPU_dp4 R600_Reg128:$src0, R600_Reg128:$src1))]
+ [(set f32:$dst, (int_AMDGPU_dp4 v4f32:$src0, v4f32:$src1))]
>;
def _real : R600_2OP <inst, "DOT4", []>;
@@ -1266,11 +1268,10 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
multiclass CUBE_Common <bits<11> inst> {
def _pseudo : InstR600 <
- inst,
(outs R600_Reg128:$dst),
(ins R600_Reg128:$src),
"CUBE $dst $src",
- [(set R600_Reg128:$dst, (int_AMDGPU_cube R600_Reg128:$src))],
+ [(set v4f32:$dst, (int_AMDGPU_cube v4f32:$src))],
VecALU
> {
let isPseudo = 1;
@@ -1282,23 +1283,38 @@ multiclass CUBE_Common <bits<11> inst> {
class EXP_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
inst, "EXP_IEEE", fexp2
->;
+> {
+ let TransOnly = 1;
+ let Itinerary = TransALU;
+}
class FLT_TO_INT_Common <bits<11> inst> : R600_1OP_Helper <
inst, "FLT_TO_INT", fp_to_sint
->;
+> {
+ let TransOnly = 1;
+ let Itinerary = TransALU;
+}
class INT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper <
inst, "INT_TO_FLT", sint_to_fp
->;
+> {
+ let TransOnly = 1;
+ let Itinerary = TransALU;
+}
class FLT_TO_UINT_Common <bits<11> inst> : R600_1OP_Helper <
inst, "FLT_TO_UINT", fp_to_uint
->;
+> {
+ let TransOnly = 1;
+ let Itinerary = TransALU;
+}
class UINT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper <
inst, "UINT_TO_FLT", uint_to_fp
->;
+> {
+ let TransOnly = 1;
+ let Itinerary = TransALU;
+}
class LOG_CLAMPED_Common <bits<11> inst> : R600_1OP <
inst, "LOG_CLAMPED", []
@@ -1306,50 +1322,84 @@ class LOG_CLAMPED_Common <bits<11> inst> : R600_1OP <
class LOG_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
inst, "LOG_IEEE", flog2
->;
+> {
+ let TransOnly = 1;
+ let Itinerary = TransALU;
+}
class LSHL_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHL", shl>;
class LSHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHR", srl>;
class ASHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "ASHR", sra>;
class MULHI_INT_Common <bits<11> inst> : R600_2OP_Helper <
inst, "MULHI_INT", mulhs
->;
+> {
+ let TransOnly = 1;
+ let Itinerary = TransALU;
+}
class MULHI_UINT_Common <bits<11> inst> : R600_2OP_Helper <
inst, "MULHI", mulhu
->;
+> {
+ let TransOnly = 1;
+ let Itinerary = TransALU;
+}
class MULLO_INT_Common <bits<11> inst> : R600_2OP_Helper <
inst, "MULLO_INT", mul
->;
-class MULLO_UINT_Common <bits<11> inst> : R600_2OP <inst, "MULLO_UINT", []>;
+> {
+ let TransOnly = 1;
+ let Itinerary = TransALU;
+}
+class MULLO_UINT_Common <bits<11> inst> : R600_2OP <inst, "MULLO_UINT", []> {
+ let TransOnly = 1;
+ let Itinerary = TransALU;
+}
class RECIP_CLAMPED_Common <bits<11> inst> : R600_1OP <
inst, "RECIP_CLAMPED", []
->;
+> {
+ let TransOnly = 1;
+ let Itinerary = TransALU;
+}
class RECIP_IEEE_Common <bits<11> inst> : R600_1OP <
- inst, "RECIP_IEEE", [(set R600_Reg32:$dst, (fdiv FP_ONE, R600_Reg32:$src0))]
->;
+ inst, "RECIP_IEEE", [(set f32:$dst, (fdiv FP_ONE, f32:$src0))]
+> {
+ let TransOnly = 1;
+ let Itinerary = TransALU;
+}
class RECIP_UINT_Common <bits<11> inst> : R600_1OP_Helper <
inst, "RECIP_UINT", AMDGPUurecip
->;
+> {
+ let TransOnly = 1;
+ let Itinerary = TransALU;
+}
class RECIPSQRT_CLAMPED_Common <bits<11> inst> : R600_1OP_Helper <
inst, "RECIPSQRT_CLAMPED", int_AMDGPU_rsq
->;
+> {
+ let TransOnly = 1;
+ let Itinerary = TransALU;
+}
class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP <
inst, "RECIPSQRT_IEEE", []
->;
+> {
+ let TransOnly = 1;
+ let Itinerary = TransALU;
+}
class SIN_Common <bits<11> inst> : R600_1OP <
inst, "SIN", []>{
let Trig = 1;
+ let TransOnly = 1;
+ let Itinerary = TransALU;
}
class COS_Common <bits<11> inst> : R600_1OP <
inst, "COS", []> {
let Trig = 1;
+ let TransOnly = 1;
+ let Itinerary = TransALU;
}
//===----------------------------------------------------------------------===//
@@ -1358,19 +1408,20 @@ class COS_Common <bits<11> inst> : R600_1OP <
multiclass DIV_Common <InstR600 recip_ieee> {
def : Pat<
- (int_AMDGPU_div R600_Reg32:$src0, R600_Reg32:$src1),
- (MUL_IEEE R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1))
+ (int_AMDGPU_div f32:$src0, f32:$src1),
+ (MUL_IEEE $src0, (recip_ieee $src1))
>;
def : Pat<
- (fdiv R600_Reg32:$src0, R600_Reg32:$src1),
- (MUL_IEEE R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1))
+ (fdiv f32:$src0, f32:$src1),
+ (MUL_IEEE $src0, (recip_ieee $src1))
>;
}
-class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ieee> : Pat <
- (int_TGSI_lit_z R600_Reg32:$src_x, R600_Reg32:$src_y, R600_Reg32:$src_w),
- (exp_ieee (mul_lit (log_clamped (MAX R600_Reg32:$src_y, (f32 ZERO))), R600_Reg32:$src_w, R600_Reg32:$src_x))
+class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ieee>
+ : Pat <
+ (int_TGSI_lit_z f32:$src_x, f32:$src_y, f32:$src_w),
+ (exp_ieee (mul_lit (log_clamped (MAX $src_y, (f32 ZERO))), $src_w, $src_x))
>;
//===----------------------------------------------------------------------===//
@@ -1410,14 +1461,13 @@ let Predicates = [isR600] in {
def RECIP_UINT_r600 : RECIP_UINT_Common <0x78>;
defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>;
- def : POW_Common <LOG_IEEE_r600, EXP_IEEE_r600, MUL, R600_Reg32>;
+ def : POW_Common <LOG_IEEE_r600, EXP_IEEE_r600, MUL>;
def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common<MUL_LIT_r600, LOG_CLAMPED_r600, EXP_IEEE_r600>;
- def : Pat<(fsqrt R600_Reg32:$src),
- (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_r600 R600_Reg32:$src))>;
+ def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>;
def R600_ExportSwz : ExportSwzInst {
- let Word1{20-17} = 1; // BURST_COUNT
+ let Word1{20-17} = 0; // BURST_COUNT
let Word1{21} = eop;
let Word1{22} = 1; // VALID_PIXEL_MODE
let Word1{30-23} = inst;
@@ -1426,25 +1476,77 @@ let Predicates = [isR600] in {
defm : ExportPattern<R600_ExportSwz, 39>;
def R600_ExportBuf : ExportBufInst {
- let Word1{20-17} = 1; // BURST_COUNT
+ let Word1{20-17} = 0; // BURST_COUNT
let Word1{21} = eop;
let Word1{22} = 1; // VALID_PIXEL_MODE
let Word1{30-23} = inst;
let Word1{31} = 1; // BARRIER
}
defm : SteamOutputExportPattern<R600_ExportBuf, 0x20, 0x21, 0x22, 0x23>;
+
+ def CF_TC_R600 : CF_CLAUSE_R600<1, (ins i32imm:$ADDR, i32imm:$COUNT),
+ "TEX $COUNT @$ADDR"> {
+ let POP_COUNT = 0;
+ }
+ def CF_VC_R600 : CF_CLAUSE_R600<2, (ins i32imm:$ADDR, i32imm:$COUNT),
+ "VTX $COUNT @$ADDR"> {
+ let POP_COUNT = 0;
+ }
+ def WHILE_LOOP_R600 : CF_CLAUSE_R600<6, (ins i32imm:$ADDR),
+ "LOOP_START_DX10 @$ADDR"> {
+ let POP_COUNT = 0;
+ let COUNT = 0;
+ }
+ def END_LOOP_R600 : CF_CLAUSE_R600<5, (ins i32imm:$ADDR), "END_LOOP @$ADDR"> {
+ let POP_COUNT = 0;
+ let COUNT = 0;
+ }
+ def LOOP_BREAK_R600 : CF_CLAUSE_R600<9, (ins i32imm:$ADDR),
+ "LOOP_BREAK @$ADDR"> {
+ let POP_COUNT = 0;
+ let COUNT = 0;
+ }
+ def CF_CONTINUE_R600 : CF_CLAUSE_R600<8, (ins i32imm:$ADDR),
+ "CONTINUE @$ADDR"> {
+ let POP_COUNT = 0;
+ let COUNT = 0;
+ }
+ def CF_JUMP_R600 : CF_CLAUSE_R600<10, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+ "JUMP @$ADDR POP:$POP_COUNT"> {
+ let COUNT = 0;
+ }
+ def CF_ELSE_R600 : CF_CLAUSE_R600<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+ "ELSE @$ADDR POP:$POP_COUNT"> {
+ let COUNT = 0;
+ }
+ def CF_CALL_FS_R600 : CF_CLAUSE_R600<19, (ins), "CALL_FS"> {
+ let ADDR = 0;
+ let COUNT = 0;
+ let POP_COUNT = 0;
+ }
+ def POP_R600 : CF_CLAUSE_R600<14, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+ "POP @$ADDR POP:$POP_COUNT"> {
+ let COUNT = 0;
+ }
+ def CF_END_R600 : CF_CLAUSE_R600<0, (ins), "CF_END"> {
+ let COUNT = 0;
+ let POP_COUNT = 0;
+ let ADDR = 0;
+ let END_OF_PROGRAM = 1;
+ }
+
}
// Helper pattern for normalizing inputs to triginomic instructions for R700+
// cards.
class COS_PAT <InstR600 trig> : Pat<
- (fcos R600_Reg32:$src),
- (trig (MUL_IEEE (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src))
+ (fcos f32:$src),
+ (trig (MUL_IEEE (MOV_IMM_I32 CONST.TWO_PI_INV), $src))
>;
class SIN_PAT <InstR600 trig> : Pat<
- (fsin R600_Reg32:$src),
- (trig (MUL_IEEE (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src))
+ (fsin f32:$src),
+ (trig (MUL_IEEE (MOV_IMM_I32 CONST.TWO_PI_INV), $src))
>;
//===----------------------------------------------------------------------===//
@@ -1482,11 +1584,10 @@ def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>;
def SIN_eg : SIN_Common<0x8D>;
def COS_eg : COS_Common<0x8E>;
-def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL, R600_Reg32>;
+def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL>;
def : SIN_PAT <SIN_eg>;
def : COS_PAT <COS_eg>;
-def : Pat<(fsqrt R600_Reg32:$src),
- (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_eg R600_Reg32:$src))>;
+def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>;
} // End Predicates = [isEG]
//===----------------------------------------------------------------------===//
@@ -1510,15 +1611,17 @@ let Predicates = [isEGorCayman] in {
// (16,8) = (Input << 8) >> 24 = (Input & 0xffffff) >> 16
// (24,8) = (Input << 0) >> 24 = (Input & 0xffffffff) >> 24
def BFE_UINT_eg : R600_3OP <0x4, "BFE_UINT",
- [(set R600_Reg32:$dst, (int_AMDIL_bit_extract_u32 R600_Reg32:$src0,
- R600_Reg32:$src1,
- R600_Reg32:$src2))],
+ [(set i32:$dst, (int_AMDIL_bit_extract_u32 i32:$src0, i32:$src1,
+ i32:$src2))],
VecALU
>;
+ def : BFEPattern <BFE_UINT_eg>;
+
+ def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", [], VecALU>;
+ defm : BFIPatterns <BFI_INT_eg>;
def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT",
- [(set R600_Reg32:$dst, (AMDGPUbitalign R600_Reg32:$src0, R600_Reg32:$src1,
- R600_Reg32:$src2))],
+ [(set i32:$dst, (AMDGPUbitalign i32:$src0, i32:$src1, i32:$src2))],
VecALU
>;
@@ -1563,14 +1666,15 @@ let hasSideEffects = 1 in {
// XXX: Lowering SELECT_CC will sometimes generate fp_to_[su]int nodes,
// which do not need to be truncated since the fp values are 0.0f or 1.0f.
// We should look into handling these cases separately.
- def : Pat<(fp_to_sint R600_Reg32:$src0),
- (FLT_TO_INT_eg (TRUNC R600_Reg32:$src0))>;
+ def : Pat<(fp_to_sint f32:$src0), (FLT_TO_INT_eg (TRUNC $src0))>;
+
+ def : Pat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>;
- def : Pat<(fp_to_uint R600_Reg32:$src0),
- (FLT_TO_UINT_eg (TRUNC R600_Reg32:$src0))>;
+ // SHA-256 Patterns
+ def : SHA256MaPattern <BFI_INT_eg, XOR_INT>;
def EG_ExportSwz : ExportSwzInst {
- let Word1{19-16} = 1; // BURST_COUNT
+ let Word1{19-16} = 0; // BURST_COUNT
let Word1{20} = 1; // VALID_PIXEL_MODE
let Word1{21} = eop;
let Word1{29-22} = inst;
@@ -1580,7 +1684,7 @@ let hasSideEffects = 1 in {
defm : ExportPattern<EG_ExportSwz, 83>;
def EG_ExportBuf : ExportBufInst {
- let Word1{19-16} = 1; // BURST_COUNT
+ let Word1{19-16} = 0; // BURST_COUNT
let Word1{20} = 1; // VALID_PIXEL_MODE
let Word1{21} = eop;
let Word1{29-22} = inst;
@@ -1589,6 +1693,57 @@ let hasSideEffects = 1 in {
}
defm : SteamOutputExportPattern<EG_ExportBuf, 0x40, 0x41, 0x42, 0x43>;
+ def CF_TC_EG : CF_CLAUSE_EG<1, (ins i32imm:$ADDR, i32imm:$COUNT),
+ "TEX $COUNT @$ADDR"> {
+ let POP_COUNT = 0;
+ }
+ def CF_VC_EG : CF_CLAUSE_EG<2, (ins i32imm:$ADDR, i32imm:$COUNT),
+ "VTX $COUNT @$ADDR"> {
+ let POP_COUNT = 0;
+ }
+ def WHILE_LOOP_EG : CF_CLAUSE_EG<6, (ins i32imm:$ADDR),
+ "LOOP_START_DX10 @$ADDR"> {
+ let POP_COUNT = 0;
+ let COUNT = 0;
+ }
+ def END_LOOP_EG : CF_CLAUSE_EG<5, (ins i32imm:$ADDR), "END_LOOP @$ADDR"> {
+ let POP_COUNT = 0;
+ let COUNT = 0;
+ }
+ def LOOP_BREAK_EG : CF_CLAUSE_EG<9, (ins i32imm:$ADDR),
+ "LOOP_BREAK @$ADDR"> {
+ let POP_COUNT = 0;
+ let COUNT = 0;
+ }
+ def CF_CONTINUE_EG : CF_CLAUSE_EG<8, (ins i32imm:$ADDR),
+ "CONTINUE @$ADDR"> {
+ let POP_COUNT = 0;
+ let COUNT = 0;
+ }
+ def CF_JUMP_EG : CF_CLAUSE_EG<10, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+ "JUMP @$ADDR POP:$POP_COUNT"> {
+ let COUNT = 0;
+ }
+ def CF_ELSE_EG : CF_CLAUSE_EG<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+ "ELSE @$ADDR POP:$POP_COUNT"> {
+ let COUNT = 0;
+ }
+ def CF_CALL_FS_EG : CF_CLAUSE_EG<19, (ins), "CALL_FS"> {
+ let ADDR = 0;
+ let COUNT = 0;
+ let POP_COUNT = 0;
+ }
+ def POP_EG : CF_CLAUSE_EG<14, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+ "POP @$ADDR POP:$POP_COUNT"> {
+ let COUNT = 0;
+ }
+ def CF_END_EG : CF_CLAUSE_EG<0, (ins), "CF_END"> {
+ let COUNT = 0;
+ let POP_COUNT = 0;
+ let ADDR = 0;
+ let END_OF_PROGRAM = 1;
+ }
+
//===----------------------------------------------------------------------===//
// Memory read/write instructions
//===----------------------------------------------------------------------===//
@@ -1618,14 +1773,14 @@ class RAT_WRITE_CACHELESS_eg <dag ins, bits<4> comp_mask, string name,
def RAT_WRITE_CACHELESS_32_eg : RAT_WRITE_CACHELESS_eg <
(ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
0x1, "RAT_WRITE_CACHELESS_32_eg",
- [(global_store (i32 R600_TReg32_X:$rw_gpr), R600_TReg32_X:$index_gpr)]
+ [(global_store i32:$rw_gpr, i32:$index_gpr)]
>;
//128-bit store
def RAT_WRITE_CACHELESS_128_eg : RAT_WRITE_CACHELESS_eg <
(ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
0xf, "RAT_WRITE_CACHELESS_128",
- [(global_store (v4i32 R600_Reg128:$rw_gpr), R600_TReg32_X:$index_gpr)]
+ [(global_store v4i32:$rw_gpr, i32:$index_gpr)]
>;
class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
@@ -1679,6 +1834,8 @@ class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
// VTX_WORD3 (Padding)
//
// Inst{127-96} = 0;
+
+ let VTXInst = 1;
}
class VTX_READ_8_eg <bits<8> buffer_id, list<dag> pattern>
@@ -1748,19 +1905,19 @@ class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern>
//===----------------------------------------------------------------------===//
def VTX_READ_PARAM_8_eg : VTX_READ_8_eg <0,
- [(set (i32 R600_TReg32_X:$dst), (load_param_zexti8 ADDRVTX_READ:$ptr))]
+ [(set i32:$dst, (load_param_zexti8 ADDRVTX_READ:$ptr))]
>;
def VTX_READ_PARAM_16_eg : VTX_READ_16_eg <0,
- [(set (i32 R600_TReg32_X:$dst), (load_param_zexti16 ADDRVTX_READ:$ptr))]
+ [(set i32:$dst, (load_param_zexti16 ADDRVTX_READ:$ptr))]
>;
def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0,
- [(set (i32 R600_TReg32_X:$dst), (load_param ADDRVTX_READ:$ptr))]
+ [(set i32:$dst, (load_param ADDRVTX_READ:$ptr))]
>;
def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0,
- [(set (v4i32 R600_Reg128:$dst), (load_param ADDRVTX_READ:$ptr))]
+ [(set v4i32:$dst, (load_param ADDRVTX_READ:$ptr))]
>;
//===----------------------------------------------------------------------===//
@@ -1769,17 +1926,17 @@ def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0,
// 8-bit reads
def VTX_READ_GLOBAL_8_eg : VTX_READ_8_eg <1,
- [(set (i32 R600_TReg32_X:$dst), (zextloadi8_global ADDRVTX_READ:$ptr))]
+ [(set i32:$dst, (zextloadi8_global ADDRVTX_READ:$ptr))]
>;
// 32-bit reads
def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1,
- [(set (i32 R600_TReg32_X:$dst), (global_load ADDRVTX_READ:$ptr))]
+ [(set i32:$dst, (global_load ADDRVTX_READ:$ptr))]
>;
// 128-bit reads
def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1,
- [(set (v4i32 R600_Reg128:$dst), (global_load ADDRVTX_READ:$ptr))]
+ [(set v4i32:$dst, (global_load ADDRVTX_READ:$ptr))]
>;
//===----------------------------------------------------------------------===//
@@ -1788,7 +1945,7 @@ def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1,
//===----------------------------------------------------------------------===//
def CONSTANT_LOAD_eg : VTX_READ_32_eg <1,
- [(set (i32 R600_TReg32_X:$dst), (constant_load ADDRVTX_READ:$ptr))]
+ [(set i32:$dst, (constant_load ADDRVTX_READ:$ptr))]
>;
}
@@ -1818,22 +1975,27 @@ def SIN_cm : SIN_Common<0x8D>;
def COS_cm : COS_Common<0x8E>;
} // End isVector = 1
-def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL, R600_Reg32>;
+def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>;
def : SIN_PAT <SIN_cm>;
def : COS_PAT <COS_cm>;
defm DIV_cm : DIV_Common<RECIP_IEEE_cm>;
// RECIP_UINT emulation for Cayman
+// The multiplication scales from [0,1] to the unsigned integer range
def : Pat <
- (AMDGPUurecip R600_Reg32:$src0),
- (FLT_TO_UINT_eg (MUL_IEEE (RECIP_IEEE_cm (UINT_TO_FLT_eg R600_Reg32:$src0)),
- (MOV_IMM_I32 0x4f800000)))
+ (AMDGPUurecip i32:$src0),
+ (FLT_TO_UINT_eg (MUL_IEEE (RECIP_IEEE_cm (UINT_TO_FLT_eg $src0)),
+ (MOV_IMM_I32 CONST.FP_UINT_MAX_PLUS_1)))
>;
+ def CF_END_CM : CF_CLAUSE_EG<32, (ins), "CF_END"> {
+ let ADDR = 0;
+ let POP_COUNT = 0;
+ let COUNT = 0;
+ }
-def : Pat<(fsqrt R600_Reg32:$src),
- (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm R600_Reg32:$src))>;
+def : Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>;
} // End isCayman
@@ -1855,21 +2017,21 @@ def PREDICATED_BREAK : ILFormat<(outs), (ins GPRI32:$src),
let isPseudo = 1 in {
def PRED_X : InstR600 <
- 0, (outs R600_Predicate_Bit:$dst),
+ (outs R600_Predicate_Bit:$dst),
(ins R600_Reg32:$src0, i32imm:$src1, i32imm:$flags),
"", [], NullALU> {
let FlagOperandIdx = 3;
}
let isTerminator = 1, isBranch = 1 in {
-def JUMP_COND : InstR600 <0x10,
+def JUMP_COND : InstR600 <
(outs),
(ins brtarget:$target, R600_Predicate_Bit:$p),
"JUMP $target ($p)",
[], AnyALU
>;
-def JUMP : InstR600 <0x10,
+def JUMP : InstR600 <
(outs),
(ins brtarget:$target),
"JUMP $target",
@@ -1896,20 +2058,28 @@ def MASK_WRITE : AMDGPUShaderInst <
} // End mayLoad = 0, mayStore = 0, hasSideEffects = 1
-def TXD: AMDGPUShaderInst <
+def TXD: InstR600 <
(outs R600_Reg128:$dst),
- (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
+ (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2,
+ i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
"TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
- [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
->;
+ [(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2,
+ imm:$resourceId, imm:$samplerId, imm:$textureTarget))],
+ NullALU > {
+ let TEXInst = 1;
+}
-def TXD_SHADOW: AMDGPUShaderInst <
+def TXD_SHADOW: InstR600 <
(outs R600_Reg128:$dst),
- (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
+ (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2,
+ i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
"TXD_SHADOW $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
- [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))]
->;
-
+ [(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2,
+ imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))],
+ NullALU
+> {
+ let TEXInst = 1;
+}
} // End isPseudo = 1
} // End usesCustomInserter = 1
@@ -1946,7 +2116,7 @@ def CONST_COPY : Instruction {
def TEX_VTX_CONSTBUF :
InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "VTX_READ_eg $dst, $ptr",
- [(set R600_Reg128:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr, (i32 imm:$BUFFER_ID)))]>,
+ [(set v4i32:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr, (i32 imm:$BUFFER_ID)))]>,
VTX_WORD1_GPR, VTX_WORD0 {
let VC_INST = 0;
@@ -1995,11 +2165,12 @@ def TEX_VTX_CONSTBUF :
// VTX_WORD3 (Padding)
//
// Inst{127-96} = 0;
+ let VTXInst = 1;
}
def TEX_VTX_TEXBUF:
InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "TEX_VTX_EXPLICIT_READ $dst, $ptr",
- [(set R600_Reg128:$dst, (int_R600_load_texbuf ADDRGA_VAR_OFFSET:$ptr, imm:$BUFFER_ID))]>,
+ [(set v4f32:$dst, (int_R600_load_texbuf ADDRGA_VAR_OFFSET:$ptr, imm:$BUFFER_ID))]>,
VTX_WORD1_GPR, VTX_WORD0 {
let VC_INST = 0;
@@ -2048,6 +2219,7 @@ let Inst{63-32} = Word1;
// VTX_WORD3 (Padding)
//
// Inst{127-96} = 0;
+ let VTXInst = 1;
}
@@ -2124,9 +2296,8 @@ let isTerminator=1 in {
// CND*_INT Pattterns for f32 True / False values
class CND_INT_f32 <InstR600 cnd, CondCode cc> : Pat <
- (selectcc (i32 R600_Reg32:$src0), 0, (f32 R600_Reg32:$src1),
- R600_Reg32:$src2, cc),
- (cnd R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2)
+ (selectcc i32:$src0, 0, f32:$src1, f32:$src2, cc),
+ (cnd $src0, $src1, $src2)
>;
def : CND_INT_f32 <CNDE_INT, SETEQ>;
@@ -2135,9 +2306,8 @@ def : CND_INT_f32 <CNDGE_INT, SETGE>;
//CNDGE_INT extra pattern
def : Pat <
- (selectcc (i32 R600_Reg32:$src0), -1, (i32 R600_Reg32:$src1),
- (i32 R600_Reg32:$src2), COND_GT),
- (CNDGE_INT R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2)
+ (selectcc i32:$src0, -1, i32:$src1, i32:$src2, COND_GT),
+ (CNDGE_INT $src0, $src1, $src2)
>;
// KIL Patterns
@@ -2147,56 +2317,56 @@ def KILP : Pat <
>;
def KIL : Pat <
- (int_AMDGPU_kill R600_Reg32:$src0),
- (MASK_WRITE (KILLGT (f32 ZERO), (f32 R600_Reg32:$src0)))
+ (int_AMDGPU_kill f32:$src0),
+ (MASK_WRITE (KILLGT (f32 ZERO), $src0))
>;
// SGT Reverse args
def : Pat <
- (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, COND_LT),
- (SGT R600_Reg32:$src1, R600_Reg32:$src0)
+ (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_LT),
+ (SGT $src1, $src0)
>;
// SGE Reverse args
def : Pat <
- (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, COND_LE),
- (SGE R600_Reg32:$src1, R600_Reg32:$src0)
+ (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_LE),
+ (SGE $src1, $src0)
>;
// SETGT_DX10 reverse args
def : Pat <
- (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, COND_LT),
- (SETGT_DX10 R600_Reg32:$src1, R600_Reg32:$src0)
+ (selectcc f32:$src0, f32:$src1, -1, 0, COND_LT),
+ (SETGT_DX10 $src1, $src0)
>;
// SETGE_DX10 reverse args
def : Pat <
- (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, COND_LE),
- (SETGE_DX10 R600_Reg32:$src1, R600_Reg32:$src0)
+ (selectcc f32:$src0, f32:$src1, -1, 0, COND_LE),
+ (SETGE_DX10 $src1, $src0)
>;
// SETGT_INT reverse args
def : Pat <
- (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETLT),
- (SETGT_INT R600_Reg32:$src1, R600_Reg32:$src0)
+ (selectcc i32:$src0, i32:$src1, -1, 0, SETLT),
+ (SETGT_INT $src1, $src0)
>;
// SETGE_INT reverse args
def : Pat <
- (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETLE),
- (SETGE_INT R600_Reg32:$src1, R600_Reg32:$src0)
+ (selectcc i32:$src0, i32:$src1, -1, 0, SETLE),
+ (SETGE_INT $src1, $src0)
>;
// SETGT_UINT reverse args
def : Pat <
- (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETULT),
- (SETGT_UINT R600_Reg32:$src1, R600_Reg32:$src0)
+ (selectcc i32:$src0, i32:$src1, -1, 0, SETULT),
+ (SETGT_UINT $src1, $src0)
>;
// SETGE_UINT reverse args
def : Pat <
- (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETULE),
- (SETGE_UINT R600_Reg32:$src1, R600_Reg32:$src0)
+ (selectcc i32:$src0, i32:$src1, -1, 0, SETULE),
+ (SETGE_UINT $src1, $src0)
>;
// The next two patterns are special cases for handling 'true if ordered' and
@@ -2209,50 +2379,50 @@ def : Pat <
//SETE - 'true if ordered'
def : Pat <
- (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, SETO),
- (SETE R600_Reg32:$src0, R600_Reg32:$src1)
+ (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, SETO),
+ (SETE $src0, $src1)
>;
//SETE_DX10 - 'true if ordered'
def : Pat <
- (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETO),
- (SETE_DX10 R600_Reg32:$src0, R600_Reg32:$src1)
+ (selectcc f32:$src0, f32:$src1, -1, 0, SETO),
+ (SETE_DX10 $src0, $src1)
>;
//SNE - 'true if unordered'
def : Pat <
- (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, SETUO),
- (SNE R600_Reg32:$src0, R600_Reg32:$src1)
+ (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, SETUO),
+ (SNE $src0, $src1)
>;
//SETNE_DX10 - 'true if ordered'
def : Pat <
- (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETUO),
- (SETNE_DX10 R600_Reg32:$src0, R600_Reg32:$src1)
+ (selectcc f32:$src0, f32:$src1, -1, 0, SETUO),
+ (SETNE_DX10 $src0, $src1)
>;
-def : Extract_Element <f32, v4f32, R600_Reg128, 0, sub0>;
-def : Extract_Element <f32, v4f32, R600_Reg128, 1, sub1>;
-def : Extract_Element <f32, v4f32, R600_Reg128, 2, sub2>;
-def : Extract_Element <f32, v4f32, R600_Reg128, 3, sub3>;
+def : Extract_Element <f32, v4f32, 0, sub0>;
+def : Extract_Element <f32, v4f32, 1, sub1>;
+def : Extract_Element <f32, v4f32, 2, sub2>;
+def : Extract_Element <f32, v4f32, 3, sub3>;
-def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 0, sub0>;
-def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 1, sub1>;
-def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 2, sub2>;
-def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 3, sub3>;
+def : Insert_Element <f32, v4f32, 0, sub0>;
+def : Insert_Element <f32, v4f32, 1, sub1>;
+def : Insert_Element <f32, v4f32, 2, sub2>;
+def : Insert_Element <f32, v4f32, 3, sub3>;
-def : Extract_Element <i32, v4i32, R600_Reg128, 0, sub0>;
-def : Extract_Element <i32, v4i32, R600_Reg128, 1, sub1>;
-def : Extract_Element <i32, v4i32, R600_Reg128, 2, sub2>;
-def : Extract_Element <i32, v4i32, R600_Reg128, 3, sub3>;
+def : Extract_Element <i32, v4i32, 0, sub0>;
+def : Extract_Element <i32, v4i32, 1, sub1>;
+def : Extract_Element <i32, v4i32, 2, sub2>;
+def : Extract_Element <i32, v4i32, 3, sub3>;
-def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 0, sub0>;
-def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 1, sub1>;
-def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 2, sub2>;
-def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 3, sub3>;
+def : Insert_Element <i32, v4i32, 0, sub0>;
+def : Insert_Element <i32, v4i32, 1, sub1>;
+def : Insert_Element <i32, v4i32, 2, sub2>;
+def : Insert_Element <i32, v4i32, 3, sub3>;
-def : Vector4_Build <v4f32, R600_Reg128, f32, R600_Reg32>;
-def : Vector4_Build <v4i32, R600_Reg128, i32, R600_Reg32>;
+def : Vector4_Build <v4f32, f32>;
+def : Vector4_Build <v4i32, i32>;
// bitconvert patterns
diff --git a/lib/Target/R600/R600MachineFunctionInfo.h b/lib/Target/R600/R600MachineFunctionInfo.h
index 99c1f91..70fddbb 100644
--- a/lib/Target/R600/R600MachineFunctionInfo.h
+++ b/lib/Target/R600/R600MachineFunctionInfo.h
@@ -25,6 +25,7 @@ public:
R600MachineFunctionInfo(const MachineFunction &MF);
SmallVector<unsigned, 4> LiveOuts;
std::vector<unsigned> IndirectRegs;
+ unsigned StackSize;
};
} // End llvm namespace
diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp
new file mode 100644
index 0000000..cd7b7d0
--- /dev/null
+++ b/lib/Target/R600/R600Packetizer.cpp
@@ -0,0 +1,459 @@
+//===----- R600Packetizer.cpp - VLIW packetizer ---------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass implements instructions packetization for R600. It unsets isLast
+/// bit of instructions inside a bundle and substitutes src register with
+/// PreviousVector when applicable.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef R600PACKETIZER_CPP
+#define R600PACKETIZER_CPP
+
+#define DEBUG_TYPE "packets"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "AMDGPU.h"
+#include "R600InstrInfo.h"
+
+namespace llvm {
+
+class R600Packetizer : public MachineFunctionPass {
+
+public:
+ static char ID;
+ R600Packetizer(const TargetMachine &TM) : MachineFunctionPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesCFG();
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachineLoopInfo>();
+ AU.addPreserved<MachineLoopInfo>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ const char *getPassName() const {
+ return "R600 Packetizer";
+ }
+
+ bool runOnMachineFunction(MachineFunction &Fn);
+};
+char R600Packetizer::ID = 0;
+
+class R600PacketizerList : public VLIWPacketizerList {
+
+private:
+ const R600InstrInfo *TII;
+ const R600RegisterInfo &TRI;
+
+ enum BankSwizzle {
+ ALU_VEC_012 = 0,
+ ALU_VEC_021,
+ ALU_VEC_120,
+ ALU_VEC_102,
+ ALU_VEC_201,
+ ALU_VEC_210
+ };
+
+ unsigned getSlot(const MachineInstr *MI) const {
+ return TRI.getHWRegChan(MI->getOperand(0).getReg());
+ }
+
+ /// \returns register to PV chan mapping for bundle/single instructions that
+ /// immediatly precedes I.
+ DenseMap<unsigned, unsigned> getPreviousVector(MachineBasicBlock::iterator I)
+ const {
+ DenseMap<unsigned, unsigned> Result;
+ I--;
+ if (!TII->isALUInstr(I->getOpcode()) && !I->isBundle())
+ return Result;
+ MachineBasicBlock::instr_iterator BI = I.getInstrIterator();
+ if (I->isBundle())
+ BI++;
+ do {
+ if (TII->isPredicated(BI))
+ continue;
+ if (TII->isTransOnly(BI))
+ continue;
+ int OperandIdx = TII->getOperandIdx(BI->getOpcode(), R600Operands::WRITE);
+ if (OperandIdx < 0)
+ continue;
+ if (BI->getOperand(OperandIdx).getImm() == 0)
+ continue;
+ unsigned Dst = BI->getOperand(0).getReg();
+ if (BI->getOpcode() == AMDGPU::DOT4_r600_real) {
+ Result[Dst] = AMDGPU::PV_X;
+ continue;
+ }
+ unsigned PVReg = 0;
+ switch (TRI.getHWRegChan(Dst)) {
+ case 0:
+ PVReg = AMDGPU::PV_X;
+ break;
+ case 1:
+ PVReg = AMDGPU::PV_Y;
+ break;
+ case 2:
+ PVReg = AMDGPU::PV_Z;
+ break;
+ case 3:
+ PVReg = AMDGPU::PV_W;
+ break;
+ default:
+ llvm_unreachable("Invalid Chan");
+ }
+ Result[Dst] = PVReg;
+ } while ((++BI)->isBundledWithPred());
+ return Result;
+ }
+
+ void substitutePV(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PVs)
+ const {
+ R600Operands::Ops Ops[] = {
+ R600Operands::SRC0,
+ R600Operands::SRC1,
+ R600Operands::SRC2
+ };
+ for (unsigned i = 0; i < 3; i++) {
+ int OperandIdx = TII->getOperandIdx(MI->getOpcode(), Ops[i]);
+ if (OperandIdx < 0)
+ continue;
+ unsigned Src = MI->getOperand(OperandIdx).getReg();
+ const DenseMap<unsigned, unsigned>::const_iterator It = PVs.find(Src);
+ if (It != PVs.end())
+ MI->getOperand(OperandIdx).setReg(It->second);
+ }
+ }
+public:
+ // Ctor.
+ R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI,
+ MachineDominatorTree &MDT)
+ : VLIWPacketizerList(MF, MLI, MDT, true),
+ TII (static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo())),
+ TRI(TII->getRegisterInfo()) { }
+
+ // initPacketizerState - initialize some internal flags.
+ void initPacketizerState() { }
+
+ // ignorePseudoInstruction - Ignore bundling of pseudo instructions.
+ bool ignorePseudoInstruction(MachineInstr *MI, MachineBasicBlock *MBB) {
+ return false;
+ }
+
+ // isSoloInstruction - return true if instruction MI can not be packetized
+ // with any other instruction, which means that MI itself is a packet.
+ bool isSoloInstruction(MachineInstr *MI) {
+ if (TII->isVector(*MI))
+ return true;
+ if (!TII->isALUInstr(MI->getOpcode()))
+ return true;
+ if (TII->get(MI->getOpcode()).TSFlags & R600_InstFlag::TRANS_ONLY)
+ return true;
+ if (TII->isTransOnly(MI))
+ return true;
+ return false;
+ }
+
+ // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ
+ // together.
+ bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
+ MachineInstr *MII = SUI->getInstr(), *MIJ = SUJ->getInstr();
+ if (getSlot(MII) <= getSlot(MIJ))
+ return false;
+ // Does MII and MIJ share the same pred_sel ?
+ int OpI = TII->getOperandIdx(MII->getOpcode(), R600Operands::PRED_SEL),
+ OpJ = TII->getOperandIdx(MIJ->getOpcode(), R600Operands::PRED_SEL);
+ unsigned PredI = (OpI > -1)?MII->getOperand(OpI).getReg():0,
+ PredJ = (OpJ > -1)?MIJ->getOperand(OpJ).getReg():0;
+ if (PredI != PredJ)
+ return false;
+ if (SUJ->isSucc(SUI)) {
+ for (unsigned i = 0, e = SUJ->Succs.size(); i < e; ++i) {
+ const SDep &Dep = SUJ->Succs[i];
+ if (Dep.getSUnit() != SUI)
+ continue;
+ if (Dep.getKind() == SDep::Anti)
+ continue;
+ if (Dep.getKind() == SDep::Output)
+ if (MII->getOperand(0).getReg() != MIJ->getOperand(0).getReg())
+ continue;
+ return false;
+ }
+ }
+ return true;
+ }
+
+ // isLegalToPruneDependencies - Is it legal to prune dependece between SUI
+ // and SUJ.
+ bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) {return false;}
+
+ void setIsLastBit(MachineInstr *MI, unsigned Bit) const {
+ unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), R600Operands::LAST);
+ MI->getOperand(LastOp).setImm(Bit);
+ }
+
+ MachineBasicBlock::iterator addToPacket(MachineInstr *MI) {
+ CurrentPacketMIs.push_back(MI);
+ bool FitsConstLimits = TII->canBundle(CurrentPacketMIs);
+ DEBUG(
+ if (!FitsConstLimits) {
+ dbgs() << "Couldn't pack :\n";
+ MI->dump();
+ dbgs() << "with the following packets :\n";
+ for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) {
+ CurrentPacketMIs[i]->dump();
+ dbgs() << "\n";
+ }
+ dbgs() << "because of Consts read limitations\n";
+ });
+ const DenseMap<unsigned, unsigned> &PV =
+ getPreviousVector(CurrentPacketMIs.front());
+ bool FitsReadPortLimits = fitsReadPortLimitation(CurrentPacketMIs, PV);
+ DEBUG(
+ if (!FitsReadPortLimits) {
+ dbgs() << "Couldn't pack :\n";
+ MI->dump();
+ dbgs() << "with the following packets :\n";
+ for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) {
+ CurrentPacketMIs[i]->dump();
+ dbgs() << "\n";
+ }
+ dbgs() << "because of Read port limitations\n";
+ });
+ bool isBundlable = FitsConstLimits && FitsReadPortLimits;
+ CurrentPacketMIs.pop_back();
+ if (!isBundlable) {
+ endPacket(MI->getParent(), MI);
+ substitutePV(MI, getPreviousVector(MI));
+ return VLIWPacketizerList::addToPacket(MI);
+ }
+ if (!CurrentPacketMIs.empty())
+ setIsLastBit(CurrentPacketMIs.back(), 0);
+ substitutePV(MI, PV);
+ return VLIWPacketizerList::addToPacket(MI);
+ }
+private:
+ std::vector<std::pair<int, unsigned> >
+ ExtractSrcs(const MachineInstr *MI, const DenseMap<unsigned, unsigned> &PV)
+ const {
+ R600Operands::Ops Ops[] = {
+ R600Operands::SRC0,
+ R600Operands::SRC1,
+ R600Operands::SRC2
+ };
+ std::vector<std::pair<int, unsigned> > Result;
+ for (unsigned i = 0; i < 3; i++) {
+ int OperandIdx = TII->getOperandIdx(MI->getOpcode(), Ops[i]);
+ if (OperandIdx < 0){
+ Result.push_back(std::pair<int, unsigned>(-1,0));
+ continue;
+ }
+ unsigned Src = MI->getOperand(OperandIdx).getReg();
+ if (PV.find(Src) != PV.end()) {
+ Result.push_back(std::pair<int, unsigned>(-1,0));
+ continue;
+ }
+ unsigned Reg = TRI.getEncodingValue(Src) & 0xff;
+ if (Reg > 127) {
+ Result.push_back(std::pair<int, unsigned>(-1,0));
+ continue;
+ }
+ unsigned Chan = TRI.getHWRegChan(Src);
+ Result.push_back(std::pair<int, unsigned>(Reg, Chan));
+ }
+ return Result;
+ }
+
+ std::vector<std::pair<int, unsigned> >
+ Swizzle(std::vector<std::pair<int, unsigned> > Src,
+ BankSwizzle Swz) const {
+ switch (Swz) {
+ case ALU_VEC_012:
+ break;
+ case ALU_VEC_021:
+ std::swap(Src[1], Src[2]);
+ break;
+ case ALU_VEC_102:
+ std::swap(Src[0], Src[1]);
+ break;
+ case ALU_VEC_120:
+ std::swap(Src[0], Src[1]);
+ std::swap(Src[0], Src[2]);
+ break;
+ case ALU_VEC_201:
+ std::swap(Src[0], Src[2]);
+ std::swap(Src[0], Src[1]);
+ break;
+ case ALU_VEC_210:
+ std::swap(Src[0], Src[2]);
+ break;
+ }
+ return Src;
+ }
+
+ bool isLegal(const std::vector<MachineInstr *> &IG,
+ const std::vector<BankSwizzle> &Swz,
+ const DenseMap<unsigned, unsigned> &PV) const {
+ assert (Swz.size() == IG.size());
+ int Vector[4][3];
+ memset(Vector, -1, sizeof(Vector));
+ for (unsigned i = 0, e = IG.size(); i < e; i++) {
+ const std::vector<std::pair<int, unsigned> > &Srcs =
+ Swizzle(ExtractSrcs(IG[i], PV), Swz[i]);
+ for (unsigned j = 0; j < 3; j++) {
+ const std::pair<int, unsigned> &Src = Srcs[j];
+ if (Src.first < 0)
+ continue;
+ if (Vector[Src.second][j] < 0)
+ Vector[Src.second][j] = Src.first;
+ if (Vector[Src.second][j] != Src.first)
+ return false;
+ }
+ }
+ return true;
+ }
+
+ bool recursiveFitsFPLimitation(
+ std::vector<MachineInstr *> IG,
+ const DenseMap<unsigned, unsigned> &PV,
+ std::vector<BankSwizzle> &SwzCandidate,
+ std::vector<MachineInstr *> CurrentlyChecked)
+ const {
+ if (!isLegal(CurrentlyChecked, SwzCandidate, PV))
+ return false;
+ if (IG.size() == CurrentlyChecked.size()) {
+ return true;
+ }
+ BankSwizzle AvailableSwizzle[] = {
+ ALU_VEC_012,
+ ALU_VEC_021,
+ ALU_VEC_120,
+ ALU_VEC_102,
+ ALU_VEC_201,
+ ALU_VEC_210
+ };
+ CurrentlyChecked.push_back(IG[CurrentlyChecked.size()]);
+ for (unsigned i = 0; i < 6; i++) {
+ SwzCandidate.push_back(AvailableSwizzle[i]);
+ if (recursiveFitsFPLimitation(IG, PV, SwzCandidate, CurrentlyChecked))
+ return true;
+ SwzCandidate.pop_back();
+ }
+ return false;
+ }
+
+ bool fitsReadPortLimitation(
+ std::vector<MachineInstr *> IG,
+ const DenseMap<unsigned, unsigned> &PV)
+ const {
+ //Todo : support shared src0 - src1 operand
+ std::vector<BankSwizzle> SwzCandidate;
+ bool Result = recursiveFitsFPLimitation(IG, PV, SwzCandidate,
+ std::vector<MachineInstr *>());
+ if (!Result)
+ return false;
+ for (unsigned i = 0, e = IG.size(); i < e; i++) {
+ MachineInstr *MI = IG[i];
+ unsigned Op = TII->getOperandIdx(MI->getOpcode(),
+ R600Operands::BANK_SWIZZLE);
+ MI->getOperand(Op).setImm(SwzCandidate[i]);
+ }
+ return true;
+ }
+};
+
+bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
+ const TargetInstrInfo *TII = Fn.getTarget().getInstrInfo();
+ MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
+ MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
+
+ // Instantiate the packetizer.
+ R600PacketizerList Packetizer(Fn, MLI, MDT);
+
+ // DFA state table should not be empty.
+ assert(Packetizer.getResourceTracker() && "Empty DFA table!");
+
+ //
+ // Loop over all basic blocks and remove KILL pseudo-instructions
+ // These instructions confuse the dependence analysis. Consider:
+ // D0 = ... (Insn 0)
+ // R0 = KILL R0, D0 (Insn 1)
+ // R0 = ... (Insn 2)
+ // Here, Insn 1 will result in the dependence graph not emitting an output
+ // dependence between Insn 0 and Insn 2. This can lead to incorrect
+ // packetization
+ //
+ for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
+ MBB != MBBe; ++MBB) {
+ MachineBasicBlock::iterator End = MBB->end();
+ MachineBasicBlock::iterator MI = MBB->begin();
+ while (MI != End) {
+ if (MI->isKill()) {
+ MachineBasicBlock::iterator DeleteMI = MI;
+ ++MI;
+ MBB->erase(DeleteMI);
+ End = MBB->end();
+ continue;
+ }
+ ++MI;
+ }
+ }
+
+ // Loop over all of the basic blocks.
+ for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
+ MBB != MBBe; ++MBB) {
+ // Find scheduling regions and schedule / packetize each region.
+ unsigned RemainingCount = MBB->size();
+ for(MachineBasicBlock::iterator RegionEnd = MBB->end();
+ RegionEnd != MBB->begin();) {
+ // The next region starts above the previous region. Look backward in the
+ // instruction stream until we find the nearest boundary.
+ MachineBasicBlock::iterator I = RegionEnd;
+ for(;I != MBB->begin(); --I, --RemainingCount) {
+ if (TII->isSchedulingBoundary(llvm::prior(I), MBB, Fn))
+ break;
+ }
+ I = MBB->begin();
+
+ // Skip empty scheduling regions.
+ if (I == RegionEnd) {
+ RegionEnd = llvm::prior(RegionEnd);
+ --RemainingCount;
+ continue;
+ }
+ // Skip regions with one instruction.
+ if (I == llvm::prior(RegionEnd)) {
+ RegionEnd = llvm::prior(RegionEnd);
+ continue;
+ }
+
+ Packetizer.PacketizeMIs(MBB, I, RegionEnd);
+ RegionEnd = I;
+ }
+ }
+
+ return true;
+
+}
+
+}
+
+llvm::FunctionPass *llvm::createR600Packetizer(TargetMachine &tm) {
+ return new R600Packetizer(tm);
+}
+
+#endif // R600PACKETIZER_CPP
diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td
index 03f4976..bfc546b 100644
--- a/lib/Target/R600/R600RegisterInfo.td
+++ b/lib/Target/R600/R600RegisterInfo.td
@@ -88,8 +88,14 @@ def NEG_ONE : R600Reg<"-1.0", 249>;
def ONE_INT : R600Reg<"1", 250>;
def HALF : R600Reg<"0.5", 252>;
def NEG_HALF : R600Reg<"-0.5", 252>;
-def ALU_LITERAL_X : R600Reg<"literal.x", 253>;
-def PV_X : R600Reg<"pv.x", 254>;
+def ALU_LITERAL_X : R600RegWithChan<"literal.x", 253, "X">;
+def ALU_LITERAL_Y : R600RegWithChan<"literal.y", 253, "Y">;
+def ALU_LITERAL_Z : R600RegWithChan<"literal.z", 253, "Z">;
+def ALU_LITERAL_W : R600RegWithChan<"literal.w", 253, "W">;
+def PV_X : R600RegWithChan<"PV.x", 254, "X">;
+def PV_Y : R600RegWithChan<"PV.y", 254, "Y">;
+def PV_Z : R600RegWithChan<"PV.z", 254, "Z">;
+def PV_W : R600RegWithChan<"PV.w", 254, "W">;
def PREDICATE_BIT : R600Reg<"PredicateBit", 0>;
def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>;
def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>;
diff --git a/lib/Target/R600/R600Schedule.td b/lib/Target/R600/R600Schedule.td
index 7ede181..78a460a 100644
--- a/lib/Target/R600/R600Schedule.td
+++ b/lib/Target/R600/R600Schedule.td
@@ -24,7 +24,7 @@ def AnyALU : InstrItinClass;
def VecALU : InstrItinClass;
def TransALU : InstrItinClass;
-def R600_EG_Itin : ProcessorItineraries <
+def R600_VLIW5_Itin : ProcessorItineraries <
[ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS, ALU_NULL],
[],
[
@@ -34,3 +34,14 @@ def R600_EG_Itin : ProcessorItineraries <
InstrItinData<NullALU, [InstrStage<1, [ALU_NULL]>]>
]
>;
+
+def R600_VLIW4_Itin : ProcessorItineraries <
+ [ALU_X, ALU_Y, ALU_Z, ALU_W, ALU_NULL],
+ [],
+ [
+ InstrItinData<AnyALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W]>]>,
+ InstrItinData<VecALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_X, ALU_W]>]>,
+ InstrItinData<TransALU, [InstrStage<1, [ALU_NULL]>]>,
+ InstrItinData<NullALU, [InstrStage<1, [ALU_NULL]>]>
+ ]
+>;
diff --git a/lib/Target/R600/SIDefines.h b/lib/Target/R600/SIDefines.h
new file mode 100644
index 0000000..716b093
--- /dev/null
+++ b/lib/Target/R600/SIDefines.h
@@ -0,0 +1,22 @@
+//===-- SIDefines.h - SI Helper Macros ----------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef SIDEFINES_H_
+#define SIDEFINES_H_
+
+#define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028
+#define R_00B128_SPI_SHADER_PGM_RSRC1_VS 0x00B128
+#define R_00B228_SPI_SHADER_PGM_RSRC1_GS 0x00B228
+#define R_00B848_COMPUTE_PGM_RSRC1 0x00B848
+#define S_00B028_VGPRS(x) (((x) & 0x3F) << 0)
+#define S_00B028_SGPRS(x) (((x) & 0x0F) << 6)
+#define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC
+
+#endif // SIDEFINES_H_
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index 6f0c307..6bd82a5 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -49,6 +49,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
addRegisterClass(MVT::v4i32, &AMDGPU::VReg_128RegClass);
addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
+ addRegisterClass(MVT::i128, &AMDGPU::SReg_128RegClass);
addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass);
addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
@@ -70,6 +71,10 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
+
+ setOperationAction(ISD::STORE, MVT::i32, Custom);
+ setOperationAction(ISD::STORE, MVT::i64, Custom);
+
setTargetDAGCombine(ISD::SELECT_CC);
setTargetDAGCombine(ISD::SETCC);
@@ -234,6 +239,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
+ case ISD::STORE: return LowerSTORE(Op, DAG);
}
return SDValue();
}
@@ -332,6 +338,32 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
return Chain;
}
+#define RSRC_DATA_FORMAT 0xf00000000000
+
+SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+ StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
+ SDValue Chain = Op.getOperand(0);
+ SDValue Value = Op.getOperand(1);
+ SDValue VirtualAddress = Op.getOperand(2);
+ DebugLoc DL = Op.getDebugLoc();
+
+ if (StoreNode->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS) {
+ return SDValue();
+ }
+
+ SDValue SrcSrc = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
+ DAG.getConstant(0, MVT::i64),
+ DAG.getConstant(RSRC_DATA_FORMAT, MVT::i64));
+
+ SDValue Ops[2];
+ Ops[0] = DAG.getNode(AMDGPUISD::BUFFER_STORE, DL, MVT::Other, Chain,
+ Value, SrcSrc, VirtualAddress);
+ Ops[1] = Chain;
+
+ return DAG.getMergeValues(Ops, 2, DL);
+
+}
+
SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
@@ -424,9 +456,12 @@ int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const {
float F;
} Imm;
- if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N))
+ if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) {
+ if (Node->getZExtValue() >> 32) {
+ return -1;
+ }
Imm.I = Node->getSExtValue();
- else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N))
+ } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N))
Imm.F = Node->getValueAPF().convertToFloat();
else
return -1; // It isn't an immediate
@@ -534,8 +569,9 @@ void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand,
Operand = SDValue(Node, 0);
}
-SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
- SelectionDAG &DAG) const {
+/// \brief Try to fold the Nodes operands into the Node
+SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
+ SelectionDAG &DAG) const {
// Original encoding (either e32 or e64)
int Opcode = Node->getMachineOpcode();
@@ -666,5 +702,116 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
// Create a complete new instruction
return DAG.getMachineNode(Desc->Opcode, Node->getDebugLoc(),
- Node->getVTList(), Ops.data(), Ops.size());
+ Node->getVTList(), Ops);
+}
+
+/// \brief Helper function for adjustWritemask
+unsigned SubIdx2Lane(unsigned Idx) {
+ switch (Idx) {
+ default: return 0;
+ case AMDGPU::sub0: return 0;
+ case AMDGPU::sub1: return 1;
+ case AMDGPU::sub2: return 2;
+ case AMDGPU::sub3: return 3;
+ }
+}
+
+/// \brief Adjust the writemask of MIMG instructions
+void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
+ SelectionDAG &DAG) const {
+ SDNode *Users[4] = { };
+ unsigned Writemask = 0, Lane = 0;
+
+ // Try to figure out the used register components
+ for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
+ I != E; ++I) {
+
+ // Abort if we can't understand the usage
+ if (!I->isMachineOpcode() ||
+ I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
+ return;
+
+ Lane = SubIdx2Lane(I->getConstantOperandVal(1));
+
+ // Abort if we have more than one user per component
+ if (Users[Lane])
+ return;
+
+ Users[Lane] = *I;
+ Writemask |= 1 << Lane;
+ }
+
+ // Abort if all components are used
+ if (Writemask == 0xf)
+ return;
+
+ // Adjust the writemask in the node
+ std::vector<SDValue> Ops;
+ Ops.push_back(DAG.getTargetConstant(Writemask, MVT::i32));
+ for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i)
+ Ops.push_back(Node->getOperand(i));
+ Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops.data(), Ops.size());
+
+ // If we only got one lane, replace it with a copy
+ if (Writemask == (1U << Lane)) {
+ SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID, MVT::i32);
+ SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+ DebugLoc(), Users[Lane]->getValueType(0),
+ SDValue(Node, 0), RC);
+ DAG.ReplaceAllUsesWith(Users[Lane], Copy);
+ return;
+ }
+
+ // Update the users of the node with the new indices
+ for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
+
+ SDNode *User = Users[i];
+ if (!User)
+ continue;
+
+ SDValue Op = DAG.getTargetConstant(Idx, MVT::i32);
+ DAG.UpdateNodeOperands(User, User->getOperand(0), Op);
+
+ switch (Idx) {
+ default: break;
+ case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
+ case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
+ case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
+ }
+ }
+}
+
+/// \brief Fold the instructions after slecting them
+SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
+ SelectionDAG &DAG) const {
+
+ if (AMDGPU::isMIMG(Node->getMachineOpcode()) != -1)
+ adjustWritemask(Node, DAG);
+
+ return foldOperands(Node, DAG);
+}
+
+/// \brief Assign the register class depending on the number of
+/// bits set in the writemask
+void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
+ SDNode *Node) const {
+ if (AMDGPU::isMIMG(MI->getOpcode()) == -1)
+ return;
+
+ unsigned VReg = MI->getOperand(0).getReg();
+ unsigned Writemask = MI->getOperand(1).getImm();
+ unsigned BitsSet = 0;
+ for (unsigned i = 0; i < 4; ++i)
+ BitsSet += Writemask & (1 << i) ? 1 : 0;
+
+ const TargetRegisterClass *RC;
+ switch (BitsSet) {
+ default: return;
+ case 1: RC = &AMDGPU::VReg_32RegClass; break;
+ case 2: RC = &AMDGPU::VReg_64RegClass; break;
+ case 3: RC = &AMDGPU::VReg_96RegClass; break;
+ }
+
+ MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+ MRI.setRegClass(VReg, RC);
}
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
index 5ad2f40..de637be 100644
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@@ -24,6 +24,7 @@ class SITargetLowering : public AMDGPUTargetLowering {
const SIInstrInfo * TII;
const TargetRegisterInfo * TRI;
+ SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
@@ -33,6 +34,9 @@ class SITargetLowering : public AMDGPUTargetLowering {
void ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand,
unsigned RegClass, bool &ScalarSlotUsed) const;
+ SDNode *foldOperands(MachineSDNode *N, SelectionDAG &DAG) const;
+ void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
+
public:
SITargetLowering(TargetMachine &tm);
@@ -49,6 +53,8 @@ public:
virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const;
+ virtual void AdjustInstrPostInstrSelection(MachineInstr *MI,
+ SDNode *Node) const;
int32_t analyzeImmediate(const SDNode *N) const;
};
diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
index 3891ddb..f737ddd 100644
--- a/lib/Target/R600/SIInstrFormats.td
+++ b/lib/Target/R600/SIInstrFormats.td
@@ -284,33 +284,33 @@ let Uses = [EXEC] in {
class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
Enc64<outs, ins, asm, pattern> {
- bits<8> VDATA;
- bits<12> OFFSET;
- bits<1> OFFEN;
- bits<1> IDXEN;
- bits<1> GLC;
- bits<1> ADDR64;
- bits<1> LDS;
- bits<8> VADDR;
- bits<7> SRSRC;
- bits<1> SLC;
- bits<1> TFE;
- bits<8> SOFFSET;
-
- let Inst{11-0} = OFFSET;
- let Inst{12} = OFFEN;
- let Inst{13} = IDXEN;
- let Inst{14} = GLC;
- let Inst{15} = ADDR64;
- let Inst{16} = LDS;
+ bits<12> offset;
+ bits<1> offen;
+ bits<1> idxen;
+ bits<1> glc;
+ bits<1> addr64;
+ bits<1> lds;
+ bits<8> vaddr;
+ bits<8> vdata;
+ bits<7> srsrc;
+ bits<1> slc;
+ bits<1> tfe;
+ bits<8> soffset;
+
+ let Inst{11-0} = offset;
+ let Inst{12} = offen;
+ let Inst{13} = idxen;
+ let Inst{14} = glc;
+ let Inst{15} = addr64;
+ let Inst{16} = lds;
let Inst{24-18} = op;
let Inst{31-26} = 0x38; //encoding
- let Inst{39-32} = VADDR;
- let Inst{47-40} = VDATA;
- let Inst{52-48} = SRSRC{6-2};
- let Inst{54} = SLC;
- let Inst{55} = TFE;
- let Inst{63-56} = SOFFSET;
+ let Inst{39-32} = vaddr;
+ let Inst{47-40} = vdata;
+ let Inst{52-48} = srsrc{6-2};
+ let Inst{54} = slc;
+ let Inst{55} = tfe;
+ let Inst{63-56} = soffset;
let VM_CNT = 1;
let EXP_CNT = 1;
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
index 0bfcef5..9a04c60 100644
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -58,6 +58,10 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 0
};
+ const int16_t Sub0_2[] = {
+ AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 0
+ };
+
const int16_t Sub0_1[] = {
AMDGPU::sub0, AMDGPU::sub1, 0
};
@@ -125,6 +129,11 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
Opcode = AMDGPU::V_MOV_B32_e32;
SubIndices = Sub0_1;
+ } else if (AMDGPU::VReg_96RegClass.contains(DestReg)) {
+ assert(AMDGPU::VReg_96RegClass.contains(SrcReg));
+ Opcode = AMDGPU::V_MOV_B32_e32;
+ SubIndices = Sub0_2;
+
} else if (AMDGPU::VReg_128RegClass.contains(DestReg)) {
assert(AMDGPU::VReg_128RegClass.contains(SrcReg) ||
AMDGPU::SReg_128RegClass.contains(SrcReg));
diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
index d4e60e5..87eff4d 100644
--- a/lib/Target/R600/SIInstrInfo.h
+++ b/lib/Target/R600/SIInstrInfo.h
@@ -80,6 +80,7 @@ namespace AMDGPU {
int getVOPe64(uint16_t Opcode);
int getCommuteRev(uint16_t Opcode);
int getCommuteOrig(uint16_t Opcode);
+ int isMIMG(uint16_t Opcode);
} // End namespace AMDGPU
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index 617f0b8..c8aecb7 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -26,6 +26,10 @@ def HI32 : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue() >> 32, MVT::i32);
}]>;
+def SIbuffer_store : SDNode<"AMDGPUISD::BUFFER_STORE",
+ SDTypeProfile<0, 3, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
+ [SDNPHasChain, SDNPMayStore]>;
+
def IMM8bitDWORD : ImmLeaf <
i32, [{
return (Imm & ~0x3FC) == 0;
@@ -255,14 +259,14 @@ multiclass VOPC_64 <bits<8> op, string opName,
class VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
op, (outs VReg_32:$dst),
(ins VSrc_32:$src0, VSrc_32:$src1, VSrc_32:$src2,
- i32imm:$abs, i32imm:$clamp, i32imm:$omod, i32imm:$neg),
+ InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
opName#" $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern
>, VOP <opName>;
class VOP3_64 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
op, (outs VReg_64:$dst),
(ins VSrc_64:$src0, VSrc_64:$src1, VSrc_64:$src2,
- i32imm:$abs, i32imm:$clamp, i32imm:$omod, i32imm:$neg),
+ InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
opName#" $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern
>, VOP <opName>;
@@ -285,17 +289,39 @@ class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBU
class MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> : MUBUF <
op,
- (outs regClass:$dst),
+ (outs regClass:$vdata),
(ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
i1imm:$lds, VReg_32:$vaddr, SReg_128:$srsrc, i1imm:$slc,
i1imm:$tfe, SSrc_32:$soffset),
- asm#" $dst, $offset, $offen, $idxen, $glc, $addr64, "
+ asm#" $vdata, $offset, $offen, $idxen, $glc, $addr64, "
#"$lds, $vaddr, $srsrc, $slc, $tfe, $soffset",
[]> {
let mayLoad = 1;
let mayStore = 0;
}
+class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass,
+ ValueType VT> :
+ MUBUF <op, (outs), (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr),
+ name#" $vdata, $srsrc + $vaddr",
+ [(SIbuffer_store (VT vdataClass:$vdata), (i128 SReg_128:$srsrc),
+ (i64 VReg_64:$vaddr))]> {
+
+ let mayLoad = 0;
+ let mayStore = 1;
+
+ // Encoding
+ let offset = 0;
+ let offen = 0;
+ let idxen = 0;
+ let glc = 0;
+ let addr64 = 1;
+ let lds = 0;
+ let slc = 0;
+ let tfe = 0;
+ let soffset = 128; // ZERO
+}
+
class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
op,
(outs regClass:$dst),
@@ -309,7 +335,22 @@ class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF
let mayStore = 0;
}
-class MIMG_Load_Helper <bits<7> op, string asm> : MIMG <
+class MIMG_NoSampler_Helper <bits<7> op, string asm> : MIMG <
+ op,
+ (outs VReg_128:$vdata),
+ (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
+ i1imm:$tfe, i1imm:$lwe, i1imm:$slc, unknown:$vaddr,
+ SReg_256:$srsrc),
+ asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
+ #" $tfe, $lwe, $slc, $vaddr, $srsrc",
+ []> {
+ let SSAMP = 0;
+ let mayLoad = 1;
+ let mayStore = 0;
+ let hasPostISelHook = 1;
+}
+
+class MIMG_Sampler_Helper <bits<7> op, string asm> : MIMG <
op,
(outs VReg_128:$vdata),
(ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
@@ -320,6 +361,7 @@ class MIMG_Load_Helper <bits<7> op, string asm> : MIMG <
[]> {
let mayLoad = 1;
let mayStore = 0;
+ let hasPostISelHook = 1;
}
//===----------------------------------------------------------------------===//
@@ -353,4 +395,13 @@ def getCommuteOrig : InstrMapping {
let ValueCols = [["1"]];
}
+// Test if the supplied opcode is an MIMG instruction
+def isMIMG : InstrMapping {
+ let FilterClass = "MIMG";
+ let RowFields = ["Inst"];
+ let ColFields = ["Size"];
+ let KeyCol = ["8"];
+ let ValueCols = [["8"]];
+}
+
include "SIInstructions.td"
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 4f734f9..0d50c5d 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -108,7 +108,7 @@ VGPR0 = V_CNDMASK VCC, VGPR0, VGPR1
def S_CMPK_EQ_I32 : SOPK <
0x00000003, (outs SCCReg:$dst), (ins SReg_32:$src0, i32imm:$src1),
"S_CMPK_EQ_I32",
- [(set SCCReg:$dst, (setcc SReg_32:$src0, imm:$src1, SETEQ))]
+ [(set i1:$dst, (setcc i32:$src0, imm:$src1, SETEQ))]
>;
*/
@@ -408,8 +408,14 @@ def BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <0x0000000d, "BUFFER_LOAD_DWORDX2",
def BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128>;
//def BUFFER_STORE_BYTE : MUBUF_ <0x00000018, "BUFFER_STORE_BYTE", []>;
//def BUFFER_STORE_SHORT : MUBUF_ <0x0000001a, "BUFFER_STORE_SHORT", []>;
-//def BUFFER_STORE_DWORD : MUBUF_ <0x0000001c, "BUFFER_STORE_DWORD", []>;
-//def BUFFER_STORE_DWORDX2 : MUBUF_DWORDX2 <0x0000001d, "BUFFER_STORE_DWORDX2", []>;
+
+def BUFFER_STORE_DWORD : MUBUF_Store_Helper <
+ 0x0000001c, "BUFFER_STORE_DWORD", VReg_32, i32
+>;
+
+def BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper <
+ 0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64, i64
+>;
//def BUFFER_STORE_DWORDX4 : MUBUF_DWORDX4 <0x0000001e, "BUFFER_STORE_DWORDX4", []>;
//def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>;
//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "BUFFER_ATOMIC_CMPSWAP", []>;
@@ -489,7 +495,7 @@ defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
//def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>;
//def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>;
//def IMAGE_LOAD : MIMG_NoPattern_ <"IMAGE_LOAD", 0x00000000>;
-//def IMAGE_LOAD_MIP : MIMG_NoPattern_ <"IMAGE_LOAD_MIP", 0x00000001>;
+def IMAGE_LOAD_MIP : MIMG_NoSampler_Helper <0x00000001, "IMAGE_LOAD_MIP">;
//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_PCK", 0x00000002>;
//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_PCK_SGN", 0x00000003>;
//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK", 0x00000004>;
@@ -498,7 +504,7 @@ defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
//def IMAGE_STORE_MIP : MIMG_NoPattern_ <"IMAGE_STORE_MIP", 0x00000009>;
//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"IMAGE_STORE_PCK", 0x0000000a>;
//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"IMAGE_STORE_MIP_PCK", 0x0000000b>;
-//def IMAGE_GET_RESINFO : MIMG_NoPattern_ <"IMAGE_GET_RESINFO", 0x0000000e>;
+def IMAGE_GET_RESINFO : MIMG_NoSampler_Helper <0x0000000e, "IMAGE_GET_RESINFO">;
//def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_SWAP", 0x0000000f>;
//def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_CMPSWAP", 0x00000010>;
//def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"IMAGE_ATOMIC_ADD", 0x00000011>;
@@ -516,20 +522,20 @@ defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_FCMPSWAP", 0x0000001d>;
//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMIN", 0x0000001e>;
//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMAX", 0x0000001f>;
-def IMAGE_SAMPLE : MIMG_Load_Helper <0x00000020, "IMAGE_SAMPLE">;
+def IMAGE_SAMPLE : MIMG_Sampler_Helper <0x00000020, "IMAGE_SAMPLE">;
//def IMAGE_SAMPLE_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CL", 0x00000021>;
-def IMAGE_SAMPLE_D : MIMG_Load_Helper <0x00000022, "IMAGE_SAMPLE_D">;
+def IMAGE_SAMPLE_D : MIMG_Sampler_Helper <0x00000022, "IMAGE_SAMPLE_D">;
//def IMAGE_SAMPLE_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_CL", 0x00000023>;
-def IMAGE_SAMPLE_L : MIMG_Load_Helper <0x00000024, "IMAGE_SAMPLE_L">;
-def IMAGE_SAMPLE_B : MIMG_Load_Helper <0x00000025, "IMAGE_SAMPLE_B">;
+def IMAGE_SAMPLE_L : MIMG_Sampler_Helper <0x00000024, "IMAGE_SAMPLE_L">;
+def IMAGE_SAMPLE_B : MIMG_Sampler_Helper <0x00000025, "IMAGE_SAMPLE_B">;
//def IMAGE_SAMPLE_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_CL", 0x00000026>;
//def IMAGE_SAMPLE_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ", 0x00000027>;
-def IMAGE_SAMPLE_C : MIMG_Load_Helper <0x00000028, "IMAGE_SAMPLE_C">;
+def IMAGE_SAMPLE_C : MIMG_Sampler_Helper <0x00000028, "IMAGE_SAMPLE_C">;
//def IMAGE_SAMPLE_C_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL", 0x00000029>;
//def IMAGE_SAMPLE_C_D : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D", 0x0000002a>;
//def IMAGE_SAMPLE_C_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL", 0x0000002b>;
-def IMAGE_SAMPLE_C_L : MIMG_Load_Helper <0x0000002c, "IMAGE_SAMPLE_C_L">;
-def IMAGE_SAMPLE_C_B : MIMG_Load_Helper <0x0000002d, "IMAGE_SAMPLE_C_B">;
+def IMAGE_SAMPLE_C_L : MIMG_Sampler_Helper <0x0000002c, "IMAGE_SAMPLE_C_L">;
+def IMAGE_SAMPLE_C_B : MIMG_Sampler_Helper <0x0000002d, "IMAGE_SAMPLE_C_B">;
//def IMAGE_SAMPLE_C_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL", 0x0000002e>;
//def IMAGE_SAMPLE_C_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ", 0x0000002f>;
//def IMAGE_SAMPLE_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_O", 0x00000030>;
@@ -594,12 +600,14 @@ defm V_READFIRSTLANE_B32 : VOP1_32 <0x00000002, "V_READFIRSTLANE_B32", []>;
//defm V_CVT_I32_F64 : VOP1_32 <0x00000003, "V_CVT_I32_F64", []>;
//defm V_CVT_F64_I32 : VOP1_64 <0x00000004, "V_CVT_F64_I32", []>;
defm V_CVT_F32_I32 : VOP1_32 <0x00000005, "V_CVT_F32_I32",
- [(set VReg_32:$dst, (sint_to_fp VSrc_32:$src0))]
+ [(set f32:$dst, (sint_to_fp i32:$src0))]
+>;
+defm V_CVT_F32_U32 : VOP1_32 <0x00000006, "V_CVT_F32_U32",
+ [(set f32:$dst, (uint_to_fp i32:$src0))]
>;
-//defm V_CVT_F32_U32 : VOP1_32 <0x00000006, "V_CVT_F32_U32", []>;
-//defm V_CVT_U32_F32 : VOP1_32 <0x00000007, "V_CVT_U32_F32", []>;
+defm V_CVT_U32_F32 : VOP1_32 <0x00000007, "V_CVT_U32_F32", []>;
defm V_CVT_I32_F32 : VOP1_32 <0x00000008, "V_CVT_I32_F32",
- [(set (i32 VReg_32:$dst), (fp_to_sint VSrc_32:$src0))]
+ [(set i32:$dst, (fp_to_sint f32:$src0))]
>;
defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>;
////def V_CVT_F16_F32 : VOP1_F16 <0x0000000a, "V_CVT_F16_F32", []>;
@@ -616,35 +624,37 @@ defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>;
//defm V_CVT_U32_F64 : VOP1_32 <0x00000015, "V_CVT_U32_F64", []>;
//defm V_CVT_F64_U32 : VOP1_64 <0x00000016, "V_CVT_F64_U32", []>;
defm V_FRACT_F32 : VOP1_32 <0x00000020, "V_FRACT_F32",
- [(set VReg_32:$dst, (AMDGPUfract VSrc_32:$src0))]
+ [(set f32:$dst, (AMDGPUfract f32:$src0))]
+>;
+defm V_TRUNC_F32 : VOP1_32 <0x00000021, "V_TRUNC_F32",
+ [(set f32:$dst, (int_AMDGPU_trunc f32:$src0))]
>;
-defm V_TRUNC_F32 : VOP1_32 <0x00000021, "V_TRUNC_F32", []>;
defm V_CEIL_F32 : VOP1_32 <0x00000022, "V_CEIL_F32",
- [(set VReg_32:$dst, (fceil VSrc_32:$src0))]
+ [(set f32:$dst, (fceil f32:$src0))]
>;
defm V_RNDNE_F32 : VOP1_32 <0x00000023, "V_RNDNE_F32",
- [(set VReg_32:$dst, (frint VSrc_32:$src0))]
+ [(set f32:$dst, (frint f32:$src0))]
>;
defm V_FLOOR_F32 : VOP1_32 <0x00000024, "V_FLOOR_F32",
- [(set VReg_32:$dst, (ffloor VSrc_32:$src0))]
+ [(set f32:$dst, (ffloor f32:$src0))]
>;
defm V_EXP_F32 : VOP1_32 <0x00000025, "V_EXP_F32",
- [(set VReg_32:$dst, (fexp2 VSrc_32:$src0))]
+ [(set f32:$dst, (fexp2 f32:$src0))]
>;
defm V_LOG_CLAMP_F32 : VOP1_32 <0x00000026, "V_LOG_CLAMP_F32", []>;
defm V_LOG_F32 : VOP1_32 <0x00000027, "V_LOG_F32",
- [(set VReg_32:$dst, (flog2 VSrc_32:$src0))]
+ [(set f32:$dst, (flog2 f32:$src0))]
>;
defm V_RCP_CLAMP_F32 : VOP1_32 <0x00000028, "V_RCP_CLAMP_F32", []>;
defm V_RCP_LEGACY_F32 : VOP1_32 <0x00000029, "V_RCP_LEGACY_F32", []>;
defm V_RCP_F32 : VOP1_32 <0x0000002a, "V_RCP_F32",
- [(set VReg_32:$dst, (fdiv FP_ONE, VSrc_32:$src0))]
+ [(set f32:$dst, (fdiv FP_ONE, f32:$src0))]
>;
defm V_RCP_IFLAG_F32 : VOP1_32 <0x0000002b, "V_RCP_IFLAG_F32", []>;
defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", []>;
defm V_RSQ_LEGACY_F32 : VOP1_32 <
0x0000002d, "V_RSQ_LEGACY_F32",
- [(set VReg_32:$dst, (int_AMDGPU_rsq VSrc_32:$src0))]
+ [(set f32:$dst, (int_AMDGPU_rsq f32:$src0))]
>;
defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32", []>;
defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64", []>;
@@ -787,14 +797,13 @@ def V_CNDMASK_B32_e64 : VOP3 <0x00000100, (outs VReg_32:$dst),
(ins VSrc_32:$src0, VSrc_32:$src1, SSrc_64:$src2,
InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
"V_CNDMASK_B32_e64 $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg",
- [(set (i32 VReg_32:$dst), (select (i1 SSrc_64:$src2),
- VSrc_32:$src1, VSrc_32:$src0))]
+ [(set i32:$dst, (select i1:$src2, i32:$src1, i32:$src0))]
>;
//f32 pattern for V_CNDMASK_B32_e64
def : Pat <
- (f32 (select (i1 SSrc_64:$src2), VSrc_32:$src1, VSrc_32:$src0)),
- (V_CNDMASK_B32_e64 VSrc_32:$src0, VSrc_32:$src1, SSrc_64:$src2)
+ (f32 (select i1:$src2, f32:$src1, f32:$src0)),
+ (V_CNDMASK_B32_e64 $src0, $src1, $src2)
>;
defm V_READLANE_B32 : VOP2_32 <0x00000001, "V_READLANE_B32", []>;
@@ -802,11 +811,11 @@ defm V_WRITELANE_B32 : VOP2_32 <0x00000002, "V_WRITELANE_B32", []>;
let isCommutable = 1 in {
defm V_ADD_F32 : VOP2_32 <0x00000003, "V_ADD_F32",
- [(set VReg_32:$dst, (fadd VSrc_32:$src0, VReg_32:$src1))]
+ [(set f32:$dst, (fadd f32:$src0, f32:$src1))]
>;
defm V_SUB_F32 : VOP2_32 <0x00000004, "V_SUB_F32",
- [(set VReg_32:$dst, (fsub VSrc_32:$src0, VReg_32:$src1))]
+ [(set f32:$dst, (fsub f32:$src0, f32:$src1))]
>;
defm V_SUBREV_F32 : VOP2_32 <0x00000005, "V_SUBREV_F32", [], "V_SUB_F32">;
} // End isCommutable = 1
@@ -817,11 +826,11 @@ let isCommutable = 1 in {
defm V_MUL_LEGACY_F32 : VOP2_32 <
0x00000007, "V_MUL_LEGACY_F32",
- [(set VReg_32:$dst, (int_AMDGPU_mul VSrc_32:$src0, VReg_32:$src1))]
+ [(set f32:$dst, (int_AMDGPU_mul f32:$src0, f32:$src1))]
>;
defm V_MUL_F32 : VOP2_32 <0x00000008, "V_MUL_F32",
- [(set VReg_32:$dst, (fmul VSrc_32:$src0, VReg_32:$src1))]
+ [(set f32:$dst, (fmul f32:$src0, f32:$src1))]
>;
} // End isCommutable = 1
@@ -834,43 +843,51 @@ defm V_MUL_F32 : VOP2_32 <0x00000008, "V_MUL_F32",
let isCommutable = 1 in {
defm V_MIN_LEGACY_F32 : VOP2_32 <0x0000000d, "V_MIN_LEGACY_F32",
- [(set VReg_32:$dst, (AMDGPUfmin VSrc_32:$src0, VReg_32:$src1))]
+ [(set f32:$dst, (AMDGPUfmin f32:$src0, f32:$src1))]
>;
defm V_MAX_LEGACY_F32 : VOP2_32 <0x0000000e, "V_MAX_LEGACY_F32",
- [(set VReg_32:$dst, (AMDGPUfmax VSrc_32:$src0, VReg_32:$src1))]
+ [(set f32:$dst, (AMDGPUfmax f32:$src0, f32:$src1))]
>;
defm V_MIN_F32 : VOP2_32 <0x0000000f, "V_MIN_F32", []>;
defm V_MAX_F32 : VOP2_32 <0x00000010, "V_MAX_F32", []>;
-defm V_MIN_I32 : VOP2_32 <0x00000011, "V_MIN_I32", []>;
-defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32", []>;
-defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32", []>;
-defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32", []>;
+defm V_MIN_I32 : VOP2_32 <0x00000011, "V_MIN_I32",
+ [(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))]
+>;
+defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32",
+ [(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))]
+>;
+defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32",
+ [(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))]
+>;
+defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32",
+ [(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))]
+>;
defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32",
- [(set VReg_32:$dst, (srl VSrc_32:$src0, (i32 VReg_32:$src1)))]
+ [(set i32:$dst, (srl i32:$src0, i32:$src1))]
>;
defm V_LSHRREV_B32 : VOP2_32 <0x00000016, "V_LSHRREV_B32", [], "V_LSHR_B32">;
defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32",
- [(set VReg_32:$dst, (sra VSrc_32:$src0, (i32 VReg_32:$src1)))]
+ [(set i32:$dst, (sra i32:$src0, i32:$src1))]
>;
defm V_ASHRREV_I32 : VOP2_32 <0x00000018, "V_ASHRREV_I32", [], "V_ASHR_I32">;
defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32",
- [(set VReg_32:$dst, (shl VSrc_32:$src0, (i32 VReg_32:$src1)))]
+ [(set i32:$dst, (shl i32:$src0, i32:$src1))]
>;
defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", [], "V_LSHL_B32">;
defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32",
- [(set VReg_32:$dst, (and VSrc_32:$src0, VReg_32:$src1))]
+ [(set i32:$dst, (and i32:$src0, i32:$src1))]
>;
defm V_OR_B32 : VOP2_32 <0x0000001c, "V_OR_B32",
- [(set VReg_32:$dst, (or VSrc_32:$src0, VReg_32:$src1))]
+ [(set i32:$dst, (or i32:$src0, i32:$src1))]
>;
defm V_XOR_B32 : VOP2_32 <0x0000001d, "V_XOR_B32",
- [(set VReg_32:$dst, (xor VSrc_32:$src0, VReg_32:$src1))]
+ [(set i32:$dst, (xor i32:$src0, i32:$src1))]
>;
} // End isCommutable = 1
@@ -885,11 +902,11 @@ defm V_MADAK_F32 : VOP2_32 <0x00000021, "V_MADAK_F32", []>;
let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC
defm V_ADD_I32 : VOP2b_32 <0x00000025, "V_ADD_I32",
- [(set VReg_32:$dst, (add (i32 VSrc_32:$src0), (i32 VReg_32:$src1)))]
+ [(set i32:$dst, (add (i32 VSrc_32:$src0), (i32 VReg_32:$src1)))]
>;
defm V_SUB_I32 : VOP2b_32 <0x00000026, "V_SUB_I32",
- [(set VReg_32:$dst, (sub (i32 VSrc_32:$src0), (i32 VReg_32:$src1)))]
+ [(set i32:$dst, (sub i32:$src0, i32:$src1))]
>;
defm V_SUBREV_I32 : VOP2b_32 <0x00000027, "V_SUBREV_I32", [], "V_SUB_I32">;
@@ -905,7 +922,7 @@ defm V_LDEXP_F32 : VOP2_32 <0x0000002b, "V_LDEXP_F32", []>;
////def V_CVT_PKNORM_I16_F32 : VOP2_I16 <0x0000002d, "V_CVT_PKNORM_I16_F32", []>;
////def V_CVT_PKNORM_U16_F32 : VOP2_U16 <0x0000002e, "V_CVT_PKNORM_U16_F32", []>;
defm V_CVT_PKRTZ_F16_F32 : VOP2_32 <0x0000002f, "V_CVT_PKRTZ_F16_F32",
- [(set VReg_32:$dst, (int_SI_packf16 VSrc_32:$src0, VReg_32:$src1))]
+ [(set i32:$dst, (int_SI_packf16 f32:$src0, f32:$src1))]
>;
////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "V_CVT_PK_U16_U32", []>;
////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "V_CVT_PK_I16_I32", []>;
@@ -942,6 +959,7 @@ def V_CUBEMA_F32 : VOP3_32 <0x00000147, "V_CUBEMA_F32", []>;
def V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32", []>;
def V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32", []>;
def V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32", []>;
+defm : BFIPatterns <V_BFI_B32>;
def V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32", []>;
def V_FMA_F64 : VOP3_64 <0x0000014c, "V_FMA_F64", []>;
//def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>;
@@ -983,18 +1001,18 @@ def V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>;
} // isCommutable = 1
def : Pat <
- (mul VSrc_32:$src0, VReg_32:$src1),
- (V_MUL_LO_I32 VSrc_32:$src0, VReg_32:$src1, (i32 0), 0, 0, 0, 0)
+ (mul i32:$src0, i32:$src1),
+ (V_MUL_LO_I32 $src0, $src1, (i32 0))
>;
def : Pat <
- (mulhu VSrc_32:$src0, VReg_32:$src1),
- (V_MUL_HI_U32 VSrc_32:$src0, VReg_32:$src1, (i32 0), 0, 0, 0, 0)
+ (mulhu i32:$src0, i32:$src1),
+ (V_MUL_HI_U32 $src0, $src1, (i32 0))
>;
def : Pat <
- (mulhs VSrc_32:$src0, VReg_32:$src1),
- (V_MUL_HI_I32 VSrc_32:$src0, VReg_32:$src1, (i32 0), 0, 0, 0, 0)
+ (mulhs i32:$src0, i32:$src1),
+ (V_MUL_HI_I32 $src0, $src1, (i32 0))
>;
def V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>;
@@ -1019,34 +1037,27 @@ def S_MAX_U32 : SOP2_32 <0x00000009, "S_MAX_U32", []>;
def S_CSELECT_B32 : SOP2 <
0x0000000a, (outs SReg_32:$dst),
(ins SReg_32:$src0, SReg_32:$src1, SCCReg:$scc), "S_CSELECT_B32",
- [(set (i32 SReg_32:$dst), (select (i1 SCCReg:$scc),
- SReg_32:$src0, SReg_32:$src1))]
+ []
>;
def S_CSELECT_B64 : SOP2_64 <0x0000000b, "S_CSELECT_B64", []>;
-// f32 pattern for S_CSELECT_B32
-def : Pat <
- (f32 (select (i1 SCCReg:$scc), SReg_32:$src0, SReg_32:$src1)),
- (S_CSELECT_B32 SReg_32:$src0, SReg_32:$src1, SCCReg:$scc)
->;
-
def S_AND_B32 : SOP2_32 <0x0000000e, "S_AND_B32", []>;
def S_AND_B64 : SOP2_64 <0x0000000f, "S_AND_B64",
- [(set SReg_64:$dst, (i64 (and SSrc_64:$src0, SSrc_64:$src1)))]
+ [(set i64:$dst, (and i64:$src0, i64:$src1))]
>;
def : Pat <
- (i1 (and SSrc_64:$src0, SSrc_64:$src1)),
- (S_AND_B64 SSrc_64:$src0, SSrc_64:$src1)
+ (i1 (and i1:$src0, i1:$src1)),
+ (S_AND_B64 $src0, $src1)
>;
def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32", []>;
def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64", []>;
def : Pat <
- (i1 (or SSrc_64:$src0, SSrc_64:$src1)),
- (S_OR_B64 SSrc_64:$src0, SSrc_64:$src1)
+ (i1 (or i1:$src0, i1:$src1)),
+ (S_OR_B64 $src0, $src1)
>;
def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32", []>;
def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64", []>;
@@ -1097,14 +1108,14 @@ def SI_IF : InstSI <
(outs SReg_64:$dst),
(ins SReg_64:$vcc, brtarget:$target),
"SI_IF $dst, $vcc, $target",
- [(set SReg_64:$dst, (int_SI_if SReg_64:$vcc, bb:$target))]
+ [(set i64:$dst, (int_SI_if i1:$vcc, bb:$target))]
>;
def SI_ELSE : InstSI <
(outs SReg_64:$dst),
(ins SReg_64:$src, brtarget:$target),
"SI_ELSE $dst, $src, $target",
- [(set SReg_64:$dst, (int_SI_else SReg_64:$src, bb:$target))]> {
+ [(set i64:$dst, (int_SI_else i64:$src, bb:$target))]> {
let Constraints = "$src = $dst";
}
@@ -1113,7 +1124,7 @@ def SI_LOOP : InstSI <
(outs),
(ins SReg_64:$saved, brtarget:$target),
"SI_LOOP $saved, $target",
- [(int_SI_loop SReg_64:$saved, bb:$target)]
+ [(int_SI_loop i64:$saved, bb:$target)]
>;
} // end isBranch = 1, isTerminator = 1
@@ -1122,35 +1133,35 @@ def SI_BREAK : InstSI <
(outs SReg_64:$dst),
(ins SReg_64:$src),
"SI_ELSE $dst, $src",
- [(set SReg_64:$dst, (int_SI_break SReg_64:$src))]
+ [(set i64:$dst, (int_SI_break i64:$src))]
>;
def SI_IF_BREAK : InstSI <
(outs SReg_64:$dst),
(ins SReg_64:$vcc, SReg_64:$src),
"SI_IF_BREAK $dst, $vcc, $src",
- [(set SReg_64:$dst, (int_SI_if_break SReg_64:$vcc, SReg_64:$src))]
+ [(set i64:$dst, (int_SI_if_break i1:$vcc, i64:$src))]
>;
def SI_ELSE_BREAK : InstSI <
(outs SReg_64:$dst),
(ins SReg_64:$src0, SReg_64:$src1),
"SI_ELSE_BREAK $dst, $src0, $src1",
- [(set SReg_64:$dst, (int_SI_else_break SReg_64:$src0, SReg_64:$src1))]
+ [(set i64:$dst, (int_SI_else_break i64:$src0, i64:$src1))]
>;
def SI_END_CF : InstSI <
(outs),
(ins SReg_64:$saved),
"SI_END_CF $saved",
- [(int_SI_end_cf SReg_64:$saved)]
+ [(int_SI_end_cf i64:$saved)]
>;
def SI_KILL : InstSI <
(outs),
(ins VReg_32:$src),
"SI_KIL $src",
- [(int_AMDGPU_kill VReg_32:$src)]
+ [(int_AMDGPU_kill f32:$src)]
>;
} // end mayLoad = 1, mayStore = 1, hasSideEffects = 1
@@ -1184,8 +1195,8 @@ def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
} // end IsCodeGenOnly, isPseudo
def : Pat<
- (int_AMDGPU_cndlt VReg_32:$src0, VReg_32:$src1, VReg_32:$src2),
- (V_CNDMASK_B32_e64 VReg_32:$src2, VReg_32:$src1, (V_CMP_GT_F32_e64 0, VReg_32:$src0))
+ (int_AMDGPU_cndlt f32:$src0, f32:$src1, f32:$src2),
+ (V_CNDMASK_B32_e64 $src2, $src1, (V_CMP_GT_F32_e64 0, $src0))
>;
def : Pat <
@@ -1195,93 +1206,110 @@ def : Pat <
/* int_SI_vs_load_input */
def : Pat<
- (int_SI_vs_load_input SReg_128:$tlst, IMM12bit:$attr_offset,
- VReg_32:$buf_idx_vgpr),
+ (int_SI_vs_load_input v16i8:$tlst, IMM12bit:$attr_offset,
+ i32:$buf_idx_vgpr),
(BUFFER_LOAD_FORMAT_XYZW imm:$attr_offset, 0, 1, 0, 0, 0,
- VReg_32:$buf_idx_vgpr, SReg_128:$tlst,
- 0, 0, 0)
+ $buf_idx_vgpr, $tlst, 0, 0, 0)
>;
/* int_SI_export */
def : Pat <
(int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr,
- VReg_32:$src0,VReg_32:$src1, VReg_32:$src2, VReg_32:$src3),
+ f32:$src0, f32:$src1, f32:$src2, f32:$src3),
(EXP imm:$en, imm:$tgt, imm:$compr, imm:$done, imm:$vm,
- VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3)
+ $src0, $src1, $src2, $src3)
>;
+/********** ======================= **********/
+/********** Image sampling patterns **********/
+/********** ======================= **********/
/* int_SI_sample for simple 1D texture lookup */
def : Pat <
- (int_SI_sample imm:$writemask, VReg_32:$addr,
- SReg_256:$rsrc, SReg_128:$sampler, imm),
- (IMAGE_SAMPLE imm:$writemask, 0, 0, 0, 0, 0, 0, 0, VReg_32:$addr,
- SReg_256:$rsrc, SReg_128:$sampler)
+ (int_SI_sample v1i32:$addr, v32i8:$rsrc, v16i8:$sampler, imm),
+ (IMAGE_SAMPLE 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
>;
-class SamplePattern<Intrinsic name, MIMG opcode, RegisterClass addr_class,
- ValueType addr_type> : Pat <
- (name imm:$writemask, (addr_type addr_class:$addr),
- SReg_256:$rsrc, SReg_128:$sampler, imm),
- (opcode imm:$writemask, 0, 0, 0, 0, 0, 0, 0, addr_class:$addr,
- SReg_256:$rsrc, SReg_128:$sampler)
+class SamplePattern<Intrinsic name, MIMG opcode, ValueType vt> : Pat <
+ (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, imm),
+ (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
>;
-class SampleRectPattern<Intrinsic name, MIMG opcode, RegisterClass addr_class,
- ValueType addr_type> : Pat <
- (name imm:$writemask, (addr_type addr_class:$addr),
- SReg_256:$rsrc, SReg_128:$sampler, TEX_RECT),
- (opcode imm:$writemask, 1, 0, 0, 0, 0, 0, 0, addr_class:$addr,
- SReg_256:$rsrc, SReg_128:$sampler)
+class SampleRectPattern<Intrinsic name, MIMG opcode, ValueType vt> : Pat <
+ (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, TEX_RECT),
+ (opcode 0xf, 1, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
>;
-class SampleArrayPattern<Intrinsic name, MIMG opcode, RegisterClass addr_class,
- ValueType addr_type> : Pat <
- (name imm:$writemask, (addr_type addr_class:$addr),
- SReg_256:$rsrc, SReg_128:$sampler, TEX_ARRAY),
- (opcode imm:$writemask, 0, 0, 1, 0, 0, 0, 0, addr_class:$addr,
- SReg_256:$rsrc, SReg_128:$sampler)
+class SampleArrayPattern<Intrinsic name, MIMG opcode, ValueType vt> : Pat <
+ (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, TEX_ARRAY),
+ (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler)
>;
class SampleShadowPattern<Intrinsic name, MIMG opcode,
- RegisterClass addr_class, ValueType addr_type> : Pat <
- (name imm:$writemask, (addr_type addr_class:$addr),
- SReg_256:$rsrc, SReg_128:$sampler, TEX_SHADOW),
- (opcode imm:$writemask, 0, 0, 0, 0, 0, 0, 0, addr_class:$addr,
- SReg_256:$rsrc, SReg_128:$sampler)
+ ValueType vt> : Pat <
+ (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, TEX_SHADOW),
+ (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
>;
class SampleShadowArrayPattern<Intrinsic name, MIMG opcode,
- RegisterClass addr_class, ValueType addr_type> : Pat <
- (name imm:$writemask, (addr_type addr_class:$addr),
- SReg_256:$rsrc, SReg_128:$sampler, TEX_SHADOW_ARRAY),
- (opcode imm:$writemask, 0, 0, 1, 0, 0, 0, 0, addr_class:$addr,
- SReg_256:$rsrc, SReg_128:$sampler)
+ ValueType vt> : Pat <
+ (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, TEX_SHADOW_ARRAY),
+ (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler)
>;
/* int_SI_sample* for texture lookups consuming more address parameters */
-multiclass SamplePatterns<RegisterClass addr_class, ValueType addr_type> {
- def : SamplePattern <int_SI_sample, IMAGE_SAMPLE, addr_class, addr_type>;
- def : SampleRectPattern <int_SI_sample, IMAGE_SAMPLE, addr_class, addr_type>;
- def : SampleArrayPattern <int_SI_sample, IMAGE_SAMPLE, addr_class, addr_type>;
- def : SampleShadowPattern <int_SI_sample, IMAGE_SAMPLE_C, addr_class, addr_type>;
- def : SampleShadowArrayPattern <int_SI_sample, IMAGE_SAMPLE_C, addr_class, addr_type>;
-
- def : SamplePattern <int_SI_samplel, IMAGE_SAMPLE_L, addr_class, addr_type>;
- def : SampleArrayPattern <int_SI_samplel, IMAGE_SAMPLE_L, addr_class, addr_type>;
- def : SampleShadowPattern <int_SI_samplel, IMAGE_SAMPLE_C_L, addr_class, addr_type>;
- def : SampleShadowArrayPattern <int_SI_samplel, IMAGE_SAMPLE_C_L, addr_class, addr_type>;
-
- def : SamplePattern <int_SI_sampleb, IMAGE_SAMPLE_B, addr_class, addr_type>;
- def : SampleArrayPattern <int_SI_sampleb, IMAGE_SAMPLE_B, addr_class, addr_type>;
- def : SampleShadowPattern <int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_class, addr_type>;
- def : SampleShadowArrayPattern <int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_class, addr_type>;
+multiclass SamplePatterns<ValueType addr_type> {
+ def : SamplePattern <int_SI_sample, IMAGE_SAMPLE, addr_type>;
+ def : SampleRectPattern <int_SI_sample, IMAGE_SAMPLE, addr_type>;
+ def : SampleArrayPattern <int_SI_sample, IMAGE_SAMPLE, addr_type>;
+ def : SampleShadowPattern <int_SI_sample, IMAGE_SAMPLE_C, addr_type>;
+ def : SampleShadowArrayPattern <int_SI_sample, IMAGE_SAMPLE_C, addr_type>;
+
+ def : SamplePattern <int_SI_samplel, IMAGE_SAMPLE_L, addr_type>;
+ def : SampleArrayPattern <int_SI_samplel, IMAGE_SAMPLE_L, addr_type>;
+ def : SampleShadowPattern <int_SI_samplel, IMAGE_SAMPLE_C_L, addr_type>;
+ def : SampleShadowArrayPattern <int_SI_samplel, IMAGE_SAMPLE_C_L, addr_type>;
+
+ def : SamplePattern <int_SI_sampleb, IMAGE_SAMPLE_B, addr_type>;
+ def : SampleArrayPattern <int_SI_sampleb, IMAGE_SAMPLE_B, addr_type>;
+ def : SampleShadowPattern <int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type>;
+ def : SampleShadowArrayPattern <int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type>;
+}
+
+defm : SamplePatterns<v2i32>;
+defm : SamplePatterns<v4i32>;
+defm : SamplePatterns<v8i32>;
+defm : SamplePatterns<v16i32>;
+
+/* int_SI_imageload for texture fetches consuming varying address parameters */
+class ImageLoadPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
+ (name addr_type:$addr, v32i8:$rsrc, imm),
+ (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc)
+>;
+
+class ImageLoadArrayPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
+ (name addr_type:$addr, v32i8:$rsrc, TEX_ARRAY),
+ (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc)
+>;
+
+multiclass ImageLoadPatterns<ValueType addr_type> {
+ def : ImageLoadPattern <int_SI_imageload, IMAGE_LOAD_MIP, addr_type>;
+ def : ImageLoadArrayPattern <int_SI_imageload, IMAGE_LOAD_MIP, addr_type>;
}
-defm : SamplePatterns<VReg_64, v2i32>;
-defm : SamplePatterns<VReg_128, v4i32>;
-defm : SamplePatterns<VReg_256, v8i32>;
-defm : SamplePatterns<VReg_512, v16i32>;
+defm : ImageLoadPatterns<v2i32>;
+defm : ImageLoadPatterns<v4i32>;
+
+/* Image resource information */
+def : Pat <
+ (int_SI_resinfo i32:$mipid, v32i8:$rsrc, imm),
+ (IMAGE_GET_RESINFO 0xf, 0, 0, 0, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
+>;
+
+def : Pat <
+ (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY),
+ (IMAGE_GET_RESINFO 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
+>;
/********** ============================================ **********/
/********** Extraction, Insertion, Building and Casting **********/
@@ -1289,77 +1317,77 @@ defm : SamplePatterns<VReg_512, v16i32>;
foreach Index = 0-2 in {
def Extract_Element_v2i32_#Index : Extract_Element <
- i32, v2i32, VReg_64, Index, !cast<SubRegIndex>(sub#Index)
+ i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Insert_Element_v2i32_#Index : Insert_Element <
- i32, v2i32, VReg_32, VReg_64, Index, !cast<SubRegIndex>(sub#Index)
+ i32, v2i32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Extract_Element_v2f32_#Index : Extract_Element <
- f32, v2f32, VReg_64, Index, !cast<SubRegIndex>(sub#Index)
+ f32, v2f32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Insert_Element_v2f32_#Index : Insert_Element <
- f32, v2f32, VReg_32, VReg_64, Index, !cast<SubRegIndex>(sub#Index)
+ f32, v2f32, Index, !cast<SubRegIndex>(sub#Index)
>;
}
foreach Index = 0-3 in {
def Extract_Element_v4i32_#Index : Extract_Element <
- i32, v4i32, VReg_128, Index, !cast<SubRegIndex>(sub#Index)
+ i32, v4i32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Insert_Element_v4i32_#Index : Insert_Element <
- i32, v4i32, VReg_32, VReg_128, Index, !cast<SubRegIndex>(sub#Index)
+ i32, v4i32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Extract_Element_v4f32_#Index : Extract_Element <
- f32, v4f32, VReg_128, Index, !cast<SubRegIndex>(sub#Index)
+ f32, v4f32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Insert_Element_v4f32_#Index : Insert_Element <
- f32, v4f32, VReg_32, VReg_128, Index, !cast<SubRegIndex>(sub#Index)
+ f32, v4f32, Index, !cast<SubRegIndex>(sub#Index)
>;
}
foreach Index = 0-7 in {
def Extract_Element_v8i32_#Index : Extract_Element <
- i32, v8i32, VReg_256, Index, !cast<SubRegIndex>(sub#Index)
+ i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Insert_Element_v8i32_#Index : Insert_Element <
- i32, v8i32, VReg_32, VReg_256, Index, !cast<SubRegIndex>(sub#Index)
+ i32, v8i32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Extract_Element_v8f32_#Index : Extract_Element <
- f32, v8f32, VReg_256, Index, !cast<SubRegIndex>(sub#Index)
+ f32, v8f32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Insert_Element_v8f32_#Index : Insert_Element <
- f32, v8f32, VReg_32, VReg_256, Index, !cast<SubRegIndex>(sub#Index)
+ f32, v8f32, Index, !cast<SubRegIndex>(sub#Index)
>;
}
foreach Index = 0-15 in {
def Extract_Element_v16i32_#Index : Extract_Element <
- i32, v16i32, VReg_512, Index, !cast<SubRegIndex>(sub#Index)
+ i32, v16i32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Insert_Element_v16i32_#Index : Insert_Element <
- i32, v16i32, VReg_32, VReg_512, Index, !cast<SubRegIndex>(sub#Index)
+ i32, v16i32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Extract_Element_v16f32_#Index : Extract_Element <
- f32, v16f32, VReg_512, Index, !cast<SubRegIndex>(sub#Index)
+ f32, v16f32, Index, !cast<SubRegIndex>(sub#Index)
>;
def Insert_Element_v16f32_#Index : Insert_Element <
- f32, v16f32, VReg_32, VReg_512, Index, !cast<SubRegIndex>(sub#Index)
+ f32, v16f32, Index, !cast<SubRegIndex>(sub#Index)
>;
}
-def : Vector1_Build <v1i32, VReg_32, i32, VReg_32>;
-def : Vector2_Build <v2i32, VReg_64, i32, VReg_32>;
-def : Vector2_Build <v2f32, VReg_64, f32, VReg_32>;
-def : Vector4_Build <v4i32, VReg_128, i32, VReg_32>;
-def : Vector4_Build <v4f32, VReg_128, f32, VReg_32>;
-def : Vector8_Build <v8i32, VReg_256, i32, VReg_32>;
-def : Vector8_Build <v8f32, VReg_256, f32, VReg_32>;
-def : Vector16_Build <v16i32, VReg_512, i32, VReg_32>;
-def : Vector16_Build <v16f32, VReg_512, f32, VReg_32>;
+def : Vector1_Build <v1i32, i32, VReg_32>;
+def : Vector2_Build <v2i32, i32>;
+def : Vector2_Build <v2f32, f32>;
+def : Vector4_Build <v4i32, i32>;
+def : Vector4_Build <v4f32, f32>;
+def : Vector8_Build <v8i32, i32>;
+def : Vector8_Build <v8f32, f32>;
+def : Vector16_Build <v16i32, i32>;
+def : Vector16_Build <v16f32, f32>;
def : BitConvert <i32, f32, SReg_32>;
def : BitConvert <i32, f32, VReg_32>;
@@ -1372,20 +1400,20 @@ def : BitConvert <f32, i32, VReg_32>;
/********** =================== **********/
def : Pat <
- (int_AMDIL_clamp VReg_32:$src, (f32 FP_ZERO), (f32 FP_ONE)),
- (V_ADD_F32_e64 VReg_32:$src, (i32 0 /* SRC1 */),
+ (int_AMDIL_clamp f32:$src, (f32 FP_ZERO), (f32 FP_ONE)),
+ (V_ADD_F32_e64 $src, (i32 0 /* SRC1 */),
0 /* ABS */, 1 /* CLAMP */, 0 /* OMOD */, 0 /* NEG */)
>;
def : Pat <
- (fabs VReg_32:$src),
- (V_ADD_F32_e64 VReg_32:$src, (i32 0 /* SRC1 */),
+ (fabs f32:$src),
+ (V_ADD_F32_e64 $src, (i32 0 /* SRC1 */),
1 /* ABS */, 0 /* CLAMP */, 0 /* OMOD */, 0 /* NEG */)
>;
def : Pat <
- (fneg VReg_32:$src),
- (V_ADD_F32_e64 VReg_32:$src, (i32 0 /* SRC1 */),
+ (fneg f32:$src),
+ (V_ADD_F32_e64 $src, (i32 0 /* SRC1 */),
0 /* ABS */, 0 /* CLAMP */, 0 /* OMOD */, 1 /* NEG */)
>;
@@ -1426,16 +1454,16 @@ def : Pat <
/********** ===================== **********/
def : Pat <
- (int_SI_fs_constant imm:$attr_chan, imm:$attr, M0Reg:$params),
- (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr, M0Reg:$params)
+ (int_SI_fs_constant imm:$attr_chan, imm:$attr, i32:$params),
+ (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr, $params)
>;
def : Pat <
- (int_SI_fs_interp imm:$attr_chan, imm:$attr, M0Reg:$params, VReg_64:$ij),
- (V_INTERP_P2_F32 (V_INTERP_P1_F32 (EXTRACT_SUBREG VReg_64:$ij, sub0),
- imm:$attr_chan, imm:$attr, M0Reg:$params),
- (EXTRACT_SUBREG VReg_64:$ij, sub1),
- imm:$attr_chan, imm:$attr, M0Reg:$params)
+ (int_SI_fs_interp imm:$attr_chan, imm:$attr, M0Reg:$params, v2i32:$ij),
+ (V_INTERP_P2_F32 (V_INTERP_P1_F32 (EXTRACT_SUBREG v2i32:$ij, sub0),
+ imm:$attr_chan, imm:$attr, i32:$params),
+ (EXTRACT_SUBREG $ij, sub1),
+ imm:$attr_chan, imm:$attr, $params)
>;
/********** ================== **********/
@@ -1443,101 +1471,111 @@ def : Pat <
/********** ================== **********/
/* llvm.AMDGPU.pow */
-def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32, VReg_32>;
+def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>;
def : Pat <
- (int_AMDGPU_div VSrc_32:$src0, VSrc_32:$src1),
- (V_MUL_LEGACY_F32_e32 VSrc_32:$src0, (V_RCP_LEGACY_F32_e32 VSrc_32:$src1))
+ (int_AMDGPU_div f32:$src0, f32:$src1),
+ (V_MUL_LEGACY_F32_e32 $src0, (V_RCP_LEGACY_F32_e32 $src1))
>;
def : Pat<
- (fdiv VSrc_32:$src0, VSrc_32:$src1),
- (V_MUL_F32_e32 VSrc_32:$src0, (V_RCP_F32_e32 VSrc_32:$src1))
+ (fdiv f32:$src0, f32:$src1),
+ (V_MUL_F32_e32 $src0, (V_RCP_F32_e32 $src1))
>;
def : Pat <
- (fcos VSrc_32:$src0),
- (V_COS_F32_e32 (V_MUL_F32_e32 VSrc_32:$src0, (V_MOV_B32_e32 CONST.TWO_PI_INV)))
+ (fcos f32:$src0),
+ (V_COS_F32_e32 (V_MUL_F32_e32 $src0, (V_MOV_B32_e32 CONST.TWO_PI_INV)))
>;
def : Pat <
- (fsin VSrc_32:$src0),
- (V_SIN_F32_e32 (V_MUL_F32_e32 VSrc_32:$src0, (V_MOV_B32_e32 CONST.TWO_PI_INV)))
+ (fsin f32:$src0),
+ (V_SIN_F32_e32 (V_MUL_F32_e32 $src0, (V_MOV_B32_e32 CONST.TWO_PI_INV)))
>;
def : Pat <
- (int_AMDGPU_cube VReg_128:$src),
+ (int_AMDGPU_cube v4f32:$src),
(INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)),
- (V_CUBETC_F32 (EXTRACT_SUBREG VReg_128:$src, sub0),
- (EXTRACT_SUBREG VReg_128:$src, sub1),
- (EXTRACT_SUBREG VReg_128:$src, sub2),
- 0, 0, 0, 0), sub0),
- (V_CUBESC_F32 (EXTRACT_SUBREG VReg_128:$src, sub0),
- (EXTRACT_SUBREG VReg_128:$src, sub1),
- (EXTRACT_SUBREG VReg_128:$src, sub2),
- 0, 0, 0, 0), sub1),
- (V_CUBEMA_F32 (EXTRACT_SUBREG VReg_128:$src, sub0),
- (EXTRACT_SUBREG VReg_128:$src, sub1),
- (EXTRACT_SUBREG VReg_128:$src, sub2),
- 0, 0, 0, 0), sub2),
- (V_CUBEID_F32 (EXTRACT_SUBREG VReg_128:$src, sub0),
- (EXTRACT_SUBREG VReg_128:$src, sub1),
- (EXTRACT_SUBREG VReg_128:$src, sub2),
- 0, 0, 0, 0), sub3)
+ (V_CUBETC_F32 (EXTRACT_SUBREG $src, sub0),
+ (EXTRACT_SUBREG $src, sub1),
+ (EXTRACT_SUBREG $src, sub2)),
+ sub0),
+ (V_CUBESC_F32 (EXTRACT_SUBREG $src, sub0),
+ (EXTRACT_SUBREG $src, sub1),
+ (EXTRACT_SUBREG $src, sub2)),
+ sub1),
+ (V_CUBEMA_F32 (EXTRACT_SUBREG $src, sub0),
+ (EXTRACT_SUBREG $src, sub1),
+ (EXTRACT_SUBREG $src, sub2)),
+ sub2),
+ (V_CUBEID_F32 (EXTRACT_SUBREG $src, sub0),
+ (EXTRACT_SUBREG $src, sub1),
+ (EXTRACT_SUBREG $src, sub2)),
+ sub3)
>;
def : Pat <
- (i32 (sext (i1 SReg_64:$src0))),
- (V_CNDMASK_B32_e64 (i32 0), (i32 -1), SReg_64:$src0)
+ (i32 (sext i1:$src0)),
+ (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src0)
>;
// 1. Offset as 8bit DWORD immediate
def : Pat <
- (int_SI_load_const SReg_128:$sbase, IMM8bitDWORD:$offset),
- (S_BUFFER_LOAD_DWORD_IMM SReg_128:$sbase, IMM8bitDWORD:$offset)
+ (int_SI_load_const v16i8:$sbase, IMM8bitDWORD:$offset),
+ (S_BUFFER_LOAD_DWORD_IMM $sbase, IMM8bitDWORD:$offset)
>;
// 2. Offset loaded in an 32bit SGPR
def : Pat <
- (int_SI_load_const SReg_128:$sbase, imm:$offset),
- (S_BUFFER_LOAD_DWORD_SGPR SReg_128:$sbase, (S_MOV_B32 imm:$offset))
+ (int_SI_load_const v16i8:$sbase, imm:$offset),
+ (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset))
>;
// 3. Offset in an 32Bit VGPR
def : Pat <
- (int_SI_load_const SReg_128:$sbase, VReg_32:$voff),
- (BUFFER_LOAD_DWORD 0, 1, 0, 0, 0, 0, VReg_32:$voff, SReg_128:$sbase, 0, 0, 0)
+ (int_SI_load_const v16i8:$sbase, i32:$voff),
+ (BUFFER_LOAD_DWORD 0, 1, 0, 0, 0, 0, $voff, $sbase, 0, 0, 0)
+>;
+
+// The multiplication scales from [0,1] to the unsigned integer range
+def : Pat <
+ (AMDGPUurecip i32:$src0),
+ (V_CVT_U32_F32_e32
+ (V_MUL_F32_e32 CONST.FP_UINT_MAX_PLUS_1,
+ (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0))))
>;
/********** ================== **********/
/********** VOP3 Patterns **********/
/********** ================== **********/
-def : Pat <(f32 (fadd (fmul VSrc_32:$src0, VSrc_32:$src1), VSrc_32:$src2)),
- (V_MAD_F32 VSrc_32:$src0, VSrc_32:$src1, VSrc_32:$src2,
- 0, 0, 0, 0)>;
+def : Pat <
+ (f32 (fadd (fmul f32:$src0, f32:$src1), f32:$src2)),
+ (V_MAD_F32 $src0, $src1, $src2)
+>;
/********** ================== **********/
/********** SMRD Patterns **********/
/********** ================== **********/
multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
+
// 1. Offset as 8bit DWORD immediate
def : Pat <
- (constant_load (SIadd64bit32bit SReg_64:$sbase, IMM8bitDWORD:$offset)),
- (vt (Instr_IMM SReg_64:$sbase, IMM8bitDWORD:$offset))
+ (constant_load (SIadd64bit32bit i64:$sbase, IMM8bitDWORD:$offset)),
+ (vt (Instr_IMM $sbase, IMM8bitDWORD:$offset))
>;
// 2. Offset loaded in an 32bit SGPR
def : Pat <
- (constant_load (SIadd64bit32bit SReg_64:$sbase, imm:$offset)),
- (vt (Instr_SGPR SReg_64:$sbase, (S_MOV_B32 imm:$offset)))
+ (constant_load (SIadd64bit32bit i64:$sbase, imm:$offset)),
+ (vt (Instr_SGPR $sbase, (S_MOV_B32 imm:$offset)))
>;
// 3. No offset at all
def : Pat <
- (constant_load SReg_64:$sbase),
- (vt (Instr_IMM SReg_64:$sbase, 0))
+ (constant_load i64:$sbase),
+ (vt (Instr_IMM $sbase, 0))
>;
}
@@ -1550,45 +1588,37 @@ defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
/********** Indirect adressing **********/
/********** ====================== **********/
-multiclass SI_INDIRECT_Pattern <RegisterClass rc, ValueType vt,
- SI_INDIRECT_DST IndDst> {
+multiclass SI_INDIRECT_Pattern <ValueType vt, SI_INDIRECT_DST IndDst> {
+
// 1. Extract with offset
def : Pat<
- (vector_extract (vt rc:$vec),
- (i64 (zext (i32 (add VReg_32:$idx, imm:$off))))
- ),
- (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, imm:$off))
+ (vector_extract vt:$vec, (i64 (zext (add i32:$idx, imm:$off)))),
+ (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, imm:$off))
>;
// 2. Extract without offset
def : Pat<
- (vector_extract (vt rc:$vec),
- (i64 (zext (i32 VReg_32:$idx)))
- ),
- (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, 0))
+ (vector_extract vt:$vec, (i64 (zext i32:$idx))),
+ (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, 0))
>;
// 3. Insert with offset
def : Pat<
- (vector_insert (vt rc:$vec), (f32 VReg_32:$val),
- (i64 (zext (i32 (add VReg_32:$idx, imm:$off))))
- ),
- (vt (IndDst (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, imm:$off, VReg_32:$val))
+ (vector_insert vt:$vec, f32:$val, (i64 (zext (add i32:$idx, imm:$off)))),
+ (IndDst (IMPLICIT_DEF), $vec, $idx, imm:$off, $val)
>;
// 4. Insert without offset
def : Pat<
- (vector_insert (vt rc:$vec), (f32 VReg_32:$val),
- (i64 (zext (i32 VReg_32:$idx)))
- ),
- (vt (IndDst (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, 0, VReg_32:$val))
+ (vector_insert vt:$vec, f32:$val, (i64 (zext i32:$idx))),
+ (IndDst (IMPLICIT_DEF), $vec, $idx, 0, $val)
>;
}
-defm : SI_INDIRECT_Pattern <VReg_64, v2f32, SI_INDIRECT_DST_V2>;
-defm : SI_INDIRECT_Pattern <VReg_128, v4f32, SI_INDIRECT_DST_V4>;
-defm : SI_INDIRECT_Pattern <VReg_256, v8f32, SI_INDIRECT_DST_V8>;
-defm : SI_INDIRECT_Pattern <VReg_512, v16f32, SI_INDIRECT_DST_V16>;
+defm : SI_INDIRECT_Pattern <v2f32, SI_INDIRECT_DST_V2>;
+defm : SI_INDIRECT_Pattern <v4f32, SI_INDIRECT_DST_V4>;
+defm : SI_INDIRECT_Pattern <v8f32, SI_INDIRECT_DST_V8>;
+defm : SI_INDIRECT_Pattern <v16f32, SI_INDIRECT_DST_V16>;
/********** =============== **********/
/********** Conditions **********/
@@ -1596,12 +1626,18 @@ defm : SI_INDIRECT_Pattern <VReg_512, v16f32, SI_INDIRECT_DST_V16>;
def : Pat<
(i1 (setcc f32:$src0, f32:$src1, SETO)),
- (V_CMP_O_F32_e64 f32:$src0, f32:$src1)
+ (V_CMP_O_F32_e64 $src0, $src1)
>;
def : Pat<
(i1 (setcc f32:$src0, f32:$src1, SETUO)),
- (V_CMP_U_F32_e64 f32:$src0, f32:$src1)
+ (V_CMP_U_F32_e64 $src0, $src1)
>;
+//============================================================================//
+// Miscellaneous Optimization Patterns
+//============================================================================//
+
+def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e32>;
+
} // End isSI predicate
diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td
index 0af378e..224cd2f 100644
--- a/lib/Target/R600/SIIntrinsics.td
+++ b/lib/Target/R600/SIIntrinsics.td
@@ -19,12 +19,16 @@ let TargetPrefix = "SI", isTarget = 1 in {
def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_v16i8_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ;
- class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_i32_ty, llvm_anyvector_ty, llvm_v32i8_ty, llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+ class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
def int_SI_sample : Sample;
def int_SI_sampleb : Sample;
def int_SI_samplel : Sample;
+ def int_SI_imageload : Intrinsic <[llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
+
+ def int_SI_resinfo : Intrinsic <[llvm_v4i32_ty], [llvm_i32_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
+
/* Interpolation Intrinsics */
def int_SI_fs_constant : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td
index 4f14931..244d4c00 100644
--- a/lib/Target/R600/SIRegisterInfo.td
+++ b/lib/Target/R600/SIRegisterInfo.td
@@ -94,6 +94,12 @@ def VGPR_64 : RegisterTuples<[sub0, sub1],
[(add (trunc VGPR_32, 255)),
(add (shl VGPR_32, 1))]>;
+// VGPR 96-bit registers
+def VGPR_96 : RegisterTuples<[sub0, sub1, sub2],
+ [(add (trunc VGPR_32, 254)),
+ (add (shl VGPR_32, 1)),
+ (add (shl VGPR_32, 2))]>;
+
// VGPR 128-bit registers
def VGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3],
[(add (trunc VGPR_32, 253)),
@@ -151,7 +157,7 @@ def SReg_64 : RegisterClass<"AMDGPU", [i64, i1], 64,
(add SGPR_64, VCCReg, EXECReg)
>;
-def SReg_128 : RegisterClass<"AMDGPU", [v16i8], 128, (add SGPR_128)>;
+def SReg_128 : RegisterClass<"AMDGPU", [v16i8, i128], 128, (add SGPR_128)>;
def SReg_256 : RegisterClass<"AMDGPU", [v32i8], 256, (add SGPR_256)>;
@@ -162,6 +168,10 @@ def VReg_32 : RegisterClass<"AMDGPU", [i32, f32, v1i32], 32, (add VGPR_32)>;
def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 64, (add VGPR_64)>;
+def VReg_96 : RegisterClass<"AMDGPU", [untyped], 96, (add VGPR_96)> {
+ let Size = 96;
+}
+
def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, (add VGPR_128)>;
def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 256, (add VGPR_256)>;
OpenPOWER on IntegriCloud